Skip to content

Commit d8c3900

Browse files
tswastgcf-owl-bot[bot]busunkim96parthea
authored
fix: correctly transform query job timeout configuration and exceptions (#492)
* fix: correctly transform query job timeout configuration and exceptions * unit tests for configuration transformations * add unit tests for timeout * link todo to relevant issue * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * use correct template for freezegun deps * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * decrease tick time in case multiple time calls are made * try no tick * add freezegun to conda deps * typo Co-authored-by: Bu Sun Kim <8822365+busunkim96@users.noreply.github.com> * typo Co-authored-by: Anthonios Partheniou <partheniou@google.com> Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com> Co-authored-by: Bu Sun Kim <8822365+busunkim96@users.noreply.github.com> Co-authored-by: Anthonios Partheniou <partheniou@google.com>
1 parent 13010dd commit d8c3900

File tree

7 files changed

+141
-36
lines changed

7 files changed

+141
-36
lines changed

ci/requirements-3.7-0.24.2.conda

+1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ coverage
33
db-dtypes==0.3.1
44
fastavro
55
flake8
6+
freezegun
67
numpy==1.16.6
78
google-cloud-bigquery==1.27.2
89
google-cloud-bigquery-storage==1.1.0

ci/requirements-3.9-1.3.4.conda

+1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ coverage
33
db-dtypes
44
fastavro
55
flake8
6+
freezegun
67
google-cloud-bigquery
78
google-cloud-bigquery-storage
89
numpy

noxfile.py

+1
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ def default(session):
9494
"-c",
9595
constraints_path,
9696
)
97+
session.install("freezegun", "-c", constraints_path)
9798

9899
if session.python == "3.9":
99100
extras = ""

owlbot.py

+1
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
unit_test_python_versions=["3.7", "3.8", "3.9", "3.10"],
3939
system_test_python_versions=["3.7", "3.8", "3.9", "3.10"],
4040
cov_level=96,
41+
unit_test_external_dependencies=["freezegun"],
4142
unit_test_extras=extras,
4243
unit_test_extras_by_python=extras_by_python,
4344
system_test_extras=extras,

pandas_gbq/gbq.py

+68-23
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
# Use of this source code is governed by a BSD-style
33
# license that can be found in the LICENSE file.
44

5+
import copy
6+
import concurrent.futures
57
from datetime import datetime
68
import logging
79
import re
@@ -378,6 +380,9 @@ def process_http_error(ex):
378380
# See `BigQuery Troubleshooting Errors
379381
# <https://cloud.google.com/bigquery/troubleshooting-errors>`__
380382

383+
if "cancelled" in ex.message:
384+
raise QueryTimeout("Reason: {0}".format(ex))
385+
381386
raise GenericGBQException("Reason: {0}".format(ex))
382387

383388
def download_table(
@@ -406,8 +411,41 @@ def download_table(
406411
user_dtypes=dtypes,
407412
)
408413

414+
def _wait_for_query_job(self, query_reply, timeout_ms):
415+
"""Wait for query to complete, pausing occasionally to update progress.
416+
417+
Args:
418+
query_reply (QueryJob):
419+
A query job which has started.
420+
421+
timeout_ms (Optional[int]):
422+
How long to wait before cancelling the query.
423+
"""
424+
# Wait at most 10 seconds so we can show progress.
425+
# TODO(https://github.com/googleapis/python-bigquery-pandas/issues/327):
426+
# Include a tqdm progress bar here instead of a stream of log messages.
427+
timeout_sec = 10.0
428+
if timeout_ms:
429+
timeout_sec = min(timeout_sec, timeout_ms / 1000.0)
430+
431+
while query_reply.state != "DONE":
432+
self.log_elapsed_seconds(" Elapsed", "s. Waiting...")
433+
434+
if timeout_ms and timeout_ms < self.get_elapsed_seconds() * 1000:
435+
self.client.cancel_job(
436+
query_reply.job_id, location=query_reply.location
437+
)
438+
raise QueryTimeout("Query timeout: {} ms".format(timeout_ms))
439+
440+
try:
441+
query_reply.result(timeout=timeout_sec)
442+
except concurrent.futures.TimeoutError:
443+
# Use our own timeout logic
444+
pass
445+
except self.http_error as ex:
446+
self.process_http_error(ex)
447+
409448
def run_query(self, query, max_results=None, progress_bar_type=None, **kwargs):
410-
from concurrent.futures import TimeoutError
411449
from google.auth.exceptions import RefreshError
412450
from google.cloud import bigquery
413451
import pandas
@@ -449,28 +487,11 @@ def run_query(self, query, max_results=None, progress_bar_type=None, **kwargs):
449487
job_id = query_reply.job_id
450488
logger.debug("Job ID: %s" % job_id)
451489

452-
while query_reply.state != "DONE":
453-
self.log_elapsed_seconds(" Elapsed", "s. Waiting...")
454-
455-
timeout_ms = job_config.get("jobTimeoutMs") or job_config["query"].get(
456-
"timeoutMs"
457-
)
458-
timeout_ms = int(timeout_ms) if timeout_ms else None
459-
if timeout_ms and timeout_ms < self.get_elapsed_seconds() * 1000:
460-
raise QueryTimeout("Query timeout: {} ms".format(timeout_ms))
461-
462-
timeout_sec = 1.0
463-
if timeout_ms:
464-
# Wait at most 1 second so we can show progress bar
465-
timeout_sec = min(1.0, timeout_ms / 1000.0)
466-
467-
try:
468-
query_reply.result(timeout=timeout_sec)
469-
except TimeoutError:
470-
# Use our own timeout logic
471-
pass
472-
except self.http_error as ex:
473-
self.process_http_error(ex)
490+
timeout_ms = job_config.get("jobTimeoutMs") or job_config["query"].get(
491+
"timeoutMs"
492+
)
493+
timeout_ms = int(timeout_ms) if timeout_ms else None
494+
self._wait_for_query_job(query_reply, timeout_ms)
474495

475496
if query_reply.cache_hit:
476497
logger.debug("Query done.\nCache hit.\n")
@@ -673,6 +694,28 @@ def _finalize_dtypes(
673694
return df
674695

675696

697+
def _transform_read_gbq_configuration(configuration):
698+
"""
699+
For backwards-compatibility, convert any previously client-side only
700+
parameters such as timeoutMs to the property name expected by the REST API.
701+
702+
Makes a copy of configuration if changes are needed.
703+
"""
704+
705+
if configuration is None:
706+
return None
707+
708+
timeout_ms = configuration.get("query", {}).get("timeoutMs")
709+
if timeout_ms is not None:
710+
# Transform timeoutMs to an actual server-side configuration.
711+
# https://github.com/googleapis/python-bigquery-pandas/issues/479
712+
configuration = copy.deepcopy(configuration)
713+
del configuration["query"]["timeoutMs"]
714+
configuration["jobTimeoutMs"] = timeout_ms
715+
716+
return configuration
717+
718+
676719
def read_gbq(
677720
query_or_table,
678721
project_id=None,
@@ -847,6 +890,8 @@ def read_gbq(
847890
if dialect not in ("legacy", "standard"):
848891
raise ValueError("'{0}' is not valid for dialect".format(dialect))
849892

893+
configuration = _transform_read_gbq_configuration(configuration)
894+
850895
if configuration and "query" in configuration and "query" in configuration["query"]:
851896
if query_or_table is not None:
852897
raise ValueError(

tests/system/test_gbq.py

+4-13
Original file line numberDiff line numberDiff line change
@@ -473,22 +473,13 @@ def test_configuration_raises_value_error_with_multiple_config(self, project_id)
473473

474474
def test_timeout_configuration(self, project_id):
475475
sql_statement = """
476-
SELECT
477-
SUM(bottles_sold) total_bottles,
478-
UPPER(category_name) category_name,
479-
magnitude,
480-
liquor.zip_code zip_code
481-
FROM `bigquery-public-data.iowa_liquor_sales.sales` liquor
482-
JOIN `bigquery-public-data.geo_us_boundaries.zip_codes` zip_codes
483-
ON liquor.zip_code = zip_codes.zip_code
484-
JOIN `bigquery-public-data.noaa_historic_severe_storms.tornado_paths` tornados
485-
ON liquor.date = tornados.storm_date
486-
WHERE ST_INTERSECTS(tornado_path_geom, zip_code_geom)
487-
GROUP BY category_name, magnitude, zip_code
488-
ORDER BY magnitude ASC, total_bottles DESC
476+
select count(*) from unnest(generate_array(1,1000000)), unnest(generate_array(1, 10000))
489477
"""
490478
configs = [
479+
# pandas-gbq timeout configuration. Transformed to REST API compatible version.
491480
{"query": {"useQueryCache": False, "timeoutMs": 1}},
481+
# REST API job timeout. See:
482+
# https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfiguration.FIELDS.job_timeout_ms
492483
{"query": {"useQueryCache": False}, "jobTimeoutMs": 1},
493484
]
494485
for config in configs:

tests/unit/test_gbq.py

+65
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,12 @@
44

55
# -*- coding: utf-8 -*-
66

7+
import concurrent.futures
78
import copy
89
import datetime
910
from unittest import mock
1011

12+
import freezegun
1113
import google.api_core.exceptions
1214
import numpy
1315
import pandas
@@ -114,6 +116,61 @@ def test__is_query(query_or_table, expected):
114116
assert result == expected
115117

116118

119+
@pytest.mark.parametrize(
120+
["original", "expected"],
121+
[
122+
(None, None),
123+
({}, {}),
124+
({"query": {"useQueryCache": False}}, {"query": {"useQueryCache": False}}),
125+
({"jobTimeoutMs": "1234"}, {"jobTimeoutMs": "1234"}),
126+
({"query": {"timeoutMs": "1234"}}, {"query": {}, "jobTimeoutMs": "1234"}),
127+
],
128+
)
129+
def test__transform_read_gbq_configuration_makes_copy(original, expected):
130+
should_change = original == expected
131+
got = gbq._transform_read_gbq_configuration(original)
132+
assert got == expected
133+
# Catch if we accidentally modified the original.
134+
did_change = original == got
135+
assert did_change == should_change
136+
137+
138+
def test__wait_for_query_job_exits_when_done(mock_bigquery_client):
139+
connector = _make_connector()
140+
connector.client = mock_bigquery_client
141+
connector.start = datetime.datetime(2020, 1, 1).timestamp()
142+
143+
mock_query = mock.create_autospec(google.cloud.bigquery.QueryJob)
144+
type(mock_query).state = mock.PropertyMock(side_effect=("RUNNING", "DONE"))
145+
mock_query.result.side_effect = concurrent.futures.TimeoutError("fake timeout")
146+
147+
with freezegun.freeze_time("2020-01-01 00:00:00", tick=False):
148+
connector._wait_for_query_job(mock_query, 60)
149+
150+
mock_bigquery_client.cancel_job.assert_not_called()
151+
152+
153+
def test__wait_for_query_job_cancels_after_timeout(mock_bigquery_client):
154+
connector = _make_connector()
155+
connector.client = mock_bigquery_client
156+
connector.start = datetime.datetime(2020, 1, 1).timestamp()
157+
158+
mock_query = mock.create_autospec(google.cloud.bigquery.QueryJob)
159+
mock_query.job_id = "a-random-id"
160+
mock_query.location = "job-location"
161+
mock_query.state = "RUNNING"
162+
mock_query.result.side_effect = concurrent.futures.TimeoutError("fake timeout")
163+
164+
with freezegun.freeze_time(
165+
"2020-01-01 00:00:00", auto_tick_seconds=15
166+
), pytest.raises(gbq.QueryTimeout):
167+
connector._wait_for_query_job(mock_query, 60)
168+
169+
mock_bigquery_client.cancel_job.assert_called_with(
170+
"a-random-id", location="job-location"
171+
)
172+
173+
117174
def test_GbqConnector_get_client_w_new_bq(mock_bigquery_client):
118175
gbq._test_google_api_imports()
119176
pytest.importorskip("google.api_core.client_info")
@@ -125,6 +182,14 @@ def test_GbqConnector_get_client_w_new_bq(mock_bigquery_client):
125182
assert kwargs["client_info"].user_agent == "pandas-{}".format(pandas.__version__)
126183

127184

185+
def test_GbqConnector_process_http_error_transforms_timeout():
186+
original = google.api_core.exceptions.GoogleAPICallError(
187+
"Job execution was cancelled: Job timed out after 0s"
188+
)
189+
with pytest.raises(gbq.QueryTimeout):
190+
gbq.GbqConnector.process_http_error(original)
191+
192+
128193
def test_to_gbq_should_fail_if_invalid_table_name_passed():
129194
with pytest.raises(gbq.NotFoundException):
130195
gbq.to_gbq(DataFrame([[1]]), "invalid_table_name", project_id="1234")

0 commit comments

Comments
 (0)