From 933d47067b23f62c25ba739a3a4722f3f28dae07 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 6 Dec 2021 14:13:48 -0600 Subject: [PATCH 01/35] fix: read out-of-bounds DATETIME values such as `0001-01-01 00:00:00` deps: require google-cloud-bigquery 1.26.1 or later --- ci/requirements-3.7-0.24.2.conda | 5 ++-- pandas_gbq/load.py | 9 ++++-- setup.py | 9 +++--- testing/constraints-3.7.txt | 6 ++-- tests/system/test_to_gbq.py | 48 ++++++++++++++++++++++++++++++++ 5 files changed, 66 insertions(+), 11 deletions(-) diff --git a/ci/requirements-3.7-0.24.2.conda b/ci/requirements-3.7-0.24.2.conda index 82f4e7b9..430c742e 100644 --- a/ci/requirements-3.7-0.24.2.conda +++ b/ci/requirements-3.7-0.24.2.conda @@ -1,10 +1,11 @@ codecov coverage -db-dtypes==0.3.0 +db-dtypes==0.3.1 fastavro flake8 numpy==1.16.6 -google-cloud-bigquery==1.11.1 +google-cloud-bigquery==1.26.1 +google-cloud-bigquery-storage==1.1.0 pyarrow==3.0.0 pydata-google-auth pytest diff --git a/pandas_gbq/load.py b/pandas_gbq/load.py index 5422402e..943c4f07 100644 --- a/pandas_gbq/load.py +++ b/pandas_gbq/load.py @@ -94,8 +94,13 @@ def cast_dataframe_for_parquet( # .astype() with DateDtype. With .astype(), I get the error: # # TypeError: Cannot interpret '' as a data type - cast_column = pandas.Series( - dataframe[column_name], dtype=db_dtypes.DateDtype() + cast_column = dataframe[column_name].astype( + dtype=db_dtypes.DateDtype(), + # Return the original column if there was an error converting + # to the dtype, such as is there is a date outside the + # supported range. + # https://github.com/googleapis/python-bigquery-pandas/issues/441 + errors="ignore", ) elif column_type in {"NUMERIC", "DECIMAL", "BIGNUMERIC", "BIGDECIMAL"}: cast_column = dataframe[column_name].map(decimal.Decimal) diff --git a/setup.py b/setup.py index 28c81eee..a5b645e7 100644 --- a/setup.py +++ b/setup.py @@ -23,16 +23,17 @@ release_status = "Development Status :: 4 - Beta" dependencies = [ "setuptools", - "db-dtypes >=0.3.0,<2.0.0", - "numpy>=1.16.6", - "pandas>=0.24.2", + "db-dtypes >=0.3.1,<2.0.0", + "numpy >=1.16.6", + "pandas >=0.24.2", "pyarrow >=3.0.0, <7.0dev", "pydata-google-auth", "google-auth", "google-auth-oauthlib", # 2.4.* has a bug where waiting for the query can hang indefinitely. # https://github.com/pydata/pandas-gbq/issues/343 - "google-cloud-bigquery[bqstorage,pandas]>=1.11.1,<3.0.0dev,!=2.4.*", + "google-cloud-bigquery >=1.26.1,<3.0.0dev,!=2.4.*", + "google-cloud-bigquery-storage >=1.1.0,<3.0.0dev", ] extras = { "tqdm": "tqdm>=4.23.0", diff --git a/testing/constraints-3.7.txt b/testing/constraints-3.7.txt index 7920656a..a5b04f0d 100644 --- a/testing/constraints-3.7.txt +++ b/testing/constraints-3.7.txt @@ -5,10 +5,10 @@ # # e.g., if setup.py has "foo >= 1.14.0, < 2.0.0dev", # Then this file should have foo==1.14.0 -db-dtypes==0.3.0 -google-auth==1.4.1 +db-dtypes==0.3.1 +google-auth==1.18.0 google-auth-oauthlib==0.0.1 -google-cloud-bigquery==1.11.1 +google-cloud-bigquery==1.26.1 google-cloud-bigquery-storage==1.1.0 numpy==1.16.6 pandas==0.24.2 diff --git a/tests/system/test_to_gbq.py b/tests/system/test_to_gbq.py index 4421f3be..046a2a86 100644 --- a/tests/system/test_to_gbq.py +++ b/tests/system/test_to_gbq.py @@ -188,6 +188,54 @@ def test_series_round_trip( {"name": "num_col", "type": "NUMERIC"}, ], ), + pytest.param( + *DataFrameRoundTripTestCase( + input_df=pandas.DataFrame( + { + "row_num": [1, 2, 3], + # DATE valuess outside the pandas range for timestamp + # aren't supported by the db-dtypes package. + # https://github.com/googleapis/python-bigquery-pandas/issues/441 + "date_col": [ + datetime.date(1, 1, 1), + datetime.date(1970, 1, 1), + datetime.date(9999, 12, 31), + ], + # DATETIME values outside of the range for pandas timestamp + # require `date_as_object` parameter in + # google-cloud-bigquery versions 1.x and 2.x. + # https://github.com/googleapis/python-bigquery-pandas/issues/365 + "datetime_col": [ + datetime.datetime(1, 1, 1), + datetime.datetime(1970, 1, 1), + datetime.datetime(9999, 12, 31, 23, 59, 59, 999999), + ], + "timestamp_col": [ + datetime.datetime(1, 1, 1, tzinfo=datetime.timezone.utc), + datetime.datetime(1970, 1, 1, tzinfo=datetime.timezone.utc), + datetime.datetime( + 9999, + 12, + 31, + 23, + 59, + 59, + 999999, + tzinfo=datetime.timezone.utc, + ), + ], + }, + columns=["row_num", "date_col", "datetime_col", "timestamp_col"], + ), + table_schema=[ + {"name": "row_num", "type": "INTEGER"}, + {"name": "date_col", "type": "DATE"}, + {"name": "datetime_col", "type": "DATETIME"}, + {"name": "timestamp_col", "type": "TIMESTAMP"}, + ], + ), + id="issue365-extreme-datetimes", + ), ] From 9a9d3fda24d41457cb0ab1c803388e096ab6afcc Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 6 Dec 2021 16:22:01 -0600 Subject: [PATCH 02/35] feat: accepts a table ID, which downloads the table without a query --- pandas_gbq/gbq.py | 38 +++++++++++++++++-------------------- tests/system/conftest.py | 19 +++++++++++++++++++ tests/system/test_to_gbq.py | 19 +++++++------------ 3 files changed, 43 insertions(+), 33 deletions(-) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 87c2327c..714c0995 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -374,7 +374,9 @@ def process_http_error(ex): raise GenericGBQException("Reason: {0}".format(ex)) - def run_query(self, query, max_results=None, progress_bar_type=None, **kwargs): + def run_query( + self, query_or_table, max_results=None, progress_bar_type=None, **kwargs + ): from concurrent.futures import TimeoutError from google.auth.exceptions import RefreshError @@ -391,20 +393,20 @@ def run_query(self, query, max_results=None, progress_bar_type=None, **kwargs): job_config.update(config) if "query" in config and "query" in config["query"]: - if query is not None: + if query_or_table is not None: raise ValueError( "Query statement can't be specified " "inside config while it is specified " "as parameter" ) - query = config["query"].pop("query") + query_or_table = config["query"].pop("query") self._start_timer() try: logger.debug("Requesting query... ") query_reply = self.client.query( - query, + query_or_table, job_config=bigquery.QueryJobConfig.from_api_repr(job_config), location=self.location, project=self.project_id, @@ -639,7 +641,7 @@ def _cast_empty_df_dtypes(schema_fields, df): def read_gbq( - query, + query_or_table, project_id=None, index_col=None, col_order=None, @@ -663,17 +665,18 @@ def read_gbq( This method uses the Google Cloud client library to make requests to Google BigQuery, documented `here - `__. + `__. See the :ref:`How to authenticate with Google BigQuery ` guide for authentication instructions. Parameters ---------- - query : str - SQL-Like Query to return data values. + query_or_table : str + SQL query to return data values. If the string is a table ID, fetch the + rows directly from the table without running a query. project_id : str, optional - Google BigQuery Account project ID. Optional when available from + Google Cloud Platform project ID. Optional when available from the environment. index_col : str, optional Name of result column to use for index in results DataFrame. @@ -688,9 +691,9 @@ def read_gbq( when getting user credentials. .. _local webserver flow: - http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server + https://googleapis.dev/python/google-auth-oauthlib/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server .. _console flow: - http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console + https://googleapis.dev/python/google-auth-oauthlib/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console .. versionadded:: 0.2.0 dialect : str, default 'standard' @@ -740,13 +743,6 @@ def read_gbq( `__ permission on the project you are billing queries to. - **Note:** Due to a `known issue in the ``google-cloud-bigquery`` - package - `__ - (fixed in version 1.11.0), you must write your query results to a - destination table. To do this with ``read_gbq``, supply a - ``configuration`` dictionary. - This feature requires the ``google-cloud-bigquery-storage`` and ``pyarrow`` packages. @@ -830,7 +826,7 @@ def read_gbq( ) final_df = connector.run_query( - query, + query_or_table, configuration=configuration, max_results=max_results, progress_bar_type=progress_bar_type, @@ -884,7 +880,7 @@ def to_gbq( This method uses the Google Cloud client library to make requests to Google BigQuery, documented `here - `__. + `__. See the :ref:`How to authenticate with Google BigQuery ` guide for authentication instructions. @@ -897,7 +893,7 @@ def to_gbq( Name of table to be written, in the form ``dataset.tablename`` or ``project.dataset.tablename``. project_id : str, optional - Google BigQuery Account project ID. Optional when available from + Google Cloud Platform project ID. Optional when available from the environment. chunksize : int, optional Number of rows to be inserted in each chunk from the dataframe. diff --git a/tests/system/conftest.py b/tests/system/conftest.py index 6ac55220..4ba8bf31 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -3,6 +3,7 @@ # license that can be found in the LICENSE file. import os +import functools import pathlib from google.cloud import bigquery @@ -56,6 +57,24 @@ def project(project_id): return project_id +@pytest.fixture +def to_gbq(credentials, project_id): + import pandas_gbq + + return functools.partial( + pandas_gbq.to_gbq, project_id=project_id, credentials=credentials + ) + + +@pytest.fixture +def read_gbq(credentials, project_id): + import pandas_gbq + + return functools.partial( + pandas_gbq.read_gbq, project_id=project_id, credentials=credentials + ) + + @pytest.fixture() def random_dataset_id(bigquery_client: bigquery.Client, project_id: str): dataset_id = prefixer.create_prefix() diff --git a/tests/system/test_to_gbq.py b/tests/system/test_to_gbq.py index 4421f3be..f8d9c7f7 100644 --- a/tests/system/test_to_gbq.py +++ b/tests/system/test_to_gbq.py @@ -5,7 +5,6 @@ import datetime import decimal import collections -import functools import random import db_dtypes @@ -23,12 +22,8 @@ def api_method(request): @pytest.fixture -def method_under_test(credentials, project_id): - import pandas_gbq - - return functools.partial( - pandas_gbq.to_gbq, project_id=project_id, credentials=credentials - ) +def method_under_test(to_gbq): + return to_gbq SeriesRoundTripTestCase = collections.namedtuple( @@ -98,7 +93,7 @@ def method_under_test(credentials, project_id): def test_series_round_trip( method_under_test, random_dataset_id, - bigquery_client, + read_gbq, input_series, api_method, api_methods, @@ -114,7 +109,7 @@ def test_series_round_trip( ) method_under_test(df, table_id, api_method=api_method) - round_trip = bigquery_client.list_rows(table_id).to_dataframe() + round_trip = read_gbq(table_id) round_trip_series = round_trip["test_col"].sort_values().reset_index(drop=True) pandas.testing.assert_series_equal( round_trip_series, input_series, check_exact=True, check_names=False, @@ -196,8 +191,8 @@ def test_series_round_trip( ) def test_dataframe_round_trip_with_table_schema( method_under_test, + read_gbq, random_dataset_id, - bigquery_client, input_df, expected_df, table_schema, @@ -212,8 +207,8 @@ def test_dataframe_round_trip_with_table_schema( method_under_test( input_df, table_id, table_schema=table_schema, api_method=api_method ) - round_trip = bigquery_client.list_rows(table_id).to_dataframe( - dtypes=dict(zip(expected_df.columns, expected_df.dtypes)) + round_trip = read_gbq( + table_id, dtypes=dict(zip(expected_df.columns, expected_df.dtypes)), ) round_trip.sort_values("row_num", inplace=True) pandas.testing.assert_frame_equal(expected_df, round_trip) From 2a76982df7cff48e58d8b1ad7eae19477665cb76 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 6 Dec 2021 16:28:43 -0600 Subject: [PATCH 03/35] revert tests for read_gbq fix which isn't yet resolved --- ci/requirements-3.7-0.24.2.conda | 3 +-- setup.py | 3 +-- testing/constraints-3.7.txt | 4 +-- tests/system/test_to_gbq.py | 42 ++++++++++++++++---------------- 4 files changed, 25 insertions(+), 27 deletions(-) diff --git a/ci/requirements-3.7-0.24.2.conda b/ci/requirements-3.7-0.24.2.conda index 430c742e..e0323d92 100644 --- a/ci/requirements-3.7-0.24.2.conda +++ b/ci/requirements-3.7-0.24.2.conda @@ -4,8 +4,7 @@ db-dtypes==0.3.1 fastavro flake8 numpy==1.16.6 -google-cloud-bigquery==1.26.1 -google-cloud-bigquery-storage==1.1.0 +google-cloud-bigquery==1.11.1 pyarrow==3.0.0 pydata-google-auth pytest diff --git a/setup.py b/setup.py index a5b645e7..283e5ea8 100644 --- a/setup.py +++ b/setup.py @@ -32,8 +32,7 @@ "google-auth-oauthlib", # 2.4.* has a bug where waiting for the query can hang indefinitely. # https://github.com/pydata/pandas-gbq/issues/343 - "google-cloud-bigquery >=1.26.1,<3.0.0dev,!=2.4.*", - "google-cloud-bigquery-storage >=1.1.0,<3.0.0dev", + "google-cloud-bigquery[bqstorage,pandas] >=1.11.1,<3.0.0dev,!=2.4.*", ] extras = { "tqdm": "tqdm>=4.23.0", diff --git a/testing/constraints-3.7.txt b/testing/constraints-3.7.txt index a5b04f0d..6c3080dc 100644 --- a/testing/constraints-3.7.txt +++ b/testing/constraints-3.7.txt @@ -6,9 +6,9 @@ # e.g., if setup.py has "foo >= 1.14.0, < 2.0.0dev", # Then this file should have foo==1.14.0 db-dtypes==0.3.1 -google-auth==1.18.0 +google-auth==1.4.1 google-auth-oauthlib==0.0.1 -google-cloud-bigquery==1.26.1 +google-cloud-bigquery==1.11.1 google-cloud-bigquery-storage==1.1.0 numpy==1.16.6 pandas==0.24.2 diff --git a/tests/system/test_to_gbq.py b/tests/system/test_to_gbq.py index 046a2a86..f7184024 100644 --- a/tests/system/test_to_gbq.py +++ b/tests/system/test_to_gbq.py @@ -201,29 +201,29 @@ def test_series_round_trip( datetime.date(1970, 1, 1), datetime.date(9999, 12, 31), ], - # DATETIME values outside of the range for pandas timestamp - # require `date_as_object` parameter in + # TODO: DATETIME/TIMESTAMP values outside of the range for + # pandas timestamp require `date_as_object` parameter in # google-cloud-bigquery versions 1.x and 2.x. # https://github.com/googleapis/python-bigquery-pandas/issues/365 - "datetime_col": [ - datetime.datetime(1, 1, 1), - datetime.datetime(1970, 1, 1), - datetime.datetime(9999, 12, 31, 23, 59, 59, 999999), - ], - "timestamp_col": [ - datetime.datetime(1, 1, 1, tzinfo=datetime.timezone.utc), - datetime.datetime(1970, 1, 1, tzinfo=datetime.timezone.utc), - datetime.datetime( - 9999, - 12, - 31, - 23, - 59, - 59, - 999999, - tzinfo=datetime.timezone.utc, - ), - ], + # "datetime_col": [ + # datetime.datetime(1, 1, 1), + # datetime.datetime(1970, 1, 1), + # datetime.datetime(9999, 12, 31, 23, 59, 59, 999999), + # ], + # "timestamp_col": [ + # datetime.datetime(1, 1, 1, tzinfo=datetime.timezone.utc), + # datetime.datetime(1970, 1, 1, tzinfo=datetime.timezone.utc), + # datetime.datetime( + # 9999, + # 12, + # 31, + # 23, + # 59, + # 59, + # 999999, + # tzinfo=datetime.timezone.utc, + # ), + # ], }, columns=["row_num", "date_col", "datetime_col", "timestamp_col"], ), From 4695c5fb18337db4f5e94671b0682dddcfb921e5 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 6 Dec 2021 16:47:45 -0600 Subject: [PATCH 04/35] Revert "revert tests for read_gbq fix which isn't yet resolved" This reverts commit 2a76982df7cff48e58d8b1ad7eae19477665cb76. --- ci/requirements-3.7-0.24.2.conda | 3 ++- setup.py | 3 ++- testing/constraints-3.7.txt | 4 +-- tests/system/test_to_gbq.py | 42 ++++++++++++++++---------------- 4 files changed, 27 insertions(+), 25 deletions(-) diff --git a/ci/requirements-3.7-0.24.2.conda b/ci/requirements-3.7-0.24.2.conda index e0323d92..430c742e 100644 --- a/ci/requirements-3.7-0.24.2.conda +++ b/ci/requirements-3.7-0.24.2.conda @@ -4,7 +4,8 @@ db-dtypes==0.3.1 fastavro flake8 numpy==1.16.6 -google-cloud-bigquery==1.11.1 +google-cloud-bigquery==1.26.1 +google-cloud-bigquery-storage==1.1.0 pyarrow==3.0.0 pydata-google-auth pytest diff --git a/setup.py b/setup.py index 283e5ea8..a5b645e7 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,8 @@ "google-auth-oauthlib", # 2.4.* has a bug where waiting for the query can hang indefinitely. # https://github.com/pydata/pandas-gbq/issues/343 - "google-cloud-bigquery[bqstorage,pandas] >=1.11.1,<3.0.0dev,!=2.4.*", + "google-cloud-bigquery >=1.26.1,<3.0.0dev,!=2.4.*", + "google-cloud-bigquery-storage >=1.1.0,<3.0.0dev", ] extras = { "tqdm": "tqdm>=4.23.0", diff --git a/testing/constraints-3.7.txt b/testing/constraints-3.7.txt index 6c3080dc..a5b04f0d 100644 --- a/testing/constraints-3.7.txt +++ b/testing/constraints-3.7.txt @@ -6,9 +6,9 @@ # e.g., if setup.py has "foo >= 1.14.0, < 2.0.0dev", # Then this file should have foo==1.14.0 db-dtypes==0.3.1 -google-auth==1.4.1 +google-auth==1.18.0 google-auth-oauthlib==0.0.1 -google-cloud-bigquery==1.11.1 +google-cloud-bigquery==1.26.1 google-cloud-bigquery-storage==1.1.0 numpy==1.16.6 pandas==0.24.2 diff --git a/tests/system/test_to_gbq.py b/tests/system/test_to_gbq.py index f7184024..046a2a86 100644 --- a/tests/system/test_to_gbq.py +++ b/tests/system/test_to_gbq.py @@ -201,29 +201,29 @@ def test_series_round_trip( datetime.date(1970, 1, 1), datetime.date(9999, 12, 31), ], - # TODO: DATETIME/TIMESTAMP values outside of the range for - # pandas timestamp require `date_as_object` parameter in + # DATETIME values outside of the range for pandas timestamp + # require `date_as_object` parameter in # google-cloud-bigquery versions 1.x and 2.x. # https://github.com/googleapis/python-bigquery-pandas/issues/365 - # "datetime_col": [ - # datetime.datetime(1, 1, 1), - # datetime.datetime(1970, 1, 1), - # datetime.datetime(9999, 12, 31, 23, 59, 59, 999999), - # ], - # "timestamp_col": [ - # datetime.datetime(1, 1, 1, tzinfo=datetime.timezone.utc), - # datetime.datetime(1970, 1, 1, tzinfo=datetime.timezone.utc), - # datetime.datetime( - # 9999, - # 12, - # 31, - # 23, - # 59, - # 59, - # 999999, - # tzinfo=datetime.timezone.utc, - # ), - # ], + "datetime_col": [ + datetime.datetime(1, 1, 1), + datetime.datetime(1970, 1, 1), + datetime.datetime(9999, 12, 31, 23, 59, 59, 999999), + ], + "timestamp_col": [ + datetime.datetime(1, 1, 1, tzinfo=datetime.timezone.utc), + datetime.datetime(1970, 1, 1, tzinfo=datetime.timezone.utc), + datetime.datetime( + 9999, + 12, + 31, + 23, + 59, + 59, + 999999, + tzinfo=datetime.timezone.utc, + ), + ], }, columns=["row_num", "date_col", "datetime_col", "timestamp_col"], ), From 6adf2332fa7726532872a68e3283e004f9c3c1db Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 6 Dec 2021 17:12:17 -0600 Subject: [PATCH 05/35] add todo for next steps --- pandas_gbq/gbq.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 714c0995..d2cc38f9 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -506,13 +506,18 @@ def _download_results( to_dataframe_kwargs["create_bqstorage_client"] = create_bqstorage_client try: + # TODO: This is the only difference between table ID and query job. + # But should I refactor for + # https://github.com/googleapis/python-bigquery-pandas/issues/339 + # now? query_job.result() # Get the table schema, so that we can list rows. destination = self.client.get_table(query_job.destination) rows_iter = self.client.list_rows(destination, max_results=max_results) - schema_fields = [field.to_api_repr() for field in rows_iter.schema] conversion_dtypes = _bqschema_to_nullsafe_dtypes(schema_fields) + # ENDTODO: This is the only difference between table ID and + conversion_dtypes.update(user_dtypes) df = rows_iter.to_dataframe( dtypes=conversion_dtypes, From 9b1eb0dc709beccdd59058874d6b9a7339da5864 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 9 Dec 2021 14:28:34 -0600 Subject: [PATCH 06/35] add unit test for table ID read_gbq --- tests/unit/conftest.py | 17 ++++++++++++++--- tests/unit/test_gbq.py | 19 ++++++++++++++++++- 2 files changed, 32 insertions(+), 4 deletions(-) diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index cfa1e819..513df4b9 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -26,18 +26,29 @@ def mock_bigquery_client(monkeypatch): # Constructor returns the mock itself, so this mock can be treated as the # constructor or the instance. mock_client.return_value = mock_client - mock_schema = [google.cloud.bigquery.SchemaField("_f0", "INTEGER")] - # Mock out SELECT 1 query results. + mock_query = mock.create_autospec(google.cloud.bigquery.QueryJob) mock_query.job_id = "some-random-id" mock_query.state = "DONE" mock_rows = mock.create_autospec(google.cloud.bigquery.table.RowIterator) mock_rows.total_rows = 1 - mock_rows.schema = mock_schema + mock_rows.__iter__.return_value = [(1,)] mock_query.result.return_value = mock_rows + mock_client.list_rows.return_value = mock_rows mock_client.query.return_value = mock_query # Mock table creation. monkeypatch.setattr(google.cloud.bigquery, "Client", mock_client) mock_client.reset_mock() + + # Mock out SELECT 1 query results. + def generate_schema(): + query = mock_client.query.call_args[0][0] + if query == "SELECT 1 AS int_col": + return [google.cloud.bigquery.SchemaField("int_col", "INTEGER")] + else: + return [google.cloud.bigquery.SchemaField("_f0", "INTEGER")] + + type(mock_rows).schema = mock.PropertyMock(side_effect=generate_schema) + return mock_client diff --git a/tests/unit/test_gbq.py b/tests/unit/test_gbq.py index 8784a98b..bc12c47c 100644 --- a/tests/unit/test_gbq.py +++ b/tests/unit/test_gbq.py @@ -292,9 +292,10 @@ def test_read_gbq_with_no_project_id_given_should_fail(monkeypatch): gbq.read_gbq("SELECT 1", dialect="standard") -def test_read_gbq_with_inferred_project_id(monkeypatch): +def test_read_gbq_with_inferred_project_id(mock_bigquery_client): df = gbq.read_gbq("SELECT 1", dialect="standard") assert df is not None + mock_bigquery_client.query.assert_called_once() def test_read_gbq_with_inferred_project_id_from_service_account_credentials( @@ -505,3 +506,19 @@ def test_read_gbq_calls_tqdm(mock_bigquery_client, mock_service_account_credenti _, to_dataframe_kwargs = mock_list_rows.to_dataframe.call_args assert to_dataframe_kwargs["progress_bar_type"] == "foobar" + + +def test_read_gbq_bypasses_query_with_table_id( + mock_bigquery_client, mock_service_account_credentials +): + mock_service_account_credentials.project_id = "service_account_project_id" + df = gbq.read_gbq( + "my-project.my_dataset.read_gbq_table", + credentials=mock_service_account_credentials, + ) + assert df is not None + + mock_bigquery_client.query.assert_not_called() + mock_bigquery_client.list_rows.assert_called_with( + "my-project.my_dataset.read_gbq_table" + ) From ec9ddaff46e1911e24fbba4cfc66e743515dde63 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 9 Dec 2021 15:28:44 -0600 Subject: [PATCH 07/35] add helper for is_query --- pandas_gbq/gbq.py | 5 +++++ tests/unit/test_gbq.py | 13 +++++++++++++ 2 files changed, 18 insertions(+) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index bba98f57..07fc6852 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -3,6 +3,7 @@ # license that can be found in the LICENSE file. import logging +import re import time import warnings from datetime import datetime @@ -64,6 +65,10 @@ def _test_google_api_imports(): raise ImportError("pandas-gbq requires google-cloud-bigquery") from ex +def _is_query(query_or_table: str) -> bool: + return re.search(r"\s", query_or_table.strip(), re.MULTILINE) is not None + + class DatasetCreationError(ValueError): """ Raised when the create dataset method fails diff --git a/tests/unit/test_gbq.py b/tests/unit/test_gbq.py index bc12c47c..496486ef 100644 --- a/tests/unit/test_gbq.py +++ b/tests/unit/test_gbq.py @@ -82,6 +82,19 @@ def test__bqschema_to_nullsafe_dtypes(type_, expected): assert result == {"x": expected} +@pytest.mark.parametrize( + ["query_or_table", "expected"], + [ + ("SELECT 1", True), + ("dataset.table", False), + ("project-id.dataset.table", False), + ], +) +def test__is_query(query_or_table, expected): + result = gbq._is_query(query_or_table) + assert result == expected + + def test_GbqConnector_get_client_w_old_bq(monkeypatch, mock_bigquery_client): gbq._test_google_api_imports() connector = _make_connector() From 9cc7c74c3d2f76de9ba1beb6fd15156101717be6 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 10 Dec 2021 15:07:44 -0600 Subject: [PATCH 08/35] implement read_gbq with table id --- pandas_gbq/gbq.py | 85 ++++++++++++++++++++++++++++-------------- tests/unit/conftest.py | 8 +++- tests/unit/test_gbq.py | 37 +++++++++++++++++- 3 files changed, 99 insertions(+), 31 deletions(-) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 07fc6852..6a8b6788 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -379,9 +379,26 @@ def process_http_error(ex): raise GenericGBQException("Reason: {0}".format(ex)) - def run_query( - self, query_or_table, max_results=None, progress_bar_type=None, **kwargs + def download_table( + self, table_id, max_results=None, progress_bar_type=None, dtypes=None ): + self._start_timer() + + try: + # Get the table schema, so that we can list rows. + destination = self.client.get_table(table_id) + rows_iter = self.client.list_rows(destination, max_results=max_results) + except self.http_error as ex: + self.process_http_error(ex) + + return self._download_results( + rows_iter, + max_results=max_results, + progress_bar_type=progress_bar_type, + user_dtypes=dtypes, + ) + + def run_query(self, query, max_results=None, progress_bar_type=None, **kwargs): from concurrent.futures import TimeoutError from google.auth.exceptions import RefreshError @@ -397,21 +414,12 @@ def run_query( if config is not None: job_config.update(config) - if "query" in config and "query" in config["query"]: - if query_or_table is not None: - raise ValueError( - "Query statement can't be specified " - "inside config while it is specified " - "as parameter" - ) - query_or_table = config["query"].pop("query") - self._start_timer() try: logger.debug("Requesting query... ") query_reply = self.client.query( - query_or_table, + query, job_config=bigquery.QueryJobConfig.from_api_repr(job_config), location=self.location, project=self.project_id, @@ -471,15 +479,25 @@ def run_query( ) dtypes = kwargs.get("dtypes") + + # Ensure destination is populated. + try: + query_reply.result() + except self.http_error as ex: + self.process_http_error(ex) + + # Get the table schema, so that we can list rows. + destination = self.client.get_table(query_reply.destination) + rows_iter = self.client.list_rows(destination, max_results=max_results) return self._download_results( - query_reply, + rows_iter, max_results=max_results, progress_bar_type=progress_bar_type, user_dtypes=dtypes, ) def _download_results( - self, query_job, max_results=None, progress_bar_type=None, user_dtypes=None, + self, rows_iter, max_results=None, progress_bar_type=None, user_dtypes=None, ): # No results are desired, so don't bother downloading anything. if max_results == 0: @@ -511,14 +529,6 @@ def _download_results( to_dataframe_kwargs["create_bqstorage_client"] = create_bqstorage_client try: - # TODO: This is the only difference between table ID and query job. - # But should I refactor for - # https://github.com/googleapis/python-bigquery-pandas/issues/339 - # now? - query_job.result() - # Get the table schema, so that we can list rows. - destination = self.client.get_table(query_job.destination) - rows_iter = self.client.list_rows(destination, max_results=max_results) schema_fields = [field.to_api_repr() for field in rows_iter.schema] conversion_dtypes = _bqschema_to_nullsafe_dtypes(schema_fields) # ENDTODO: This is the only difference between table ID and @@ -829,6 +839,15 @@ def read_gbq( if dialect not in ("legacy", "standard"): raise ValueError("'{0}' is not valid for dialect".format(dialect)) + if configuration and "query" in configuration and "query" in configuration["query"]: + if query_or_table is not None: + raise ValueError( + "Query statement can't be specified " + "inside config while it is specified " + "as parameter" + ) + query_or_table = configuration["query"].pop("query") + connector = GbqConnector( project_id, reauth=reauth, @@ -840,13 +859,21 @@ def read_gbq( use_bqstorage_api=use_bqstorage_api, ) - final_df = connector.run_query( - query_or_table, - configuration=configuration, - max_results=max_results, - progress_bar_type=progress_bar_type, - dtypes=dtypes, - ) + if _is_query(query_or_table): + final_df = connector.run_query( + query_or_table, + configuration=configuration, + max_results=max_results, + progress_bar_type=progress_bar_type, + dtypes=dtypes, + ) + else: + final_df = connector.download_table( + query_or_table, + max_results=max_results, + progress_bar_type=progress_bar_type, + dtypes=dtypes, + ) # Reindex the DataFrame on the provided column if index_col is not None: diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index 513df4b9..3f0c5e53 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -43,7 +43,7 @@ def mock_bigquery_client(monkeypatch): # Mock out SELECT 1 query results. def generate_schema(): - query = mock_client.query.call_args[0][0] + query = mock_client.query.call_args[0][0] if mock_client.query.call_args else "" if query == "SELECT 1 AS int_col": return [google.cloud.bigquery.SchemaField("int_col", "INTEGER")] else: @@ -51,4 +51,10 @@ def generate_schema(): type(mock_rows).schema = mock.PropertyMock(side_effect=generate_schema) + # Mock out get_table. + def get_table(table_ref_or_id, **kwargs): + return google.cloud.bigquery.Table(table_ref_or_id) + + mock_client.get_table.side_effect = get_table + return mock_client diff --git a/tests/unit/test_gbq.py b/tests/unit/test_gbq.py index 496486ef..480f0000 100644 --- a/tests/unit/test_gbq.py +++ b/tests/unit/test_gbq.py @@ -521,13 +521,14 @@ def test_read_gbq_calls_tqdm(mock_bigquery_client, mock_service_account_credenti assert to_dataframe_kwargs["progress_bar_type"] == "foobar" -def test_read_gbq_bypasses_query_with_table_id( +def test_read_gbq_with_full_table_id( mock_bigquery_client, mock_service_account_credentials ): mock_service_account_credentials.project_id = "service_account_project_id" df = gbq.read_gbq( "my-project.my_dataset.read_gbq_table", credentials=mock_service_account_credentials, + project_id="param-project", ) assert df is not None @@ -535,3 +536,37 @@ def test_read_gbq_bypasses_query_with_table_id( mock_bigquery_client.list_rows.assert_called_with( "my-project.my_dataset.read_gbq_table" ) + + +def test_read_gbq_with_partial_table_id( + mock_bigquery_client, mock_service_account_credentials +): + mock_service_account_credentials.project_id = "service_account_project_id" + df = gbq.read_gbq( + "my_dataset.read_gbq_table", + credentials=mock_service_account_credentials, + project_id="param-project", + ) + assert df is not None + + mock_bigquery_client.query.assert_not_called() + mock_bigquery_client.list_rows.assert_called_with( + "param-project.my_dataset.read_gbq_table" + ) + + +def test_read_gbq_bypasses_query_with_table_id_and_max_results( + mock_bigquery_client, mock_service_account_credentials +): + mock_service_account_credentials.project_id = "service_account_project_id" + df = gbq.read_gbq( + "my-project.my_dataset.read_gbq_table", + credentials=mock_service_account_credentials, + max_results=11, + ) + assert df is not None + + mock_bigquery_client.query.assert_not_called() + mock_bigquery_client.list_rows.assert_called_with( + "my-project.my_dataset.read_gbq_table", max_results=11 + ) From dd51ad8e7d9c51301e33ec2422fe6d80013e7322 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 10 Dec 2021 15:49:49 -0600 Subject: [PATCH 09/35] fix remaining tests, don't localalize out-of-bounds timestamp columns --- pandas_gbq/gbq.py | 5 ++++- pandas_gbq/timestamp.py | 8 +++++++- tests/unit/test_gbq.py | 7 ++++--- 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 6a8b6788..247df17c 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -386,7 +386,10 @@ def download_table( try: # Get the table schema, so that we can list rows. - destination = self.client.get_table(table_id) + table_ref = bigquery.TableReference.from_string( + table_id, default_project=self.project_id + ) + destination = self.client.get_table(table_ref) rows_iter = self.client.list_rows(destination, max_results=max_results) except self.http_error as ex: self.process_http_error(ex) diff --git a/pandas_gbq/timestamp.py b/pandas_gbq/timestamp.py index e0b41475..c6bb6d93 100644 --- a/pandas_gbq/timestamp.py +++ b/pandas_gbq/timestamp.py @@ -7,6 +7,8 @@ Private module. """ +import pandas.api.types + def localize_df(df, schema_fields): """Localize any TIMESTAMP columns to tz-aware type. @@ -38,7 +40,11 @@ def localize_df(df, schema_fields): if "mode" in field and field["mode"].upper() == "REPEATED": continue - if field["type"].upper() == "TIMESTAMP" and df[column].dt.tz is None: + if ( + field["type"].upper() == "TIMESTAMP" + and pandas.api.types.is_datetime64_ns_dtype(df.dtypes[column]) + and df[column].dt.tz is None + ): df[column] = df[column].dt.tz_localize("UTC") return df diff --git a/tests/unit/test_gbq.py b/tests/unit/test_gbq.py index 480f0000..7593eea5 100644 --- a/tests/unit/test_gbq.py +++ b/tests/unit/test_gbq.py @@ -8,6 +8,7 @@ import datetime from unittest import mock +from google.cloud import bigquery import numpy import pandas from pandas import DataFrame @@ -534,7 +535,7 @@ def test_read_gbq_with_full_table_id( mock_bigquery_client.query.assert_not_called() mock_bigquery_client.list_rows.assert_called_with( - "my-project.my_dataset.read_gbq_table" + bigquery.Table("my-project.my_dataset.read_gbq_table"), max_results=None, ) @@ -551,7 +552,7 @@ def test_read_gbq_with_partial_table_id( mock_bigquery_client.query.assert_not_called() mock_bigquery_client.list_rows.assert_called_with( - "param-project.my_dataset.read_gbq_table" + bigquery.Table("param-project.my_dataset.read_gbq_table"), max_results=None, ) @@ -568,5 +569,5 @@ def test_read_gbq_bypasses_query_with_table_id_and_max_results( mock_bigquery_client.query.assert_not_called() mock_bigquery_client.list_rows.assert_called_with( - "my-project.my_dataset.read_gbq_table", max_results=11 + bigquery.Table("my-project.my_dataset.read_gbq_table"), max_results=11 ) From e1ad679671f920b5964b7b987d9ceb3b36dca10e Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 10 Dec 2021 15:52:12 -0600 Subject: [PATCH 10/35] Update pandas_gbq/gbq.py --- pandas_gbq/gbq.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 247df17c..fdd4dcc6 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -534,8 +534,6 @@ def _download_results( try: schema_fields = [field.to_api_repr() for field in rows_iter.schema] conversion_dtypes = _bqschema_to_nullsafe_dtypes(schema_fields) - # ENDTODO: This is the only difference between table ID and - conversion_dtypes.update(user_dtypes) df = rows_iter.to_dataframe( dtypes=conversion_dtypes, From d29bc2ac072f0c1673944557b0dc53c12487a99a Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 10 Dec 2021 16:30:35 -0600 Subject: [PATCH 11/35] fix 3.7 unit tests --- noxfile.py | 2 +- tests/unit/test_gbq.py | 24 ++++++++++++++---------- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/noxfile.py b/noxfile.py index df3378bf..7530c68a 100644 --- a/noxfile.py +++ b/noxfile.py @@ -259,7 +259,7 @@ def cover(session): test runs (not system test runs), and then erases coverage data. """ session.install("coverage", "pytest-cov") - session.run("coverage", "report", "--show-missing", "--fail-under=88") + session.run("coverage", "report", "--show-missing", "--fail-under=91") session.run("coverage", "erase") diff --git a/tests/unit/test_gbq.py b/tests/unit/test_gbq.py index 7593eea5..142771d1 100644 --- a/tests/unit/test_gbq.py +++ b/tests/unit/test_gbq.py @@ -8,7 +8,6 @@ import datetime from unittest import mock -from google.cloud import bigquery import numpy import pandas from pandas import DataFrame @@ -534,9 +533,10 @@ def test_read_gbq_with_full_table_id( assert df is not None mock_bigquery_client.query.assert_not_called() - mock_bigquery_client.list_rows.assert_called_with( - bigquery.Table("my-project.my_dataset.read_gbq_table"), max_results=None, - ) + sent_table = mock_bigquery_client.list_rows.call_args[0][0] + assert sent_table.project == "my-project" + assert sent_table.dataset_id == "my_dataset" + assert sent_table.table_id == "read_gbq_table" def test_read_gbq_with_partial_table_id( @@ -551,9 +551,10 @@ def test_read_gbq_with_partial_table_id( assert df is not None mock_bigquery_client.query.assert_not_called() - mock_bigquery_client.list_rows.assert_called_with( - bigquery.Table("param-project.my_dataset.read_gbq_table"), max_results=None, - ) + sent_table = mock_bigquery_client.list_rows.call_args[0][0] + assert sent_table.project == "param-project" + assert sent_table.dataset_id == "my_dataset" + assert sent_table.table_id == "read_gbq_table" def test_read_gbq_bypasses_query_with_table_id_and_max_results( @@ -568,6 +569,9 @@ def test_read_gbq_bypasses_query_with_table_id_and_max_results( assert df is not None mock_bigquery_client.query.assert_not_called() - mock_bigquery_client.list_rows.assert_called_with( - bigquery.Table("my-project.my_dataset.read_gbq_table"), max_results=11 - ) + sent_table = mock_bigquery_client.list_rows.call_args[0][0] + assert sent_table.project == "my-project" + assert sent_table.dataset_id == "my_dataset" + assert sent_table.table_id == "read_gbq_table" + sent_max_results = mock_bigquery_client.list_rows.call_args[1]["max_results"] + assert sent_max_results == 11 From cb8f24f5153535fdff344f2b3837b10222b4e322 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 10 Dec 2021 16:32:56 -0600 Subject: [PATCH 12/35] correct coverage --- noxfile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/noxfile.py b/noxfile.py index 7530c68a..398b4dc2 100644 --- a/noxfile.py +++ b/noxfile.py @@ -259,7 +259,7 @@ def cover(session): test runs (not system test runs), and then erases coverage data. """ session.install("coverage", "pytest-cov") - session.run("coverage", "report", "--show-missing", "--fail-under=91") + session.run("coverage", "report", "--show-missing", "--fail-under=89") session.run("coverage", "erase") From 56b73b213444955b28041f1822c6ceccee93916c Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 10 Dec 2021 16:34:30 -0600 Subject: [PATCH 13/35] skip coverage for optional test skip --- tests/unit/test_gbq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/test_gbq.py b/tests/unit/test_gbq.py index 142771d1..0c27dd76 100644 --- a/tests/unit/test_gbq.py +++ b/tests/unit/test_gbq.py @@ -487,7 +487,7 @@ def test_read_gbq_passes_dtypes(mock_bigquery_client, mock_service_account_crede def test_read_gbq_use_bqstorage_api( mock_bigquery_client, mock_service_account_credentials ): - if not FEATURES.bigquery_has_bqstorage: + if not FEATURES.bigquery_has_bqstorage: # pragma: NO COVER pytest.skip("requires BigQuery Storage API") mock_service_account_credentials.project_id = "service_account_project_id" From 8a61e97e31d5fd5a29898554f52cb66c422f12e9 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 10 Dec 2021 16:47:34 -0600 Subject: [PATCH 14/35] fix docs build --- pandas_gbq/gbq.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index fdd4dcc6..41cb2f5b 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -712,14 +712,14 @@ def read_gbq( reauth : boolean, default False Force Google BigQuery to re-authenticate the user. This is useful if multiple accounts are used. - auth_local_webserver : boolean, default False - Use the `local webserver flow`_ instead of the `console flow`_ - when getting user credentials. - - .. _local webserver flow: - https://googleapis.dev/python/google-auth-oauthlib/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server - .. _console flow: - https://googleapis.dev/python/google-auth-oauthlib/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console + auth_local_webserver : bool, default False + Use the `local webserver flow + `_ + instead of the `console flow + `_ + when getting user credentials. Your code must run on the same machine + as your web browser and your web browser can access your application + via ``localhost:808X``. .. versionadded:: 0.2.0 dialect : str, default 'standard' @@ -954,13 +954,13 @@ def to_gbq( ``'append'`` If table exists, insert data. Create if does not exist. auth_local_webserver : bool, default False - Use the `local webserver flow`_ instead of the `console flow`_ - when getting user credentials. - - .. _local webserver flow: - http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server - .. _console flow: - http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console + Use the `local webserver flow + `_ + instead of the `console flow + `_ + when getting user credentials. Your code must run on the same machine + as your web browser and your web browser can access your application + via ``localhost:808X``. .. versionadded:: 0.2.0 table_schema : list of dicts, optional From 3f7900bf184a10337c9bab19fa703211650da1df Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 10 Dec 2021 16:56:20 -0600 Subject: [PATCH 15/35] improve test coverage for error case --- tests/unit/test_gbq.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tests/unit/test_gbq.py b/tests/unit/test_gbq.py index 0c27dd76..9a0e8ce3 100644 --- a/tests/unit/test_gbq.py +++ b/tests/unit/test_gbq.py @@ -8,6 +8,7 @@ import datetime from unittest import mock +import google.api_core.exceptions import numpy import pandas from pandas import DataFrame @@ -575,3 +576,17 @@ def test_read_gbq_bypasses_query_with_table_id_and_max_results( assert sent_table.table_id == "read_gbq_table" sent_max_results = mock_bigquery_client.list_rows.call_args[1]["max_results"] assert sent_max_results == 11 + + +def test_read_gbq_with_list_rows_error_translates_exception( + mock_bigquery_client, mock_service_account_credentials +): + mock_bigquery_client.list_rows.side_effect = ( + google.api_core.exceptions.NotFound("table not found"), + ) + + with pytest.raises(gbq.GenericGBQException, match="table not found"): + gbq.read_gbq( + "my-project.my_dataset.read_gbq_table", + credentials=mock_service_account_credentials, + ) From 3c53f1f697265a9034b00fadfe99f525100f8eae Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 13 Dec 2021 10:21:46 -0600 Subject: [PATCH 16/35] as of google-cloud-bigquery 1.11.0, get_table before list_rows is unnecessary --- pandas_gbq/gbq.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 41cb2f5b..1ba64057 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -385,12 +385,10 @@ def download_table( self._start_timer() try: - # Get the table schema, so that we can list rows. table_ref = bigquery.TableReference.from_string( table_id, default_project=self.project_id ) - destination = self.client.get_table(table_ref) - rows_iter = self.client.list_rows(destination, max_results=max_results) + rows_iter = self.client.list_rows(table_ref, max_results=max_results) except self.http_error as ex: self.process_http_error(ex) @@ -489,9 +487,9 @@ def run_query(self, query, max_results=None, progress_bar_type=None, **kwargs): except self.http_error as ex: self.process_http_error(ex) - # Get the table schema, so that we can list rows. - destination = self.client.get_table(query_reply.destination) - rows_iter = self.client.list_rows(destination, max_results=max_results) + rows_iter = self.client.list_rows( + query_reply.destination, max_results=max_results + ) return self._download_results( rows_iter, max_results=max_results, From f0acde6dc47a4fa8da326f226b702ea8d4c19d9a Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 13 Dec 2021 17:07:35 -0600 Subject: [PATCH 17/35] refactor tests --- ci/requirements-3.7-0.24.2.conda | 2 +- pandas_gbq/features.py | 8 + pandas_gbq/gbq.py | 6 +- setup.py | 2 +- testing/constraints-3.7.txt | 2 +- tests/system/test_gbq.py | 316 +-------------------------- tests/system/test_read_gbq.py | 359 +++++++++++++++++++++++++++++++ tests/system/test_to_gbq.py | 7 +- 8 files changed, 381 insertions(+), 321 deletions(-) create mode 100644 tests/system/test_read_gbq.py diff --git a/ci/requirements-3.7-0.24.2.conda b/ci/requirements-3.7-0.24.2.conda index 430c742e..a99bd59e 100644 --- a/ci/requirements-3.7-0.24.2.conda +++ b/ci/requirements-3.7-0.24.2.conda @@ -4,7 +4,7 @@ db-dtypes==0.3.1 fastavro flake8 numpy==1.16.6 -google-cloud-bigquery==1.26.1 +google-cloud-bigquery==1.27.2 google-cloud-bigquery-storage==1.1.0 pyarrow==3.0.0 pydata-google-auth diff --git a/pandas_gbq/features.py b/pandas_gbq/features.py index 4259eaf1..cfaa0d1d 100644 --- a/pandas_gbq/features.py +++ b/pandas_gbq/features.py @@ -9,6 +9,7 @@ BIGQUERY_CLIENT_INFO_VERSION = "1.12.0" BIGQUERY_BQSTORAGE_VERSION = "1.24.0" BIGQUERY_FROM_DATAFRAME_CSV_VERSION = "2.6.0" +BIGQUERY_NO_DATE_AS_OBJECT_VERSION = "3.0.0dev" PANDAS_VERBOSITY_DEPRECATION_VERSION = "0.23.0" PANDAS_BOOLEAN_DTYPE_VERSION = "1.0.0" PANDAS_PARQUET_LOSSLESS_TIMESTAMP_VERSION = "1.1.0" @@ -69,6 +70,13 @@ def bigquery_has_from_dataframe_with_csv(self): ) return self.bigquery_installed_version >= bigquery_from_dataframe_version + @property + def bigquery_needs_date_as_object(self): + import pkg_resources + + max_version = pkg_resources.parse_version(BIGQUERY_NO_DATE_AS_OBJECT_VERSION) + return self.bigquery_installed_version < max_version + @property def pandas_installed_version(self): import pandas diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 1ba64057..ee5c3589 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -528,6 +528,8 @@ def _download_results( to_dataframe_kwargs = {} if FEATURES.bigquery_has_bqstorage: to_dataframe_kwargs["create_bqstorage_client"] = create_bqstorage_client + if FEATURES.bigquery_needs_date_as_object: + to_dataframe_kwargs["date_as_object"] = True try: schema_fields = [field.to_api_repr() for field in rows_iter.schema] @@ -544,6 +546,8 @@ def _download_results( if df.empty: df = _cast_empty_df_dtypes(schema_fields, df) + # TODO: DATETIME/DATE column casts + # Ensure any TIMESTAMP columns are tz-aware. df = pandas_gbq.timestamp.localize_df(df, schema_fields) @@ -602,8 +606,6 @@ def _bqschema_to_nullsafe_dtypes(schema_fields): # If you update this mapping, also update the table at # `docs/reading.rst`. dtype_map = { - "DATE": "datetime64[ns]", - "DATETIME": "datetime64[ns]", "FLOAT": np.dtype(float), "GEOMETRY": "object", "INTEGER": "Int64", diff --git a/setup.py b/setup.py index 9b9194f8..df89400b 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ "google-auth-oauthlib", # 2.4.* has a bug where waiting for the query can hang indefinitely. # https://github.com/pydata/pandas-gbq/issues/343 - "google-cloud-bigquery >=1.26.1,<4.0.0dev,!=2.4.*", + "google-cloud-bigquery >=1.27.2,<4.0.0dev,!=2.4.*", "google-cloud-bigquery-storage >=1.1.0,<3.0.0dev", ] extras = { diff --git a/testing/constraints-3.7.txt b/testing/constraints-3.7.txt index a5b04f0d..4c2d77da 100644 --- a/testing/constraints-3.7.txt +++ b/testing/constraints-3.7.txt @@ -8,7 +8,7 @@ db-dtypes==0.3.1 google-auth==1.18.0 google-auth-oauthlib==0.0.1 -google-cloud-bigquery==1.26.1 +google-cloud-bigquery==1.27.2 google-cloud-bigquery-storage==1.1.0 numpy==1.16.6 pandas==0.24.2 diff --git a/tests/system/test_gbq.py b/tests/system/test_gbq.py index 812d2089..f2fda75e 100644 --- a/tests/system/test_gbq.py +++ b/tests/system/test_gbq.py @@ -11,7 +11,7 @@ import pandas import pandas.api.types import pandas.testing as tm -from pandas import DataFrame, NaT +from pandas import DataFrame try: import pkg_resources # noqa @@ -21,7 +21,6 @@ import pytz from pandas_gbq import gbq -from pandas_gbq.features import FEATURES import pandas_gbq.schema @@ -153,319 +152,6 @@ def setup(self, project, credentials): self.gbq_connector = gbq.GbqConnector(project, credentials=credentials) self.credentials = credentials - def test_should_properly_handle_empty_strings(self, project_id): - query = 'SELECT "" AS empty_string' - df = gbq.read_gbq( - query, - project_id=project_id, - credentials=self.credentials, - dialect="legacy", - ) - tm.assert_frame_equal(df, DataFrame({"empty_string": [""]})) - - def test_should_properly_handle_null_strings(self, project_id): - query = "SELECT STRING(NULL) AS null_string" - df = gbq.read_gbq( - query, - project_id=project_id, - credentials=self.credentials, - dialect="legacy", - ) - tm.assert_frame_equal(df, DataFrame({"null_string": [None]})) - - def test_should_properly_handle_valid_integers(self, project_id): - query = "SELECT CAST(3 AS INT64) AS valid_integer" - df = gbq.read_gbq( - query, - project_id=project_id, - credentials=self.credentials, - dialect="standard", - ) - tm.assert_frame_equal(df, DataFrame({"valid_integer": [3]}, dtype="Int64")) - - def test_should_properly_handle_nullable_integers(self, project_id): - query = """SELECT * FROM - UNNEST([1, NULL]) AS nullable_integer - """ - df = gbq.read_gbq( - query, - project_id=project_id, - credentials=self.credentials, - dialect="standard", - dtypes={"nullable_integer": "Int64"}, - ) - tm.assert_frame_equal( - df, - DataFrame({"nullable_integer": pandas.Series([1, None], dtype="Int64")}), - ) - - def test_should_properly_handle_valid_longs(self, project_id): - query = "SELECT 1 << 62 AS valid_long" - df = gbq.read_gbq( - query, - project_id=project_id, - credentials=self.credentials, - dialect="standard", - ) - tm.assert_frame_equal(df, DataFrame({"valid_long": [1 << 62]}, dtype="Int64")) - - def test_should_properly_handle_nullable_longs(self, project_id): - query = """SELECT * FROM - UNNEST([1 << 62, NULL]) AS nullable_long - """ - df = gbq.read_gbq( - query, - project_id=project_id, - credentials=self.credentials, - dialect="standard", - dtypes={"nullable_long": "Int64"}, - ) - tm.assert_frame_equal( - df, - DataFrame({"nullable_long": pandas.Series([1 << 62, None], dtype="Int64")}), - ) - - def test_should_properly_handle_null_integers(self, project_id): - query = "SELECT CAST(NULL AS INT64) AS null_integer" - df = gbq.read_gbq( - query, - project_id=project_id, - credentials=self.credentials, - dialect="standard", - dtypes={"null_integer": "Int64"}, - ) - tm.assert_frame_equal( - df, DataFrame({"null_integer": pandas.Series([None], dtype="Int64")}), - ) - - def test_should_properly_handle_valid_floats(self, project_id): - from math import pi - - query = "SELECT PI() AS valid_float" - df = gbq.read_gbq( - query, - project_id=project_id, - credentials=self.credentials, - dialect="legacy", - ) - tm.assert_frame_equal(df, DataFrame({"valid_float": [pi]})) - - def test_should_properly_handle_nullable_floats(self, project_id): - from math import pi - - query = """SELECT * FROM - (SELECT PI() AS nullable_float), - (SELECT NULL AS nullable_float)""" - df = gbq.read_gbq( - query, - project_id=project_id, - credentials=self.credentials, - dialect="legacy", - ) - tm.assert_frame_equal(df, DataFrame({"nullable_float": [pi, None]})) - - def test_should_properly_handle_valid_doubles(self, project_id): - from math import pi - - query = "SELECT PI() * POW(10, 307) AS valid_double" - df = gbq.read_gbq( - query, - project_id=project_id, - credentials=self.credentials, - dialect="legacy", - ) - tm.assert_frame_equal(df, DataFrame({"valid_double": [pi * 10 ** 307]})) - - def test_should_properly_handle_nullable_doubles(self, project_id): - from math import pi - - query = """SELECT * FROM - (SELECT PI() * POW(10, 307) AS nullable_double), - (SELECT NULL AS nullable_double)""" - df = gbq.read_gbq( - query, - project_id=project_id, - credentials=self.credentials, - dialect="legacy", - ) - tm.assert_frame_equal( - df, DataFrame({"nullable_double": [pi * 10 ** 307, None]}) - ) - - def test_should_properly_handle_null_floats(self, project_id): - query = """SELECT null_float - FROM UNNEST(ARRAY[NULL, 1.0]) AS null_float - """ - df = gbq.read_gbq( - query, - project_id=project_id, - credentials=self.credentials, - dialect="standard", - ) - tm.assert_frame_equal(df, DataFrame({"null_float": [np.nan, 1.0]})) - - def test_should_properly_handle_date(self, project_id): - query = "SELECT DATE(2003, 1, 4) AS date_col" - df = gbq.read_gbq(query, project_id=project_id, credentials=self.credentials,) - expected = DataFrame( - { - "date_col": pandas.Series( - [datetime.date(2003, 1, 4)], dtype="datetime64[ns]" - ) - }, - ) - tm.assert_frame_equal(df, expected) - - def test_should_properly_handle_time(self, project_id): - query = ( - "SELECT TIME_ADD(TIME(3, 14, 15), INTERVAL 926589 MICROSECOND) AS time_col" - ) - df = gbq.read_gbq(query, project_id=project_id, credentials=self.credentials,) - expected = DataFrame( - { - "time_col": pandas.Series( - [datetime.time(3, 14, 15, 926589)], dtype="object" - ) - }, - ) - tm.assert_frame_equal(df, expected) - - def test_should_properly_handle_timestamp_unix_epoch(self, project_id): - query = 'SELECT TIMESTAMP("1970-01-01 00:00:00") AS unix_epoch' - df = gbq.read_gbq( - query, - project_id=project_id, - credentials=self.credentials, - dialect="legacy", - ) - expected = DataFrame( - {"unix_epoch": ["1970-01-01T00:00:00.000000Z"]}, dtype="datetime64[ns]", - ) - if expected["unix_epoch"].dt.tz is None: - expected["unix_epoch"] = expected["unix_epoch"].dt.tz_localize("UTC") - tm.assert_frame_equal(df, expected) - - def test_should_properly_handle_arbitrary_timestamp(self, project_id): - query = 'SELECT TIMESTAMP("2004-09-15 05:00:00") AS valid_timestamp' - df = gbq.read_gbq( - query, - project_id=project_id, - credentials=self.credentials, - dialect="legacy", - ) - expected = DataFrame( - {"valid_timestamp": ["2004-09-15T05:00:00.000000Z"]}, - dtype="datetime64[ns]", - ) - if expected["valid_timestamp"].dt.tz is None: - expected["valid_timestamp"] = expected["valid_timestamp"].dt.tz_localize( - "UTC" - ) - tm.assert_frame_equal(df, expected) - - def test_should_properly_handle_datetime_unix_epoch(self, project_id): - query = 'SELECT DATETIME("1970-01-01 00:00:00") AS unix_epoch' - df = gbq.read_gbq( - query, - project_id=project_id, - credentials=self.credentials, - dialect="legacy", - ) - tm.assert_frame_equal( - df, - DataFrame({"unix_epoch": ["1970-01-01T00:00:00"]}, dtype="datetime64[ns]"), - ) - - def test_should_properly_handle_arbitrary_datetime(self, project_id): - query = 'SELECT DATETIME("2004-09-15 05:00:00") AS valid_timestamp' - df = gbq.read_gbq( - query, - project_id=project_id, - credentials=self.credentials, - dialect="legacy", - ) - tm.assert_frame_equal( - df, DataFrame({"valid_timestamp": [np.datetime64("2004-09-15T05:00:00")]}), - ) - - @pytest.mark.parametrize( - "expression, is_expected_dtype", - [ - ("current_date()", pandas.api.types.is_datetime64_ns_dtype), - ("current_timestamp()", pandas.api.types.is_datetime64tz_dtype), - ("current_datetime()", pandas.api.types.is_datetime64_ns_dtype), - ("TRUE", pandas.api.types.is_bool_dtype), - ("FALSE", pandas.api.types.is_bool_dtype), - ], - ) - def test_return_correct_types(self, project_id, expression, is_expected_dtype): - """ - All type checks can be added to this function using additional - parameters, rather than creating additional functions. - We can consolidate the existing functions here in time - - TODO: time doesn't currently parse - ("time(12,30,00)", "[NULL, 1.0]) AS null_float +# """ +# df = gbq.read_gbq( +# query, +# project_id=project_id, +# credentials=self.credentials, +# dialect="standard", +# ) +# tm.assert_frame_equal(df, DataFrame({"null_float": [np.nan, 1.0]})) +# +# def test_should_properly_handle_date(self, project_id): +# query = "SELECT DATE(2003, 1, 4) AS date_col" +# df = gbq.read_gbq(query, project_id=project_id, credentials=self.credentials,) +# expected = DataFrame( +# { +# "date_col": pandas.Series( +# [datetime.date(2003, 1, 4)], dtype="datetime64[ns]" +# ) +# }, +# ) +# tm.assert_frame_equal(df, expected) +# +# def test_should_properly_handle_time(self, project_id): +# query = ( +# "SELECT TIME_ADD(TIME(3, 14, 15), INTERVAL 926589 MICROSECOND) AS time_col" +# ) +# df = gbq.read_gbq(query, project_id=project_id, credentials=self.credentials,) +# expected = DataFrame( +# { +# "time_col": pandas.Series( +# [datetime.time(3, 14, 15, 926589)], dtype="object" +# ) +# }, +# ) +# tm.assert_frame_equal(df, expected) +# +# def test_should_properly_handle_timestamp_unix_epoch(self, project_id): +# query = 'SELECT TIMESTAMP("1970-01-01 00:00:00") AS unix_epoch' +# df = gbq.read_gbq( +# query, +# project_id=project_id, +# credentials=self.credentials, +# dialect="legacy", +# ) +# expected = DataFrame( +# {"unix_epoch": ["1970-01-01T00:00:00.000000Z"]}, dtype="datetime64[ns]", +# ) +# if expected["unix_epoch"].dt.tz is None: +# expected["unix_epoch"] = expected["unix_epoch"].dt.tz_localize("UTC") +# tm.assert_frame_equal(df, expected) +# +# def test_should_properly_handle_arbitrary_timestamp(self, project_id): +# query = 'SELECT TIMESTAMP("2004-09-15 05:00:00") AS valid_timestamp' +# df = gbq.read_gbq( +# query, +# project_id=project_id, +# credentials=self.credentials, +# dialect="legacy", +# ) +# expected = DataFrame( +# {"valid_timestamp": ["2004-09-15T05:00:00.000000Z"]}, +# dtype="datetime64[ns]", +# ) +# if expected["valid_timestamp"].dt.tz is None: +# expected["valid_timestamp"] = expected["valid_timestamp"].dt.tz_localize( +# "UTC" +# ) +# tm.assert_frame_equal(df, expected) +# +# def test_should_properly_handle_datetime_unix_epoch(self, project_id): +# query = 'SELECT DATETIME("1970-01-01 00:00:00") AS unix_epoch' +# df = gbq.read_gbq( +# query, +# project_id=project_id, +# credentials=self.credentials, +# dialect="legacy", +# ) +# tm.assert_frame_equal( +# df, +# DataFrame({"unix_epoch": ["1970-01-01T00:00:00"]}, dtype="datetime64[ns]"), +# ) +# +# def test_should_properly_handle_arbitrary_datetime(self, project_id): +# query = 'SELECT DATETIME("2004-09-15 05:00:00") AS valid_timestamp' +# df = gbq.read_gbq( +# query, +# project_id=project_id, +# credentials=self.credentials, +# dialect="legacy", +# ) +# tm.assert_frame_equal( +# df, DataFrame({"valid_timestamp": [np.datetime64("2004-09-15T05:00:00")]}), +# ) +# +# def test_should_properly_handle_null_timestamp(self, project_id): +# query = "SELECT TIMESTAMP(NULL) AS null_timestamp" +# df = gbq.read_gbq( +# query, +# project_id=project_id, +# credentials=self.credentials, +# dialect="legacy", +# ) +# expected = DataFrame({"null_timestamp": [NaT]}, dtype="datetime64[ns]") +# expected["null_timestamp"] = expected["null_timestamp"].dt.tz_localize("UTC") +# tm.assert_frame_equal(df, expected) +# +# def test_should_properly_handle_null_datetime(self, project_id): +# query = "SELECT CAST(NULL AS DATETIME) AS null_datetime" +# df = gbq.read_gbq( +# query, +# project_id=project_id, +# credentials=self.credentials, +# dialect="standard", +# ) +# tm.assert_frame_equal(df, DataFrame({"null_datetime": [NaT]})) +# +# def test_should_properly_handle_null_boolean(self, project_id): +# query = "SELECT BOOLEAN(NULL) AS null_boolean" +# df = gbq.read_gbq( +# query, +# project_id=project_id, +# credentials=self.credentials, +# dialect="legacy", +# ) +# expected_dtype = "boolean" if FEATURES.pandas_has_boolean_dtype else None +# tm.assert_frame_equal( +# df, DataFrame({"null_boolean": [None]}, dtype=expected_dtype) +# ) +# +# def test_should_properly_handle_nullable_booleans(self, project_id): +# query = """SELECT * FROM +# (SELECT BOOLEAN(TRUE) AS nullable_boolean), +# (SELECT NULL AS nullable_boolean)""" +# df = gbq.read_gbq( +# query, +# project_id=project_id, +# credentials=self.credentials, +# dialect="legacy", +# ) +# expected_dtype = "boolean" if FEATURES.pandas_has_boolean_dtype else None +# tm.assert_frame_equal( +# df, DataFrame({"nullable_boolean": [True, None]}, dtype=expected_dtype) +# ) +# diff --git a/tests/system/test_to_gbq.py b/tests/system/test_to_gbq.py index d65bb3e4..f92da9e7 100644 --- a/tests/system/test_to_gbq.py +++ b/tests/system/test_to_gbq.py @@ -256,7 +256,12 @@ def test_dataframe_round_trip_with_table_schema( input_df, table_id, table_schema=table_schema, api_method=api_method ) round_trip = read_gbq( - table_id, dtypes=dict(zip(expected_df.columns, expected_df.dtypes)), + table_id, + dtypes=dict(zip(expected_df.columns, expected_df.dtypes)), + # BigQuery Storage API is required to avoid out-of-bound due to extra + # day from rounding error which was fixed in google-cloud-bigquery + # 2.6.0. https://github.com/googleapis/python-bigquery/pull/402 + use_bqstorage_api=True, ) round_trip.sort_values("row_num", inplace=True) pandas.testing.assert_frame_equal(expected_df, round_trip) From 362a26db51526e071252bdd029733bec878103d5 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Tue, 14 Dec 2021 10:48:14 -0600 Subject: [PATCH 18/35] add more scalars --- tests/system/test_read_gbq.py | 61 ++++++++++++++++++++++++++--------- 1 file changed, 46 insertions(+), 15 deletions(-) diff --git a/tests/system/test_read_gbq.py b/tests/system/test_read_gbq.py index 89026495..b4438d7b 100644 --- a/tests/system/test_read_gbq.py +++ b/tests/system/test_read_gbq.py @@ -4,11 +4,12 @@ import datetime +import db_dtypes import pandas import pandas.testing import pytest -# from pandas_gbq.features import FEATURES +from pandas_gbq.features import FEATURES @pytest.mark.parametrize(["use_bqstorage_api"], [(True,), (False,)]) @@ -17,25 +18,55 @@ [ pytest.param( """ - SELECT - date_col - FROM - UNNEST([ - STRUCT(DATE(1998, 9, 4) AS date_col), - STRUCT(DATE(2011, 10, 1) AS date_col), - STRUCT(DATE(2018, 4, 11) AS date_col) - ]) +SELECT + bools.row_num AS row_num, + bool_col, + bytes_col, + date_col +FROM + UNNEST([ + STRUCT(1 AS row_num, TRUE AS bool_col), + STRUCT(2 AS row_num, FALSE AS bool_col), + STRUCT(3 AS row_num, TRUE AS bool_col) ]) AS `bools` +INNER JOIN + UNNEST([ + STRUCT(1 AS row_num, CAST('C00010FF' AS BYTES FORMAT 'HEX') AS bytes_col), + STRUCT(2 AS row_num, CAST('F1AC' AS BYTES FORMAT 'HEX') AS bytes_col), + STRUCT(3 AS row_num, CAST('FFBADD11' AS BYTES FORMAT 'HEX') AS bytes_co) ]) AS `bytes` +INNER JOIN + UNNEST([ + STRUCT(1 AS row_num, DATE(1998, 9, 4) AS date_col), + STRUCT(2 AS row_num, DATE(2011, 10, 1) AS date_col), + STRUCT(3 AS row_num, DATE(2018, 4, 11) AS date_col) ]) AS `dates` +WHERE + `bools`.row_num = `dates`.row_num + AND `bools`.row_num = `bytes`.row_num """, pandas.DataFrame( { - "date_col": [ - datetime.date(1998, 9, 4), - datetime.date(2011, 10, 1), - datetime.date(2018, 4, 11), - ] + "row_num": pandas.Series([1, 2, 3], dtype="Int64"), + "bool_col": pandas.Series( + [True, False, True], + dtype="boolean" + if FEATURES.pandas_has_boolean_dtype + else "bool", + ), + "bytes_col": [ + bytes.fromhex("C00010FF"), + bytes.fromhex("F1AC"), + bytes.fromhex("FFBADD11"), + ], + "date_col": pandas.Series( + [ + datetime.date(1998, 9, 4), + datetime.date(2011, 10, 1), + datetime.date(2018, 4, 11), + ], + dtype=db_dtypes.DateDtype(), + ), } ), - id="alltypes-nonnull-normal-range", + id="scalar-types-nonnull-normal-range", ) ], ) From 752d67ce376019f701ee9c2f5022a69398e1bcbf Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Tue, 14 Dec 2021 14:03:53 -0600 Subject: [PATCH 19/35] add more types --- tests/system/test_read_gbq.py | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/tests/system/test_read_gbq.py b/tests/system/test_read_gbq.py index b4438d7b..b917ff9e 100644 --- a/tests/system/test_read_gbq.py +++ b/tests/system/test_read_gbq.py @@ -3,6 +3,7 @@ # license that can be found in the LICENSE file. import datetime +import decimal import db_dtypes import pandas @@ -22,7 +23,9 @@ bools.row_num AS row_num, bool_col, bytes_col, - date_col + date_col, + datetime_col, + numeric_col FROM UNNEST([ STRUCT(1 AS row_num, TRUE AS bool_col), @@ -38,9 +41,21 @@ STRUCT(1 AS row_num, DATE(1998, 9, 4) AS date_col), STRUCT(2 AS row_num, DATE(2011, 10, 1) AS date_col), STRUCT(3 AS row_num, DATE(2018, 4, 11) AS date_col) ]) AS `dates` +INNER JOIN + UNNEST([ + STRUCT(1 AS row_num, DATETIME('1998-09-04 12:34:56.789101') AS datetime_col), + STRUCT(2 AS row_num, DATETIME('2011-10-01 00:01:02.345678') AS datetime_col), + STRUCT(3 AS row_num, DATETIME('2018-04-11 23:59:59.999999') AS datetime_col) ]) AS `datetimes` +INNER JOIN + UNNEST([ + STRUCT(1 AS row_num, CAST('123.456789' AS NUMERIC) AS numeric_col), + STRUCT(2 AS row_num, CAST('-123.456789' AS NUMERIC) AS numeric_col), + STRUCT(3 AS row_num, CAST('999.999999' AS NUMERIC) AS numeric_col) ]) AS `numerics` WHERE `bools`.row_num = `dates`.row_num AND `bools`.row_num = `bytes`.row_num + AND `bools`.row_num = `datetimes`.row_num + AND `bools`.row_num = `numerics`.row_num """, pandas.DataFrame( { @@ -64,6 +79,19 @@ ], dtype=db_dtypes.DateDtype(), ), + "datetime_col": pandas.Series( + [ + "1998-09-04 12:34:56.789101", + "2011-10-01 00:01:02.345678", + "2018-04-11 23:59:59.999999", + ], + dtype="datetime64[ns]", + ), + "numeric_col": [ + decimal.Decimal("123.456789"), + decimal.Decimal("-123.456789"), + decimal.Decimal("999.999999"), + ], } ), id="scalar-types-nonnull-normal-range", @@ -81,8 +109,6 @@ def test_default_dtypes(read_gbq, query, use_bqstorage_api, expected): # ("current_date()", pandas.api.types.is_datetime64_ns_dtype), # ("current_timestamp()", pandas.api.types.is_datetime64tz_dtype), # ("current_datetime()", pandas.api.types.is_datetime64_ns_dtype), -# ("TRUE", pandas.api.types.is_bool_dtype), -# ("FALSE", pandas.api.types.is_bool_dtype), # ], # ) # def test_return_correct_types(self, project_id, expression, is_expected_dtype): From 5b46127b95928519d116d90b08eb5832fdc8e76d Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Wed, 15 Dec 2021 11:13:29 -0600 Subject: [PATCH 20/35] add failing time test --- tests/system/test_read_gbq.py | 85 +++++++++++++++++++++++++++++------ 1 file changed, 71 insertions(+), 14 deletions(-) diff --git a/tests/system/test_read_gbq.py b/tests/system/test_read_gbq.py index b917ff9e..468829da 100644 --- a/tests/system/test_read_gbq.py +++ b/tests/system/test_read_gbq.py @@ -25,7 +25,10 @@ bytes_col, date_col, datetime_col, - numeric_col + float_col, + numeric_col, + string_col, + time_col FROM UNNEST([ STRUCT(1 AS row_num, TRUE AS bool_col), @@ -46,16 +49,35 @@ STRUCT(1 AS row_num, DATETIME('1998-09-04 12:34:56.789101') AS datetime_col), STRUCT(2 AS row_num, DATETIME('2011-10-01 00:01:02.345678') AS datetime_col), STRUCT(3 AS row_num, DATETIME('2018-04-11 23:59:59.999999') AS datetime_col) ]) AS `datetimes` +INNER JOIN + UNNEST([ + STRUCT(1 AS row_num, 1.125 AS float_col), + STRUCT(2 AS row_num, -2.375 AS float_col), + STRUCT(3 AS row_num, 0.0 AS float_col) ]) AS `floats` INNER JOIN UNNEST([ STRUCT(1 AS row_num, CAST('123.456789' AS NUMERIC) AS numeric_col), STRUCT(2 AS row_num, CAST('-123.456789' AS NUMERIC) AS numeric_col), STRUCT(3 AS row_num, CAST('999.999999' AS NUMERIC) AS numeric_col) ]) AS `numerics` +INNER JOIN + UNNEST([ + STRUCT(1 AS row_num, 'abcdefghijklmnopqrstuvwxyz' AS string_col), + STRUCT(2 AS row_num, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' AS string_col), + STRUCT(3 AS row_num, 'こんにちは' AS string_col) ]) AS `strings` +INNER JOIN + UNNEST([ + STRUCT(1 AS row_num, CAST('00:00:00.000000' AS TIME) AS time_col), + STRUCT(2 AS row_num, CAST('09:08:07.654321' AS TIME) AS time_col), + STRUCT(3 AS row_num, CAST('23:59:59.999999' AS TIME) AS time_col) ]) AS `times` WHERE `bools`.row_num = `dates`.row_num AND `bools`.row_num = `bytes`.row_num AND `bools`.row_num = `datetimes`.row_num + AND `bools`.row_num = `floats`.row_num AND `bools`.row_num = `numerics`.row_num + AND `bools`.row_num = `strings`.row_num + AND `bools`.row_num = `times`.row_num +ORDER BY row_num ASC """, pandas.DataFrame( { @@ -87,15 +109,48 @@ ], dtype="datetime64[ns]", ), + "float_col": [1.125, -2.375, 0.0], "numeric_col": [ decimal.Decimal("123.456789"), decimal.Decimal("-123.456789"), decimal.Decimal("999.999999"), ], + "string_col": [ + "abcdefghijklmnopqrstuvwxyz", + "ABCDEFGHIJKLMNOPQRSTUVWXYZ", + "こんにちは", + ], + "time_col": pandas.Series( + ["00:00:00.000000", "09:08:07.654321", "23:59:59.999999"], + dtype=db_dtypes.TimeDtype(), + ), } ), id="scalar-types-nonnull-normal-range", - ) + ), + pytest.param( + """ +SELECT + row_num, + time_col +FROM + UNNEST([ + STRUCT(1 AS row_num, CAST('00:00:00.000000' AS TIME) AS time_col), + STRUCT(2 AS row_num, CAST('09:08:07.654321' AS TIME) AS time_col), + STRUCT(3 AS row_num, CAST('23:59:59.999999' AS TIME) AS time_col) ]) AS `times` +ORDER BY row_num ASC + """, + pandas.DataFrame( + { + "row_num": pandas.Series([1, 2, 3], dtype="Int64"), + "time_col": pandas.Series( + ["00:00:00.000000", "09:08:07.654321", "23:59:59.999999"], + dtype=db_dtypes.TimeDtype(), + ), + } + ), + id="times-nonnull-normal-range", + ), ], ) def test_default_dtypes(read_gbq, query, use_bqstorage_api, expected): @@ -103,12 +158,24 @@ def test_default_dtypes(read_gbq, query, use_bqstorage_api, expected): pandas.testing.assert_frame_equal(result, expected) +# TODO: skip BIGNUMERIC on versions of google-cloud-bigquery that don't support it +# pytest.param(..., marks=skipif...) +# UNNEST([ +# STRUCT(1 AS row_num, CAST('123456789.123456789' AS BIGNUMERIC) AS bignumeric_col), +# STRUCT(2 AS row_num, CAST('-123456789.123456789' AS BIGNUMERIC) AS bignumeric_col), +# STRUCT(3 AS row_num, CAST('987654321.987654321' AS BIGNUMERIC) AS bignumeric_col) ]) AS `bignumerics` +# INNER JOIN +# "bignumeric_col": [ +# decimal.Decimal("123456789.123456789"), +# decimal.Decimal("-123456789.123456789"), +# decimal.Decimal("987654321.987654321"), +# ], + + # @pytest.mark.parametrize( # "expression, is_expected_dtype", # [ -# ("current_date()", pandas.api.types.is_datetime64_ns_dtype), # ("current_timestamp()", pandas.api.types.is_datetime64tz_dtype), -# ("current_datetime()", pandas.api.types.is_datetime64_ns_dtype), # ], # ) # def test_return_correct_types(self, project_id, expression, is_expected_dtype): @@ -149,16 +216,6 @@ def test_default_dtypes(read_gbq, query, use_bqstorage_api, expected): # ) # tm.assert_frame_equal(df, DataFrame({"null_string": [None]})) # -# def test_should_properly_handle_valid_integers(self, project_id): -# query = "SELECT CAST(3 AS INT64) AS valid_integer" -# df = gbq.read_gbq( -# query, -# project_id=project_id, -# credentials=self.credentials, -# dialect="standard", -# ) -# tm.assert_frame_equal(df, DataFrame({"valid_integer": [3]}, dtype="Int64")) -# # def test_should_properly_handle_nullable_integers(self, project_id): # query = """SELECT * FROM # UNNEST([1, NULL]) AS nullable_integer From 254f6a0a18c981cdee32ca78bd0ecd1450fe7610 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Wed, 15 Dec 2021 16:09:23 -0600 Subject: [PATCH 21/35] add test for bignumeric --- pandas_gbq/features.py | 8 ++++++ pandas_gbq/gbq.py | 10 ------- tests/system/test_read_gbq.py | 54 +++++++++++++++++++---------------- tests/unit/test_features.py | 29 +++++++++++++++++++ 4 files changed, 67 insertions(+), 34 deletions(-) diff --git a/pandas_gbq/features.py b/pandas_gbq/features.py index cfaa0d1d..34e23785 100644 --- a/pandas_gbq/features.py +++ b/pandas_gbq/features.py @@ -9,6 +9,7 @@ BIGQUERY_CLIENT_INFO_VERSION = "1.12.0" BIGQUERY_BQSTORAGE_VERSION = "1.24.0" BIGQUERY_FROM_DATAFRAME_CSV_VERSION = "2.6.0" +BIGQUERY_SUPPORTS_BIGNUMERIC_VERSION = "2.10.0" BIGQUERY_NO_DATE_AS_OBJECT_VERSION = "3.0.0dev" PANDAS_VERBOSITY_DEPRECATION_VERSION = "0.23.0" PANDAS_BOOLEAN_DTYPE_VERSION = "1.0.0" @@ -52,6 +53,13 @@ def bigquery_has_client_info(self): ) return self.bigquery_installed_version >= bigquery_client_info_version + @property + def bigquery_has_bignumeric(self): + import pkg_resources + + min_version = pkg_resources.parse_version(BIGQUERY_SUPPORTS_BIGNUMERIC_VERSION) + return self.bigquery_installed_version >= min_version + @property def bigquery_has_bqstorage(self): import pkg_resources diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index ee5c3589..7e0e1dde 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -607,17 +607,7 @@ def _bqschema_to_nullsafe_dtypes(schema_fields): # `docs/reading.rst`. dtype_map = { "FLOAT": np.dtype(float), - "GEOMETRY": "object", "INTEGER": "Int64", - "RECORD": "object", - "STRING": "object", - # datetime.time objects cannot be case to datetime64. - # https://github.com/pydata/pandas-gbq/issues/328 - "TIME": "object", - # pandas doesn't support timezone-aware dtype in DataFrame/Series - # constructors. It's more idiomatic to localize after construction. - # https://github.com/pandas-dev/pandas/issues/25843 - "TIMESTAMP": "datetime64[ns]", } # Amend dtype_map with newer extension types if pandas version allows. diff --git a/tests/system/test_read_gbq.py b/tests/system/test_read_gbq.py index 468829da..92da2232 100644 --- a/tests/system/test_read_gbq.py +++ b/tests/system/test_read_gbq.py @@ -28,7 +28,8 @@ float_col, numeric_col, string_col, - time_col + time_col, + timestamp_col FROM UNNEST([ STRUCT(1 AS row_num, TRUE AS bool_col), @@ -69,6 +70,11 @@ STRUCT(1 AS row_num, CAST('00:00:00.000000' AS TIME) AS time_col), STRUCT(2 AS row_num, CAST('09:08:07.654321' AS TIME) AS time_col), STRUCT(3 AS row_num, CAST('23:59:59.999999' AS TIME) AS time_col) ]) AS `times` +INNER JOIN + UNNEST([ + STRUCT(1 AS row_num, TIMESTAMP('1998-09-04 12:34:56.789101') AS timestamp_col), + STRUCT(2 AS row_num, TIMESTAMP('2011-10-01 00:01:02.345678') AS timestamp_col), + STRUCT(3 AS row_num, TIMESTAMP('2018-04-11 23:59:59.999999') AS timestamp_col) ]) AS `timestamps` WHERE `bools`.row_num = `dates`.row_num AND `bools`.row_num = `bytes`.row_num @@ -77,6 +83,7 @@ AND `bools`.row_num = `numerics`.row_num AND `bools`.row_num = `strings`.row_num AND `bools`.row_num = `times`.row_num + AND `bools`.row_num = `timestamps`.row_num ORDER BY row_num ASC """, pandas.DataFrame( @@ -124,6 +131,14 @@ ["00:00:00.000000", "09:08:07.654321", "23:59:59.999999"], dtype=db_dtypes.TimeDtype(), ), + "timestamp_col": pandas.Series( + [ + "1998-09-04 12:34:56.789101", + "2011-10-01 00:01:02.345678", + "2018-04-11 23:59:59.999999", + ], + dtype="datetime64[ns]", + ).dt.tz_localize(datetime.timezone.utc), } ), id="scalar-types-nonnull-normal-range", @@ -132,24 +147,29 @@ """ SELECT row_num, - time_col + bignumeric_col FROM UNNEST([ - STRUCT(1 AS row_num, CAST('00:00:00.000000' AS TIME) AS time_col), - STRUCT(2 AS row_num, CAST('09:08:07.654321' AS TIME) AS time_col), - STRUCT(3 AS row_num, CAST('23:59:59.999999' AS TIME) AS time_col) ]) AS `times` + STRUCT(1 AS row_num, CAST('123456789.123456789' AS BIGNUMERIC) AS bignumeric_col), + STRUCT(2 AS row_num, CAST('-123456789.123456789' AS BIGNUMERIC) AS bignumeric_col), + STRUCT(3 AS row_num, CAST('987654321.987654321' AS BIGNUMERIC) AS bignumeric_col) ]) AS `bignumerics` ORDER BY row_num ASC """, pandas.DataFrame( { "row_num": pandas.Series([1, 2, 3], dtype="Int64"), - "time_col": pandas.Series( - ["00:00:00.000000", "09:08:07.654321", "23:59:59.999999"], - dtype=db_dtypes.TimeDtype(), - ), + "bignumeric_col": [ + decimal.Decimal("123456789.123456789"), + decimal.Decimal("-123456789.123456789"), + decimal.Decimal("987654321.987654321"), + ], } ), - id="times-nonnull-normal-range", + id="bignumeric-nonnull-normal-range", + marks=pytest.mark.skipif( + not FEATURES.bigquery_has_bignumeric, + reason="BIGNUMERIC not supported in this version of google-cloud-bigquery", + ), ), ], ) @@ -158,20 +178,6 @@ def test_default_dtypes(read_gbq, query, use_bqstorage_api, expected): pandas.testing.assert_frame_equal(result, expected) -# TODO: skip BIGNUMERIC on versions of google-cloud-bigquery that don't support it -# pytest.param(..., marks=skipif...) -# UNNEST([ -# STRUCT(1 AS row_num, CAST('123456789.123456789' AS BIGNUMERIC) AS bignumeric_col), -# STRUCT(2 AS row_num, CAST('-123456789.123456789' AS BIGNUMERIC) AS bignumeric_col), -# STRUCT(3 AS row_num, CAST('987654321.987654321' AS BIGNUMERIC) AS bignumeric_col) ]) AS `bignumerics` -# INNER JOIN -# "bignumeric_col": [ -# decimal.Decimal("123456789.123456789"), -# decimal.Decimal("-123456789.123456789"), -# decimal.Decimal("987654321.987654321"), -# ], - - # @pytest.mark.parametrize( # "expression, is_expected_dtype", # [ diff --git a/tests/unit/test_features.py b/tests/unit/test_features.py index b10b0fa8..a64aea02 100644 --- a/tests/unit/test_features.py +++ b/tests/unit/test_features.py @@ -12,6 +12,24 @@ def fresh_bigquery_version(monkeypatch): monkeypatch.setattr(FEATURES, "_bigquery_installed_version", None) +@pytest.mark.parametrize( + ["bigquery_version", "expected"], + [ + ("1.11.1", False), + ("1.26.0", False), + ("2.9.999", False), + ("2.10.0", True), + ("2.12.0", True), + ("3.0.0", True), + ], +) +def test_bigquery_has_bignumeric(monkeypatch, bigquery_version, expected): + import google.cloud.bigquery + + monkeypatch.setattr(google.cloud.bigquery, "__version__", bigquery_version) + assert FEATURES.bigquery_has_bignumeric == expected + + @pytest.mark.parametrize( ["bigquery_version", "expected"], [ @@ -28,3 +46,14 @@ def test_bigquery_has_from_dataframe_with_csv(monkeypatch, bigquery_version, exp monkeypatch.setattr(google.cloud.bigquery, "__version__", bigquery_version) assert FEATURES.bigquery_has_from_dataframe_with_csv == expected + + +@pytest.mark.parametrize( + ["bigquery_version", "expected"], + [("1.26.0", True), ("2.12.0", True), ("3.0.0", False), ("3.1.0", False)], +) +def test_bigquery_needs_date_as_object(monkeypatch, bigquery_version, expected): + import google.cloud.bigquery + + monkeypatch.setattr(google.cloud.bigquery, "__version__", bigquery_version) + assert FEATURES.bigquery_needs_date_as_object == expected From c0780b61db6ce93bdb6d20fb45fefb96e86f6402 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Wed, 15 Dec 2021 16:48:57 -0600 Subject: [PATCH 22/35] add test for null values --- tests/system/test_read_gbq.py | 254 ++++++++++++++++++++++++++++++---- 1 file changed, 228 insertions(+), 26 deletions(-) diff --git a/tests/system/test_read_gbq.py b/tests/system/test_read_gbq.py index 92da2232..441b1556 100644 --- a/tests/system/test_read_gbq.py +++ b/tests/system/test_read_gbq.py @@ -146,26 +146,251 @@ pytest.param( """ SELECT - row_num, - bignumeric_col + bools.row_num AS row_num, + bool_col, + bytes_col, + date_col, + datetime_col, + float_col, + int64_col, + numeric_col, + string_col, + time_col, + timestamp_col +FROM + UNNEST([ + STRUCT(1 AS row_num, TRUE AS bool_col), + STRUCT(2 AS row_num, FALSE AS bool_col), + STRUCT(3 AS row_num, NULL AS bool_col) ]) AS `bools` +INNER JOIN + UNNEST([ + STRUCT(1 AS row_num, NULL AS bytes_col), + STRUCT(2 AS row_num, CAST('F1AC' AS BYTES FORMAT 'HEX') AS bytes_col), + STRUCT(3 AS row_num, CAST('' AS BYTES FORMAT 'HEX') AS bytes_co) ]) AS `bytes` +INNER JOIN + UNNEST([ + STRUCT(1 AS row_num, DATE(1998, 9, 4) AS date_col), + STRUCT(2 AS row_num, NULL AS date_col), + STRUCT(3 AS row_num, DATE(2018, 4, 11) AS date_col) ]) AS `dates` +INNER JOIN + UNNEST([ + STRUCT(1 AS row_num, DATETIME('1998-09-04 12:34:56.789101') AS datetime_col), + STRUCT(2 AS row_num, DATETIME('2011-10-01 00:01:02.345678') AS datetime_col), + STRUCT(3 AS row_num, NULL AS datetime_col) ]) AS `datetimes` +INNER JOIN + UNNEST([ + STRUCT(1 AS row_num, NULL AS float_col), + STRUCT(2 AS row_num, -2.375 AS float_col), + STRUCT(3 AS row_num, 0.0 AS float_col) ]) AS `floats` +INNER JOIN + UNNEST([ + STRUCT(1 AS row_num, -1 AS int64_col), + STRUCT(2 AS row_num, NULL AS int64_col), + STRUCT(3 AS row_num, 0 AS int64_col) ]) AS `int64s` +INNER JOIN + UNNEST([ + STRUCT(1 AS row_num, CAST('123.456789' AS NUMERIC) AS numeric_col), + STRUCT(2 AS row_num, NULL AS numeric_col), + STRUCT(3 AS row_num, CAST('999.999999' AS NUMERIC) AS numeric_col) ]) AS `numerics` +INNER JOIN + UNNEST([ + STRUCT(1 AS row_num, '' AS string_col), + STRUCT(2 AS row_num, 'こんにちは' AS string_col), + STRUCT(3 AS row_num, NULL AS string_col) ]) AS `strings` +INNER JOIN + UNNEST([ + STRUCT(1 AS row_num, NULL AS time_col), + STRUCT(2 AS row_num, CAST('00:00:00.000000' AS TIME) AS time_col), + STRUCT(3 AS row_num, CAST('23:59:59.999999' AS TIME) AS time_col) ]) AS `times` +INNER JOIN + UNNEST([ + STRUCT(1 AS row_num, TIMESTAMP('1998-09-04 12:34:56.789101') AS timestamp_col), + STRUCT(2 AS row_num, NULL AS timestamp_col), + STRUCT(3 AS row_num, TIMESTAMP('2018-04-11 23:59:59.999999') AS timestamp_col) ]) AS `timestamps` +WHERE + `bools`.row_num = `dates`.row_num + AND `bools`.row_num = `bytes`.row_num + AND `bools`.row_num = `datetimes`.row_num + AND `bools`.row_num = `floats`.row_num + AND `bools`.row_num = `int64s`.row_num + AND `bools`.row_num = `numerics`.row_num + AND `bools`.row_num = `strings`.row_num + AND `bools`.row_num = `times`.row_num + AND `bools`.row_num = `timestamps`.row_num +ORDER BY row_num ASC + """, + pandas.DataFrame( + { + "row_num": pandas.Series([1, 2, 3], dtype="Int64"), + "bool_col": pandas.Series( + [True, False, None], + dtype="boolean" + if FEATURES.pandas_has_boolean_dtype + else "object", + ), + "bytes_col": [None, b"", bytes.fromhex("FFBADD11")], + "date_col": pandas.Series( + [datetime.date(1998, 9, 4), None, datetime.date(2018, 4, 11)], + dtype=db_dtypes.DateDtype(), + ), + "datetime_col": pandas.Series( + [ + "1998-09-04 12:34:56.789101", + "2011-10-01 00:01:02.345678", + None, + ], + dtype="datetime64[ns]", + ), + "float_col": [None, -2.375, 0.0], + "int64_col": pandas.Series([-1, None, 0], dtype="Int64"), + "numeric_col": [ + decimal.Decimal("123.456789"), + None, + decimal.Decimal("999.999999"), + ], + "string_col": ["", "こんにちは", None], + "time_col": pandas.Series( + [None, "00:00:00", "23:59:59.999999"], + dtype=db_dtypes.TimeDtype(), + ), + "timestamp_col": pandas.Series( + [ + "1998-09-04 12:34:56.789101", + None, + "2018-04-11 23:59:59.999999", + ], + dtype="datetime64[ns]", + ).dt.tz_localize(datetime.timezone.utc), + } + ), + id="scalar-types-nullable-normal-range", + ), + pytest.param( + """ +SELECT + bools.row_num AS row_num, + bool_col, + bytes_col, + date_col, + datetime_col, + float_col, + int64_col, + numeric_col, + string_col, + time_col, + timestamp_col +FROM + UNNEST([ + STRUCT(1 AS row_num, CAST(NULL AS BOOL) AS bool_col) ]) AS `bools` +INNER JOIN + UNNEST([ + STRUCT(1 AS row_num, CAST(NULL AS BYTES) AS bytes_col) ]) AS `bytes` +INNER JOIN + UNNEST([ + STRUCT(1 AS row_num, CAST(NULL AS DATE) AS date_col) ]) AS `dates` +INNER JOIN + UNNEST([ + STRUCT(1 AS row_num, CAST(NULL AS DATETIME) AS datetime_col) ]) AS `datetimes` +INNER JOIN + UNNEST([ + STRUCT(1 AS row_num, CAST(NULL AS FLOAT64) AS float_col) ]) AS `floats` +INNER JOIN + UNNEST([ + STRUCT(1 AS row_num, CAST(NULL AS INT64) AS int64_col) ]) AS `int64s` +INNER JOIN + UNNEST([ + STRUCT(1 AS row_num, CAST(NULL AS NUMERIC) AS numeric_col) ]) AS `numerics` +INNER JOIN + UNNEST([ + STRUCT(1 AS row_num, CAST(NULL AS STRING) AS string_col) ]) AS `strings` +INNER JOIN + UNNEST([ + STRUCT(1 AS row_num, CAST(NULL AS TIME) AS time_col) ]) AS `times` +INNER JOIN + UNNEST([ + STRUCT(1 AS row_num, CAST(NULL AS TIMESTAMP) AS timestamp_col) ]) AS `timestamps` +WHERE + `bools`.row_num = `dates`.row_num + AND `bools`.row_num = `bytes`.row_num + AND `bools`.row_num = `datetimes`.row_num + AND `bools`.row_num = `floats`.row_num + AND `bools`.row_num = `int64s`.row_num + AND `bools`.row_num = `numerics`.row_num + AND `bools`.row_num = `strings`.row_num + AND `bools`.row_num = `times`.row_num + AND `bools`.row_num = `timestamps`.row_num +ORDER BY row_num ASC + """, + pandas.DataFrame( + { + "row_num": pandas.Series([1], dtype="Int64"), + "bool_col": pandas.Series( + [None], + dtype="boolean" + if FEATURES.pandas_has_boolean_dtype + else "object", + ), + "bytes_col": [None], + "date_col": pandas.Series([None], dtype=db_dtypes.DateDtype(),), + "datetime_col": pandas.Series([None], dtype="datetime64[ns]",), + "float_col": pandas.Series([None], dtype="float64"), + "int64_col": pandas.Series([None], dtype="Int64"), + "numeric_col": [None], + "string_col": [None], + "time_col": pandas.Series([None], dtype=db_dtypes.TimeDtype(),), + "timestamp_col": pandas.Series( + [None], dtype="datetime64[ns]", + ).dt.tz_localize(datetime.timezone.utc), + } + ), + id="scalar-types-null", + ), + pytest.param( + """ +SELECT + bignumerics.row_num AS row_num, + bignumeric_col, + nullable_col, + null_col FROM UNNEST([ STRUCT(1 AS row_num, CAST('123456789.123456789' AS BIGNUMERIC) AS bignumeric_col), STRUCT(2 AS row_num, CAST('-123456789.123456789' AS BIGNUMERIC) AS bignumeric_col), STRUCT(3 AS row_num, CAST('987654321.987654321' AS BIGNUMERIC) AS bignumeric_col) ]) AS `bignumerics` +INNER JOIN + UNNEST([ + STRUCT(1 AS row_num, CAST('123456789.123456789' AS BIGNUMERIC) AS nullable_col), + STRUCT(2 AS row_num, NULL AS nullable_col), + STRUCT(3 AS row_num, CAST('987654321.987654321' AS BIGNUMERIC) AS nullable_col) ]) AS `nullables` +INNER JOIN + UNNEST([ + STRUCT(1 AS row_num, CAST(NULL AS BIGNUMERIC) AS null_col), + STRUCT(2 AS row_num, CAST(NULL AS BIGNUMERIC) AS null_col), + STRUCT(3 AS row_num, CAST(NULL AS BIGNUMERIC) AS null_col) ]) AS `nulls` +WHERE + `bignumerics`.row_num = `nullables`.row_num + AND `bignumerics`.row_num = `nulls`.row_num ORDER BY row_num ASC """, pandas.DataFrame( { "row_num": pandas.Series([1, 2, 3], dtype="Int64"), + # TODO: Support a special (nullable) dtype for decimal data. + # https://github.com/googleapis/python-db-dtypes-pandas/issues/49 "bignumeric_col": [ decimal.Decimal("123456789.123456789"), decimal.Decimal("-123456789.123456789"), decimal.Decimal("987654321.987654321"), ], + "nullable_col": [ + decimal.Decimal("123456789.123456789"), + None, + decimal.Decimal("987654321.987654321"), + ], + "null_col": [None, None, None], } ), - id="bignumeric-nonnull-normal-range", + id="bignumeric-normal-range", marks=pytest.mark.skipif( not FEATURES.bigquery_has_bignumeric, reason="BIGNUMERIC not supported in this version of google-cloud-bigquery", @@ -178,29 +403,6 @@ def test_default_dtypes(read_gbq, query, use_bqstorage_api, expected): pandas.testing.assert_frame_equal(result, expected) -# @pytest.mark.parametrize( -# "expression, is_expected_dtype", -# [ -# ("current_timestamp()", pandas.api.types.is_datetime64tz_dtype), -# ], -# ) -# def test_return_correct_types(self, project_id, expression, is_expected_dtype): -# """ -# All type checks can be added to this function using additional -# parameters, rather than creating additional functions. -# We can consolidate the existing functions here in time -# -# TODO: time doesn't currently parse -# ("time(12,30,00)", " Date: Wed, 15 Dec 2021 16:56:45 -0600 Subject: [PATCH 23/35] add epoch timestamps to tests --- tests/system/test_read_gbq.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/system/test_read_gbq.py b/tests/system/test_read_gbq.py index 441b1556..4cc82ca6 100644 --- a/tests/system/test_read_gbq.py +++ b/tests/system/test_read_gbq.py @@ -169,12 +169,12 @@ STRUCT(3 AS row_num, CAST('' AS BYTES FORMAT 'HEX') AS bytes_co) ]) AS `bytes` INNER JOIN UNNEST([ - STRUCT(1 AS row_num, DATE(1998, 9, 4) AS date_col), + STRUCT(1 AS row_num, DATE(1970, 1, 1) AS date_col), STRUCT(2 AS row_num, NULL AS date_col), STRUCT(3 AS row_num, DATE(2018, 4, 11) AS date_col) ]) AS `dates` INNER JOIN UNNEST([ - STRUCT(1 AS row_num, DATETIME('1998-09-04 12:34:56.789101') AS datetime_col), + STRUCT(1 AS row_num, DATETIME('1970-01-01 00:00:00.000000') AS datetime_col), STRUCT(2 AS row_num, DATETIME('2011-10-01 00:01:02.345678') AS datetime_col), STRUCT(3 AS row_num, NULL AS datetime_col) ]) AS `datetimes` INNER JOIN @@ -204,7 +204,7 @@ STRUCT(3 AS row_num, CAST('23:59:59.999999' AS TIME) AS time_col) ]) AS `times` INNER JOIN UNNEST([ - STRUCT(1 AS row_num, TIMESTAMP('1998-09-04 12:34:56.789101') AS timestamp_col), + STRUCT(1 AS row_num, TIMESTAMP('1970-01-01 00:00:00.000000') AS timestamp_col), STRUCT(2 AS row_num, NULL AS timestamp_col), STRUCT(3 AS row_num, TIMESTAMP('2018-04-11 23:59:59.999999') AS timestamp_col) ]) AS `timestamps` WHERE @@ -228,14 +228,14 @@ if FEATURES.pandas_has_boolean_dtype else "object", ), - "bytes_col": [None, b"", bytes.fromhex("FFBADD11")], + "bytes_col": [None, bytes.fromhex("F1AC"), b""], "date_col": pandas.Series( - [datetime.date(1998, 9, 4), None, datetime.date(2018, 4, 11)], + [datetime.date(1970, 1, 1), None, datetime.date(2018, 4, 11)], dtype=db_dtypes.DateDtype(), ), "datetime_col": pandas.Series( [ - "1998-09-04 12:34:56.789101", + "1970-01-01 00:00:00.000000", "2011-10-01 00:01:02.345678", None, ], @@ -255,7 +255,7 @@ ), "timestamp_col": pandas.Series( [ - "1998-09-04 12:34:56.789101", + "1970-01-01 00:00:00.000000", None, "2018-04-11 23:59:59.999999", ], From b03443b21b5e4e983d7179de478f8f4effbb4441 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 16 Dec 2021 15:42:33 -0600 Subject: [PATCH 24/35] add post-download dtype conversions --- pandas_gbq/gbq.py | 63 ++++++++++++++++++++++++----------- tests/system/test_read_gbq.py | 1 + 2 files changed, 45 insertions(+), 19 deletions(-) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 7e0e1dde..3141bcaf 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -5,7 +5,9 @@ import logging import re import time +import typing import warnings +from typing import Any, Dict, Sequence from datetime import datetime import numpy as np @@ -29,6 +31,10 @@ import pandas_gbq.schema import pandas_gbq.timestamp +# Avoid circular imports by importing only during type checks. +if typing.TYPE_CHECKING: + import pandas + logger = logging.getLogger(__name__) @@ -543,13 +549,7 @@ def _download_results( except self.http_error as ex: self.process_http_error(ex) - if df.empty: - df = _cast_empty_df_dtypes(schema_fields, df) - - # TODO: DATETIME/DATE column casts - - # Ensure any TIMESTAMP columns are tz-aware. - df = pandas_gbq.timestamp.localize_df(df, schema_fields) + df = _finalize_dtypes(df, schema_fields) logger.debug("Got {} rows.\n".format(rows_iter.total_rows)) return df @@ -603,11 +603,18 @@ def _bqschema_to_nullsafe_dtypes(schema_fields): See: http://pandas.pydata.org/pandas-docs/dev/missing_data.html #missing-data-casting-rules-and-indexing """ + import db_dtypes # TODO: add to _test_..._imports ? + # If you update this mapping, also update the table at # `docs/reading.rst`. dtype_map = { "FLOAT": np.dtype(float), "INTEGER": "Int64", + "TIME": db_dtypes.TimeDtype(), + # Note: Other types such as 'datetime64[ns]' and db_types.DateDtype() + # are not included because the pandas range does not align with the + # BigQuery range. We need to attempt a conversion to those types and + # fall back to 'object' when there are out-of-range values. } # Amend dtype_map with newer extension types if pandas version allows. @@ -630,28 +637,46 @@ def _bqschema_to_nullsafe_dtypes(schema_fields): return dtypes -def _cast_empty_df_dtypes(schema_fields, df): - """Cast any columns in an empty dataframe to correct type. +def _finalize_dtypes( + df: "pandas.DataFrame", schema_fields: Sequence[Dict[str, Any]] +) -> "pandas.DataFrame": + """ + Attempt to change the dtypes of those columns that don't map exactly. - In an empty dataframe, pandas cannot choose a dtype unless one is - explicitly provided. The _bqschema_to_nullsafe_dtypes() function only - provides dtypes when the dtype safely handles null values. This means - that empty int64 and boolean columns are incorrectly classified as - ``object``. + For example db_dtypes.DateDtype() and datetime64[ns] cannot represent + 0001-01-01, but they can represent dates within a couple hundred years of + 1970. See: + https://github.com/googleapis/python-bigquery-pandas/issues/365 """ - if not df.empty: - raise ValueError("DataFrame must be empty in order to cast non-nullsafe dtypes") + import db_dtypes # TODO: add to _test_..._imports ? - dtype_map = {"BOOLEAN": bool, "INTEGER": np.int64} + # If you update this mapping, also update the table at + # `docs/reading.rst`. + dtype_map = { + "DATE": db_dtypes.DateDtype(), + "DATETIME": "datetime64[ns]", + "TIMESTAMP": "datetime64[ns]", + } for field in schema_fields: - column = str(field["name"]) + # This method doesn't modify ARRAY/REPEATED columns. if field["mode"].upper() == "REPEATED": continue + name = str(field["name"]) dtype = dtype_map.get(field["type"].upper()) if dtype: - df[column] = df[column].astype(dtype) + # TODO: don't cast TIMESTAMP if already the right type + # tests/system/test_read_gbq.py::test_default_dtypes[scalar-types-nullable-normal-range-False] + # /Users/swast/src/github.com/googleapis/python-bigquery-pandas/pandas_gbq/gbq.py:668: + # FutureWarning: Using .astype to convert from timezone-aware dtype + # to timezone-naive dtype is deprecated and will raise in a future + # version. Use obj.tz_localize(None) or + # obj.tz_convert('UTC').tz_localize(None) instead + df[name] = df[name].astype(dtype, errors="raise") # TODO: errors="ignore" + + # Ensure any TIMESTAMP columns are tz-aware. + df = pandas_gbq.timestamp.localize_df(df, schema_fields) return df diff --git a/tests/system/test_read_gbq.py b/tests/system/test_read_gbq.py index 4cc82ca6..25a367fd 100644 --- a/tests/system/test_read_gbq.py +++ b/tests/system/test_read_gbq.py @@ -396,6 +396,7 @@ reason="BIGNUMERIC not supported in this version of google-cloud-bigquery", ), ), + # TODO: test with extreme DATE/DATETIME/TIMESTAMPS ], ) def test_default_dtypes(read_gbq, query, use_bqstorage_api, expected): From 11126a6e342422f530c57d4f59738231e4c5ad83 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Wed, 29 Dec 2021 15:21:57 -0600 Subject: [PATCH 25/35] add failing test for desired fix --- tests/system/test_read_gbq.py | 371 +++++++++------------------------- 1 file changed, 92 insertions(+), 279 deletions(-) diff --git a/tests/system/test_read_gbq.py b/tests/system/test_read_gbq.py index 25a367fd..a68f5ceb 100644 --- a/tests/system/test_read_gbq.py +++ b/tests/system/test_read_gbq.py @@ -26,6 +26,7 @@ date_col, datetime_col, float_col, + int64_col, numeric_col, string_col, time_col, @@ -55,6 +56,13 @@ STRUCT(1 AS row_num, 1.125 AS float_col), STRUCT(2 AS row_num, -2.375 AS float_col), STRUCT(3 AS row_num, 0.0 AS float_col) ]) AS `floats` +INNER JOIN + UNNEST([ + -- 2 ^ 63 - 1, but in hex to avoid intermediate overlfow. + STRUCT(1 AS row_num, 0x7fffffffffffffff AS int64_col), + STRUCT(2 AS row_num, -1 AS in64_col), + -- -2 ^ 63, but in hex to avoid intermediate overlfow. + STRUCT(3 AS row_num, -0x8000000000000000 AS int64_col) ]) AS `ints` INNER JOIN UNNEST([ STRUCT(1 AS row_num, CAST('123.456789' AS NUMERIC) AS numeric_col), @@ -80,6 +88,7 @@ AND `bools`.row_num = `bytes`.row_num AND `bools`.row_num = `datetimes`.row_num AND `bools`.row_num = `floats`.row_num + AND `bools`.row_num = `ints`.row_num AND `bools`.row_num = `numerics`.row_num AND `bools`.row_num = `strings`.row_num AND `bools`.row_num = `times`.row_num @@ -117,6 +126,9 @@ dtype="datetime64[ns]", ), "float_col": [1.125, -2.375, 0.0], + "int64_col": pandas.Series( + [(2 ** 63) - 1, -1, -(2 ** 63)], dtype="Int64" + ), "numeric_col": [ decimal.Decimal("123.456789"), decimal.Decimal("-123.456789"), @@ -396,287 +408,88 @@ reason="BIGNUMERIC not supported in this version of google-cloud-bigquery", ), ), - # TODO: test with extreme DATE/DATETIME/TIMESTAMPS + pytest.param( + """ +SELECT + dates.row_num AS row_num, + date_col, + datetime_col, + timestamp_col +FROM + UNNEST([ + STRUCT(1 AS row_num, DATE(1, 1, 1) AS date_col), + STRUCT(2 AS row_num, DATE(9999, 12, 31) AS date_col), + STRUCT(3 AS row_num, DATE(2262, 4, 12) AS date_col) ]) AS `dates` +INNER JOIN + UNNEST([ + STRUCT(1 AS row_num, DATETIME('0001-01-01 00:00:00.000000') AS datetime_col), + STRUCT(2 AS row_num, DATETIME('9999-12-31 23:59:59.999999') AS datetime_col), + STRUCT(3 AS row_num, DATETIME('2262-04-11 23:47:16.854776') AS datetime_col) ]) AS `datetimes` +INNER JOIN + UNNEST([ + STRUCT(1 AS row_num, TIMESTAMP('0001-01-01 00:00:00.000000') AS timestamp_col), + STRUCT(2 AS row_num, TIMESTAMP('9999-12-31 23:59:59.999999') AS timestamp_col), + STRUCT(3 AS row_num, TIMESTAMP('2262-04-11 23:47:16.854776') AS timestamp_col) ]) AS `timestamps` +WHERE + `dates`.row_num = `datetimes`.row_num + AND `dates`.row_num = `timestamps`.row_num +ORDER BY row_num ASC + """, + pandas.DataFrame( + { + "row_num": pandas.Series([1, 2, 3], dtype="Int64"), + "date_col": pandas.Series( + [ + datetime.date(1, 1, 1), + datetime.date(9999, 12, 31), + datetime.date(2262, 4, 12), + ], + dtype="object", + ), + "datetime_col": pandas.Series( + [ + datetime.datetime(1, 1, 1, 0, 0, 0, 0), + datetime.datetime(9999, 12, 31, 23, 59, 59, 999999), + # One microsecond more than pandas.Timestamp.max. + datetime.datetime(2262, 4, 11, 23, 47, 16, 854776), + ], + dtype="object", + ), + "timestamp_col": pandas.Series( + [ + datetime.datetime( + 1, 1, 1, 0, 0, 0, 0, tzinfo=datetime.timezone.utc + ), + datetime.datetime( + 9999, + 12, + 31, + 23, + 59, + 59, + 999999, + tzinfo=datetime.timezone.utc, + ), + # One microsecond more than pandas.Timestamp.max. + datetime.datetime( + 2262, + 4, + 11, + 23, + 47, + 16, + 854776, + tzinfo=datetime.timezone.utc, + ), + ], + dtype="object", + ), + } + ), + id="issue365-extreme-datetimes", + ), ], ) def test_default_dtypes(read_gbq, query, use_bqstorage_api, expected): result = read_gbq(query, use_bqstorage_api=use_bqstorage_api) pandas.testing.assert_frame_equal(result, expected) - - -# -# def test_should_properly_handle_empty_strings(self, project_id): -# query = 'SELECT "" AS empty_string' -# df = gbq.read_gbq( -# query, -# project_id=project_id, -# credentials=self.credentials, -# dialect="legacy", -# ) -# tm.assert_frame_equal(df, DataFrame({"empty_string": [""]})) -# -# def test_should_properly_handle_null_strings(self, project_id): -# query = "SELECT STRING(NULL) AS null_string" -# df = gbq.read_gbq( -# query, -# project_id=project_id, -# credentials=self.credentials, -# dialect="legacy", -# ) -# tm.assert_frame_equal(df, DataFrame({"null_string": [None]})) -# -# def test_should_properly_handle_nullable_integers(self, project_id): -# query = """SELECT * FROM -# UNNEST([1, NULL]) AS nullable_integer -# """ -# df = gbq.read_gbq( -# query, -# project_id=project_id, -# credentials=self.credentials, -# dialect="standard", -# dtypes={"nullable_integer": "Int64"}, -# ) -# tm.assert_frame_equal( -# df, -# DataFrame({"nullable_integer": pandas.Series([1, None], dtype="Int64")}), -# ) -# -# def test_should_properly_handle_valid_longs(self, project_id): -# query = "SELECT 1 << 62 AS valid_long" -# df = gbq.read_gbq( -# query, -# project_id=project_id, -# credentials=self.credentials, -# dialect="standard", -# ) -# tm.assert_frame_equal(df, DataFrame({"valid_long": [1 << 62]}, dtype="Int64")) -# -# def test_should_properly_handle_nullable_longs(self, project_id): -# query = """SELECT * FROM -# UNNEST([1 << 62, NULL]) AS nullable_long -# """ -# df = gbq.read_gbq( -# query, -# project_id=project_id, -# credentials=self.credentials, -# dialect="standard", -# dtypes={"nullable_long": "Int64"}, -# ) -# tm.assert_frame_equal( -# df, -# DataFrame({"nullable_long": pandas.Series([1 << 62, None], dtype="Int64")}), -# ) -# -# def test_should_properly_handle_null_integers(self, project_id): -# query = "SELECT CAST(NULL AS INT64) AS null_integer" -# df = gbq.read_gbq( -# query, -# project_id=project_id, -# credentials=self.credentials, -# dialect="standard", -# dtypes={"null_integer": "Int64"}, -# ) -# tm.assert_frame_equal( -# df, DataFrame({"null_integer": pandas.Series([None], dtype="Int64")}), -# ) -# -# def test_should_properly_handle_valid_floats(self, project_id): -# from math import pi -# -# query = "SELECT PI() AS valid_float" -# df = gbq.read_gbq( -# query, -# project_id=project_id, -# credentials=self.credentials, -# dialect="legacy", -# ) -# tm.assert_frame_equal(df, DataFrame({"valid_float": [pi]})) -# -# def test_should_properly_handle_nullable_floats(self, project_id): -# from math import pi -# -# query = """SELECT * FROM -# (SELECT PI() AS nullable_float), -# (SELECT NULL AS nullable_float)""" -# df = gbq.read_gbq( -# query, -# project_id=project_id, -# credentials=self.credentials, -# dialect="legacy", -# ) -# tm.assert_frame_equal(df, DataFrame({"nullable_float": [pi, None]})) -# -# def test_should_properly_handle_valid_doubles(self, project_id): -# from math import pi -# -# query = "SELECT PI() * POW(10, 307) AS valid_double" -# df = gbq.read_gbq( -# query, -# project_id=project_id, -# credentials=self.credentials, -# dialect="legacy", -# ) -# tm.assert_frame_equal(df, DataFrame({"valid_double": [pi * 10 ** 307]})) -# -# def test_should_properly_handle_nullable_doubles(self, project_id): -# from math import pi -# -# query = """SELECT * FROM -# (SELECT PI() * POW(10, 307) AS nullable_double), -# (SELECT NULL AS nullable_double)""" -# df = gbq.read_gbq( -# query, -# project_id=project_id, -# credentials=self.credentials, -# dialect="legacy", -# ) -# tm.assert_frame_equal( -# df, DataFrame({"nullable_double": [pi * 10 ** 307, None]}) -# ) -# -# def test_should_properly_handle_null_floats(self, project_id): -# query = """SELECT null_float -# FROM UNNEST(ARRAY[NULL, 1.0]) AS null_float -# """ -# df = gbq.read_gbq( -# query, -# project_id=project_id, -# credentials=self.credentials, -# dialect="standard", -# ) -# tm.assert_frame_equal(df, DataFrame({"null_float": [np.nan, 1.0]})) -# -# def test_should_properly_handle_date(self, project_id): -# query = "SELECT DATE(2003, 1, 4) AS date_col" -# df = gbq.read_gbq(query, project_id=project_id, credentials=self.credentials,) -# expected = DataFrame( -# { -# "date_col": pandas.Series( -# [datetime.date(2003, 1, 4)], dtype="datetime64[ns]" -# ) -# }, -# ) -# tm.assert_frame_equal(df, expected) -# -# def test_should_properly_handle_time(self, project_id): -# query = ( -# "SELECT TIME_ADD(TIME(3, 14, 15), INTERVAL 926589 MICROSECOND) AS time_col" -# ) -# df = gbq.read_gbq(query, project_id=project_id, credentials=self.credentials,) -# expected = DataFrame( -# { -# "time_col": pandas.Series( -# [datetime.time(3, 14, 15, 926589)], dtype="object" -# ) -# }, -# ) -# tm.assert_frame_equal(df, expected) -# -# def test_should_properly_handle_timestamp_unix_epoch(self, project_id): -# query = 'SELECT TIMESTAMP("1970-01-01 00:00:00") AS unix_epoch' -# df = gbq.read_gbq( -# query, -# project_id=project_id, -# credentials=self.credentials, -# dialect="legacy", -# ) -# expected = DataFrame( -# {"unix_epoch": ["1970-01-01T00:00:00.000000Z"]}, dtype="datetime64[ns]", -# ) -# if expected["unix_epoch"].dt.tz is None: -# expected["unix_epoch"] = expected["unix_epoch"].dt.tz_localize("UTC") -# tm.assert_frame_equal(df, expected) -# -# def test_should_properly_handle_arbitrary_timestamp(self, project_id): -# query = 'SELECT TIMESTAMP("2004-09-15 05:00:00") AS valid_timestamp' -# df = gbq.read_gbq( -# query, -# project_id=project_id, -# credentials=self.credentials, -# dialect="legacy", -# ) -# expected = DataFrame( -# {"valid_timestamp": ["2004-09-15T05:00:00.000000Z"]}, -# dtype="datetime64[ns]", -# ) -# if expected["valid_timestamp"].dt.tz is None: -# expected["valid_timestamp"] = expected["valid_timestamp"].dt.tz_localize( -# "UTC" -# ) -# tm.assert_frame_equal(df, expected) -# -# def test_should_properly_handle_datetime_unix_epoch(self, project_id): -# query = 'SELECT DATETIME("1970-01-01 00:00:00") AS unix_epoch' -# df = gbq.read_gbq( -# query, -# project_id=project_id, -# credentials=self.credentials, -# dialect="legacy", -# ) -# tm.assert_frame_equal( -# df, -# DataFrame({"unix_epoch": ["1970-01-01T00:00:00"]}, dtype="datetime64[ns]"), -# ) -# -# def test_should_properly_handle_arbitrary_datetime(self, project_id): -# query = 'SELECT DATETIME("2004-09-15 05:00:00") AS valid_timestamp' -# df = gbq.read_gbq( -# query, -# project_id=project_id, -# credentials=self.credentials, -# dialect="legacy", -# ) -# tm.assert_frame_equal( -# df, DataFrame({"valid_timestamp": [np.datetime64("2004-09-15T05:00:00")]}), -# ) -# -# def test_should_properly_handle_null_timestamp(self, project_id): -# query = "SELECT TIMESTAMP(NULL) AS null_timestamp" -# df = gbq.read_gbq( -# query, -# project_id=project_id, -# credentials=self.credentials, -# dialect="legacy", -# ) -# expected = DataFrame({"null_timestamp": [NaT]}, dtype="datetime64[ns]") -# expected["null_timestamp"] = expected["null_timestamp"].dt.tz_localize("UTC") -# tm.assert_frame_equal(df, expected) -# -# def test_should_properly_handle_null_datetime(self, project_id): -# query = "SELECT CAST(NULL AS DATETIME) AS null_datetime" -# df = gbq.read_gbq( -# query, -# project_id=project_id, -# credentials=self.credentials, -# dialect="standard", -# ) -# tm.assert_frame_equal(df, DataFrame({"null_datetime": [NaT]})) -# -# def test_should_properly_handle_null_boolean(self, project_id): -# query = "SELECT BOOLEAN(NULL) AS null_boolean" -# df = gbq.read_gbq( -# query, -# project_id=project_id, -# credentials=self.credentials, -# dialect="legacy", -# ) -# expected_dtype = "boolean" if FEATURES.pandas_has_boolean_dtype else None -# tm.assert_frame_equal( -# df, DataFrame({"null_boolean": [None]}, dtype=expected_dtype) -# ) -# -# def test_should_properly_handle_nullable_booleans(self, project_id): -# query = """SELECT * FROM -# (SELECT BOOLEAN(TRUE) AS nullable_boolean), -# (SELECT NULL AS nullable_boolean)""" -# df = gbq.read_gbq( -# query, -# project_id=project_id, -# credentials=self.credentials, -# dialect="legacy", -# ) -# expected_dtype = "boolean" if FEATURES.pandas_has_boolean_dtype else None -# tm.assert_frame_equal( -# df, DataFrame({"nullable_boolean": [True, None]}, dtype=expected_dtype) -# ) -# From 14e6070caeef1ddb091e278f67a21f2d45ba70ba Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Wed, 29 Dec 2021 16:33:50 -0600 Subject: [PATCH 26/35] fix the issue with extreme datetimes --- pandas_gbq/gbq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 0a4e7e47..948e3a85 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -684,7 +684,7 @@ def _finalize_dtypes( # to timezone-naive dtype is deprecated and will raise in a future # version. Use obj.tz_localize(None) or # obj.tz_convert('UTC').tz_localize(None) instead - df[name] = df[name].astype(dtype, errors="raise") # TODO: errors="ignore" + df[name] = df[name].astype(dtype, errors="ignore") # Ensure any TIMESTAMP columns are tz-aware. df = pandas_gbq.timestamp.localize_df(df, schema_fields) From 8f92d9bb6f72a3884684d3e3d5eb4520323e69b7 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Wed, 29 Dec 2021 17:09:07 -0600 Subject: [PATCH 27/35] fix constraints --- pandas_gbq/gbq.py | 13 +++++-------- setup.py | 2 +- testing/constraints-3.7.txt | 2 +- 3 files changed, 7 insertions(+), 10 deletions(-) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 948e3a85..8bb634d3 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -660,6 +660,7 @@ def _finalize_dtypes( https://github.com/googleapis/python-bigquery-pandas/issues/365 """ import db_dtypes # TODO: add to _test_..._imports ? + import pandas.api.types # If you update this mapping, also update the table at # `docs/reading.rst`. @@ -676,14 +677,10 @@ def _finalize_dtypes( name = str(field["name"]) dtype = dtype_map.get(field["type"].upper()) - if dtype: - # TODO: don't cast TIMESTAMP if already the right type - # tests/system/test_read_gbq.py::test_default_dtypes[scalar-types-nullable-normal-range-False] - # /Users/swast/src/github.com/googleapis/python-bigquery-pandas/pandas_gbq/gbq.py:668: - # FutureWarning: Using .astype to convert from timezone-aware dtype - # to timezone-naive dtype is deprecated and will raise in a future - # version. Use obj.tz_localize(None) or - # obj.tz_convert('UTC').tz_localize(None) instead + + # Avoid deprecated conversion to timezone-naive dtype deprecation by + # only casting object dtypes. + if dtype and pandas.api.types.is_object_dtype(df[name]): df[name] = df[name].astype(dtype, errors="ignore") # Ensure any TIMESTAMP columns are tz-aware. diff --git a/setup.py b/setup.py index 63df3a18..53a3624d 100644 --- a/setup.py +++ b/setup.py @@ -28,7 +28,7 @@ "pandas >=0.24.2", "pyarrow >=3.0.0, <7.0dev", "pydata-google-auth", - "google-api-core >=1.14.0", + "google-api-core >=1.21.0", "google-auth >=1.4.1", "google-auth-oauthlib >=0.0.1", # 2.4.* has a bug where waiting for the query can hang indefinitely. diff --git a/testing/constraints-3.7.txt b/testing/constraints-3.7.txt index 7c5bc0f6..f0c9a4ac 100644 --- a/testing/constraints-3.7.txt +++ b/testing/constraints-3.7.txt @@ -6,7 +6,7 @@ # e.g., if setup.py has "foo >= 1.14.0, < 2.0.0dev", # Then this file should have foo==1.14.0 db-dtypes==0.3.1 -google-api-core==1.14.0 +google-api-core==1.21.0 google-auth==1.18.0 google-auth-oauthlib==0.0.1 google-cloud-bigquery==1.27.2 From 9985d150f2fa604ee184cc46eeab4710bfe21492 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 30 Dec 2021 11:13:20 -0600 Subject: [PATCH 28/35] fix tests for empty dataframe --- tests/system/test_gbq.py | 20 --------- tests/system/test_read_gbq.py | 81 +++++++++++++++++++++++++++++++++++ tests/system/test_to_gbq.py | 7 ++- 3 files changed, 87 insertions(+), 21 deletions(-) diff --git a/tests/system/test_gbq.py b/tests/system/test_gbq.py index a7bdcffc..ec588a3e 100644 --- a/tests/system/test_gbq.py +++ b/tests/system/test_gbq.py @@ -305,26 +305,6 @@ def test_max_results(self, random_dataset, project_id): ) assert len(df) == 10 - def test_zero_rows(self, project_id): - # Bug fix for https://github.com/pandas-dev/pandas/issues/10273 - df = gbq.read_gbq( - 'SELECT name, number, (mlc_class = "HU") is_hurricane, iso_time ' - "FROM `bigquery-public-data.noaa_hurricanes.hurricanes` " - 'WHERE iso_time = TIMESTAMP("1900-01-01 00:00:00") ', - project_id=project_id, - credentials=self.credentials, - ) - empty_columns = { - "name": pandas.Series([], dtype=object), - "number": pandas.Series([], dtype=np.dtype(int)), - "is_hurricane": pandas.Series([], dtype=np.dtype(bool)), - "iso_time": pandas.Series([], dtype="datetime64[ns]"), - } - expected_result = DataFrame( - empty_columns, columns=["name", "number", "is_hurricane", "iso_time"], - ) - tm.assert_frame_equal(df, expected_result, check_index_type=False) - def test_one_row_one_column(self, project_id): df = gbq.read_gbq( "SELECT 3 as v", diff --git a/tests/system/test_read_gbq.py b/tests/system/test_read_gbq.py index a68f5ceb..4d69d4ad 100644 --- a/tests/system/test_read_gbq.py +++ b/tests/system/test_read_gbq.py @@ -359,6 +359,87 @@ ), pytest.param( """ +SELECT + bools.row_num AS row_num, + bool_col, + bytes_col, + date_col, + datetime_col, + float_col, + int64_col, + numeric_col, + string_col, + time_col, + timestamp_col +FROM + UNNEST([ + STRUCT(1 AS row_num, TRUE AS bool_col) ]) AS `bools` +INNER JOIN + UNNEST([ + STRUCT(1 AS row_num, CAST('F1AC' AS BYTES FORMAT 'HEX') AS bytes_col) ]) AS `bytes` +INNER JOIN + UNNEST([ + STRUCT(1 AS row_num, DATE(2018, 4, 11) AS date_col) ]) AS `dates` +INNER JOIN + UNNEST([ + STRUCT(1 AS row_num, DATETIME('2011-10-01 00:01:02.345678') AS datetime_col) ]) AS `datetimes` +INNER JOIN + UNNEST([ + STRUCT(1 AS row_num, -2.375 AS float_col) ]) AS `floats` +INNER JOIN + UNNEST([ + STRUCT(1 AS row_num, 1234 AS int64_col) ]) AS `int64s` +INNER JOIN + UNNEST([ + STRUCT(1 AS row_num, CAST('123.456789' AS NUMERIC) AS numeric_col) ]) AS `numerics` +INNER JOIN + UNNEST([ + STRUCT(1 AS row_num, 'abcdefghijklmnopqrstuvwxyz' AS string_col) ]) AS `strings` +INNER JOIN + UNNEST([ + STRUCT(1 AS row_num, CAST('09:08:07.654321' AS TIME) AS time_col) ]) AS `times` +INNER JOIN + UNNEST([ + STRUCT(1 AS row_num, TIMESTAMP('1998-09-04 12:34:56.789101') AS timestamp_col) ]) AS `timestamps` +WHERE + `bools`.row_num = `dates`.row_num + AND `bools`.row_num = `bytes`.row_num + AND `bools`.row_num = `datetimes`.row_num + AND `bools`.row_num = `floats`.row_num + AND `bools`.row_num = `int64s`.row_num + AND `bools`.row_num = `numerics`.row_num + AND `bools`.row_num = `strings`.row_num + AND `bools`.row_num = `times`.row_num + AND `bools`.row_num = `timestamps`.row_num + AND `bools`.row_num = -1 +ORDER BY row_num ASC + """, + pandas.DataFrame( + { + "row_num": pandas.Series([], dtype="Int64"), + "bool_col": pandas.Series( + [], + dtype="boolean" + if FEATURES.pandas_has_boolean_dtype + else "object", + ), + "bytes_col": pandas.Series([], dtype="object"), + "date_col": pandas.Series([], dtype=db_dtypes.DateDtype(),), + "datetime_col": pandas.Series([], dtype="datetime64[ns]",), + "float_col": pandas.Series([], dtype="float64"), + "int64_col": pandas.Series([], dtype="Int64"), + "numeric_col": pandas.Series([], dtype="object"), + "string_col": pandas.Series([], dtype="object"), + "time_col": pandas.Series([], dtype=db_dtypes.TimeDtype(),), + "timestamp_col": pandas.Series( + [], dtype="datetime64[ns]", + ).dt.tz_localize(datetime.timezone.utc), + } + ), + id="scalar-types-empyt-pandas-dev-issue10273", + ), + pytest.param( + """ SELECT bignumerics.row_num AS row_num, bignumeric_col, diff --git a/tests/system/test_to_gbq.py b/tests/system/test_to_gbq.py index f92da9e7..ae3b8614 100644 --- a/tests/system/test_to_gbq.py +++ b/tests/system/test_to_gbq.py @@ -160,7 +160,12 @@ def test_series_round_trip( columns=["row_num", "date_col"], ), expected_df=pandas.DataFrame( - {"row_num": [123], "date_col": [datetime.date(2021, 12, 12)]}, + { + "row_num": [123], + "date_col": pandas.Series( + [datetime.date(2021, 12, 12)], dtype=db_dtypes.DateDtype() + ), + }, columns=["row_num", "date_col"], ), table_schema=[ From 6fb73a20f5ac99ce78201c8ab92dfbdd982cfb66 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 30 Dec 2021 16:33:15 -0600 Subject: [PATCH 29/35] fix tests for older google-cloud-bigquery --- pandas_gbq/features.py | 8 + tests/system/test_read_gbq.py | 459 ++++++++++++++++++---------------- tests/unit/test_features.py | 18 ++ 3 files changed, 271 insertions(+), 214 deletions(-) diff --git a/pandas_gbq/features.py b/pandas_gbq/features.py index 34e23785..77535041 100644 --- a/pandas_gbq/features.py +++ b/pandas_gbq/features.py @@ -8,6 +8,7 @@ BIGQUERY_MINIMUM_VERSION = "1.11.1" BIGQUERY_CLIENT_INFO_VERSION = "1.12.0" BIGQUERY_BQSTORAGE_VERSION = "1.24.0" +BIGQUERY_ACCURATE_TIMESTAMP_VERSION = "2.6.0" BIGQUERY_FROM_DATAFRAME_CSV_VERSION = "2.6.0" BIGQUERY_SUPPORTS_BIGNUMERIC_VERSION = "2.10.0" BIGQUERY_NO_DATE_AS_OBJECT_VERSION = "3.0.0dev" @@ -44,6 +45,13 @@ def bigquery_installed_version(self): return self._bigquery_installed_version + @property + def bigquery_has_accurate_timestamp(self): + import pkg_resources + + min_version = pkg_resources.parse_version(BIGQUERY_ACCURATE_TIMESTAMP_VERSION) + return self.bigquery_installed_version >= min_version + @property def bigquery_has_client_info(self): import pkg_resources diff --git a/tests/system/test_read_gbq.py b/tests/system/test_read_gbq.py index 4d69d4ad..645ec4ed 100644 --- a/tests/system/test_read_gbq.py +++ b/tests/system/test_read_gbq.py @@ -2,6 +2,7 @@ # Use of this source code is governed by a BSD-style # license that can be found in the LICENSE file. +import collections import datetime import decimal @@ -13,12 +14,20 @@ from pandas_gbq.features import FEATURES +QueryTestCase = collections.namedtuple( + "QueryTestCase", + ["query", "expected", "use_bqstorage_apis"], + defaults=[None, None, {True, False}], +) + + @pytest.mark.parametrize(["use_bqstorage_api"], [(True,), (False,)]) @pytest.mark.parametrize( - ["query", "expected"], + ["query", "expected", "use_bqstorage_apis"], [ pytest.param( - """ + *QueryTestCase( + query=""" SELECT bools.row_num AS row_num, bool_col, @@ -94,69 +103,71 @@ AND `bools`.row_num = `times`.row_num AND `bools`.row_num = `timestamps`.row_num ORDER BY row_num ASC - """, - pandas.DataFrame( - { - "row_num": pandas.Series([1, 2, 3], dtype="Int64"), - "bool_col": pandas.Series( - [True, False, True], - dtype="boolean" - if FEATURES.pandas_has_boolean_dtype - else "bool", - ), - "bytes_col": [ - bytes.fromhex("C00010FF"), - bytes.fromhex("F1AC"), - bytes.fromhex("FFBADD11"), - ], - "date_col": pandas.Series( - [ - datetime.date(1998, 9, 4), - datetime.date(2011, 10, 1), - datetime.date(2018, 4, 11), + """, + expected=pandas.DataFrame( + { + "row_num": pandas.Series([1, 2, 3], dtype="Int64"), + "bool_col": pandas.Series( + [True, False, True], + dtype="boolean" + if FEATURES.pandas_has_boolean_dtype + else "bool", + ), + "bytes_col": [ + bytes.fromhex("C00010FF"), + bytes.fromhex("F1AC"), + bytes.fromhex("FFBADD11"), ], - dtype=db_dtypes.DateDtype(), - ), - "datetime_col": pandas.Series( - [ - "1998-09-04 12:34:56.789101", - "2011-10-01 00:01:02.345678", - "2018-04-11 23:59:59.999999", + "date_col": pandas.Series( + [ + datetime.date(1998, 9, 4), + datetime.date(2011, 10, 1), + datetime.date(2018, 4, 11), + ], + dtype=db_dtypes.DateDtype(), + ), + "datetime_col": pandas.Series( + [ + "1998-09-04 12:34:56.789101", + "2011-10-01 00:01:02.345678", + "2018-04-11 23:59:59.999999", + ], + dtype="datetime64[ns]", + ), + "float_col": [1.125, -2.375, 0.0], + "int64_col": pandas.Series( + [(2 ** 63) - 1, -1, -(2 ** 63)], dtype="Int64" + ), + "numeric_col": [ + decimal.Decimal("123.456789"), + decimal.Decimal("-123.456789"), + decimal.Decimal("999.999999"), ], - dtype="datetime64[ns]", - ), - "float_col": [1.125, -2.375, 0.0], - "int64_col": pandas.Series( - [(2 ** 63) - 1, -1, -(2 ** 63)], dtype="Int64" - ), - "numeric_col": [ - decimal.Decimal("123.456789"), - decimal.Decimal("-123.456789"), - decimal.Decimal("999.999999"), - ], - "string_col": [ - "abcdefghijklmnopqrstuvwxyz", - "ABCDEFGHIJKLMNOPQRSTUVWXYZ", - "こんにちは", - ], - "time_col": pandas.Series( - ["00:00:00.000000", "09:08:07.654321", "23:59:59.999999"], - dtype=db_dtypes.TimeDtype(), - ), - "timestamp_col": pandas.Series( - [ - "1998-09-04 12:34:56.789101", - "2011-10-01 00:01:02.345678", - "2018-04-11 23:59:59.999999", + "string_col": [ + "abcdefghijklmnopqrstuvwxyz", + "ABCDEFGHIJKLMNOPQRSTUVWXYZ", + "こんにちは", ], - dtype="datetime64[ns]", - ).dt.tz_localize(datetime.timezone.utc), - } + "time_col": pandas.Series( + ["00:00:00.000000", "09:08:07.654321", "23:59:59.999999"], + dtype=db_dtypes.TimeDtype(), + ), + "timestamp_col": pandas.Series( + [ + "1998-09-04 12:34:56.789101", + "2011-10-01 00:01:02.345678", + "2018-04-11 23:59:59.999999", + ], + dtype="datetime64[ns]", + ).dt.tz_localize(datetime.timezone.utc), + } + ), ), id="scalar-types-nonnull-normal-range", ), pytest.param( - """ + *QueryTestCase( + query=""" SELECT bools.row_num AS row_num, bool_col, @@ -231,54 +242,60 @@ AND `bools`.row_num = `timestamps`.row_num ORDER BY row_num ASC """, - pandas.DataFrame( - { - "row_num": pandas.Series([1, 2, 3], dtype="Int64"), - "bool_col": pandas.Series( - [True, False, None], - dtype="boolean" - if FEATURES.pandas_has_boolean_dtype - else "object", - ), - "bytes_col": [None, bytes.fromhex("F1AC"), b""], - "date_col": pandas.Series( - [datetime.date(1970, 1, 1), None, datetime.date(2018, 4, 11)], - dtype=db_dtypes.DateDtype(), - ), - "datetime_col": pandas.Series( - [ - "1970-01-01 00:00:00.000000", - "2011-10-01 00:01:02.345678", - None, - ], - dtype="datetime64[ns]", - ), - "float_col": [None, -2.375, 0.0], - "int64_col": pandas.Series([-1, None, 0], dtype="Int64"), - "numeric_col": [ - decimal.Decimal("123.456789"), - None, - decimal.Decimal("999.999999"), - ], - "string_col": ["", "こんにちは", None], - "time_col": pandas.Series( - [None, "00:00:00", "23:59:59.999999"], - dtype=db_dtypes.TimeDtype(), - ), - "timestamp_col": pandas.Series( - [ - "1970-01-01 00:00:00.000000", + expected=pandas.DataFrame( + { + "row_num": pandas.Series([1, 2, 3], dtype="Int64"), + "bool_col": pandas.Series( + [True, False, None], + dtype="boolean" + if FEATURES.pandas_has_boolean_dtype + else "object", + ), + "bytes_col": [None, bytes.fromhex("F1AC"), b""], + "date_col": pandas.Series( + [ + datetime.date(1970, 1, 1), + None, + datetime.date(2018, 4, 11), + ], + dtype=db_dtypes.DateDtype(), + ), + "datetime_col": pandas.Series( + [ + "1970-01-01 00:00:00.000000", + "2011-10-01 00:01:02.345678", + None, + ], + dtype="datetime64[ns]", + ), + "float_col": [None, -2.375, 0.0], + "int64_col": pandas.Series([-1, None, 0], dtype="Int64"), + "numeric_col": [ + decimal.Decimal("123.456789"), None, - "2018-04-11 23:59:59.999999", + decimal.Decimal("999.999999"), ], - dtype="datetime64[ns]", - ).dt.tz_localize(datetime.timezone.utc), - } + "string_col": ["", "こんにちは", None], + "time_col": pandas.Series( + [None, "00:00:00", "23:59:59.999999"], + dtype=db_dtypes.TimeDtype(), + ), + "timestamp_col": pandas.Series( + [ + "1970-01-01 00:00:00.000000", + None, + "2018-04-11 23:59:59.999999", + ], + dtype="datetime64[ns]", + ).dt.tz_localize(datetime.timezone.utc), + } + ), ), id="scalar-types-nullable-normal-range", ), pytest.param( - """ + *QueryTestCase( + query=""" SELECT bools.row_num AS row_num, bool_col, @@ -333,32 +350,34 @@ AND `bools`.row_num = `timestamps`.row_num ORDER BY row_num ASC """, - pandas.DataFrame( - { - "row_num": pandas.Series([1], dtype="Int64"), - "bool_col": pandas.Series( - [None], - dtype="boolean" - if FEATURES.pandas_has_boolean_dtype - else "object", - ), - "bytes_col": [None], - "date_col": pandas.Series([None], dtype=db_dtypes.DateDtype(),), - "datetime_col": pandas.Series([None], dtype="datetime64[ns]",), - "float_col": pandas.Series([None], dtype="float64"), - "int64_col": pandas.Series([None], dtype="Int64"), - "numeric_col": [None], - "string_col": [None], - "time_col": pandas.Series([None], dtype=db_dtypes.TimeDtype(),), - "timestamp_col": pandas.Series( - [None], dtype="datetime64[ns]", - ).dt.tz_localize(datetime.timezone.utc), - } + expected=pandas.DataFrame( + { + "row_num": pandas.Series([1], dtype="Int64"), + "bool_col": pandas.Series( + [None], + dtype="boolean" + if FEATURES.pandas_has_boolean_dtype + else "object", + ), + "bytes_col": [None], + "date_col": pandas.Series([None], dtype=db_dtypes.DateDtype(),), + "datetime_col": pandas.Series([None], dtype="datetime64[ns]",), + "float_col": pandas.Series([None], dtype="float64"), + "int64_col": pandas.Series([None], dtype="Int64"), + "numeric_col": [None], + "string_col": [None], + "time_col": pandas.Series([None], dtype=db_dtypes.TimeDtype(),), + "timestamp_col": pandas.Series( + [None], dtype="datetime64[ns]", + ).dt.tz_localize(datetime.timezone.utc), + } + ), ), id="scalar-types-null", ), pytest.param( - """ + *QueryTestCase( + query=""" SELECT bools.row_num AS row_num, bool_col, @@ -414,32 +433,34 @@ AND `bools`.row_num = -1 ORDER BY row_num ASC """, - pandas.DataFrame( - { - "row_num": pandas.Series([], dtype="Int64"), - "bool_col": pandas.Series( - [], - dtype="boolean" - if FEATURES.pandas_has_boolean_dtype - else "object", - ), - "bytes_col": pandas.Series([], dtype="object"), - "date_col": pandas.Series([], dtype=db_dtypes.DateDtype(),), - "datetime_col": pandas.Series([], dtype="datetime64[ns]",), - "float_col": pandas.Series([], dtype="float64"), - "int64_col": pandas.Series([], dtype="Int64"), - "numeric_col": pandas.Series([], dtype="object"), - "string_col": pandas.Series([], dtype="object"), - "time_col": pandas.Series([], dtype=db_dtypes.TimeDtype(),), - "timestamp_col": pandas.Series( - [], dtype="datetime64[ns]", - ).dt.tz_localize(datetime.timezone.utc), - } + expected=pandas.DataFrame( + { + "row_num": pandas.Series([], dtype="Int64"), + "bool_col": pandas.Series( + [], + dtype="boolean" + if FEATURES.pandas_has_boolean_dtype + else "bool", + ), + "bytes_col": pandas.Series([], dtype="object"), + "date_col": pandas.Series([], dtype=db_dtypes.DateDtype(),), + "datetime_col": pandas.Series([], dtype="datetime64[ns]",), + "float_col": pandas.Series([], dtype="float64"), + "int64_col": pandas.Series([], dtype="Int64"), + "numeric_col": pandas.Series([], dtype="object"), + "string_col": pandas.Series([], dtype="object"), + "time_col": pandas.Series([], dtype=db_dtypes.TimeDtype(),), + "timestamp_col": pandas.Series( + [], dtype="datetime64[ns]", + ).dt.tz_localize(datetime.timezone.utc), + } + ), ), - id="scalar-types-empyt-pandas-dev-issue10273", + id="scalar-types-empty-pandas-dev-issue10273", ), pytest.param( - """ + *QueryTestCase( + query=""" SELECT bignumerics.row_num AS row_num, bignumeric_col, @@ -465,23 +486,24 @@ AND `bignumerics`.row_num = `nulls`.row_num ORDER BY row_num ASC """, - pandas.DataFrame( - { - "row_num": pandas.Series([1, 2, 3], dtype="Int64"), - # TODO: Support a special (nullable) dtype for decimal data. - # https://github.com/googleapis/python-db-dtypes-pandas/issues/49 - "bignumeric_col": [ - decimal.Decimal("123456789.123456789"), - decimal.Decimal("-123456789.123456789"), - decimal.Decimal("987654321.987654321"), - ], - "nullable_col": [ - decimal.Decimal("123456789.123456789"), - None, - decimal.Decimal("987654321.987654321"), - ], - "null_col": [None, None, None], - } + expected=pandas.DataFrame( + { + "row_num": pandas.Series([1, 2, 3], dtype="Int64"), + # TODO: Support a special (nullable) dtype for decimal data. + # https://github.com/googleapis/python-db-dtypes-pandas/issues/49 + "bignumeric_col": [ + decimal.Decimal("123456789.123456789"), + decimal.Decimal("-123456789.123456789"), + decimal.Decimal("987654321.987654321"), + ], + "nullable_col": [ + decimal.Decimal("123456789.123456789"), + None, + decimal.Decimal("987654321.987654321"), + ], + "null_col": [None, None, None], + } + ), ), id="bignumeric-normal-range", marks=pytest.mark.skipif( @@ -490,7 +512,8 @@ ), ), pytest.param( - """ + *QueryTestCase( + query=""" SELECT dates.row_num AS row_num, date_col, @@ -516,61 +539,69 @@ AND `dates`.row_num = `timestamps`.row_num ORDER BY row_num ASC """, - pandas.DataFrame( - { - "row_num": pandas.Series([1, 2, 3], dtype="Int64"), - "date_col": pandas.Series( - [ - datetime.date(1, 1, 1), - datetime.date(9999, 12, 31), - datetime.date(2262, 4, 12), - ], - dtype="object", - ), - "datetime_col": pandas.Series( - [ - datetime.datetime(1, 1, 1, 0, 0, 0, 0), - datetime.datetime(9999, 12, 31, 23, 59, 59, 999999), - # One microsecond more than pandas.Timestamp.max. - datetime.datetime(2262, 4, 11, 23, 47, 16, 854776), - ], - dtype="object", - ), - "timestamp_col": pandas.Series( - [ - datetime.datetime( - 1, 1, 1, 0, 0, 0, 0, tzinfo=datetime.timezone.utc - ), - datetime.datetime( - 9999, - 12, - 31, - 23, - 59, - 59, - 999999, - tzinfo=datetime.timezone.utc, - ), - # One microsecond more than pandas.Timestamp.max. - datetime.datetime( - 2262, - 4, - 11, - 23, - 47, - 16, - 854776, - tzinfo=datetime.timezone.utc, - ), - ], - dtype="object", - ), - } + expected=pandas.DataFrame( + { + "row_num": pandas.Series([1, 2, 3], dtype="Int64"), + "date_col": pandas.Series( + [ + datetime.date(1, 1, 1), + datetime.date(9999, 12, 31), + datetime.date(2262, 4, 12), + ], + dtype="object", + ), + "datetime_col": pandas.Series( + [ + datetime.datetime(1, 1, 1, 0, 0, 0, 0), + datetime.datetime(9999, 12, 31, 23, 59, 59, 999999), + # One microsecond more than pandas.Timestamp.max. + datetime.datetime(2262, 4, 11, 23, 47, 16, 854776), + ], + dtype="object", + ), + "timestamp_col": pandas.Series( + [ + datetime.datetime( + 1, 1, 1, 0, 0, 0, 0, tzinfo=datetime.timezone.utc + ), + datetime.datetime( + 9999, + 12, + 31, + 23, + 59, + 59, + 999999, + tzinfo=datetime.timezone.utc, + ), + # One microsecond more than pandas.Timestamp.max. + datetime.datetime( + 2262, + 4, + 11, + 23, + 47, + 16, + 854776, + tzinfo=datetime.timezone.utc, + ), + ], + dtype="object", + ), + } + ), + use_bqstorage_apis={True, False} + if FEATURES.bigquery_has_accurate_timestamp + else {True}, ), id="issue365-extreme-datetimes", ), ], ) -def test_default_dtypes(read_gbq, query, use_bqstorage_api, expected): +def test_default_dtypes( + read_gbq, query, expected, use_bqstorage_apis, use_bqstorage_api +): + if use_bqstorage_api not in use_bqstorage_apis: + pytest.skip(f"use_bqstorage_api={use_bqstorage_api} not supported.") result = read_gbq(query, use_bqstorage_api=use_bqstorage_api) pandas.testing.assert_frame_equal(result, expected) diff --git a/tests/unit/test_features.py b/tests/unit/test_features.py index 3f7d3896..c810104f 100644 --- a/tests/unit/test_features.py +++ b/tests/unit/test_features.py @@ -13,6 +13,24 @@ def fresh_bigquery_version(monkeypatch): monkeypatch.setattr(FEATURES, "_pandas_installed_version", None) +@pytest.mark.parametrize( + ["bigquery_version", "expected"], + [ + ("1.11.1", False), + ("1.26.0", False), + ("2.5.4", False), + ("2.6.0", True), + ("2.6.1", True), + ("2.12.0", True), + ], +) +def test_bigquery_has_accurate_timestamp(monkeypatch, bigquery_version, expected): + import google.cloud.bigquery + + monkeypatch.setattr(google.cloud.bigquery, "__version__", bigquery_version) + assert FEATURES.bigquery_has_accurate_timestamp == expected + + @pytest.mark.parametrize( ["bigquery_version", "expected"], [ From 8cc45242929e96d7c4611011d4b0f188bc956d0f Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 30 Dec 2021 16:41:03 -0600 Subject: [PATCH 30/35] ignore index on empty dataframe --- tests/system/test_read_gbq.py | 165 +++++++++++++++++----------------- 1 file changed, 82 insertions(+), 83 deletions(-) diff --git a/tests/system/test_read_gbq.py b/tests/system/test_read_gbq.py index 645ec4ed..3791f50d 100644 --- a/tests/system/test_read_gbq.py +++ b/tests/system/test_read_gbq.py @@ -378,89 +378,6 @@ pytest.param( *QueryTestCase( query=""" -SELECT - bools.row_num AS row_num, - bool_col, - bytes_col, - date_col, - datetime_col, - float_col, - int64_col, - numeric_col, - string_col, - time_col, - timestamp_col -FROM - UNNEST([ - STRUCT(1 AS row_num, TRUE AS bool_col) ]) AS `bools` -INNER JOIN - UNNEST([ - STRUCT(1 AS row_num, CAST('F1AC' AS BYTES FORMAT 'HEX') AS bytes_col) ]) AS `bytes` -INNER JOIN - UNNEST([ - STRUCT(1 AS row_num, DATE(2018, 4, 11) AS date_col) ]) AS `dates` -INNER JOIN - UNNEST([ - STRUCT(1 AS row_num, DATETIME('2011-10-01 00:01:02.345678') AS datetime_col) ]) AS `datetimes` -INNER JOIN - UNNEST([ - STRUCT(1 AS row_num, -2.375 AS float_col) ]) AS `floats` -INNER JOIN - UNNEST([ - STRUCT(1 AS row_num, 1234 AS int64_col) ]) AS `int64s` -INNER JOIN - UNNEST([ - STRUCT(1 AS row_num, CAST('123.456789' AS NUMERIC) AS numeric_col) ]) AS `numerics` -INNER JOIN - UNNEST([ - STRUCT(1 AS row_num, 'abcdefghijklmnopqrstuvwxyz' AS string_col) ]) AS `strings` -INNER JOIN - UNNEST([ - STRUCT(1 AS row_num, CAST('09:08:07.654321' AS TIME) AS time_col) ]) AS `times` -INNER JOIN - UNNEST([ - STRUCT(1 AS row_num, TIMESTAMP('1998-09-04 12:34:56.789101') AS timestamp_col) ]) AS `timestamps` -WHERE - `bools`.row_num = `dates`.row_num - AND `bools`.row_num = `bytes`.row_num - AND `bools`.row_num = `datetimes`.row_num - AND `bools`.row_num = `floats`.row_num - AND `bools`.row_num = `int64s`.row_num - AND `bools`.row_num = `numerics`.row_num - AND `bools`.row_num = `strings`.row_num - AND `bools`.row_num = `times`.row_num - AND `bools`.row_num = `timestamps`.row_num - AND `bools`.row_num = -1 -ORDER BY row_num ASC - """, - expected=pandas.DataFrame( - { - "row_num": pandas.Series([], dtype="Int64"), - "bool_col": pandas.Series( - [], - dtype="boolean" - if FEATURES.pandas_has_boolean_dtype - else "bool", - ), - "bytes_col": pandas.Series([], dtype="object"), - "date_col": pandas.Series([], dtype=db_dtypes.DateDtype(),), - "datetime_col": pandas.Series([], dtype="datetime64[ns]",), - "float_col": pandas.Series([], dtype="float64"), - "int64_col": pandas.Series([], dtype="Int64"), - "numeric_col": pandas.Series([], dtype="object"), - "string_col": pandas.Series([], dtype="object"), - "time_col": pandas.Series([], dtype=db_dtypes.TimeDtype(),), - "timestamp_col": pandas.Series( - [], dtype="datetime64[ns]", - ).dt.tz_localize(datetime.timezone.utc), - } - ), - ), - id="scalar-types-empty-pandas-dev-issue10273", - ), - pytest.param( - *QueryTestCase( - query=""" SELECT bignumerics.row_num AS row_num, bignumeric_col, @@ -605,3 +522,85 @@ def test_default_dtypes( pytest.skip(f"use_bqstorage_api={use_bqstorage_api} not supported.") result = read_gbq(query, use_bqstorage_api=use_bqstorage_api) pandas.testing.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize(["use_bqstorage_api"], [(True,), (False,)]) +def test_empty_dataframe(read_gbq, use_bqstorage_api): + # Bug fix for https://github.com/pandas-dev/pandas/issues/10273 + query = """ +SELECT + bools.row_num AS row_num, + bool_col, + bytes_col, + date_col, + datetime_col, + float_col, + int64_col, + numeric_col, + string_col, + time_col, + timestamp_col +FROM + UNNEST([ + STRUCT(1 AS row_num, TRUE AS bool_col) ]) AS `bools` +INNER JOIN + UNNEST([ + STRUCT(1 AS row_num, CAST('F1AC' AS BYTES FORMAT 'HEX') AS bytes_col) ]) AS `bytes` +INNER JOIN + UNNEST([ + STRUCT(1 AS row_num, DATE(2018, 4, 11) AS date_col) ]) AS `dates` +INNER JOIN + UNNEST([ + STRUCT(1 AS row_num, DATETIME('2011-10-01 00:01:02.345678') AS datetime_col) ]) AS `datetimes` +INNER JOIN + UNNEST([ + STRUCT(1 AS row_num, -2.375 AS float_col) ]) AS `floats` +INNER JOIN + UNNEST([ + STRUCT(1 AS row_num, 1234 AS int64_col) ]) AS `int64s` +INNER JOIN + UNNEST([ + STRUCT(1 AS row_num, CAST('123.456789' AS NUMERIC) AS numeric_col) ]) AS `numerics` +INNER JOIN + UNNEST([ + STRUCT(1 AS row_num, 'abcdefghijklmnopqrstuvwxyz' AS string_col) ]) AS `strings` +INNER JOIN + UNNEST([ + STRUCT(1 AS row_num, CAST('09:08:07.654321' AS TIME) AS time_col) ]) AS `times` +INNER JOIN + UNNEST([ + STRUCT(1 AS row_num, TIMESTAMP('1998-09-04 12:34:56.789101') AS timestamp_col) ]) AS `timestamps` +WHERE + `bools`.row_num = `dates`.row_num + AND `bools`.row_num = `bytes`.row_num + AND `bools`.row_num = `datetimes`.row_num + AND `bools`.row_num = `floats`.row_num + AND `bools`.row_num = `int64s`.row_num + AND `bools`.row_num = `numerics`.row_num + AND `bools`.row_num = `strings`.row_num + AND `bools`.row_num = `times`.row_num + AND `bools`.row_num = `timestamps`.row_num + AND `bools`.row_num = -1 +ORDER BY row_num ASC + """ + expected = pandas.DataFrame( + { + "row_num": pandas.Series([], dtype="Int64"), + "bool_col": pandas.Series( + [], dtype="boolean" if FEATURES.pandas_has_boolean_dtype else "bool", + ), + "bytes_col": pandas.Series([], dtype="object"), + "date_col": pandas.Series([], dtype=db_dtypes.DateDtype(),), + "datetime_col": pandas.Series([], dtype="datetime64[ns]",), + "float_col": pandas.Series([], dtype="float64"), + "int64_col": pandas.Series([], dtype="Int64"), + "numeric_col": pandas.Series([], dtype="object"), + "string_col": pandas.Series([], dtype="object"), + "time_col": pandas.Series([], dtype=db_dtypes.TimeDtype(),), + "timestamp_col": pandas.Series([], dtype="datetime64[ns]",).dt.tz_localize( + datetime.timezone.utc + ), + } + ) + result = read_gbq(query, use_bqstorage_api=use_bqstorage_api) + pandas.testing.assert_frame_equal(result, expected, check_index_type=False) From a0d6cad6b833ca4dff39f94b0c155b1341fa6f56 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 30 Dec 2021 16:52:41 -0600 Subject: [PATCH 31/35] add db-dtypes to runtime import checks --- pandas_gbq/gbq.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 8bb634d3..9974850e 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -51,6 +51,11 @@ def _test_google_api_imports(): except ImportError as ex: raise ImportError("pandas-gbq requires setuptools") from ex + try: + import db_dtypes # noqa + except ImportError as ex: + raise ImportError("pandas-gbq requires db-dtypes") from ex + try: import pydata_google_auth # noqa except ImportError as ex: @@ -614,7 +619,7 @@ def _bqschema_to_nullsafe_dtypes(schema_fields): See: http://pandas.pydata.org/pandas-docs/dev/missing_data.html #missing-data-casting-rules-and-indexing """ - import db_dtypes # TODO: add to _test_..._imports ? + import db_dtypes # If you update this mapping, also update the table at # `docs/reading.rst`. From 82c5362d96f45ab3d414e5eecdd39dda40066358 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Tue, 4 Jan 2022 10:06:00 -0600 Subject: [PATCH 32/35] document dependencies --- setup.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 53a3624d..ccee7265 100644 --- a/setup.py +++ b/setup.py @@ -28,11 +28,17 @@ "pandas >=0.24.2", "pyarrow >=3.0.0, <7.0dev", "pydata-google-auth", + # Note: google-api-core and google-auth are also included via transitive + # dependency on google-cloud-bigquery, but this library also uses them + # directly. "google-api-core >=1.21.0", - "google-auth >=1.4.1", + "google-auth >=1.18.0", "google-auth-oauthlib >=0.0.1", - # 2.4.* has a bug where waiting for the query can hang indefinitely. - # https://github.com/pydata/pandas-gbq/issues/343 + # Require 1.27.* because it has a fix for out-of-bounds timestamps. See: + # https://github.com/googleapis/python-bigquery/pull/209 and + # https://github.com/googleapis/python-bigquery-pandas/issues/365 + # Exclude 2.4.* because it has a bug where waiting for the query can hang + # indefinitely. https://github.com/pydata/pandas-gbq/issues/343 "google-cloud-bigquery >=1.27.2,<4.0.0dev,!=2.4.*", "google-cloud-bigquery-storage >=1.1.0,<3.0.0dev", ] From de4a06e9bc77830d5018312cfba626973d10388a Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Tue, 4 Jan 2022 10:08:33 -0600 Subject: [PATCH 33/35] remove TODO, since done --- pandas_gbq/gbq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 9974850e..80b7bbd2 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -664,7 +664,7 @@ def _finalize_dtypes( 1970. See: https://github.com/googleapis/python-bigquery-pandas/issues/365 """ - import db_dtypes # TODO: add to _test_..._imports ? + import db_dtypes import pandas.api.types # If you update this mapping, also update the table at From 9fc8c08445e604708fefa295ab422594332e7ef9 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Tue, 4 Jan 2022 10:38:37 -0600 Subject: [PATCH 34/35] remove unnecessary special case for empty dataframe Fixes prerelease test run --- pandas_gbq/timestamp.py | 5 ----- tests/system/test_read_gbq.py | 3 ++- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/pandas_gbq/timestamp.py b/pandas_gbq/timestamp.py index c6bb6d93..66374881 100644 --- a/pandas_gbq/timestamp.py +++ b/pandas_gbq/timestamp.py @@ -30,11 +30,6 @@ def localize_df(df, schema_fields): pandas.DataFrame DataFrame with localized TIMESTAMP columns. """ - if len(df.index) == 0: - # If there are no rows, there is nothing to do. - # Fix for https://github.com/pydata/pandas-gbq/issues/299 - return df - for field in schema_fields: column = str(field["name"]) if "mode" in field and field["mode"].upper() == "REPEATED": diff --git a/tests/system/test_read_gbq.py b/tests/system/test_read_gbq.py index 3791f50d..a13e830f 100644 --- a/tests/system/test_read_gbq.py +++ b/tests/system/test_read_gbq.py @@ -526,7 +526,8 @@ def test_default_dtypes( @pytest.mark.parametrize(["use_bqstorage_api"], [(True,), (False,)]) def test_empty_dataframe(read_gbq, use_bqstorage_api): - # Bug fix for https://github.com/pandas-dev/pandas/issues/10273 + # Bug fix for https://github.com/pandas-dev/pandas/issues/10273 and + # https://github.com/googleapis/python-bigquery-pandas/issues/299 query = """ SELECT bools.row_num AS row_num, From c5c0e85962da1ed438814699144010eac19264d8 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Wed, 5 Jan 2022 15:03:42 -0600 Subject: [PATCH 35/35] remove redundant 'deprecated' from comment --- pandas_gbq/gbq.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 80b7bbd2..feca5e2a 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -683,8 +683,8 @@ def _finalize_dtypes( name = str(field["name"]) dtype = dtype_map.get(field["type"].upper()) - # Avoid deprecated conversion to timezone-naive dtype deprecation by - # only casting object dtypes. + # Avoid deprecated conversion to timezone-naive dtype by only casting + # object dtypes. if dtype and pandas.api.types.is_object_dtype(df[name]): df[name] = df[name].astype(dtype, errors="ignore")