From 601a1979f0e72a6f28913185c84f8625977dd5c0 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 16 May 2024 14:54:54 -0500 Subject: [PATCH 1/8] feat: `read_gbq` suggests using BigQuery DataFrames with large results --- pandas_gbq/constants.py | 8 ++++++ pandas_gbq/exceptions.py | 4 +++ pandas_gbq/gbq.py | 30 ++++++++++++++++++++++ tests/unit/test_gbq.py | 55 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 97 insertions(+) create mode 100644 pandas_gbq/constants.py diff --git a/pandas_gbq/constants.py b/pandas_gbq/constants.py new file mode 100644 index 00000000..cfe53864 --- /dev/null +++ b/pandas_gbq/constants.py @@ -0,0 +1,8 @@ +# Copyright (c) 2024 pandas-gbq Authors All rights reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +BYTES_IN_KIB = 1024 +BYTES_IN_MIB = 1024 * BYTES_IN_KIB +BYTES_IN_GIB = 1024 * BYTES_IN_MIB +BYTES_TO_RECOMMEND_BIGFRAMES = BYTES_IN_GIB diff --git a/pandas_gbq/exceptions.py b/pandas_gbq/exceptions.py index 574b2dec..af58212e 100644 --- a/pandas_gbq/exceptions.py +++ b/pandas_gbq/exceptions.py @@ -28,6 +28,10 @@ class InvalidPrivateKeyFormat(ValueError): """ +class LargeResultsWarning(UserWarning): + """Raise when results are beyond that recommended for pandas DataFrame.""" + + class PerformanceWarning(RuntimeWarning): """ Raised when a performance-related feature is requested, but unsupported. diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 60bb8bda..55364f51 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -19,6 +19,8 @@ if typing.TYPE_CHECKING: # pragma: NO COVER import pandas +import pandas_gbq.constants +import pandas_gbq.exceptions from pandas_gbq.exceptions import GenericGBQException, QueryTimeout from pandas_gbq.features import FEATURES import pandas_gbq.query @@ -478,6 +480,34 @@ def _download_results( if max_results is not None: create_bqstorage_client = False + # If we're downloading a large table, BigQuery DataFrames might be a + # better fit. Not all code paths will populate rows_iter._table, but + # if it's not populated that means we are working with a small result + # set. + if ( + (table := getattr(rows_iter, "_table", None)) is not None + and (num_bytes := table.num_bytes) is not None + and num_bytes > pandas_gbq.constants.BYTES_TO_RECOMMEND_BIGFRAMES + ): + num_gib = num_bytes / pandas_gbq.constants.BYTES_IN_GIB + warnings.warn( + f"Recommendation: Your results are {num_gib:.1f} GiB. " + "Consider using BigQuery DataFrames " + "(https://cloud.google.com/bigquery/docs/dataframes-quickstart) " + "to process these results with pandas compatible APIs that " + "run in the BigQuery SQL query engine. This provides an " + "opportunity to save on costs and improve performance. " + "Please reach out to bigframes-feedback@google.com with any " + "questions or concerns. To disable this message, run " + "warnings.simplefilter('ignore', category=pandas_gbq.exceptions.LargeResultsWarning)", + category=pandas_gbq.exceptions.LargeResultsWarning, + # user's code + # -> read_gbq + # -> run_query + # -> download_results + stacklevel=4, + ) + try: schema_fields = [field.to_api_repr() for field in rows_iter.schema] conversion_dtypes = _bqschema_to_nullsafe_dtypes(schema_fields) diff --git a/tests/unit/test_gbq.py b/tests/unit/test_gbq.py index 8ba81b6d..d93fed96 100644 --- a/tests/unit/test_gbq.py +++ b/tests/unit/test_gbq.py @@ -6,10 +6,13 @@ import copy import datetime +import re from unittest import mock +import warnings import google.api_core.exceptions import google.cloud.bigquery +import google.cloud.bigquery.table import numpy import packaging.version import pandas @@ -17,6 +20,8 @@ import pytest from pandas_gbq import gbq +import pandas_gbq.constants +import pandas_gbq.exceptions import pandas_gbq.features from pandas_gbq.features import FEATURES @@ -147,6 +152,56 @@ def test__transform_read_gbq_configuration_makes_copy(original, expected): assert did_change == should_change +def test_GbqConnector_download_results_warns_for_large_tables(): + gbq._test_google_api_imports() + connector = _make_connector() + rows_iter = mock.create_autospec( + google.cloud.bigquery.table.RowIterator, instance=True + ) + table = google.cloud.bigquery.Table.from_api_repr( + { + "tableReference": { + "projectId": "my-proj", + "datasetId": "my-dset", + "tableId": "my_tbl", + }, + "numBytes": 2 * pandas_gbq.constants.BYTES_IN_GIB, + }, + ) + rows_iter._table = table + + with pytest.warns( + pandas_gbq.exceptions.LargeResultsWarning, + match=re.escape("Your results are 2.0 GiB. Consider using BigQuery DataFrames"), + ): + connector._download_results(rows_iter) + + +def test_GbqConnector_download_results_doesnt_warn_for_small_tables(): + gbq._test_google_api_imports() + connector = _make_connector() + rows_iter = mock.create_autospec( + google.cloud.bigquery.table.RowIterator, instance=True + ) + table = google.cloud.bigquery.Table.from_api_repr( + { + "tableReference": { + "projectId": "my-proj", + "datasetId": "my-dset", + "tableId": "my_tbl", + }, + "numBytes": 999 * pandas_gbq.constants.BYTES_IN_MIB, + }, + ) + rows_iter._table = table + + with warnings.catch_warnings(): + warnings.simplefilter( + "error", category=pandas_gbq.exceptions.LargeResultsWarning + ) + connector._download_results(rows_iter) + + def test_GbqConnector_get_client_w_new_bq(mock_bigquery_client): gbq._test_google_api_imports() pytest.importorskip("google.api_core.client_info") From 0911d8814ce714d13d0400b59ba1da4df5ab8fbb Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 16 May 2024 15:07:40 -0500 Subject: [PATCH 2/8] update docs --- docs/index.rst | 6 ++++++ pandas_gbq/gbq.py | 21 ++++++++++++++------- 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/docs/index.rst b/docs/index.rst index 67496f26..046a0256 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -23,6 +23,12 @@ Note: The canonical version of this documentation can always be found on the `BigQuery sandbox `__ to try the service for free. + Also, consider using `BigQuery DataFrames + `__ to + process large results with pandas compatible APIs that run in the BigQuery + SQL query engine. This provides an opportunity to save on costs and improve + performance. + While BigQuery uses standard SQL syntax, it has some important differences from traditional databases both in functionality, API limitations (size and quantity of queries or uploads), and how Google charges for use of the diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 55364f51..14214493 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -693,18 +693,25 @@ def read_gbq( *, col_order=None, ): - r"""Load data from Google BigQuery using google-cloud-python + r"""Read data from Google BigQuery to a pandas DataFrame. - The main method a user calls to execute a Query in Google BigQuery - and read results into a pandas DataFrame. - - This method uses the Google Cloud client library to make requests to - Google BigQuery, documented `here - `__. + Run a SQL query in BigQuery or read directly from a table + the `Python client library for BigQuery + `__ + and for `BigQuery Storage + `__ + to make API requests. See the :ref:`How to authenticate with Google BigQuery ` guide for authentication instructions. + .. note:: + Consider using `BigQuery DataFrames + `__ to + process large results with pandas compatible APIs that run in the + BigQuery SQL query engine. This provides an opportunity to save on + costs and improve performance. + Parameters ---------- query_or_table : str From fbca5bb6ae18d12164038f9a5902d8d7234b938c Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 16 May 2024 15:13:31 -0500 Subject: [PATCH 3/8] guard against non-int bytes --- pandas_gbq/gbq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 14214493..aaecd2e1 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -486,7 +486,7 @@ def _download_results( # set. if ( (table := getattr(rows_iter, "_table", None)) is not None - and (num_bytes := table.num_bytes) is not None + and isinstance((num_bytes := table.num_bytes), int) and num_bytes > pandas_gbq.constants.BYTES_TO_RECOMMEND_BIGFRAMES ): num_gib = num_bytes / pandas_gbq.constants.BYTES_IN_GIB From a7e556cd2fda2716b6fe40417c5c5dcdd44a6dd4 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 16 May 2024 15:20:58 -0500 Subject: [PATCH 4/8] tweak message --- docs/index.rst | 8 ++++---- pandas_gbq/gbq.py | 10 +++++----- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/docs/index.rst b/docs/index.rst index 046a0256..73673e0f 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -24,10 +24,10 @@ Note: The canonical version of this documentation can always be found on the try the service for free. Also, consider using `BigQuery DataFrames - `__ to - process large results with pandas compatible APIs that run in the BigQuery - SQL query engine. This provides an opportunity to save on costs and improve - performance. + `__ + to process large results with pandas compatible APIs with transparent SQL + pushdown to BigQuery engine. This provides an opportunity to save on costs + and improve performance. While BigQuery uses standard SQL syntax, it has some important differences from traditional databases both in functionality, API limitations (size and diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index aaecd2e1..d4ff31c9 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -492,11 +492,11 @@ def _download_results( num_gib = num_bytes / pandas_gbq.constants.BYTES_IN_GIB warnings.warn( f"Recommendation: Your results are {num_gib:.1f} GiB. " - "Consider using BigQuery DataFrames " - "(https://cloud.google.com/bigquery/docs/dataframes-quickstart) " - "to process these results with pandas compatible APIs that " - "run in the BigQuery SQL query engine. This provides an " - "opportunity to save on costs and improve performance. " + "Also, consider using BigQuery DataFrames " + "(https://cloud.google.com/bigquery/docs/bigquery-dataframes-introduction) " + "to process large results with pandas compatible APIs with transparent SQL " + "pushdown to BigQuery engine. This provides an opportunity to save on costs " + "and improve performance. " "Please reach out to bigframes-feedback@google.com with any " "questions or concerns. To disable this message, run " "warnings.simplefilter('ignore', category=pandas_gbq.exceptions.LargeResultsWarning)", From 8752fb7f5350e3de3c0519060419da3e5014edd1 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 16 May 2024 15:24:30 -0500 Subject: [PATCH 5/8] remove unnecessary also --- pandas_gbq/gbq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index d4ff31c9..c487e1ff 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -492,7 +492,7 @@ def _download_results( num_gib = num_bytes / pandas_gbq.constants.BYTES_IN_GIB warnings.warn( f"Recommendation: Your results are {num_gib:.1f} GiB. " - "Also, consider using BigQuery DataFrames " + "Consider using BigQuery DataFrames " "(https://cloud.google.com/bigquery/docs/bigquery-dataframes-introduction) " "to process large results with pandas compatible APIs with transparent SQL " "pushdown to BigQuery engine. This provides an opportunity to save on costs " From 6ba62d19df00375dfb21780bc902626787f118ef Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 16 May 2024 17:16:55 -0500 Subject: [PATCH 6/8] remove dead code --- noxfile.py | 10 ++++++++ pandas_gbq/features.py | 10 -------- pandas_gbq/gbq.py | 54 ++++++++++++++++++--------------------- tests/unit/test_gbq.py | 48 ++++++++++++++++------------------ tests/unit/test_to_gbq.py | 5 +--- 5 files changed, 58 insertions(+), 69 deletions(-) diff --git a/noxfile.py b/noxfile.py index 2b973857..c7366f3d 100644 --- a/noxfile.py +++ b/noxfile.py @@ -375,6 +375,16 @@ def cover(session): session.install("coverage", "pytest-cov") session.run("coverage", "report", "--show-missing", "--fail-under=96") + # Make sure there is no dead code in our test directories. + session.run( + "coverage", + "report", + "--show-missing", + "--include=tests/unit/*", + "--include=tests/system/small/*", + "--fail-under=100", + ) + session.run("coverage", "erase") diff --git a/pandas_gbq/features.py b/pandas_gbq/features.py index 45a43c55..b6ab25ac 100644 --- a/pandas_gbq/features.py +++ b/pandas_gbq/features.py @@ -9,7 +9,6 @@ BIGQUERY_QUERY_AND_WAIT_VERSION = "3.14.0" PANDAS_VERBOSITY_DEPRECATION_VERSION = "0.23.0" PANDAS_BOOLEAN_DTYPE_VERSION = "1.0.0" -PANDAS_PARQUET_LOSSLESS_TIMESTAMP_VERSION = "1.1.0" class Features: @@ -82,14 +81,5 @@ def pandas_has_boolean_dtype(self): desired_version = packaging.version.parse(PANDAS_BOOLEAN_DTYPE_VERSION) return self.pandas_installed_version >= desired_version - @property - def pandas_has_parquet_with_lossless_timestamp(self): - import packaging.version - - desired_version = packaging.version.parse( - PANDAS_PARQUET_LOSSLESS_TIMESTAMP_VERSION - ) - return self.pandas_installed_version >= desired_version - FEATURES = Features() diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index c487e1ff..a9dca3ce 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -484,29 +484,30 @@ def _download_results( # better fit. Not all code paths will populate rows_iter._table, but # if it's not populated that means we are working with a small result # set. - if ( - (table := getattr(rows_iter, "_table", None)) is not None - and isinstance((num_bytes := table.num_bytes), int) - and num_bytes > pandas_gbq.constants.BYTES_TO_RECOMMEND_BIGFRAMES - ): - num_gib = num_bytes / pandas_gbq.constants.BYTES_IN_GIB - warnings.warn( - f"Recommendation: Your results are {num_gib:.1f} GiB. " - "Consider using BigQuery DataFrames " - "(https://cloud.google.com/bigquery/docs/bigquery-dataframes-introduction) " - "to process large results with pandas compatible APIs with transparent SQL " - "pushdown to BigQuery engine. This provides an opportunity to save on costs " - "and improve performance. " - "Please reach out to bigframes-feedback@google.com with any " - "questions or concerns. To disable this message, run " - "warnings.simplefilter('ignore', category=pandas_gbq.exceptions.LargeResultsWarning)", - category=pandas_gbq.exceptions.LargeResultsWarning, - # user's code - # -> read_gbq - # -> run_query - # -> download_results - stacklevel=4, - ) + if (table_ref := getattr(rows_iter, "_table", None)) is not None: + table = self.client.get_table(table_ref) + if ( + isinstance((num_bytes := table.num_bytes), int) + and num_bytes > pandas_gbq.constants.BYTES_TO_RECOMMEND_BIGFRAMES + ): + num_gib = num_bytes / pandas_gbq.constants.BYTES_IN_GIB + warnings.warn( + f"Recommendation: Your results are {num_gib:.1f} GiB. " + "Consider using BigQuery DataFrames " + "(https://cloud.google.com/bigquery/docs/bigquery-dataframes-introduction) " + "to process large results with pandas compatible APIs with transparent SQL " + "pushdown to BigQuery engine. This provides an opportunity to save on costs " + "and improve performance. " + "Please reach out to bigframes-feedback@google.com with any " + "questions or concerns. To disable this message, run " + "warnings.simplefilter('ignore', category=pandas_gbq.exceptions.LargeResultsWarning)", + category=pandas_gbq.exceptions.LargeResultsWarning, + # user's code + # -> read_gbq + # -> run_query + # -> download_results + stacklevel=4, + ) try: schema_fields = [field.to_api_repr() for field in rows_iter.schema] @@ -1087,12 +1088,7 @@ def to_gbq( ) if api_method == "default": - # Avoid using parquet if pandas doesn't support lossless conversions to - # parquet timestamp. See: https://stackoverflow.com/a/69758676/101923 - if FEATURES.pandas_has_parquet_with_lossless_timestamp: - api_method = "load_parquet" - else: - api_method = "load_csv" + api_method = "load_parquet" if chunksize is not None: if api_method == "load_parquet": diff --git a/tests/unit/test_gbq.py b/tests/unit/test_gbq.py index d93fed96..cef916f2 100644 --- a/tests/unit/test_gbq.py +++ b/tests/unit/test_gbq.py @@ -152,7 +152,7 @@ def test__transform_read_gbq_configuration_makes_copy(original, expected): assert did_change == should_change -def test_GbqConnector_download_results_warns_for_large_tables(): +def test_GbqConnector_download_results_warns_for_large_tables(default_bigquery_client): gbq._test_google_api_imports() connector = _make_connector() rows_iter = mock.create_autospec( @@ -169,6 +169,8 @@ def test_GbqConnector_download_results_warns_for_large_tables(): }, ) rows_iter._table = table + default_bigquery_client.get_table.reset_mock(side_effect=True) + default_bigquery_client.get_table.return_value = table with pytest.warns( pandas_gbq.exceptions.LargeResultsWarning, @@ -177,7 +179,9 @@ def test_GbqConnector_download_results_warns_for_large_tables(): connector._download_results(rows_iter) -def test_GbqConnector_download_results_doesnt_warn_for_small_tables(): +def test_GbqConnector_download_results_doesnt_warn_for_small_tables( + default_bigquery_client, +): gbq._test_google_api_imports() connector = _make_connector() rows_iter = mock.create_autospec( @@ -194,6 +198,8 @@ def test_GbqConnector_download_results_doesnt_warn_for_small_tables(): }, ) rows_iter._table = table + default_bigquery_client.get_table.reset_mock(side_effect=True) + default_bigquery_client.get_table.return_value = table with warnings.catch_warnings(): warnings.simplefilter( @@ -246,16 +252,13 @@ def test_to_gbq_with_chunksize_warns_deprecation( api_method, warning_message, warning_type ): with pytest.warns(warning_type, match=warning_message): - try: - gbq.to_gbq( - DataFrame([[1]]), - "dataset.tablename", - project_id="my-project", - api_method=api_method, - chunksize=100, - ) - except gbq.TableCreationError: - pass + gbq.to_gbq( + DataFrame([[1]]), + "dataset.tablename", + project_id="my-project", + api_method=api_method, + chunksize=100, + ) @pytest.mark.parametrize(["verbose"], [(True,), (False,)]) @@ -266,15 +269,12 @@ def test_to_gbq_with_verbose_new_pandas_warns_deprecation(monkeypatch, verbose): mock.PropertyMock(return_value=True), ) with pytest.warns(FutureWarning, match="verbose is deprecated"): - try: - gbq.to_gbq( - DataFrame([[1]]), - "dataset.tablename", - project_id="my-project", - verbose=verbose, - ) - except gbq.TableCreationError: - pass + gbq.to_gbq( + DataFrame([[1]]), + "dataset.tablename", + project_id="my-project", + verbose=verbose, + ) def test_to_gbq_with_private_key_raises_notimplementederror(): @@ -288,11 +288,7 @@ def test_to_gbq_with_private_key_raises_notimplementederror(): def test_to_gbq_doesnt_run_query(mock_bigquery_client): - try: - gbq.to_gbq(DataFrame([[1]]), "dataset.tablename", project_id="my-project") - except gbq.TableCreationError: - pass - + gbq.to_gbq(DataFrame([[1]]), "dataset.tablename", project_id="my-project") mock_bigquery_client.query.assert_not_called() diff --git a/tests/unit/test_to_gbq.py b/tests/unit/test_to_gbq.py index 4456df0e..15176a1b 100644 --- a/tests/unit/test_to_gbq.py +++ b/tests/unit/test_to_gbq.py @@ -8,14 +8,11 @@ import pytest from pandas_gbq import gbq -from pandas_gbq.features import FEATURES @pytest.fixture def expected_load_method(mock_bigquery_client): - if FEATURES.pandas_has_parquet_with_lossless_timestamp: - return mock_bigquery_client.load_table_from_dataframe - return mock_bigquery_client.load_table_from_file + return mock_bigquery_client.load_table_from_dataframe def test_to_gbq_create_dataset_with_location(mock_bigquery_client): From 16d381e4bca22297835705f1dfce99df82d3b6be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Thu, 16 May 2024 17:28:04 -0500 Subject: [PATCH 7/8] remove directory that doesn't exist --- noxfile.py | 1 - 1 file changed, 1 deletion(-) diff --git a/noxfile.py b/noxfile.py index c7366f3d..f7b290f4 100644 --- a/noxfile.py +++ b/noxfile.py @@ -381,7 +381,6 @@ def cover(session): "report", "--show-missing", "--include=tests/unit/*", - "--include=tests/system/small/*", "--fail-under=100", ) From 46da4ac4eaca17107bb32e8cfa123fb1b9b68c8f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Mon, 20 May 2024 14:53:34 -0500 Subject: [PATCH 8/8] comment about GiB vs GB --- pandas_gbq/constants.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas_gbq/constants.py b/pandas_gbq/constants.py index cfe53864..37266b3c 100644 --- a/pandas_gbq/constants.py +++ b/pandas_gbq/constants.py @@ -2,6 +2,10 @@ # Use of this source code is governed by a BSD-style # license that can be found in the LICENSE file. +# BigQuery uses powers of 2 in calculating data sizes. See: +# https://cloud.google.com/bigquery/pricing#data The documentation uses +# GiB rather than GB to disambiguate from the alternative base 10 units. +# https://en.wikipedia.org/wiki/Byte#Multiple-byte_units BYTES_IN_KIB = 1024 BYTES_IN_MIB = 1024 * BYTES_IN_KIB BYTES_IN_GIB = 1024 * BYTES_IN_MIB