Skip to content

Commit bf0e863

Browse files
authored
feat: accepts a table ID, which downloads the table without a query (#443)
Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [x] Make sure to open an issue as a [bug/issue](https://github.com/googleapis/python-bigquery-pandas/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [x] Ensure the tests and linter pass - [x] Code coverage does not decrease (if any source code was changed) - [x] Appropriate docs were updated (if necessary) Fixes #266 🦕
1 parent c0f9da0 commit bf0e863

File tree

9 files changed

+245
-73
lines changed

9 files changed

+245
-73
lines changed

.coveragerc

+1-1
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ omit =
2222
google/cloud/__init__.py
2323

2424
[report]
25-
fail_under = 88
25+
fail_under = 89
2626
show_missing = True
2727
exclude_lines =
2828
# Re-enable the standard pragma

noxfile.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -259,7 +259,7 @@ def cover(session):
259259
test runs (not system test runs), and then erases coverage data.
260260
"""
261261
session.install("coverage", "pytest-cov")
262-
session.run("coverage", "report", "--show-missing", "--fail-under=88")
262+
session.run("coverage", "report", "--show-missing", "--fail-under=89")
263263

264264
session.run("coverage", "erase")
265265

owlbot.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333
templated_files = common.py_library(
3434
unit_test_python_versions=["3.7", "3.8", "3.9", "3.10"],
3535
system_test_python_versions=["3.7", "3.8", "3.9", "3.10"],
36-
cov_level=88,
36+
cov_level=89,
3737
unit_test_extras=extras,
3838
system_test_extras=extras,
3939
intersphinx_dependencies={

pandas_gbq/gbq.py

+96-52
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,21 @@
33
# license that can be found in the LICENSE file.
44

55
import logging
6+
import re
67
import time
78
import warnings
89
from datetime import datetime
10+
import typing
11+
from typing import Any, Dict, Optional, Union
912

1013
import numpy as np
1114

15+
# Only import at module-level at type checking time to avoid circular
16+
# dependencies in the pandas package, which has an optional dependency on
17+
# pandas-gbq.
18+
if typing.TYPE_CHECKING: # pragma: NO COVER
19+
import pandas
20+
1221
# Required dependencies, but treat as optional so that _test_google_api_imports
1322
# can provide a better error message.
1423
try:
@@ -64,6 +73,10 @@ def _test_google_api_imports():
6473
raise ImportError("pandas-gbq requires google-cloud-bigquery") from ex
6574

6675

76+
def _is_query(query_or_table: str) -> bool:
77+
return re.search(r"\s", query_or_table.strip(), re.MULTILINE) is not None
78+
79+
6780
class DatasetCreationError(ValueError):
6881
"""
6982
Raised when the create dataset method fails
@@ -374,6 +387,30 @@ def process_http_error(ex):
374387

375388
raise GenericGBQException("Reason: {0}".format(ex))
376389

390+
def download_table(
391+
self,
392+
table_id: str,
393+
max_results: Optional[int] = None,
394+
progress_bar_type: Optional[str] = None,
395+
dtypes: Optional[Dict[str, Union[str, Any]]] = None,
396+
) -> "pandas.DataFrame":
397+
self._start_timer()
398+
399+
try:
400+
table_ref = bigquery.TableReference.from_string(
401+
table_id, default_project=self.project_id
402+
)
403+
rows_iter = self.client.list_rows(table_ref, max_results=max_results)
404+
except self.http_error as ex:
405+
self.process_http_error(ex)
406+
407+
return self._download_results(
408+
rows_iter,
409+
max_results=max_results,
410+
progress_bar_type=progress_bar_type,
411+
user_dtypes=dtypes,
412+
)
413+
377414
def run_query(self, query, max_results=None, progress_bar_type=None, **kwargs):
378415
from concurrent.futures import TimeoutError
379416
from google.auth.exceptions import RefreshError
@@ -390,15 +427,6 @@ def run_query(self, query, max_results=None, progress_bar_type=None, **kwargs):
390427
if config is not None:
391428
job_config.update(config)
392429

393-
if "query" in config and "query" in config["query"]:
394-
if query is not None:
395-
raise ValueError(
396-
"Query statement can't be specified "
397-
"inside config while it is specified "
398-
"as parameter"
399-
)
400-
query = config["query"].pop("query")
401-
402430
self._start_timer()
403431

404432
try:
@@ -464,15 +492,25 @@ def run_query(self, query, max_results=None, progress_bar_type=None, **kwargs):
464492
)
465493

466494
dtypes = kwargs.get("dtypes")
495+
496+
# Ensure destination is populated.
497+
try:
498+
query_reply.result()
499+
except self.http_error as ex:
500+
self.process_http_error(ex)
501+
502+
rows_iter = self.client.list_rows(
503+
query_reply.destination, max_results=max_results
504+
)
467505
return self._download_results(
468-
query_reply,
506+
rows_iter,
469507
max_results=max_results,
470508
progress_bar_type=progress_bar_type,
471509
user_dtypes=dtypes,
472510
)
473511

474512
def _download_results(
475-
self, query_job, max_results=None, progress_bar_type=None, user_dtypes=None,
513+
self, rows_iter, max_results=None, progress_bar_type=None, user_dtypes=None,
476514
):
477515
# No results are desired, so don't bother downloading anything.
478516
if max_results == 0:
@@ -504,11 +542,6 @@ def _download_results(
504542
to_dataframe_kwargs["create_bqstorage_client"] = create_bqstorage_client
505543

506544
try:
507-
query_job.result()
508-
# Get the table schema, so that we can list rows.
509-
destination = self.client.get_table(query_job.destination)
510-
rows_iter = self.client.list_rows(destination, max_results=max_results)
511-
512545
schema_fields = [field.to_api_repr() for field in rows_iter.schema]
513546
conversion_dtypes = _bqschema_to_nullsafe_dtypes(schema_fields)
514547
conversion_dtypes.update(user_dtypes)
@@ -644,7 +677,7 @@ def _cast_empty_df_dtypes(schema_fields, df):
644677

645678

646679
def read_gbq(
647-
query,
680+
query_or_table,
648681
project_id=None,
649682
index_col=None,
650683
col_order=None,
@@ -668,17 +701,18 @@ def read_gbq(
668701
669702
This method uses the Google Cloud client library to make requests to
670703
Google BigQuery, documented `here
671-
<https://google-cloud-python.readthedocs.io/en/latest/bigquery/usage.html>`__.
704+
<https://googleapis.dev/python/bigquery/latest/index.html>`__.
672705
673706
See the :ref:`How to authenticate with Google BigQuery <authentication>`
674707
guide for authentication instructions.
675708
676709
Parameters
677710
----------
678-
query : str
679-
SQL-Like Query to return data values.
711+
query_or_table : str
712+
SQL query to return data values. If the string is a table ID, fetch the
713+
rows directly from the table without running a query.
680714
project_id : str, optional
681-
Google BigQuery Account project ID. Optional when available from
715+
Google Cloud Platform project ID. Optional when available from
682716
the environment.
683717
index_col : str, optional
684718
Name of result column to use for index in results DataFrame.
@@ -688,14 +722,14 @@ def read_gbq(
688722
reauth : boolean, default False
689723
Force Google BigQuery to re-authenticate the user. This is useful
690724
if multiple accounts are used.
691-
auth_local_webserver : boolean, default False
692-
Use the `local webserver flow`_ instead of the `console flow`_
693-
when getting user credentials.
694-
695-
.. _local webserver flow:
696-
http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server
697-
.. _console flow:
698-
http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console
725+
auth_local_webserver : bool, default False
726+
Use the `local webserver flow
727+
<https://googleapis.dev/python/google-auth-oauthlib/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server>`_
728+
instead of the `console flow
729+
<https://googleapis.dev/python/google-auth-oauthlib/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console>`_
730+
when getting user credentials. Your code must run on the same machine
731+
as your web browser and your web browser can access your application
732+
via ``localhost:808X``.
699733
700734
.. versionadded:: 0.2.0
701735
dialect : str, default 'standard'
@@ -745,13 +779,6 @@ def read_gbq(
745779
<https://cloud.google.com/bigquery/docs/access-control#roles>`__
746780
permission on the project you are billing queries to.
747781
748-
**Note:** Due to a `known issue in the ``google-cloud-bigquery``
749-
package
750-
<https://github.com/googleapis/google-cloud-python/pull/7633>`__
751-
(fixed in version 1.11.0), you must write your query results to a
752-
destination table. To do this with ``read_gbq``, supply a
753-
``configuration`` dictionary.
754-
755782
This feature requires the ``google-cloud-bigquery-storage`` and
756783
``pyarrow`` packages.
757784
@@ -823,6 +850,15 @@ def read_gbq(
823850
if dialect not in ("legacy", "standard"):
824851
raise ValueError("'{0}' is not valid for dialect".format(dialect))
825852

853+
if configuration and "query" in configuration and "query" in configuration["query"]:
854+
if query_or_table is not None:
855+
raise ValueError(
856+
"Query statement can't be specified "
857+
"inside config while it is specified "
858+
"as parameter"
859+
)
860+
query_or_table = configuration["query"].pop("query")
861+
826862
connector = GbqConnector(
827863
project_id,
828864
reauth=reauth,
@@ -834,13 +870,21 @@ def read_gbq(
834870
use_bqstorage_api=use_bqstorage_api,
835871
)
836872

837-
final_df = connector.run_query(
838-
query,
839-
configuration=configuration,
840-
max_results=max_results,
841-
progress_bar_type=progress_bar_type,
842-
dtypes=dtypes,
843-
)
873+
if _is_query(query_or_table):
874+
final_df = connector.run_query(
875+
query_or_table,
876+
configuration=configuration,
877+
max_results=max_results,
878+
progress_bar_type=progress_bar_type,
879+
dtypes=dtypes,
880+
)
881+
else:
882+
final_df = connector.download_table(
883+
query_or_table,
884+
max_results=max_results,
885+
progress_bar_type=progress_bar_type,
886+
dtypes=dtypes,
887+
)
844888

845889
# Reindex the DataFrame on the provided column
846890
if index_col is not None:
@@ -889,7 +933,7 @@ def to_gbq(
889933
890934
This method uses the Google Cloud client library to make requests to
891935
Google BigQuery, documented `here
892-
<https://google-cloud-python.readthedocs.io/en/latest/bigquery/usage.html>`__.
936+
<https://googleapis.dev/python/bigquery/latest/index.html>`__.
893937
894938
See the :ref:`How to authenticate with Google BigQuery <authentication>`
895939
guide for authentication instructions.
@@ -902,7 +946,7 @@ def to_gbq(
902946
Name of table to be written, in the form ``dataset.tablename`` or
903947
``project.dataset.tablename``.
904948
project_id : str, optional
905-
Google BigQuery Account project ID. Optional when available from
949+
Google Cloud Platform project ID. Optional when available from
906950
the environment.
907951
chunksize : int, optional
908952
Number of rows to be inserted in each chunk from the dataframe.
@@ -920,13 +964,13 @@ def to_gbq(
920964
``'append'``
921965
If table exists, insert data. Create if does not exist.
922966
auth_local_webserver : bool, default False
923-
Use the `local webserver flow`_ instead of the `console flow`_
924-
when getting user credentials.
925-
926-
.. _local webserver flow:
927-
http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server
928-
.. _console flow:
929-
http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console
967+
Use the `local webserver flow
968+
<https://googleapis.dev/python/google-auth-oauthlib/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server>`_
969+
instead of the `console flow
970+
<https://googleapis.dev/python/google-auth-oauthlib/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console>`_
971+
when getting user credentials. Your code must run on the same machine
972+
as your web browser and your web browser can access your application
973+
via ``localhost:808X``.
930974
931975
.. versionadded:: 0.2.0
932976
table_schema : list of dicts, optional

pandas_gbq/timestamp.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
Private module.
88
"""
99

10+
import pandas.api.types
11+
1012

1113
def localize_df(df, schema_fields):
1214
"""Localize any TIMESTAMP columns to tz-aware type.
@@ -38,7 +40,11 @@ def localize_df(df, schema_fields):
3840
if "mode" in field and field["mode"].upper() == "REPEATED":
3941
continue
4042

41-
if field["type"].upper() == "TIMESTAMP" and df[column].dt.tz is None:
43+
if (
44+
field["type"].upper() == "TIMESTAMP"
45+
and pandas.api.types.is_datetime64_ns_dtype(df.dtypes[column])
46+
and df[column].dt.tz is None
47+
):
4248
df[column] = df[column].dt.tz_localize("UTC")
4349

4450
return df

tests/system/conftest.py

+19
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
# license that can be found in the LICENSE file.
44

55
import os
6+
import functools
67
import pathlib
78

89
from google.cloud import bigquery
@@ -56,6 +57,24 @@ def project(project_id):
5657
return project_id
5758

5859

60+
@pytest.fixture
61+
def to_gbq(credentials, project_id):
62+
import pandas_gbq
63+
64+
return functools.partial(
65+
pandas_gbq.to_gbq, project_id=project_id, credentials=credentials
66+
)
67+
68+
69+
@pytest.fixture
70+
def read_gbq(credentials, project_id):
71+
import pandas_gbq
72+
73+
return functools.partial(
74+
pandas_gbq.read_gbq, project_id=project_id, credentials=credentials
75+
)
76+
77+
5978
@pytest.fixture()
6079
def random_dataset_id(bigquery_client: bigquery.Client, project_id: str):
6180
dataset_id = prefixer.create_prefix()

0 commit comments

Comments
 (0)