Skip to content

Commit 3d1af95

Browse files
Jim Fultonpartheatswastleahecole
authored
feat!: Use pandas custom data types for BigQuery DATE and TIME columns, remove date_as_object argument (#972)
* Use new pandas date and time dtypes * Get rid of date_as_object argument * added *unit* test for dealing with dates and timestamps that can't fit in datetime64[ns] * Implemented any, all, min, max and median * test (and fix) load from dataframe with date and time columns * Make sure insert_rows_from_dataframe works * Renamed date and time dtypes to bqdate and bqtime * make fallback date and time dtype names strings to make pytype happy * date and time arrays implement __arrow_array__ to facilitate arrow conversion * Make conversion of date columns from arrow pandas outout to pandas zero-copy when not date_as_object * Added date math support * Support date math with DateOffset scalars * always use types mapper for conversion from arrow to pandas * adjust unit tests to use arrow not avro * avoid "ValueError: need at least one array to concatenate" with empty RecordBatch * add missing db-dtypes requirement * avoid arrow_schema on older versions of bqstorage BREAKING CHANGE: remove `date_as_object` argument from `to_dataframe`. The `dbdate` dtype is used by default with an automatic fallback to `object` when dates are not within the range of a nanosecond-precision pandas timestamp Co-authored-by: Anthonios Partheniou <partheniou@google.com> Co-authored-by: Tim Swast <swast@google.com> Co-authored-by: Leah E. Cole <6719667+leahecole@users.noreply.github.com>
1 parent 42d3db6 commit 3d1af95

16 files changed

+396
-176
lines changed

docs/usage/pandas.rst

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,13 +50,25 @@ The following data types are used when creating a pandas DataFrame.
5050
-
5151
* - DATETIME
5252
- datetime64[ns], object
53-
- object is used when there are values not representable in pandas
53+
- The object dtype is used when there are values not representable in a
54+
pandas nanosecond-precision timestamp.
55+
* - DATE
56+
- dbdate, object
57+
- The object dtype is used when there are values not representable in a
58+
pandas nanosecond-precision timestamp.
59+
60+
Requires the ``db-dtypes`` package. See the `db-dtypes usage guide
61+
<https://googleapis.dev/python/db-dtypes/latest/usage.html>`_
5462
* - FLOAT64
5563
- float64
5664
-
5765
* - INT64
5866
- Int64
5967
-
68+
* - TIME
69+
- dbtime
70+
- Requires the ``db-dtypes`` package. See the `db-dtypes usage guide
71+
<https://googleapis.dev/python/db-dtypes/latest/usage.html>`_
6072

6173
Retrieve BigQuery GEOGRAPHY data as a GeoPandas GeoDataFrame
6274
------------------------------------------------------------

google/cloud/bigquery/_pandas_helpers.py

Lines changed: 39 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -18,16 +18,21 @@
1818
import functools
1919
import logging
2020
import queue
21-
from typing import Dict, Sequence
2221
import warnings
2322

2423
try:
2524
import pandas # type: ignore
2625
except ImportError: # pragma: NO COVER
2726
pandas = None
27+
date_dtype_name = time_dtype_name = "" # Use '' rather than None because pytype
2828
else:
2929
import numpy
3030

31+
from db_dtypes import DateDtype, TimeDtype # type: ignore
32+
33+
date_dtype_name = DateDtype.name
34+
time_dtype_name = TimeDtype.name
35+
3136
import pyarrow # type: ignore
3237
import pyarrow.parquet # type: ignore
3338

@@ -77,15 +82,6 @@ def _to_wkb(v):
7782

7883
_MAX_QUEUE_SIZE_DEFAULT = object() # max queue size sentinel for BQ Storage downloads
7984

80-
# If you update the default dtypes, also update the docs at docs/usage/pandas.rst.
81-
_BQ_TO_PANDAS_DTYPE_NULLSAFE = {
82-
"BOOL": "boolean",
83-
"BOOLEAN": "boolean",
84-
"FLOAT": "float64",
85-
"FLOAT64": "float64",
86-
"INT64": "Int64",
87-
"INTEGER": "Int64",
88-
}
8985
_PANDAS_DTYPE_TO_BQ = {
9086
"bool": "BOOLEAN",
9187
"datetime64[ns, UTC]": "TIMESTAMP",
@@ -102,6 +98,8 @@ def _to_wkb(v):
10298
"uint16": "INTEGER",
10399
"uint32": "INTEGER",
104100
"geometry": "GEOGRAPHY",
101+
date_dtype_name: "DATE",
102+
time_dtype_name: "TIME",
105103
}
106104

107105

@@ -267,26 +265,40 @@ def bq_to_arrow_schema(bq_schema):
267265
return pyarrow.schema(arrow_fields)
268266

269267

270-
def bq_schema_to_nullsafe_pandas_dtypes(
271-
bq_schema: Sequence[schema.SchemaField],
272-
) -> Dict[str, str]:
273-
"""Return the default dtypes to use for columns in a BigQuery schema.
268+
def default_types_mapper(date_as_object: bool = False):
269+
"""Create a mapping from pyarrow types to pandas types.
274270
275-
Only returns default dtypes which are safe to have NULL values. This
276-
includes Int64, which has pandas.NA values and does not result in
277-
loss-of-precision.
271+
This overrides the pandas defaults to use null-safe extension types where
272+
available.
278273
279-
Returns:
280-
A mapping from column names to pandas dtypes.
274+
See: https://arrow.apache.org/docs/python/api/datatypes.html for a list of
275+
data types. See:
276+
tests/unit/test__pandas_helpers.py::test_bq_to_arrow_data_type for
277+
BigQuery to Arrow type mapping.
278+
279+
Note to google-cloud-bigquery developers: If you update the default dtypes,
280+
also update the docs at docs/usage/pandas.rst.
281281
"""
282-
dtypes = {}
283-
for bq_field in bq_schema:
284-
if bq_field.mode.upper() not in {"NULLABLE", "REQUIRED"}:
285-
continue
286-
field_type = bq_field.field_type.upper()
287-
if field_type in _BQ_TO_PANDAS_DTYPE_NULLSAFE:
288-
dtypes[bq_field.name] = _BQ_TO_PANDAS_DTYPE_NULLSAFE[field_type]
289-
return dtypes
282+
283+
def types_mapper(arrow_data_type):
284+
if pyarrow.types.is_boolean(arrow_data_type):
285+
return pandas.BooleanDtype()
286+
287+
elif (
288+
# If date_as_object is True, we know some DATE columns are
289+
# out-of-bounds of what is supported by pandas.
290+
not date_as_object
291+
and pyarrow.types.is_date(arrow_data_type)
292+
):
293+
return DateDtype()
294+
295+
elif pyarrow.types.is_integer(arrow_data_type):
296+
return pandas.Int64Dtype()
297+
298+
elif pyarrow.types.is_time(arrow_data_type):
299+
return TimeDtype()
300+
301+
return types_mapper
290302

291303

292304
def bq_to_arrow_array(series, bq_field):

google/cloud/bigquery/job/query.py

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1556,7 +1556,6 @@ def to_dataframe(
15561556
dtypes: Dict[str, Any] = None,
15571557
progress_bar_type: str = None,
15581558
create_bqstorage_client: bool = True,
1559-
date_as_object: bool = True,
15601559
max_results: Optional[int] = None,
15611560
geography_as_object: bool = False,
15621561
) -> "pandas.DataFrame":
@@ -1599,12 +1598,6 @@ def to_dataframe(
15991598
16001599
.. versionadded:: 1.24.0
16011600
1602-
date_as_object (Optional[bool]):
1603-
If ``True`` (default), cast dates to objects. If ``False``, convert
1604-
to datetime64[ns] dtype.
1605-
1606-
.. versionadded:: 1.26.0
1607-
16081601
max_results (Optional[int]):
16091602
Maximum number of rows to include in the result. No limit by default.
16101603
@@ -1638,7 +1631,6 @@ def to_dataframe(
16381631
dtypes=dtypes,
16391632
progress_bar_type=progress_bar_type,
16401633
create_bqstorage_client=create_bqstorage_client,
1641-
date_as_object=date_as_object,
16421634
geography_as_object=geography_as_object,
16431635
)
16441636

@@ -1651,7 +1643,6 @@ def to_geodataframe(
16511643
dtypes: Dict[str, Any] = None,
16521644
progress_bar_type: str = None,
16531645
create_bqstorage_client: bool = True,
1654-
date_as_object: bool = True,
16551646
max_results: Optional[int] = None,
16561647
geography_column: Optional[str] = None,
16571648
) -> "geopandas.GeoDataFrame":
@@ -1694,12 +1685,6 @@ def to_geodataframe(
16941685
16951686
.. versionadded:: 1.24.0
16961687
1697-
date_as_object (Optional[bool]):
1698-
If ``True`` (default), cast dates to objects. If ``False``, convert
1699-
to datetime64[ns] dtype.
1700-
1701-
.. versionadded:: 1.26.0
1702-
17031688
max_results (Optional[int]):
17041689
Maximum number of rows to include in the result. No limit by default.
17051690
@@ -1732,7 +1717,6 @@ def to_geodataframe(
17321717
dtypes=dtypes,
17331718
progress_bar_type=progress_bar_type,
17341719
create_bqstorage_client=create_bqstorage_client,
1735-
date_as_object=date_as_object,
17361720
geography_column=geography_column,
17371721
)
17381722

google/cloud/bigquery/table.py

Lines changed: 43 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@
2828
import pandas # type: ignore
2929
except ImportError: # pragma: NO COVER
3030
pandas = None
31+
else:
32+
import db_dtypes # type: ignore # noqa
3133

3234
import pyarrow # type: ignore
3335

@@ -1815,7 +1817,6 @@ def to_dataframe(
18151817
dtypes: Dict[str, Any] = None,
18161818
progress_bar_type: str = None,
18171819
create_bqstorage_client: bool = True,
1818-
date_as_object: bool = True,
18191820
geography_as_object: bool = False,
18201821
) -> "pandas.DataFrame":
18211822
"""Create a pandas DataFrame by loading all pages of a query.
@@ -1865,12 +1866,6 @@ def to_dataframe(
18651866
18661867
.. versionadded:: 1.24.0
18671868
1868-
date_as_object (Optional[bool]):
1869-
If ``True`` (default), cast dates to objects. If ``False``, convert
1870-
to datetime64[ns] dtype.
1871-
1872-
.. versionadded:: 1.26.0
1873-
18741869
geography_as_object (Optional[bool]):
18751870
If ``True``, convert GEOGRAPHY data to :mod:`shapely`
18761871
geometry objects. If ``False`` (default), don't cast
@@ -1912,40 +1907,44 @@ def to_dataframe(
19121907
bqstorage_client=bqstorage_client,
19131908
create_bqstorage_client=create_bqstorage_client,
19141909
)
1915-
default_dtypes = _pandas_helpers.bq_schema_to_nullsafe_pandas_dtypes(
1916-
self.schema
1917-
)
19181910

1919-
# Let the user-defined dtypes override the default ones.
1920-
# https://stackoverflow.com/a/26853961/101923
1921-
dtypes = {**default_dtypes, **dtypes}
1922-
1923-
# When converting timestamp values to nanosecond precision, the result
1911+
# When converting date or timestamp values to nanosecond precision, the result
19241912
# can be out of pyarrow bounds. To avoid the error when converting to
1925-
# Pandas, we set the timestamp_as_object parameter to True, if necessary.
1926-
types_to_check = {
1927-
pyarrow.timestamp("us"),
1928-
pyarrow.timestamp("us", tz=datetime.timezone.utc),
1929-
}
1930-
1931-
for column in record_batch:
1932-
if column.type in types_to_check:
1933-
try:
1934-
column.cast("timestamp[ns]")
1935-
except pyarrow.lib.ArrowInvalid:
1936-
timestamp_as_object = True
1937-
break
1938-
else:
1939-
timestamp_as_object = False
1940-
1941-
extra_kwargs = {"timestamp_as_object": timestamp_as_object}
1913+
# Pandas, we set the date_as_object or timestamp_as_object parameter to True,
1914+
# if necessary.
1915+
date_as_object = not all(
1916+
self.__can_cast_timestamp_ns(col)
1917+
for col in record_batch
1918+
# Type can be date32 or date64 (plus units).
1919+
# See: https://arrow.apache.org/docs/python/api/datatypes.html
1920+
if str(col.type).startswith("date")
1921+
)
19421922

1943-
df = record_batch.to_pandas(
1944-
date_as_object=date_as_object, integer_object_nulls=True, **extra_kwargs
1923+
timestamp_as_object = not all(
1924+
self.__can_cast_timestamp_ns(col)
1925+
for col in record_batch
1926+
# Type can be timestamp (plus units and time zone).
1927+
# See: https://arrow.apache.org/docs/python/api/datatypes.html
1928+
if str(col.type).startswith("timestamp")
19451929
)
19461930

1931+
if len(record_batch) > 0:
1932+
df = record_batch.to_pandas(
1933+
date_as_object=date_as_object,
1934+
timestamp_as_object=timestamp_as_object,
1935+
integer_object_nulls=True,
1936+
types_mapper=_pandas_helpers.default_types_mapper(
1937+
date_as_object=date_as_object
1938+
),
1939+
)
1940+
else:
1941+
# Avoid "ValueError: need at least one array to concatenate" on
1942+
# older versions of pandas when converting empty RecordBatch to
1943+
# DataFrame. See: https://github.com/pandas-dev/pandas/issues/41241
1944+
df = pandas.DataFrame([], columns=record_batch.schema.names)
1945+
19471946
for column in dtypes:
1948-
df[column] = pandas.Series(df[column], dtype=dtypes[column])
1947+
df[column] = pandas.Series(df[column], dtype=dtypes[column], copy=False)
19491948

19501949
if geography_as_object:
19511950
for field in self.schema:
@@ -1954,6 +1953,15 @@ def to_dataframe(
19541953

19551954
return df
19561955

1956+
@staticmethod
1957+
def __can_cast_timestamp_ns(column):
1958+
try:
1959+
column.cast("timestamp[ns]")
1960+
except pyarrow.lib.ArrowInvalid:
1961+
return False
1962+
else:
1963+
return True
1964+
19571965
# If changing the signature of this method, make sure to apply the same
19581966
# changes to job.QueryJob.to_geodataframe()
19591967
def to_geodataframe(
@@ -1962,7 +1970,6 @@ def to_geodataframe(
19621970
dtypes: Dict[str, Any] = None,
19631971
progress_bar_type: str = None,
19641972
create_bqstorage_client: bool = True,
1965-
date_as_object: bool = True,
19661973
geography_column: Optional[str] = None,
19671974
) -> "geopandas.GeoDataFrame":
19681975
"""Create a GeoPandas GeoDataFrame by loading all pages of a query.
@@ -2010,10 +2017,6 @@ def to_geodataframe(
20102017
20112018
This argument does nothing if ``bqstorage_client`` is supplied.
20122019
2013-
date_as_object (Optional[bool]):
2014-
If ``True`` (default), cast dates to objects. If ``False``, convert
2015-
to datetime64[ns] dtype.
2016-
20172020
geography_column (Optional[str]):
20182021
If there are more than one GEOGRAPHY column,
20192022
identifies which one to use to construct a geopandas
@@ -2069,7 +2072,6 @@ def to_geodataframe(
20692072
dtypes,
20702073
progress_bar_type,
20712074
create_bqstorage_client,
2072-
date_as_object,
20732075
geography_as_object=True,
20742076
)
20752077

@@ -2126,7 +2128,6 @@ def to_dataframe(
21262128
dtypes=None,
21272129
progress_bar_type=None,
21282130
create_bqstorage_client=True,
2129-
date_as_object=True,
21302131
geography_as_object=False,
21312132
) -> "pandas.DataFrame":
21322133
"""Create an empty dataframe.
@@ -2136,7 +2137,6 @@ def to_dataframe(
21362137
dtypes (Any): Ignored. Added for compatibility with RowIterator.
21372138
progress_bar_type (Any): Ignored. Added for compatibility with RowIterator.
21382139
create_bqstorage_client (bool): Ignored. Added for compatibility with RowIterator.
2139-
date_as_object (bool): Ignored. Added for compatibility with RowIterator.
21402140
21412141
Returns:
21422142
pandas.DataFrame: An empty :class:`~pandas.DataFrame`.
@@ -2151,7 +2151,6 @@ def to_geodataframe(
21512151
dtypes=None,
21522152
progress_bar_type=None,
21532153
create_bqstorage_client=True,
2154-
date_as_object=True,
21552154
geography_column: Optional[str] = None,
21562155
) -> "pandas.DataFrame":
21572156
"""Create an empty dataframe.
@@ -2161,7 +2160,6 @@ def to_geodataframe(
21612160
dtypes (Any): Ignored. Added for compatibility with RowIterator.
21622161
progress_bar_type (Any): Ignored. Added for compatibility with RowIterator.
21632162
create_bqstorage_client (bool): Ignored. Added for compatibility with RowIterator.
2164-
date_as_object (bool): Ignored. Added for compatibility with RowIterator.
21652163
21662164
Returns:
21672165
pandas.DataFrame: An empty :class:`~pandas.DataFrame`.

samples/geography/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ click==8.0.1
77
click-plugins==1.1.1
88
cligj==0.7.2
99
dataclasses==0.6; python_version < '3.7'
10+
db-dtypes==0.3.0
1011
Fiona==1.8.20
1112
geojson==2.5.0
1213
geopandas==0.9.0

samples/magics/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
db-dtypes==0.3.0
12
google-cloud-bigquery-storage==2.9.0
23
google-auth-oauthlib==0.4.6
34
grpcio==1.41.0

samples/snippets/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
db-dtypes==0.3.0
12
google-cloud-bigquery-storage==2.9.0
23
google-auth-oauthlib==0.4.6
34
grpcio==1.41.0

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@
5050
# Keep the no-op bqstorage extra for backward compatibility.
5151
# See: https://github.com/googleapis/python-bigquery/issues/757
5252
"bqstorage": [],
53-
"pandas": ["pandas>=1.0.0"],
53+
"pandas": ["pandas>=1.0.0", "db-dtypes>=0.3.0,<2.0.0dev"],
5454
"geopandas": ["geopandas>=0.9.0, <1.0dev", "Shapely>=1.6.0, <2.0dev"],
5555
"tqdm": ["tqdm >= 4.7.4, <5.0.0dev"],
5656
"opentelemetry": [

0 commit comments

Comments
 (0)