Skip to content

Commit 2180836

Browse files
authored
fix: to_gbq allows strings for DATE and floats for NUMERIC with api_method="load_parquet" (#423)
deps: require pandas 0.24+ and db-dtypes for TIME/DATE extension dtypes (#423)
1 parent 3e70975 commit 2180836

12 files changed

+279
-80
lines changed

.circleci/config.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ jobs:
1010
- image: continuumio/miniconda3
1111
environment:
1212
PYTHON: "3.7"
13-
PANDAS: "0.23.2"
13+
PANDAS: "0.24.2"
1414
steps:
1515
- checkout
1616
- run: ci/config_auth.sh

.coveragerc

+1-1
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ omit =
2222
google/cloud/__init__.py
2323

2424
[report]
25-
fail_under = 86
25+
fail_under = 88
2626
show_missing = True
2727
exclude_lines =
2828
# Re-enable the standard pragma

ci/requirements-3.7-0.23.2.conda renamed to ci/requirements-3.7-0.24.2.conda

+1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
codecov
22
coverage
3+
db-dtypes==0.3.0
34
fastavro
45
flake8
56
numpy==1.16.6

ci/requirements-3.9-NIGHTLY.conda

+1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
db-dtypes
12
pydata-google-auth
23
google-cloud-bigquery
34
google-cloud-bigquery-storage

noxfile.py

+2-6
Original file line numberDiff line numberDiff line change
@@ -146,11 +146,7 @@ def system(session):
146146
# Install all test dependencies, then install this package into the
147147
# virtualenv's dist-packages.
148148
session.install("mock", "pytest", "google-cloud-testutils", "-c", constraints_path)
149-
if session.python == "3.9":
150-
extras = "[tqdm,db-dtypes]"
151-
else:
152-
extras = "[tqdm]"
153-
session.install("-e", f".{extras}", "-c", constraints_path)
149+
session.install("-e", ".[tqdm]", "-c", constraints_path)
154150

155151
# Run py.test against the system tests.
156152
if system_test_exists:
@@ -179,7 +175,7 @@ def cover(session):
179175
test runs (not system test runs), and then erases coverage data.
180176
"""
181177
session.install("coverage", "pytest-cov")
182-
session.run("coverage", "report", "--show-missing", "--fail-under=86")
178+
session.run("coverage", "report", "--show-missing", "--fail-under=88")
183179

184180
session.run("coverage", "erase")
185181

owlbot.py

+1-5
Original file line numberDiff line numberDiff line change
@@ -29,16 +29,12 @@
2929
# ----------------------------------------------------------------------------
3030

3131
extras = ["tqdm"]
32-
extras_by_python = {
33-
"3.9": ["tqdm", "db-dtypes"],
34-
}
3532
templated_files = common.py_library(
3633
unit_test_python_versions=["3.7", "3.8", "3.9", "3.10"],
3734
system_test_python_versions=["3.7", "3.8", "3.9", "3.10"],
38-
cov_level=86,
35+
cov_level=88,
3936
unit_test_extras=extras,
4037
system_test_extras=extras,
41-
system_test_extras_by_python=extras_by_python,
4238
intersphinx_dependencies={
4339
"pandas": "https://pandas.pydata.org/pandas-docs/stable/",
4440
"pydata-google-auth": "https://pydata-google-auth.readthedocs.io/en/latest/",

pandas_gbq/load.py

+52
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,11 @@
44

55
"""Helper methods for loading data into BigQuery"""
66

7+
import decimal
78
import io
89
from typing import Any, Callable, Dict, List, Optional
910

11+
import db_dtypes
1012
import pandas
1113
import pyarrow.lib
1214
from google.cloud import bigquery
@@ -56,6 +58,55 @@ def split_dataframe(dataframe, chunksize=None):
5658
yield remaining_rows, chunk
5759

5860

61+
def cast_dataframe_for_parquet(
62+
dataframe: pandas.DataFrame, schema: Optional[Dict[str, Any]],
63+
) -> pandas.DataFrame:
64+
"""Cast columns to needed dtype when writing parquet files.
65+
66+
See: https://github.com/googleapis/python-bigquery-pandas/issues/421
67+
"""
68+
69+
columns = schema.get("fields", [])
70+
71+
# Protect against an explicit None in the dictionary.
72+
columns = columns if columns is not None else []
73+
74+
for column in columns:
75+
# Schema can be a superset of the columns in the dataframe, so ignore
76+
# columns that aren't present.
77+
column_name = column.get("name")
78+
if column_name not in dataframe.columns:
79+
continue
80+
81+
# Skip array columns for now. Potentially casting the elements of the
82+
# array would be possible, but not worth the effort until there is
83+
# demand for it.
84+
if column.get("mode", "NULLABLE").upper() == "REPEATED":
85+
continue
86+
87+
column_type = column.get("type", "").upper()
88+
if (
89+
column_type == "DATE"
90+
# Use extension dtype first so that it uses the correct equality operator.
91+
and db_dtypes.DateDtype() != dataframe[column_name].dtype
92+
):
93+
# Construct converted column manually, because I can't use
94+
# .astype() with DateDtype. With .astype(), I get the error:
95+
#
96+
# TypeError: Cannot interpret '<db_dtypes.DateDtype ...>' as a data type
97+
cast_column = pandas.Series(
98+
dataframe[column_name], dtype=db_dtypes.DateDtype()
99+
)
100+
elif column_type in {"NUMERIC", "DECIMAL", "BIGNUMERIC", "BIGDECIMAL"}:
101+
cast_column = dataframe[column_name].map(decimal.Decimal)
102+
else:
103+
cast_column = None
104+
105+
if cast_column is not None:
106+
dataframe = dataframe.assign(**{column_name: cast_column})
107+
return dataframe
108+
109+
59110
def load_parquet(
60111
client: bigquery.Client,
61112
dataframe: pandas.DataFrame,
@@ -70,6 +121,7 @@ def load_parquet(
70121
if schema is not None:
71122
schema = pandas_gbq.schema.remove_policy_tags(schema)
72123
job_config.schema = pandas_gbq.schema.to_google_cloud_bigquery(schema)
124+
dataframe = cast_dataframe_for_parquet(dataframe, schema)
73125

74126
try:
75127
client.load_table_from_dataframe(

setup.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,9 @@
2323
release_status = "Development Status :: 4 - Beta"
2424
dependencies = [
2525
"setuptools",
26+
"db-dtypes >=0.3.0,<2.0.0",
2627
"numpy>=1.16.6",
27-
"pandas>=0.23.2",
28+
"pandas>=0.24.2",
2829
"pyarrow >=3.0.0, <7.0dev",
2930
"pydata-google-auth",
3031
"google-auth",
@@ -35,7 +36,6 @@
3536
]
3637
extras = {
3738
"tqdm": "tqdm>=4.23.0",
38-
"db-dtypes": "db-dtypes >=0.3.0,<2.0.0",
3939
}
4040

4141
# Setup boilerplate below this line.

testing/constraints-3.7.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,13 @@
55
#
66
# e.g., if setup.py has "foo >= 1.14.0, < 2.0.0dev",
77
# Then this file should have foo==1.14.0
8+
db-dtypes==0.3.0
89
google-auth==1.4.1
910
google-auth-oauthlib==0.0.1
1011
google-cloud-bigquery==1.11.1
1112
google-cloud-bigquery-storage==1.1.0
1213
numpy==1.16.6
13-
pandas==0.23.2
14+
pandas==0.24.2
1415
pyarrow==3.0.0
1516
pydata-google-auth==0.1.2
1617
tqdm==4.23.0

tests/system/test_gbq.py

+3-18
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,6 @@
2626

2727
TABLE_ID = "new_test"
2828
PANDAS_VERSION = pkg_resources.parse_version(pandas.__version__)
29-
NULLABLE_INT_PANDAS_VERSION = pkg_resources.parse_version("0.24.0")
30-
NULLABLE_INT_MESSAGE = "Require pandas 0.24+ in order to use nullable integer type."
3129

3230

3331
def test_imports():
@@ -173,9 +171,6 @@ def test_should_properly_handle_valid_integers(self, project_id):
173171
tm.assert_frame_equal(df, DataFrame({"valid_integer": [3]}))
174172

175173
def test_should_properly_handle_nullable_integers(self, project_id):
176-
if PANDAS_VERSION < NULLABLE_INT_PANDAS_VERSION:
177-
pytest.skip(msg=NULLABLE_INT_MESSAGE)
178-
179174
query = """SELECT * FROM
180175
UNNEST([1, NULL]) AS nullable_integer
181176
"""
@@ -188,9 +183,7 @@ def test_should_properly_handle_nullable_integers(self, project_id):
188183
)
189184
tm.assert_frame_equal(
190185
df,
191-
DataFrame(
192-
{"nullable_integer": pandas.Series([1, pandas.NA], dtype="Int64")}
193-
),
186+
DataFrame({"nullable_integer": pandas.Series([1, None], dtype="Int64")}),
194187
)
195188

196189
def test_should_properly_handle_valid_longs(self, project_id):
@@ -204,9 +197,6 @@ def test_should_properly_handle_valid_longs(self, project_id):
204197
tm.assert_frame_equal(df, DataFrame({"valid_long": [1 << 62]}))
205198

206199
def test_should_properly_handle_nullable_longs(self, project_id):
207-
if PANDAS_VERSION < NULLABLE_INT_PANDAS_VERSION:
208-
pytest.skip(msg=NULLABLE_INT_MESSAGE)
209-
210200
query = """SELECT * FROM
211201
UNNEST([1 << 62, NULL]) AS nullable_long
212202
"""
@@ -219,15 +209,10 @@ def test_should_properly_handle_nullable_longs(self, project_id):
219209
)
220210
tm.assert_frame_equal(
221211
df,
222-
DataFrame(
223-
{"nullable_long": pandas.Series([1 << 62, pandas.NA], dtype="Int64")}
224-
),
212+
DataFrame({"nullable_long": pandas.Series([1 << 62, None], dtype="Int64")}),
225213
)
226214

227215
def test_should_properly_handle_null_integers(self, project_id):
228-
if PANDAS_VERSION < NULLABLE_INT_PANDAS_VERSION:
229-
pytest.skip(msg=NULLABLE_INT_MESSAGE)
230-
231216
query = "SELECT CAST(NULL AS INT64) AS null_integer"
232217
df = gbq.read_gbq(
233218
query,
@@ -237,7 +222,7 @@ def test_should_properly_handle_null_integers(self, project_id):
237222
dtypes={"null_integer": "Int64"},
238223
)
239224
tm.assert_frame_equal(
240-
df, DataFrame({"null_integer": pandas.Series([pandas.NA], dtype="Int64")}),
225+
df, DataFrame({"null_integer": pandas.Series([None], dtype="Int64")}),
241226
)
242227

243228
def test_should_properly_handle_valid_floats(self, project_id):

0 commit comments

Comments
 (0)