Skip to content

Commit ee30a1e

Browse files
authored
feat: to_gbq can write non-string values to existing STRING columns in BigQuery (#876)
1 parent a8d3352 commit ee30a1e

File tree

3 files changed

+148
-35
lines changed

3 files changed

+148
-35
lines changed

pandas_gbq/load/__init__.py

+23
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# Copyright (c) 2025 pandas-gbq Authors All rights reserved.
2+
# Use of this source code is governed by a BSD-style
3+
# license that can be found in the LICENSE file.
4+
5+
from pandas_gbq.load.core import (
6+
cast_dataframe_for_parquet,
7+
encode_chunk,
8+
load_chunks,
9+
load_csv_from_dataframe,
10+
load_csv_from_file,
11+
load_parquet,
12+
split_dataframe,
13+
)
14+
15+
__all__ = [
16+
"cast_dataframe_for_parquet",
17+
"encode_chunk",
18+
"load_chunks",
19+
"load_csv_from_dataframe",
20+
"load_csv_from_file",
21+
"load_parquet",
22+
"split_dataframe",
23+
]

pandas_gbq/load.py renamed to pandas_gbq/load/core.py

+5
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,11 @@ def convert(x):
111111
return decimal.Decimal(x)
112112

113113
cast_column = dataframe[column_name].map(convert)
114+
elif column_type == "STRING":
115+
# Allow non-string columns to be uploaded to STRING in BigQuery.
116+
# https://github.com/googleapis/python-bigquery-pandas/issues/875
117+
# TODO: Use pyarrow as the storage when the minimum pandas version allows for it.
118+
cast_column = dataframe[column_name].astype(pandas.StringDtype())
114119
else:
115120
cast_column = None
116121

tests/system/test_to_gbq.py

+120-35
Original file line numberDiff line numberDiff line change
@@ -28,13 +28,13 @@ def method_under_test(to_gbq):
2828

2929
SeriesRoundTripTestCase = collections.namedtuple(
3030
"SeriesRoundTripTestCase",
31-
["input_series", "api_methods"],
32-
defaults=[None, {"load_csv", "load_parquet"}],
31+
["input_series", "api_methods", "expected_dtype"],
32+
defaults=[None, {"load_csv", "load_parquet"}, None],
3333
)
3434

3535

3636
@pytest.mark.parametrize(
37-
["input_series", "api_methods"],
37+
["input_series", "api_methods", "expected_dtype"],
3838
[
3939
# Ensure that 64-bit floating point numbers are unchanged.
4040
# See: https://github.com/pydata/pandas-gbq/issues/326
@@ -53,40 +53,46 @@ def method_under_test(to_gbq):
5353
name="test_col",
5454
),
5555
),
56-
SeriesRoundTripTestCase(
57-
input_series=pandas.Series(
58-
[
59-
"abc",
60-
"defg",
61-
# Ensure that unicode characters are encoded. See:
62-
# https://github.com/googleapis/python-bigquery-pandas/issues/106
63-
"信用卡",
64-
"Skywalker™",
65-
"hülle",
66-
],
67-
name="test_col",
56+
pytest.param(
57+
*SeriesRoundTripTestCase(
58+
input_series=pandas.Series(
59+
[
60+
"abc",
61+
"defg",
62+
# Ensure that unicode characters are encoded. See:
63+
# https://github.com/googleapis/python-bigquery-pandas/issues/106
64+
"信用卡",
65+
"Skywalker™",
66+
"hülle",
67+
],
68+
name="test_col",
69+
),
6870
),
71+
id="string-unicode",
6972
),
70-
SeriesRoundTripTestCase(
71-
input_series=pandas.Series(
72-
[
73-
"abc",
74-
"defg",
75-
# Ensure that empty strings are written as empty string,
76-
# not NULL. See:
77-
# https://github.com/googleapis/python-bigquery-pandas/issues/366
78-
"",
79-
None,
80-
],
81-
name="empty_strings",
73+
pytest.param(
74+
*SeriesRoundTripTestCase(
75+
input_series=pandas.Series(
76+
[
77+
"abc",
78+
"defg",
79+
# Ensure that empty strings are written as empty string,
80+
# not NULL. See:
81+
# https://github.com/googleapis/python-bigquery-pandas/issues/366
82+
"",
83+
None,
84+
],
85+
name="empty_strings",
86+
),
87+
# BigQuery CSV loader uses empty string as the "null marker" by
88+
# default. Potentially one could choose a rarely used character or
89+
# string as the null marker to disambiguate null from empty string,
90+
# but then that string couldn't be loaded.
91+
# TODO: Revist when custom load job configuration is supported.
92+
# https://github.com/googleapis/python-bigquery-pandas/issues/425
93+
api_methods={"load_parquet"},
8294
),
83-
# BigQuery CSV loader uses empty string as the "null marker" by
84-
# default. Potentially one could choose a rarely used character or
85-
# string as the null marker to disambiguate null from empty string,
86-
# but then that string couldn't be loaded.
87-
# TODO: Revist when custom load job configuration is supported.
88-
# https://github.com/googleapis/python-bigquery-pandas/issues/425
89-
api_methods={"load_parquet"},
95+
id="string-empty-and-null",
9096
),
9197
],
9298
)
@@ -97,6 +103,7 @@ def test_series_round_trip(
97103
input_series,
98104
api_method,
99105
api_methods,
106+
expected_dtype,
100107
):
101108
if api_method not in api_methods:
102109
pytest.skip(f"{api_method} not supported.")
@@ -111,9 +118,14 @@ def test_series_round_trip(
111118

112119
round_trip = read_gbq(table_id)
113120
round_trip_series = round_trip["test_col"].sort_values().reset_index(drop=True)
121+
122+
expected_series = input_series.copy()
123+
if expected_dtype is not None:
124+
expected_series = expected_series.astype(expected_dtype)
125+
114126
pandas.testing.assert_series_equal(
115127
round_trip_series,
116-
input_series,
128+
expected_series,
117129
check_exact=True,
118130
check_names=False,
119131
)
@@ -362,6 +374,79 @@ def test_series_round_trip(
362374
),
363375
id="issue365-extreme-datetimes",
364376
),
377+
# Loading a STRING column should work with all available string dtypes.
378+
pytest.param(
379+
*DataFrameRoundTripTestCase(
380+
input_df=pandas.DataFrame(
381+
{
382+
"row_num": [1, 2, 3],
383+
# If a cast to STRING is lossless, pandas-gbq should do that automatically.
384+
# See: https://github.com/googleapis/python-bigquery-pandas/issues/875
385+
"int_want_string": [94043, 10011, 98033],
386+
"object": pandas.Series(["a", "b", "c"], dtype="object"),
387+
"string_python": pandas.Series(
388+
["d", "e", "f"],
389+
dtype=(
390+
pandas.StringDtype(storage="python")
391+
if hasattr(pandas, "ArrowDtype")
392+
else pandas.StringDtype()
393+
),
394+
),
395+
"string_pyarrow": pandas.Series(
396+
["g", "h", "i"],
397+
dtype=(
398+
pandas.StringDtype(storage="pyarrow")
399+
if hasattr(pandas, "ArrowDtype")
400+
else pandas.StringDtype()
401+
),
402+
),
403+
"arrowdtype_string": pandas.Series(
404+
["j", "k", "l"],
405+
dtype=(
406+
pandas.ArrowDtype(pyarrow.string())
407+
if hasattr(pandas, "ArrowDtype")
408+
else pandas.StringDtype()
409+
),
410+
),
411+
"arrowdtype_large_string": pandas.Series(
412+
["m", "n", "o"],
413+
dtype=(
414+
pandas.ArrowDtype(pyarrow.large_string())
415+
if hasattr(pandas, "ArrowDtype")
416+
and hasattr(pyarrow, "large_string")
417+
else pandas.StringDtype()
418+
),
419+
),
420+
},
421+
),
422+
expected_df=pandas.DataFrame(
423+
{
424+
"row_num": [1, 2, 3],
425+
"int_want_string": pandas.Series(
426+
["94043", "10011", "98033"], dtype="object"
427+
),
428+
"object": pandas.Series(["a", "b", "c"], dtype="object"),
429+
"string_python": pandas.Series(["d", "e", "f"], dtype="object"),
430+
"string_pyarrow": pandas.Series(["g", "h", "i"], dtype="object"),
431+
"arrowdtype_string": pandas.Series(["j", "k", "l"], dtype="object"),
432+
"arrowdtype_large_string": pandas.Series(
433+
["m", "n", "o"], dtype="object"
434+
),
435+
},
436+
),
437+
table_schema=[
438+
{"name": "row_num", "type": "INTEGER"},
439+
{"name": "int_want_string", "type": "STRING"},
440+
{"name": "object", "type": "STRING"},
441+
{"name": "string_python", "type": "STRING"},
442+
{"name": "string_pyarrow", "type": "STRING"},
443+
{"name": "string_pyarrow_from_int", "type": "STRING"},
444+
{"name": "arrowdtype_string", "type": "STRING"},
445+
{"name": "arrowdtype_large_string", "type": "STRING"},
446+
],
447+
),
448+
id="issue875-strings",
449+
),
365450
pytest.param(
366451
# Load STRUCT and ARRAY using either object column or ArrowDtype.
367452
# See: https://github.com/googleapis/python-bigquery-pandas/issues/452

0 commit comments

Comments
 (0)