@@ -28,13 +28,13 @@ def method_under_test(to_gbq):
28
28
29
29
SeriesRoundTripTestCase = collections .namedtuple (
30
30
"SeriesRoundTripTestCase" ,
31
- ["input_series" , "api_methods" ],
32
- defaults = [None , {"load_csv" , "load_parquet" }],
31
+ ["input_series" , "api_methods" , "expected_dtype" ],
32
+ defaults = [None , {"load_csv" , "load_parquet" }, None ],
33
33
)
34
34
35
35
36
36
@pytest .mark .parametrize (
37
- ["input_series" , "api_methods" ],
37
+ ["input_series" , "api_methods" , "expected_dtype" ],
38
38
[
39
39
# Ensure that 64-bit floating point numbers are unchanged.
40
40
# See: https://github.com/pydata/pandas-gbq/issues/326
@@ -53,40 +53,46 @@ def method_under_test(to_gbq):
53
53
name = "test_col" ,
54
54
),
55
55
),
56
- SeriesRoundTripTestCase (
57
- input_series = pandas .Series (
58
- [
59
- "abc" ,
60
- "defg" ,
61
- # Ensure that unicode characters are encoded. See:
62
- # https://github.com/googleapis/python-bigquery-pandas/issues/106
63
- "信用卡" ,
64
- "Skywalker™" ,
65
- "hülle" ,
66
- ],
67
- name = "test_col" ,
56
+ pytest .param (
57
+ * SeriesRoundTripTestCase (
58
+ input_series = pandas .Series (
59
+ [
60
+ "abc" ,
61
+ "defg" ,
62
+ # Ensure that unicode characters are encoded. See:
63
+ # https://github.com/googleapis/python-bigquery-pandas/issues/106
64
+ "信用卡" ,
65
+ "Skywalker™" ,
66
+ "hülle" ,
67
+ ],
68
+ name = "test_col" ,
69
+ ),
68
70
),
71
+ id = "string-unicode" ,
69
72
),
70
- SeriesRoundTripTestCase (
71
- input_series = pandas .Series (
72
- [
73
- "abc" ,
74
- "defg" ,
75
- # Ensure that empty strings are written as empty string,
76
- # not NULL. See:
77
- # https://github.com/googleapis/python-bigquery-pandas/issues/366
78
- "" ,
79
- None ,
80
- ],
81
- name = "empty_strings" ,
73
+ pytest .param (
74
+ * SeriesRoundTripTestCase (
75
+ input_series = pandas .Series (
76
+ [
77
+ "abc" ,
78
+ "defg" ,
79
+ # Ensure that empty strings are written as empty string,
80
+ # not NULL. See:
81
+ # https://github.com/googleapis/python-bigquery-pandas/issues/366
82
+ "" ,
83
+ None ,
84
+ ],
85
+ name = "empty_strings" ,
86
+ ),
87
+ # BigQuery CSV loader uses empty string as the "null marker" by
88
+ # default. Potentially one could choose a rarely used character or
89
+ # string as the null marker to disambiguate null from empty string,
90
+ # but then that string couldn't be loaded.
91
+ # TODO: Revist when custom load job configuration is supported.
92
+ # https://github.com/googleapis/python-bigquery-pandas/issues/425
93
+ api_methods = {"load_parquet" },
82
94
),
83
- # BigQuery CSV loader uses empty string as the "null marker" by
84
- # default. Potentially one could choose a rarely used character or
85
- # string as the null marker to disambiguate null from empty string,
86
- # but then that string couldn't be loaded.
87
- # TODO: Revist when custom load job configuration is supported.
88
- # https://github.com/googleapis/python-bigquery-pandas/issues/425
89
- api_methods = {"load_parquet" },
95
+ id = "string-empty-and-null" ,
90
96
),
91
97
],
92
98
)
@@ -97,6 +103,7 @@ def test_series_round_trip(
97
103
input_series ,
98
104
api_method ,
99
105
api_methods ,
106
+ expected_dtype ,
100
107
):
101
108
if api_method not in api_methods :
102
109
pytest .skip (f"{ api_method } not supported." )
@@ -111,9 +118,14 @@ def test_series_round_trip(
111
118
112
119
round_trip = read_gbq (table_id )
113
120
round_trip_series = round_trip ["test_col" ].sort_values ().reset_index (drop = True )
121
+
122
+ expected_series = input_series .copy ()
123
+ if expected_dtype is not None :
124
+ expected_series = expected_series .astype (expected_dtype )
125
+
114
126
pandas .testing .assert_series_equal (
115
127
round_trip_series ,
116
- input_series ,
128
+ expected_series ,
117
129
check_exact = True ,
118
130
check_names = False ,
119
131
)
@@ -362,6 +374,79 @@ def test_series_round_trip(
362
374
),
363
375
id = "issue365-extreme-datetimes" ,
364
376
),
377
+ # Loading a STRING column should work with all available string dtypes.
378
+ pytest .param (
379
+ * DataFrameRoundTripTestCase (
380
+ input_df = pandas .DataFrame (
381
+ {
382
+ "row_num" : [1 , 2 , 3 ],
383
+ # If a cast to STRING is lossless, pandas-gbq should do that automatically.
384
+ # See: https://github.com/googleapis/python-bigquery-pandas/issues/875
385
+ "int_want_string" : [94043 , 10011 , 98033 ],
386
+ "object" : pandas .Series (["a" , "b" , "c" ], dtype = "object" ),
387
+ "string_python" : pandas .Series (
388
+ ["d" , "e" , "f" ],
389
+ dtype = (
390
+ pandas .StringDtype (storage = "python" )
391
+ if hasattr (pandas , "ArrowDtype" )
392
+ else pandas .StringDtype ()
393
+ ),
394
+ ),
395
+ "string_pyarrow" : pandas .Series (
396
+ ["g" , "h" , "i" ],
397
+ dtype = (
398
+ pandas .StringDtype (storage = "pyarrow" )
399
+ if hasattr (pandas , "ArrowDtype" )
400
+ else pandas .StringDtype ()
401
+ ),
402
+ ),
403
+ "arrowdtype_string" : pandas .Series (
404
+ ["j" , "k" , "l" ],
405
+ dtype = (
406
+ pandas .ArrowDtype (pyarrow .string ())
407
+ if hasattr (pandas , "ArrowDtype" )
408
+ else pandas .StringDtype ()
409
+ ),
410
+ ),
411
+ "arrowdtype_large_string" : pandas .Series (
412
+ ["m" , "n" , "o" ],
413
+ dtype = (
414
+ pandas .ArrowDtype (pyarrow .large_string ())
415
+ if hasattr (pandas , "ArrowDtype" )
416
+ and hasattr (pyarrow , "large_string" )
417
+ else pandas .StringDtype ()
418
+ ),
419
+ ),
420
+ },
421
+ ),
422
+ expected_df = pandas .DataFrame (
423
+ {
424
+ "row_num" : [1 , 2 , 3 ],
425
+ "int_want_string" : pandas .Series (
426
+ ["94043" , "10011" , "98033" ], dtype = "object"
427
+ ),
428
+ "object" : pandas .Series (["a" , "b" , "c" ], dtype = "object" ),
429
+ "string_python" : pandas .Series (["d" , "e" , "f" ], dtype = "object" ),
430
+ "string_pyarrow" : pandas .Series (["g" , "h" , "i" ], dtype = "object" ),
431
+ "arrowdtype_string" : pandas .Series (["j" , "k" , "l" ], dtype = "object" ),
432
+ "arrowdtype_large_string" : pandas .Series (
433
+ ["m" , "n" , "o" ], dtype = "object"
434
+ ),
435
+ },
436
+ ),
437
+ table_schema = [
438
+ {"name" : "row_num" , "type" : "INTEGER" },
439
+ {"name" : "int_want_string" , "type" : "STRING" },
440
+ {"name" : "object" , "type" : "STRING" },
441
+ {"name" : "string_python" , "type" : "STRING" },
442
+ {"name" : "string_pyarrow" , "type" : "STRING" },
443
+ {"name" : "string_pyarrow_from_int" , "type" : "STRING" },
444
+ {"name" : "arrowdtype_string" , "type" : "STRING" },
445
+ {"name" : "arrowdtype_large_string" , "type" : "STRING" },
446
+ ],
447
+ ),
448
+ id = "issue875-strings" ,
449
+ ),
365
450
pytest .param (
366
451
# Load STRUCT and ARRAY using either object column or ArrowDtype.
367
452
# See: https://github.com/googleapis/python-bigquery-pandas/issues/452
0 commit comments