Skip to content

Commit 84e646e

Browse files
authored
fix: avoid policy tags 403 error in load_table_from_dataframe (#557)
* WIP: fix: don't set policy tags in load job from dataframe * copy fields parameter for struct support * update tests to allow missing description property * fix load from dataframe test on python 3.6 Also, check that sent schema matches DataFrame order, not table order
1 parent 7447f05 commit 84e646e

File tree

6 files changed

+150
-149
lines changed

6 files changed

+150
-149
lines changed

google/cloud/bigquery/client.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2291,9 +2291,18 @@ def load_table_from_dataframe(
22912291
name
22922292
for name, _ in _pandas_helpers.list_columns_and_indexes(dataframe)
22932293
)
2294-
# schema fields not present in the dataframe are not needed
22952294
job_config.schema = [
2296-
field for field in table.schema if field.name in columns_and_indexes
2295+
# Field description and policy tags are not needed to
2296+
# serialize a data frame.
2297+
SchemaField(
2298+
field.name,
2299+
field.field_type,
2300+
mode=field.mode,
2301+
fields=field.fields,
2302+
)
2303+
# schema fields not present in the dataframe are not needed
2304+
for field in table.schema
2305+
if field.name in columns_and_indexes
22972306
]
22982307

22992308
job_config.schema = _pandas_helpers.dataframe_to_bq_schema(

google/cloud/bigquery/schema.py

Lines changed: 21 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
from google.cloud.bigquery_v2 import types
2020

2121

22+
_DEFAULT_VALUE = object()
2223
_STRUCT_TYPES = ("RECORD", "STRUCT")
2324

2425
# SQL types reference:
@@ -73,14 +74,18 @@ def __init__(
7374
name,
7475
field_type,
7576
mode="NULLABLE",
76-
description=None,
77+
description=_DEFAULT_VALUE,
7778
fields=(),
7879
policy_tags=None,
7980
):
80-
self._name = name
81-
self._field_type = field_type
82-
self._mode = mode
83-
self._description = description
81+
self._properties = {
82+
"name": name,
83+
"type": field_type,
84+
}
85+
if mode is not None:
86+
self._properties["mode"] = mode.upper()
87+
if description is not _DEFAULT_VALUE:
88+
self._properties["description"] = description
8489
self._fields = tuple(fields)
8590
self._policy_tags = policy_tags
8691

@@ -98,7 +103,7 @@ def from_api_repr(cls, api_repr):
98103
"""
99104
# Handle optional properties with default values
100105
mode = api_repr.get("mode", "NULLABLE")
101-
description = api_repr.get("description")
106+
description = api_repr.get("description", _DEFAULT_VALUE)
102107
fields = api_repr.get("fields", ())
103108

104109
return cls(
@@ -113,7 +118,7 @@ def from_api_repr(cls, api_repr):
113118
@property
114119
def name(self):
115120
"""str: The name of the field."""
116-
return self._name
121+
return self._properties["name"]
117122

118123
@property
119124
def field_type(self):
@@ -122,7 +127,7 @@ def field_type(self):
122127
See:
123128
https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#TableFieldSchema.FIELDS.type
124129
"""
125-
return self._field_type
130+
return self._properties["type"]
126131

127132
@property
128133
def mode(self):
@@ -131,17 +136,17 @@ def mode(self):
131136
See:
132137
https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#TableFieldSchema.FIELDS.mode
133138
"""
134-
return self._mode
139+
return self._properties.get("mode")
135140

136141
@property
137142
def is_nullable(self):
138143
"""bool: whether 'mode' is 'nullable'."""
139-
return self._mode == "NULLABLE"
144+
return self.mode == "NULLABLE"
140145

141146
@property
142147
def description(self):
143148
"""Optional[str]: description for the field."""
144-
return self._description
149+
return self._properties.get("description")
145150

146151
@property
147152
def fields(self):
@@ -164,13 +169,7 @@ def to_api_repr(self):
164169
Returns:
165170
Dict: A dictionary representing the SchemaField in a serialized form.
166171
"""
167-
# Put together the basic representation. See http://bit.ly/2hOAT5u.
168-
answer = {
169-
"mode": self.mode.upper(),
170-
"name": self.name,
171-
"type": self.field_type.upper(),
172-
"description": self.description,
173-
}
172+
answer = self._properties.copy()
174173

175174
# If this is a RECORD type, then sub-fields are also included,
176175
# add this to the serialized representation.
@@ -193,10 +192,10 @@ def _key(self):
193192
Tuple: The contents of this :class:`~google.cloud.bigquery.schema.SchemaField`.
194193
"""
195194
return (
196-
self._name,
197-
self._field_type.upper(),
198-
self._mode.upper(),
199-
self._description,
195+
self.name,
196+
self.field_type.upper(),
197+
self.mode.upper(),
198+
self.description,
200199
self._fields,
201200
self._policy_tags,
202201
)

tests/unit/job/test_load_config.py

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -434,13 +434,11 @@ def test_schema_setter_fields(self):
434434
"name": "full_name",
435435
"type": "STRING",
436436
"mode": "REQUIRED",
437-
"description": None,
438437
}
439438
age_repr = {
440439
"name": "age",
441440
"type": "INTEGER",
442441
"mode": "REQUIRED",
443-
"description": None,
444442
}
445443
self.assertEqual(
446444
config._properties["load"]["schema"], {"fields": [full_name_repr, age_repr]}
@@ -449,24 +447,18 @@ def test_schema_setter_fields(self):
449447
def test_schema_setter_valid_mappings_list(self):
450448
config = self._get_target_class()()
451449

452-
schema = [
453-
{"name": "full_name", "type": "STRING", "mode": "REQUIRED"},
454-
{"name": "age", "type": "INTEGER", "mode": "REQUIRED"},
455-
]
456-
config.schema = schema
457-
458450
full_name_repr = {
459451
"name": "full_name",
460452
"type": "STRING",
461453
"mode": "REQUIRED",
462-
"description": None,
463454
}
464455
age_repr = {
465456
"name": "age",
466457
"type": "INTEGER",
467458
"mode": "REQUIRED",
468-
"description": None,
469459
}
460+
schema = [full_name_repr, age_repr]
461+
config.schema = schema
470462
self.assertEqual(
471463
config._properties["load"]["schema"], {"fields": [full_name_repr, age_repr]}
472464
)

tests/unit/test_client.py

Lines changed: 74 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1596,18 +1596,8 @@ def test_create_table_w_schema_and_query(self):
15961596
{
15971597
"schema": {
15981598
"fields": [
1599-
{
1600-
"name": "full_name",
1601-
"type": "STRING",
1602-
"mode": "REQUIRED",
1603-
"description": None,
1604-
},
1605-
{
1606-
"name": "age",
1607-
"type": "INTEGER",
1608-
"mode": "REQUIRED",
1609-
"description": None,
1610-
},
1599+
{"name": "full_name", "type": "STRING", "mode": "REQUIRED"},
1600+
{"name": "age", "type": "INTEGER", "mode": "REQUIRED"},
16111601
]
16121602
},
16131603
"view": {"query": query},
@@ -1641,18 +1631,8 @@ def test_create_table_w_schema_and_query(self):
16411631
},
16421632
"schema": {
16431633
"fields": [
1644-
{
1645-
"name": "full_name",
1646-
"type": "STRING",
1647-
"mode": "REQUIRED",
1648-
"description": None,
1649-
},
1650-
{
1651-
"name": "age",
1652-
"type": "INTEGER",
1653-
"mode": "REQUIRED",
1654-
"description": None,
1655-
},
1634+
{"name": "full_name", "type": "STRING", "mode": "REQUIRED"},
1635+
{"name": "age", "type": "INTEGER", "mode": "REQUIRED"},
16561636
]
16571637
},
16581638
"view": {"query": query, "useLegacySql": False},
@@ -2602,7 +2582,7 @@ def test_update_table(self):
26022582
"name": "age",
26032583
"type": "INTEGER",
26042584
"mode": "REQUIRED",
2605-
"description": None,
2585+
"description": "New field description",
26062586
},
26072587
]
26082588
},
@@ -2613,8 +2593,10 @@ def test_update_table(self):
26132593
}
26142594
)
26152595
schema = [
2616-
SchemaField("full_name", "STRING", mode="REQUIRED"),
2617-
SchemaField("age", "INTEGER", mode="REQUIRED"),
2596+
SchemaField("full_name", "STRING", mode="REQUIRED", description=None),
2597+
SchemaField(
2598+
"age", "INTEGER", mode="REQUIRED", description="New field description"
2599+
),
26182600
]
26192601
creds = _make_credentials()
26202602
client = self._make_one(project=self.PROJECT, credentials=creds)
@@ -2647,7 +2629,7 @@ def test_update_table(self):
26472629
"name": "age",
26482630
"type": "INTEGER",
26492631
"mode": "REQUIRED",
2650-
"description": None,
2632+
"description": "New field description",
26512633
},
26522634
]
26532635
},
@@ -2773,13 +2755,24 @@ def test_update_table_w_query(self):
27732755
"name": "age",
27742756
"type": "INTEGER",
27752757
"mode": "REQUIRED",
2776-
"description": None,
2758+
"description": "this is a column",
27772759
},
2760+
{"name": "country", "type": "STRING", "mode": "NULLABLE"},
27782761
]
27792762
}
27802763
schema = [
2781-
SchemaField("full_name", "STRING", mode="REQUIRED"),
2782-
SchemaField("age", "INTEGER", mode="REQUIRED"),
2764+
SchemaField(
2765+
"full_name",
2766+
"STRING",
2767+
mode="REQUIRED",
2768+
# Explicitly unset the description.
2769+
description=None,
2770+
),
2771+
SchemaField(
2772+
"age", "INTEGER", mode="REQUIRED", description="this is a column"
2773+
),
2774+
# Omit the description to not make updates to it.
2775+
SchemaField("country", "STRING"),
27832776
]
27842777
resource = self._make_table_resource()
27852778
resource.update(
@@ -7658,18 +7651,47 @@ def test_load_table_from_file_w_invalid_job_config(self):
76587651
def test_load_table_from_dataframe(self):
76597652
from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES
76607653
from google.cloud.bigquery import job
7661-
from google.cloud.bigquery.schema import SchemaField
7654+
from google.cloud.bigquery.schema import PolicyTagList, SchemaField
76627655

76637656
client = self._make_client()
7664-
records = [{"id": 1, "age": 100}, {"id": 2, "age": 60}]
7665-
dataframe = pandas.DataFrame(records)
7657+
records = [
7658+
{"id": 1, "age": 100, "accounts": [2, 3]},
7659+
{"id": 2, "age": 60, "accounts": [5]},
7660+
{"id": 3, "age": 40, "accounts": []},
7661+
]
7662+
# Mixup column order so that we can verify sent schema matches the
7663+
# serialized order, not the table column order.
7664+
column_order = ["age", "accounts", "id"]
7665+
dataframe = pandas.DataFrame(records, columns=column_order)
7666+
table_fields = {
7667+
"id": SchemaField(
7668+
"id",
7669+
"INTEGER",
7670+
mode="REQUIRED",
7671+
description="integer column",
7672+
policy_tags=PolicyTagList(names=("foo", "bar")),
7673+
),
7674+
"age": SchemaField(
7675+
"age",
7676+
"INTEGER",
7677+
mode="NULLABLE",
7678+
description="age column",
7679+
policy_tags=PolicyTagList(names=("baz",)),
7680+
),
7681+
"accounts": SchemaField(
7682+
"accounts", "INTEGER", mode="REPEATED", description="array column",
7683+
),
7684+
}
7685+
get_table_schema = [
7686+
table_fields["id"],
7687+
table_fields["age"],
7688+
table_fields["accounts"],
7689+
]
76667690

76677691
get_table_patch = mock.patch(
76687692
"google.cloud.bigquery.client.Client.get_table",
76697693
autospec=True,
7670-
return_value=mock.Mock(
7671-
schema=[SchemaField("id", "INTEGER"), SchemaField("age", "INTEGER")]
7672-
),
7694+
return_value=mock.Mock(schema=get_table_schema),
76737695
)
76747696
load_patch = mock.patch(
76757697
"google.cloud.bigquery.client.Client.load_table_from_file", autospec=True
@@ -7695,8 +7717,21 @@ def test_load_table_from_dataframe(self):
76957717
sent_file = load_table_from_file.mock_calls[0][1][1]
76967718
assert sent_file.closed
76977719

7698-
sent_config = load_table_from_file.mock_calls[0][2]["job_config"]
7699-
assert sent_config.source_format == job.SourceFormat.PARQUET
7720+
sent_config = load_table_from_file.mock_calls[0][2]["job_config"].to_api_repr()[
7721+
"load"
7722+
]
7723+
assert sent_config["sourceFormat"] == job.SourceFormat.PARQUET
7724+
for field_index, field in enumerate(sent_config["schema"]["fields"]):
7725+
assert field["name"] == column_order[field_index]
7726+
table_field = table_fields[field["name"]]
7727+
assert field["name"] == table_field.name
7728+
assert field["type"] == table_field.field_type
7729+
assert field["mode"] == table_field.mode
7730+
assert len(field.get("fields", [])) == len(table_field.fields)
7731+
# Omit unnecessary fields when they come from getting the table
7732+
# (not passed in via job_config)
7733+
assert "description" not in field
7734+
assert "policyTags" not in field
77007735

77017736
@unittest.skipIf(pandas is None, "Requires `pandas`")
77027737
@unittest.skipIf(pyarrow is None, "Requires `pyarrow`")

tests/unit/test_external_config.py

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -77,14 +77,7 @@ def test_to_api_repr_base(self):
7777
ec.schema = [schema.SchemaField("full_name", "STRING", mode="REQUIRED")]
7878

7979
exp_schema = {
80-
"fields": [
81-
{
82-
"name": "full_name",
83-
"type": "STRING",
84-
"mode": "REQUIRED",
85-
"description": None,
86-
}
87-
]
80+
"fields": [{"name": "full_name", "type": "STRING", "mode": "REQUIRED"}]
8881
}
8982
got_resource = ec.to_api_repr()
9083
exp_resource = {

0 commit comments

Comments
 (0)