Skip to content

Commit 62ec85b

Browse files
authored
BUG: Fix uploading of dataframes containing int64 and float64 columns (#117)
* BUG: Fix uploading of dataframes containing int64 and float64 columns Fixes #116 and #96 by loading data in CSV chunks. * ENH: allow chunksize=None to disable chunking in to_gbq() Also, fixes lint errors. * TST: update min g-c-bq lib to 0.29.0 in CI * BUG: pass schema to load job for to_gbq * Generate schema if needed for table creation. * Restore _generate_bq_schema, as it is used in tests. * Add fixes to changelog.
1 parent f040c18 commit 62ec85b

File tree

8 files changed

+256
-65
lines changed

8 files changed

+256
-65
lines changed

ci/requirements-3.5-0.18.1.pip

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
google-auth==1.0.0
22
google-auth-oauthlib==0.0.1
33
mock
4-
google-cloud-bigquery==0.28.0
4+
google-cloud-bigquery==0.29.0

docs/source/changelog.rst

+2-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@ Changelog
77

88
- Fix an issue where Unicode couldn't be uploaded in Python 2 (:issue:`93`)
99
- Add support for a passed schema in :func:``to_gbq`` instead inferring the schema from the passed ``DataFrame`` with ``DataFrame.dtypes`` (:issue:`46`)
10-
10+
- Fix an issue where a dataframe containing both integer and floating point columns could not be uploaded with ``to_gbq`` (:issue:`116`)
11+
- ``to_gbq`` now uses ``to_csv`` to avoid manually looping over rows in a dataframe (should result in faster table uploads) (:issue:`96`)
1112

1213
0.3.0 / 2018-01-03
1314
------------------

pandas_gbq/_load.py

+74
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
"""Helper methods for loading data into BigQuery"""
2+
3+
from google.cloud import bigquery
4+
import six
5+
6+
from pandas_gbq import _schema
7+
8+
9+
def encode_chunk(dataframe):
10+
"""Return a file-like object of CSV-encoded rows.
11+
12+
Args:
13+
dataframe (pandas.DataFrame): A chunk of a dataframe to encode
14+
"""
15+
csv_buffer = six.StringIO()
16+
dataframe.to_csv(
17+
csv_buffer, index=False, header=False, encoding='utf-8',
18+
date_format='%Y-%m-%d %H:%M')
19+
20+
# Convert to a BytesIO buffer so that unicode text is properly handled.
21+
# See: https://github.com/pydata/pandas-gbq/issues/106
22+
body = csv_buffer.getvalue()
23+
if isinstance(body, bytes):
24+
body = body.decode('utf-8')
25+
body = body.encode('utf-8')
26+
return six.BytesIO(body)
27+
28+
29+
def encode_chunks(dataframe, chunksize=None):
30+
dataframe = dataframe.reset_index(drop=True)
31+
if chunksize is None:
32+
yield 0, encode_chunk(dataframe)
33+
return
34+
35+
remaining_rows = len(dataframe)
36+
total_rows = remaining_rows
37+
start_index = 0
38+
while start_index < total_rows:
39+
end_index = start_index + chunksize
40+
chunk_buffer = encode_chunk(dataframe[start_index:end_index])
41+
start_index += chunksize
42+
remaining_rows = max(0, remaining_rows - chunksize)
43+
yield remaining_rows, chunk_buffer
44+
45+
46+
def load_chunks(
47+
client, dataframe, dataset_id, table_id, chunksize=None, schema=None):
48+
destination_table = client.dataset(dataset_id).table(table_id)
49+
job_config = bigquery.LoadJobConfig()
50+
job_config.write_disposition = 'WRITE_APPEND'
51+
job_config.source_format = 'CSV'
52+
53+
if schema is None:
54+
schema = _schema.generate_bq_schema(dataframe)
55+
56+
# Manually create the schema objects, adding NULLABLE mode
57+
# as a workaround for
58+
# https://github.com/GoogleCloudPlatform/google-cloud-python/issues/4456
59+
for field in schema['fields']:
60+
if 'mode' not in field:
61+
field['mode'] = 'NULLABLE'
62+
63+
job_config.schema = [
64+
bigquery.SchemaField.from_api_repr(field)
65+
for field in schema['fields']
66+
]
67+
68+
chunks = encode_chunks(dataframe, chunksize=chunksize)
69+
for remaining_rows, chunk_buffer in chunks:
70+
yield remaining_rows
71+
client.load_table_from_file(
72+
chunk_buffer,
73+
destination_table,
74+
job_config=job_config).result()

pandas_gbq/_schema.py

+29
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
"""Helper methods for BigQuery schemas"""
2+
3+
4+
def generate_bq_schema(dataframe, default_type='STRING'):
5+
"""Given a passed dataframe, generate the associated Google BigQuery schema.
6+
7+
Arguments:
8+
dataframe (pandas.DataFrame): D
9+
default_type : string
10+
The default big query type in case the type of the column
11+
does not exist in the schema.
12+
"""
13+
14+
type_mapping = {
15+
'i': 'INTEGER',
16+
'b': 'BOOLEAN',
17+
'f': 'FLOAT',
18+
'O': 'STRING',
19+
'S': 'STRING',
20+
'U': 'STRING',
21+
'M': 'TIMESTAMP'
22+
}
23+
24+
fields = []
25+
for column_name, dtype in dataframe.dtypes.iteritems():
26+
fields.append({'name': column_name,
27+
'type': type_mapping.get(dtype.kind, default_type)})
28+
29+
return {'fields': fields}

pandas_gbq/gbq.py

+32-63
Original file line numberDiff line numberDiff line change
@@ -556,45 +556,22 @@ def run_query(self, query, **kwargs):
556556

557557
return schema, result_rows
558558

559-
def load_data(self, dataframe, dataset_id, table_id, chunksize):
560-
from google.cloud.bigquery import LoadJobConfig
561-
from six import BytesIO
562-
563-
destination_table = self.client.dataset(dataset_id).table(table_id)
564-
job_config = LoadJobConfig()
565-
job_config.write_disposition = 'WRITE_APPEND'
566-
job_config.source_format = 'NEWLINE_DELIMITED_JSON'
567-
rows = []
568-
remaining_rows = len(dataframe)
569-
570-
total_rows = remaining_rows
571-
self._print("\n\n")
559+
def load_data(
560+
self, dataframe, dataset_id, table_id, chunksize=None,
561+
schema=None):
562+
from pandas_gbq import _load
572563

573-
for index, row in dataframe.reset_index(drop=True).iterrows():
574-
row_json = row.to_json(
575-
force_ascii=False, date_unit='s', date_format='iso')
576-
rows.append(row_json)
577-
remaining_rows -= 1
564+
total_rows = len(dataframe)
565+
self._print("\n\n")
578566

579-
if (len(rows) % chunksize == 0) or (remaining_rows == 0):
567+
try:
568+
for remaining_rows in _load.load_chunks(
569+
self.client, dataframe, dataset_id, table_id,
570+
chunksize=chunksize):
580571
self._print("\rLoad is {0}% Complete".format(
581572
((total_rows - remaining_rows) * 100) / total_rows))
582-
583-
body = '{}\n'.format('\n'.join(rows))
584-
if isinstance(body, bytes):
585-
body = body.decode('utf-8')
586-
body = body.encode('utf-8')
587-
body = BytesIO(body)
588-
589-
try:
590-
self.client.load_table_from_file(
591-
body,
592-
destination_table,
593-
job_config=job_config).result()
594-
except self.http_error as ex:
595-
self.process_http_error(ex)
596-
597-
rows = []
573+
except self.http_error as ex:
574+
self.process_http_error(ex)
598575

599576
self._print("\n")
600577

@@ -888,7 +865,7 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None,
888865
return final_df
889866

890867

891-
def to_gbq(dataframe, destination_table, project_id, chunksize=10000,
868+
def to_gbq(dataframe, destination_table, project_id, chunksize=None,
892869
verbose=True, reauth=False, if_exists='fail', private_key=None,
893870
auth_local_webserver=False, table_schema=None):
894871
"""Write a DataFrame to a Google BigQuery table.
@@ -922,8 +899,9 @@ def to_gbq(dataframe, destination_table, project_id, chunksize=10000,
922899
Name of table to be written, in the form 'dataset.tablename'
923900
project_id : str
924901
Google BigQuery Account project ID.
925-
chunksize : int (default 10000)
926-
Number of rows to be inserted in each chunk from the dataframe.
902+
chunksize : int (default None)
903+
Number of rows to be inserted in each chunk from the dataframe. Use
904+
``None`` to load the dataframe in a single chunk.
927905
verbose : boolean (default True)
928906
Show percentage complete
929907
reauth : boolean (default False)
@@ -985,7 +963,7 @@ def to_gbq(dataframe, destination_table, project_id, chunksize=10000,
985963
raise TableCreationError("Could not create the table because it "
986964
"already exists. "
987965
"Change the if_exists parameter to "
988-
"append or replace data.")
966+
"'append' or 'replace' data.")
989967
elif if_exists == 'replace':
990968
connector.delete_and_recreate_table(
991969
dataset_id, table_id, table_schema)
@@ -999,19 +977,14 @@ def to_gbq(dataframe, destination_table, project_id, chunksize=10000,
999977
else:
1000978
table.create(table_id, table_schema)
1001979

1002-
connector.load_data(dataframe, dataset_id, table_id, chunksize)
980+
connector.load_data(
981+
dataframe, dataset_id, table_id, chunksize=chunksize,
982+
schema=table_schema)
1003983

1004984

1005985
def generate_bq_schema(df, default_type='STRING'):
1006-
# deprecation TimeSeries, #11121
1007-
warnings.warn("generate_bq_schema is deprecated and will be removed in "
1008-
"a future version", FutureWarning, stacklevel=2)
1009-
1010-
return _generate_bq_schema(df, default_type=default_type)
1011-
1012-
1013-
def _generate_bq_schema(df, default_type='STRING'):
1014-
""" Given a passed df, generate the associated Google BigQuery schema.
986+
"""DEPRECATED: Given a passed df, generate the associated Google BigQuery
987+
schema.
1015988
1016989
Parameters
1017990
----------
@@ -1020,23 +993,16 @@ def _generate_bq_schema(df, default_type='STRING'):
1020993
The default big query type in case the type of the column
1021994
does not exist in the schema.
1022995
"""
996+
# deprecation TimeSeries, #11121
997+
warnings.warn("generate_bq_schema is deprecated and will be removed in "
998+
"a future version", FutureWarning, stacklevel=2)
1023999

1024-
type_mapping = {
1025-
'i': 'INTEGER',
1026-
'b': 'BOOLEAN',
1027-
'f': 'FLOAT',
1028-
'O': 'STRING',
1029-
'S': 'STRING',
1030-
'U': 'STRING',
1031-
'M': 'TIMESTAMP'
1032-
}
1000+
return _generate_bq_schema(df, default_type=default_type)
10331001

1034-
fields = []
1035-
for column_name, dtype in df.dtypes.iteritems():
1036-
fields.append({'name': column_name,
1037-
'type': type_mapping.get(dtype.kind, default_type)})
10381002

1039-
return {'fields': fields}
1003+
def _generate_bq_schema(df, default_type='STRING'):
1004+
from pandas_gbq import _schema
1005+
return _schema.generate_bq_schema(df, default_type=default_type)
10401006

10411007

10421008
class _Table(GbqConnector):
@@ -1096,6 +1062,9 @@ def create(self, table_id, schema):
10961062
table_ref = self.client.dataset(self.dataset_id).table(table_id)
10971063
table = Table(table_ref)
10981064

1065+
# Manually create the schema objects, adding NULLABLE mode
1066+
# as a workaround for
1067+
# https://github.com/GoogleCloudPlatform/google-cloud-python/issues/4456
10991068
for field in schema['fields']:
11001069
if 'mode' not in field:
11011070
field['mode'] = 'NULLABLE'

pandas_gbq/tests/test__load.py

+40
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
# -*- coding: utf-8 -*-
2+
3+
import numpy
4+
import pandas
5+
6+
7+
def test_encode_chunk_with_unicode():
8+
"""Test that a dataframe containing unicode can be encoded as a file.
9+
10+
See: https://github.com/pydata/pandas-gbq/issues/106
11+
"""
12+
from pandas_gbq._load import encode_chunk
13+
14+
df = pandas.DataFrame(
15+
numpy.random.randn(6, 4), index=range(6), columns=list('ABCD'))
16+
df['s'] = u'信用卡'
17+
csv_buffer = encode_chunk(df)
18+
csv_bytes = csv_buffer.read()
19+
csv_string = csv_bytes.decode('utf-8')
20+
assert u'信用卡' in csv_string
21+
22+
23+
def test_encode_chunks_splits_dataframe():
24+
from pandas_gbq._load import encode_chunks
25+
df = pandas.DataFrame(numpy.random.randn(6, 4), index=range(6))
26+
chunks = list(encode_chunks(df, chunksize=2))
27+
assert len(chunks) == 3
28+
remaining, buffer = chunks[0]
29+
assert remaining == 4
30+
assert len(buffer.readlines()) == 2
31+
32+
33+
def test_encode_chunks_with_chunksize_none():
34+
from pandas_gbq._load import encode_chunks
35+
df = pandas.DataFrame(numpy.random.randn(6, 4), index=range(6))
36+
chunks = list(encode_chunks(df))
37+
assert len(chunks) == 1
38+
remaining, buffer = chunks[0]
39+
assert remaining == 0
40+
assert len(buffer.readlines()) == 6

pandas_gbq/tests/test__schema.py

+55
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
2+
import datetime
3+
4+
import pandas
5+
import pytest
6+
7+
from pandas_gbq import _schema
8+
9+
10+
@pytest.mark.parametrize(
11+
'dataframe,expected_schema',
12+
[
13+
(
14+
pandas.DataFrame(data={'col1': [1, 2, 3]}),
15+
{'fields': [{'name': 'col1', 'type': 'INTEGER'}]},
16+
),
17+
(
18+
pandas.DataFrame(data={'col1': [True, False]}),
19+
{'fields': [{'name': 'col1', 'type': 'BOOLEAN'}]},
20+
),
21+
(
22+
pandas.DataFrame(data={'col1': [1.0, 3.14]}),
23+
{'fields': [{'name': 'col1', 'type': 'FLOAT'}]},
24+
),
25+
(
26+
pandas.DataFrame(data={'col1': [u'hello', u'world']}),
27+
{'fields': [{'name': 'col1', 'type': 'STRING'}]},
28+
),
29+
(
30+
pandas.DataFrame(data={'col1': [datetime.datetime.now()]}),
31+
{'fields': [{'name': 'col1', 'type': 'TIMESTAMP'}]},
32+
),
33+
(
34+
pandas.DataFrame(
35+
data={
36+
'col1': [datetime.datetime.now()],
37+
'col2': [u'hello'],
38+
'col3': [3.14],
39+
'col4': [True],
40+
'col5': [4],
41+
}),
42+
{
43+
'fields': [
44+
{'name': 'col1', 'type': 'TIMESTAMP'},
45+
{'name': 'col2', 'type': 'STRING'},
46+
{'name': 'col3', 'type': 'FLOAT'},
47+
{'name': 'col4', 'type': 'BOOLEAN'},
48+
{'name': 'col5', 'type': 'INTEGER'},
49+
],
50+
},
51+
),
52+
])
53+
def test_generate_bq_schema(dataframe, expected_schema):
54+
schema = _schema.generate_bq_schema(dataframe)
55+
assert schema == expected_schema

0 commit comments

Comments
 (0)