From 12b59d0a0bd2ea20bec6236c091da7bfe00334be Mon Sep 17 00:00:00 2001 From: Maximilian Roos Date: Thu, 11 Jan 2018 22:07:15 -0500 Subject: [PATCH 01/12] encode before uploading --- pandas_gbq/gbq.py | 4 ++-- pandas_gbq/tests/test_gbq.py | 17 +++++++++++++++++ 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 77efe100..ac7722d9 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -559,7 +559,7 @@ def run_query(self, query, **kwargs): def load_data(self, dataframe, dataset_id, table_id, chunksize): from google.cloud.bigquery import LoadJobConfig - from six import StringIO + from six import BytesIO destination_table = self.client.dataset(dataset_id).table(table_id) job_config = LoadJobConfig() @@ -581,7 +581,7 @@ def load_data(self, dataframe, dataset_id, table_id, chunksize): self._print("\rLoad is {0}% Complete".format( ((total_rows - remaining_rows) * 100) / total_rows)) - body = StringIO('{}\n'.format('\n'.join(rows))) + body = BytesIO('{}\n'.format('\n'.join(rows)).encode()) try: self.client.load_table_from_file( diff --git a/pandas_gbq/tests/test_gbq.py b/pandas_gbq/tests/test_gbq.py index 75274d97..72c2fca1 100644 --- a/pandas_gbq/tests/test_gbq.py +++ b/pandas_gbq/tests/test_gbq.py @@ -1468,6 +1468,23 @@ def test_upload_data(self): assert result['num_rows'][0] == test_size + def test_upload_unicode_data(self): + test_id = "1" + test_size = 10 + df = DataFrame(np.random.randn(6, 4), index=range(6), columns=list('ABCD')) + df.A = '信用卡' + + gbq.to_gbq(df, self.destination_table + test_id, _get_project_id(), + chunksize=10000) + + result = gbq.read_gbq("SELECT COUNT(*) AS num_rows FROM {0}".format( + self.destination_table + test_id), + project_id=_get_project_id()) + + assert result['num_rows'][0] == test_size + + + class TestToGBQIntegrationWithServiceAccountKeyContents(object): # Changes to BigQuery table schema may take up to 2 minutes as of May 2015 # As a workaround to this issue, each test should use a unique table name. From 937596f0b58d3ef9d83ff63be7785ea66198b87a Mon Sep 17 00:00:00 2001 From: Maximilian Roos Date: Thu, 11 Jan 2018 22:19:15 -0500 Subject: [PATCH 02/12] set py file coding for py2 --- pandas_gbq/tests/test_gbq.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas_gbq/tests/test_gbq.py b/pandas_gbq/tests/test_gbq.py index 72c2fca1..5c91576c 100644 --- a/pandas_gbq/tests/test_gbq.py +++ b/pandas_gbq/tests/test_gbq.py @@ -1,3 +1,5 @@ +# -*- coding: utf-8 -*- + import pytest import re @@ -1472,7 +1474,7 @@ def test_upload_unicode_data(self): test_id = "1" test_size = 10 df = DataFrame(np.random.randn(6, 4), index=range(6), columns=list('ABCD')) - df.A = '信用卡' + df.A = u'信用卡' gbq.to_gbq(df, self.destination_table + test_id, _get_project_id(), chunksize=10000) From a64f67afb215643e876cd2d38719886023570466 Mon Sep 17 00:00:00 2001 From: Maximilian Roos Date: Thu, 11 Jan 2018 22:36:58 -0500 Subject: [PATCH 03/12] lint --- pandas_gbq/tests/test_gbq.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas_gbq/tests/test_gbq.py b/pandas_gbq/tests/test_gbq.py index 5c91576c..006b7fa0 100644 --- a/pandas_gbq/tests/test_gbq.py +++ b/pandas_gbq/tests/test_gbq.py @@ -1469,11 +1469,11 @@ def test_upload_data(self): assert result['num_rows'][0] == test_size - def test_upload_unicode_data(self): test_id = "1" test_size = 10 - df = DataFrame(np.random.randn(6, 4), index=range(6), columns=list('ABCD')) + df = DataFrame(np.random.randn(6, 4), index=range(6), + columns=list('ABCD')) df.A = u'信用卡' gbq.to_gbq(df, self.destination_table + test_id, _get_project_id(), @@ -1486,7 +1486,6 @@ def test_upload_unicode_data(self): assert result['num_rows'][0] == test_size - class TestToGBQIntegrationWithServiceAccountKeyContents(object): # Changes to BigQuery table schema may take up to 2 minutes as of May 2015 # As a workaround to this issue, each test should use a unique table name. From 3ed5a24001267c91fe8cedba99b7c927f45dbbed Mon Sep 17 00:00:00 2001 From: Maximilian Roos Date: Thu, 11 Jan 2018 22:43:37 -0500 Subject: [PATCH 04/12] move test to travis test class --- pandas_gbq/tests/test_gbq.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/pandas_gbq/tests/test_gbq.py b/pandas_gbq/tests/test_gbq.py index 006b7fa0..c29adf30 100644 --- a/pandas_gbq/tests/test_gbq.py +++ b/pandas_gbq/tests/test_gbq.py @@ -1156,6 +1156,22 @@ def test_google_upload_errors_should_raise_exception(self): gbq.to_gbq(bad_df, self.destination_table + test_id, _get_project_id(), private_key=_get_private_key_path()) + def test_upload_unicode_data(self): + test_id = "1" + test_size = 10 + df = DataFrame(np.random.randn(6, 4), index=range(6), + columns=list('ABCD')) + df.A = u'信用卡' + + gbq.to_gbq(df, self.destination_table + test_id, _get_project_id(), + chunksize=10000) + + result = gbq.read_gbq("SELECT COUNT(*) AS num_rows FROM {0}".format( + self.destination_table + test_id), + project_id=_get_project_id()) + + assert result['num_rows'][0] == test_size + def test_generate_schema(self): df = tm.makeMixedDataFrame() schema = gbq._generate_bq_schema(df) @@ -1469,22 +1485,6 @@ def test_upload_data(self): assert result['num_rows'][0] == test_size - def test_upload_unicode_data(self): - test_id = "1" - test_size = 10 - df = DataFrame(np.random.randn(6, 4), index=range(6), - columns=list('ABCD')) - df.A = u'信用卡' - - gbq.to_gbq(df, self.destination_table + test_id, _get_project_id(), - chunksize=10000) - - result = gbq.read_gbq("SELECT COUNT(*) AS num_rows FROM {0}".format( - self.destination_table + test_id), - project_id=_get_project_id()) - - assert result['num_rows'][0] == test_size - class TestToGBQIntegrationWithServiceAccountKeyContents(object): # Changes to BigQuery table schema may take up to 2 minutes as of May 2015 From 0434e34d57d61296c9df09139f32220efbbdc76f Mon Sep 17 00:00:00 2001 From: Maximilian Roos Date: Fri, 12 Jan 2018 10:03:40 -0500 Subject: [PATCH 05/12] try forcing utf-8 encoding --- pandas_gbq/gbq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index ac7722d9..5b9d753c 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -581,7 +581,7 @@ def load_data(self, dataframe, dataset_id, table_id, chunksize): self._print("\rLoad is {0}% Complete".format( ((total_rows - remaining_rows) * 100) / total_rows)) - body = BytesIO('{}\n'.format('\n'.join(rows)).encode()) + body = BytesIO('{}\n'.format('\n'.join(rows)).encode('utf-8')) try: self.client.load_table_from_file( From 085018f4eb02ee42406bc1fea09169295494215d Mon Sep 17 00:00:00 2001 From: Maximilian Roos Date: Fri, 12 Jan 2018 10:39:46 -0500 Subject: [PATCH 06/12] add test --- pandas_gbq/tests/test_gbq.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/pandas_gbq/tests/test_gbq.py b/pandas_gbq/tests/test_gbq.py index c29adf30..bcb4442f 100644 --- a/pandas_gbq/tests/test_gbq.py +++ b/pandas_gbq/tests/test_gbq.py @@ -1156,7 +1156,7 @@ def test_google_upload_errors_should_raise_exception(self): gbq.to_gbq(bad_df, self.destination_table + test_id, _get_project_id(), private_key=_get_private_key_path()) - def test_upload_unicode_data(self): + def test_upload_chinese_unicode_data(self): test_id = "1" test_size = 10 df = DataFrame(np.random.randn(6, 4), index=range(6), @@ -1172,6 +1172,27 @@ def test_upload_unicode_data(self): assert result['num_rows'][0] == test_size + def test_upload_other_unicode_data(self): + test_id = "1" + test_size = 10 + df = DataFrame({ + 'string': ['Skywalker™', 'lego', 'hülle'], + 'integer': [200, 300, 400], + 'Date': [ + '2017-12-13 17:40:39', '2017-12-13 17:40:39', + '2017-12-13 17:40:39' + ] + }) + + gbq.to_gbq(df, self.destination_table + test_id, _get_project_id(), + chunksize=10000) + + result = gbq.read_gbq("SELECT COUNT(*) AS num_rows FROM {0}".format( + self.destination_table + test_id), + project_id=_get_project_id()) + + assert result['num_rows'][0] == test_size + def test_generate_schema(self): df = tm.makeMixedDataFrame() schema = gbq._generate_bq_schema(df) From 8cd699137789937725f10c5b831139067de13cb8 Mon Sep 17 00:00:00 2001 From: Maximilian Roos Date: Fri, 12 Jan 2018 13:55:00 -0500 Subject: [PATCH 07/12] correct expected sizes --- pandas_gbq/tests/test_gbq.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas_gbq/tests/test_gbq.py b/pandas_gbq/tests/test_gbq.py index bcb4442f..10ec6597 100644 --- a/pandas_gbq/tests/test_gbq.py +++ b/pandas_gbq/tests/test_gbq.py @@ -1157,8 +1157,8 @@ def test_google_upload_errors_should_raise_exception(self): _get_project_id(), private_key=_get_private_key_path()) def test_upload_chinese_unicode_data(self): - test_id = "1" - test_size = 10 + test_id = "2" + test_size = 6 df = DataFrame(np.random.randn(6, 4), index=range(6), columns=list('ABCD')) df.A = u'信用卡' @@ -1173,8 +1173,8 @@ def test_upload_chinese_unicode_data(self): assert result['num_rows'][0] == test_size def test_upload_other_unicode_data(self): - test_id = "1" - test_size = 10 + test_id = "3" + test_size = 3 df = DataFrame({ 'string': ['Skywalker™', 'lego', 'hülle'], 'integer': [200, 300, 400], @@ -1191,7 +1191,7 @@ def test_upload_other_unicode_data(self): self.destination_table + test_id), project_id=_get_project_id()) - assert result['num_rows'][0] == test_size + assert result['num_rows'][0] == test_si def test_generate_schema(self): df = tm.makeMixedDataFrame() @@ -1505,7 +1505,7 @@ def test_upload_data(self): project_id=_get_project_id()) assert result['num_rows'][0] == test_size - + class TestToGBQIntegrationWithServiceAccountKeyContents(object): # Changes to BigQuery table schema may take up to 2 minutes as of May 2015 From 4d52f959513aec788fff50726f2d3d9a3e511707 Mon Sep 17 00:00:00 2001 From: Maximilian Roos Date: Sat, 13 Jan 2018 20:42:04 -0500 Subject: [PATCH 08/12] test data matches --- pandas_gbq/tests/test_gbq.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas_gbq/tests/test_gbq.py b/pandas_gbq/tests/test_gbq.py index 10ec6597..79108535 100644 --- a/pandas_gbq/tests/test_gbq.py +++ b/pandas_gbq/tests/test_gbq.py @@ -1171,6 +1171,7 @@ def test_upload_chinese_unicode_data(self): project_id=_get_project_id()) assert result['num_rows'][0] == test_size + tm.assert_series_equal(result['A'], df['A']) def test_upload_other_unicode_data(self): test_id = "3" @@ -1191,7 +1192,8 @@ def test_upload_other_unicode_data(self): self.destination_table + test_id), project_id=_get_project_id()) - assert result['num_rows'][0] == test_si + assert result['num_rows'][0] == test_size + tm.assert_series_equal(result['string'], df['string']) def test_generate_schema(self): df = tm.makeMixedDataFrame() From b3bbff018181daf5fcbb83241d486239940ebc0d Mon Sep 17 00:00:00 2001 From: Maximilian Roos Date: Sat, 13 Jan 2018 20:46:20 -0500 Subject: [PATCH 09/12] test unicode locally --- pandas_gbq/tests/test_gbq.py | 40 +++++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/pandas_gbq/tests/test_gbq.py b/pandas_gbq/tests/test_gbq.py index 79108535..e6628fa5 100644 --- a/pandas_gbq/tests/test_gbq.py +++ b/pandas_gbq/tests/test_gbq.py @@ -1507,7 +1507,45 @@ def test_upload_data(self): project_id=_get_project_id()) assert result['num_rows'][0] == test_size - + + def test_upload_chinese_unicode_data(self): + test_id = "2" + test_size = 6 + df = DataFrame(np.random.randn(6, 4), index=range(6), + columns=list('ABCD')) + df.A = u'信用卡' + + gbq.to_gbq(df, self.destination_table + test_id, _get_project_id(), + chunksize=10000) + + result = gbq.read_gbq("SELECT COUNT(*) AS num_rows FROM {0}".format( + self.destination_table + test_id), + project_id=_get_project_id()) + + assert result['num_rows'][0] == test_size + tm.assert_series_equal(result['A'], df['A']) + + def test_upload_other_unicode_data(self): + test_id = "3" + test_size = 3 + df = DataFrame({ + 'string': ['Skywalker™', 'lego', 'hülle'], + 'integer': [200, 300, 400], + 'Date': [ + '2017-12-13 17:40:39', '2017-12-13 17:40:39', + '2017-12-13 17:40:39' + ] + }) + + gbq.to_gbq(df, self.destination_table + test_id, _get_project_id(), + chunksize=10000) + + result = gbq.read_gbq("SELECT COUNT(*) AS num_rows FROM {0}".format( + self.destination_table + test_id), + project_id=_get_project_id()) + + assert result['num_rows'][0] == test_size + tm.assert_series_equal(result['string'], df['string']) class TestToGBQIntegrationWithServiceAccountKeyContents(object): # Changes to BigQuery table schema may take up to 2 minutes as of May 2015 From fca5d2965da30399e9d9d6332e12539ec5629f5b Mon Sep 17 00:00:00 2001 From: Maximilian Roos Date: Sat, 13 Jan 2018 22:13:26 -0500 Subject: [PATCH 10/12] Py2/Py3 compat --- pandas_gbq/gbq.py | 6 ++- pandas_gbq/tests/test_gbq.py | 72 ++++++++++++++++++++++++++---------- 2 files changed, 57 insertions(+), 21 deletions(-) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 5b9d753c..67d5ea51 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -581,7 +581,11 @@ def load_data(self, dataframe, dataset_id, table_id, chunksize): self._print("\rLoad is {0}% Complete".format( ((total_rows - remaining_rows) * 100) / total_rows)) - body = BytesIO('{}\n'.format('\n'.join(rows)).encode('utf-8')) + body = '{}\n'.format('\n'.join(rows)) + if isinstance(body, bytes): + body = body.decode('utf-8') + body = body.encode('utf-8') + body = BytesIO(body) try: self.client.load_table_from_file( diff --git a/pandas_gbq/tests/test_gbq.py b/pandas_gbq/tests/test_gbq.py index e6628fa5..20b6e244 100644 --- a/pandas_gbq/tests/test_gbq.py +++ b/pandas_gbq/tests/test_gbq.py @@ -9,6 +9,7 @@ import os from random import randint import logging +import sys import numpy as np @@ -1161,25 +1162,33 @@ def test_upload_chinese_unicode_data(self): test_size = 6 df = DataFrame(np.random.randn(6, 4), index=range(6), columns=list('ABCD')) - df.A = u'信用卡' + df['s'] = u'信用卡' gbq.to_gbq(df, self.destination_table + test_id, _get_project_id(), chunksize=10000) - result = gbq.read_gbq("SELECT COUNT(*) AS num_rows FROM {0}".format( + result = gbq.read_gbq("SELECT * FROM {0}".format( self.destination_table + test_id), project_id=_get_project_id()) - assert result['num_rows'][0] == test_size - tm.assert_series_equal(result['A'], df['A']) + assert len(result_df) == test_size + + pytest.skipif( + sys.version_info.major < 3, + reason='Unicode comparison in Py2 not working') + + result = result_df['s'].sort_values() + expected = df['s'].sort_values() + + tm.assert_numpy_array_equal(expected.values, result.values) def test_upload_other_unicode_data(self): test_id = "3" test_size = 3 df = DataFrame({ - 'string': ['Skywalker™', 'lego', 'hülle'], - 'integer': [200, 300, 400], - 'Date': [ + 's': ['Skywalker™', 'lego', 'hülle'], + 'i': [200, 300, 400], + 'd': [ '2017-12-13 17:40:39', '2017-12-13 17:40:39', '2017-12-13 17:40:39' ] @@ -1188,12 +1197,20 @@ def test_upload_other_unicode_data(self): gbq.to_gbq(df, self.destination_table + test_id, _get_project_id(), chunksize=10000) - result = gbq.read_gbq("SELECT COUNT(*) AS num_rows FROM {0}".format( + result_df = gbq.read_gbq("SELECT * FROM {0}".format( self.destination_table + test_id), project_id=_get_project_id()) - assert result['num_rows'][0] == test_size - tm.assert_series_equal(result['string'], df['string']) + assert len(result_df) == test_size + + pytest.skipif( + sys.version_info.major < 3, + reason='Unicode comparison in Py2 not working') + + result = result_df['s'].sort_values() + expected = df['s'].sort_values() + + tm.assert_numpy_array_equal(expected.values, result.values) def test_generate_schema(self): df = tm.makeMixedDataFrame() @@ -1513,25 +1530,32 @@ def test_upload_chinese_unicode_data(self): test_size = 6 df = DataFrame(np.random.randn(6, 4), index=range(6), columns=list('ABCD')) - df.A = u'信用卡' + df['s'] = u'信用卡' gbq.to_gbq(df, self.destination_table + test_id, _get_project_id(), chunksize=10000) - result = gbq.read_gbq("SELECT COUNT(*) AS num_rows FROM {0}".format( + result_df = gbq.read_gbq("SELECT * FROM {0}".format( self.destination_table + test_id), project_id=_get_project_id()) - assert result['num_rows'][0] == test_size - tm.assert_series_equal(result['A'], df['A']) + assert len(result_df) == test_size + + if sys.version_info.major < 3: + pytest.skip(msg='Unicode comparison in Py2 not working') + + result = result_df['s'].sort_values() + expected = df['s'].sort_values() + + tm.assert_numpy_array_equal(expected.values, result.values) def test_upload_other_unicode_data(self): test_id = "3" test_size = 3 df = DataFrame({ - 'string': ['Skywalker™', 'lego', 'hülle'], - 'integer': [200, 300, 400], - 'Date': [ + 's': ['Skywalker™', 'lego', 'hülle'], + 'i': [200, 300, 400], + 'd': [ '2017-12-13 17:40:39', '2017-12-13 17:40:39', '2017-12-13 17:40:39' ] @@ -1540,12 +1564,20 @@ def test_upload_other_unicode_data(self): gbq.to_gbq(df, self.destination_table + test_id, _get_project_id(), chunksize=10000) - result = gbq.read_gbq("SELECT COUNT(*) AS num_rows FROM {0}".format( + result_df = gbq.read_gbq("SELECT * FROM {0}".format( self.destination_table + test_id), project_id=_get_project_id()) - assert result['num_rows'][0] == test_size - tm.assert_series_equal(result['string'], df['string']) + assert len(result_df) == test_size + + if sys.version_info.major < 3: + pytest.skip(msg='Unicode comparison in Py2 not working') + + result = result_df['s'].sort_values() + expected = df['s'].sort_values() + + tm.assert_numpy_array_equal(expected.values, result.values) + class TestToGBQIntegrationWithServiceAccountKeyContents(object): # Changes to BigQuery table schema may take up to 2 minutes as of May 2015 From e0b80eb614e727e010cb0141772f102f8b5d1589 Mon Sep 17 00:00:00 2001 From: Maximilian Roos Date: Wed, 17 Jan 2018 10:33:43 -0500 Subject: [PATCH 11/12] typo --- pandas_gbq/tests/test_gbq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas_gbq/tests/test_gbq.py b/pandas_gbq/tests/test_gbq.py index 20b6e244..27f991d7 100644 --- a/pandas_gbq/tests/test_gbq.py +++ b/pandas_gbq/tests/test_gbq.py @@ -1167,7 +1167,7 @@ def test_upload_chinese_unicode_data(self): gbq.to_gbq(df, self.destination_table + test_id, _get_project_id(), chunksize=10000) - result = gbq.read_gbq("SELECT * FROM {0}".format( + result_df = gbq.read_gbq("SELECT * FROM {0}".format( self.destination_table + test_id), project_id=_get_project_id()) From 42882a22b29ce6349f7264a7d44f827232185bf5 Mon Sep 17 00:00:00 2001 From: Maximilian Roos Date: Wed, 17 Jan 2018 14:15:35 -0500 Subject: [PATCH 12/12] what's new --- docs/source/changelog.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index 1cd36a41..5d1bb98b 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -1,6 +1,13 @@ Changelog ========= + +0.3.1 / [TBD] +------------------ + +- Fix an issue where Unicode couldn't be uploaded in Python 2 (:issue:`93`) + + 0.3.0 / 2018-01-03 ------------------