Skip to content

Encode before uploading #108

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Jan 17, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions docs/source/changelog.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
Changelog
=========


0.3.1 / [TBD]
------------------

- Fix an issue where Unicode couldn't be uploaded in Python 2 (:issue:`93`)


0.3.0 / 2018-01-03
------------------

Expand Down
8 changes: 6 additions & 2 deletions pandas_gbq/gbq.py
Original file line number Diff line number Diff line change
Expand Up @@ -559,7 +559,7 @@ def run_query(self, query, **kwargs):

def load_data(self, dataframe, dataset_id, table_id, chunksize):
from google.cloud.bigquery import LoadJobConfig
from six import StringIO
from six import BytesIO

destination_table = self.client.dataset(dataset_id).table(table_id)
job_config = LoadJobConfig()
Expand All @@ -581,7 +581,11 @@ def load_data(self, dataframe, dataset_id, table_id, chunksize):
self._print("\rLoad is {0}% Complete".format(
((total_rows - remaining_rows) * 100) / total_rows))

body = StringIO('{}\n'.format('\n'.join(rows)))
body = '{}\n'.format('\n'.join(rows))
Copy link
Collaborator

@tswast tswast Jan 17, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you use u'{}\n'.format(u'\n'.join(rows)) is the if statement checking for bytes necessary?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unfortunately not:


>               body = u'{}\n'.format(u'\n'.join(rows))
E               UnicodeDecodeError: 'ascii' codec can't decode byte 0xe4 in position 77: ordinal not in range(128)

I think the nub problem is that row.to_json comes out as either bytes or str depending on the python version - so we need some branching somewhere. Unless there's a function in python that can deal with both (this all seems a bit inelegant)

(I also tried decoding the row first on 576, which made Py2 pass, but then python3 failed, because it can't decode unicode.

if isinstance(body, bytes):
body = body.decode('utf-8')
body = body.encode('utf-8')
body = BytesIO(body)

try:
self.client.load_table_from_file(
Expand Down
111 changes: 111 additions & 0 deletions pandas_gbq/tests/test_gbq.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# -*- coding: utf-8 -*-

import pytest

import re
Expand All @@ -7,6 +9,7 @@
import os
from random import randint
import logging
import sys

import numpy as np

Expand Down Expand Up @@ -1154,6 +1157,61 @@ def test_google_upload_errors_should_raise_exception(self):
gbq.to_gbq(bad_df, self.destination_table + test_id,
_get_project_id(), private_key=_get_private_key_path())

def test_upload_chinese_unicode_data(self):
test_id = "2"
test_size = 6
df = DataFrame(np.random.randn(6, 4), index=range(6),
columns=list('ABCD'))
df['s'] = u'信用卡'

gbq.to_gbq(df, self.destination_table + test_id, _get_project_id(),
chunksize=10000)

result_df = gbq.read_gbq("SELECT * FROM {0}".format(
self.destination_table + test_id),
project_id=_get_project_id())

assert len(result_df) == test_size

pytest.skipif(
sys.version_info.major < 3,
reason='Unicode comparison in Py2 not working')

result = result_df['s'].sort_values()
expected = df['s'].sort_values()

tm.assert_numpy_array_equal(expected.values, result.values)

def test_upload_other_unicode_data(self):
test_id = "3"
test_size = 3
df = DataFrame({
's': ['Skywalker™', 'lego', 'hülle'],
'i': [200, 300, 400],
'd': [
'2017-12-13 17:40:39', '2017-12-13 17:40:39',
'2017-12-13 17:40:39'
]
})

gbq.to_gbq(df, self.destination_table + test_id, _get_project_id(),
chunksize=10000)

result_df = gbq.read_gbq("SELECT * FROM {0}".format(
self.destination_table + test_id),
project_id=_get_project_id())

assert len(result_df) == test_size

pytest.skipif(
sys.version_info.major < 3,
reason='Unicode comparison in Py2 not working')

result = result_df['s'].sort_values()
expected = df['s'].sort_values()

tm.assert_numpy_array_equal(expected.values, result.values)

def test_generate_schema(self):
df = tm.makeMixedDataFrame()
schema = gbq._generate_bq_schema(df)
Expand Down Expand Up @@ -1467,6 +1525,59 @@ def test_upload_data(self):

assert result['num_rows'][0] == test_size

def test_upload_chinese_unicode_data(self):
test_id = "2"
test_size = 6
df = DataFrame(np.random.randn(6, 4), index=range(6),
columns=list('ABCD'))
df['s'] = u'信用卡'

gbq.to_gbq(df, self.destination_table + test_id, _get_project_id(),
chunksize=10000)

result_df = gbq.read_gbq("SELECT * FROM {0}".format(
self.destination_table + test_id),
project_id=_get_project_id())

assert len(result_df) == test_size

if sys.version_info.major < 3:
pytest.skip(msg='Unicode comparison in Py2 not working')

result = result_df['s'].sort_values()
expected = df['s'].sort_values()

tm.assert_numpy_array_equal(expected.values, result.values)

def test_upload_other_unicode_data(self):
test_id = "3"
test_size = 3
df = DataFrame({
's': ['Skywalker™', 'lego', 'hülle'],
'i': [200, 300, 400],
'd': [
'2017-12-13 17:40:39', '2017-12-13 17:40:39',
'2017-12-13 17:40:39'
]
})

gbq.to_gbq(df, self.destination_table + test_id, _get_project_id(),
chunksize=10000)

result_df = gbq.read_gbq("SELECT * FROM {0}".format(
self.destination_table + test_id),
project_id=_get_project_id())

assert len(result_df) == test_size

if sys.version_info.major < 3:
pytest.skip(msg='Unicode comparison in Py2 not working')

result = result_df['s'].sort_values()
expected = df['s'].sort_values()

tm.assert_numpy_array_equal(expected.values, result.values)


class TestToGBQIntegrationWithServiceAccountKeyContents(object):
# Changes to BigQuery table schema may take up to 2 minutes as of May 2015
Expand Down