Skip to content

Parse all date/time types #224

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Oct 10, 2018
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
.pytest_cache
.testmon*
.vscode/
.env

# Docs #
########
Expand Down
6 changes: 6 additions & 0 deletions docs/source/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@ Changelog
0.7.0 / [unreleased]
--------------------

- `int` columns which contain `NULL` are now cast to `float`, rather than
`object` type. (:issue:`174`)
- `DATE`, `DATETIME` and `TIMESTAMP` columns are now parsed as pandas' `timestamp`
objects (:issue:`224`)
- Add :class:`pandas_gbq.Context` to cache credentials in-memory, across
calls to ``read_gbq`` and ``to_gbq``. (:issue:`198`, :issue:`208`)
- Fast queries now do not log above ``DEBUG`` level. (:issue:`204`)
Expand All @@ -20,6 +24,8 @@ Internal changes
~~~~~~~~~~~~~~~~

- Avoid listing datasets and tables in system tests. (:issue:`215`)
- Improved performance from eliminating some duplicative parsing steps
(:issue:`224`)

.. _changelog-0.6.1:

Expand Down
8 changes: 1 addition & 7 deletions noxfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,13 +77,7 @@ def test_latest_deps(session, python=latest_python):
@nox.session
def lint(session, python=latest_python):
session.install("black")
session.run(
"black",
"--check",
"--exclude",
"(\.git|\.hg|\.mypy_cache|\.tox|\.nox|\.venv|_build|buck-out|build|dist)",
".",
)
session.run("black", "--check", ".")


@nox.session
Expand Down
40 changes: 22 additions & 18 deletions pandas_gbq/gbq.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,7 @@ def __init__(

# BQ Queries costs $5 per TB. First 1 TB per month is free
# see here for more: https://cloud.google.com/bigquery/pricing
self.query_price_for_TB = 5. / 2 ** 40 # USD/TB
self.query_price_for_TB = 5.0 / 2 ** 40 # USD/TB

def _start_timer(self):
self.start = time.time()
Expand Down Expand Up @@ -577,24 +577,41 @@ def _parse_schema(schema_fields):
# see:
# http://pandas.pydata.org/pandas-docs/dev/missing_data.html
# #missing-data-casting-rules-and-indexing
dtype_map = {"FLOAT": np.dtype(float), "TIMESTAMP": "M8[ns]"}
dtype_map = {
"FLOAT": np.dtype(float),
"TIMESTAMP": "datetime64[ns]",
"TIME": "datetime64[ns]",
"DATE": "datetime64[ns]",
"DATETIME": "datetime64[ns]",
"BOOLEAN": bool,
"INTEGER": np.int64,
}

for field in schema_fields:
name = str(field["name"])
if field["mode"].upper() == "REPEATED":
yield name, object
else:
dtype = dtype_map.get(field["type"].upper(), object)
dtype = dtype_map.get(field["type"].upper())
yield name, dtype


def _parse_data(schema, rows):

column_dtypes = OrderedDict(_parse_schema(schema["fields"]))

df = DataFrame(data=(iter(r) for r in rows), columns=column_dtypes.keys())

for column in df:
df[column] = df[column].astype(column_dtypes[column])
dtype = column_dtypes[column]
null_safe = (
df[column].notnull().all()
or dtype == float
or dtype == "datetime64[ns]"
)
if dtype and null_safe:
df[column] = df[column].astype(
column_dtypes[column], errors="ignore"
)
return df


Expand Down Expand Up @@ -747,19 +764,6 @@ def read_gbq(
"Column order does not match this DataFrame."
)

# cast BOOLEAN and INTEGER columns from object to bool/int
# if they dont have any nulls AND field mode is not repeated (i.e., array)
type_map = {"BOOLEAN": bool, "INTEGER": np.int64}
for field in schema["fields"]:
if (
field["type"].upper() in type_map
and final_df[field["name"]].notnull().all()
and field["mode"].lower() != "repeated"
):
final_df[field["name"]] = final_df[field["name"]].astype(
type_map[field["type"].upper()]
)

connector.log_elapsed_seconds(
"Total time taken",
datetime.now().strftime("s.\nFinished at %Y-%m-%d %H:%M:%S."),
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ exclude = '''
versioneer.py
| _version.py
| docs
| .nox
'''
60 changes: 33 additions & 27 deletions tests/system/test_gbq.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# -*- coding: utf-8 -*-

import sys
from datetime import datetime
import uuid
from datetime import datetime

import numpy as np
import pandas.util.testing as tm
Expand Down Expand Up @@ -200,9 +200,7 @@ def test_should_properly_handle_nullable_integers(self, project_id):
private_key=self.credentials,
dialect="legacy",
)
tm.assert_frame_equal(
df, DataFrame({"nullable_integer": [1, None]}).astype(object)
)
tm.assert_frame_equal(df, DataFrame({"nullable_integer": [1, None]}))

def test_should_properly_handle_valid_longs(self, project_id):
query = "SELECT 1 << 62 AS valid_long"
Expand All @@ -225,7 +223,7 @@ def test_should_properly_handle_nullable_longs(self, project_id):
dialect="legacy",
)
tm.assert_frame_equal(
df, DataFrame({"nullable_long": [1 << 62, None]}).astype(object)
df, DataFrame({"nullable_long": [1 << 62, None]})
)

def test_should_properly_handle_null_integers(self, project_id):
Expand Down Expand Up @@ -338,35 +336,43 @@ def test_should_properly_handle_arbitrary_timestamp(self, project_id):
),
)

def test_should_properly_handle_null_timestamp(self, project_id):
query = "SELECT TIMESTAMP(NULL) AS null_timestamp"
df = gbq.read_gbq(
query,
project_id=project_id,
private_key=self.credentials,
dialect="legacy",
)
tm.assert_frame_equal(df, DataFrame({"null_timestamp": [NaT]}))
@pytest.mark.parametrize(
"expression, type_",
[
("current_date()", "<M8[ns]"),
("current_timestamp()", "<M8[ns]"),
("current_datetime()", "<M8[ns]"),
("TRUE", bool),
("FALSE", bool),
],
)
def test_return_correct_types(self, project_id, expression, type_):
"""
All type checks can be added to this function using additional
parameters, rather than creating additional functions.
We can consolidate the existing functions here in time

def test_should_properly_handle_true_boolean(self, project_id):
query = "SELECT BOOLEAN(TRUE) AS true_boolean"
TODO: time doesn't currently parse
("time(12,30,00)", "<M8[ns]"),
"""
query = "SELECT {} AS _".format(expression)
df = gbq.read_gbq(
query,
project_id=project_id,
private_key=self.credentials,
dialect="legacy",
dialect="standard",
)
tm.assert_frame_equal(df, DataFrame({"true_boolean": [True]}))
assert df["_"].dtype == type_

def test_should_properly_handle_false_boolean(self, project_id):
query = "SELECT BOOLEAN(FALSE) AS false_boolean"
def test_should_properly_handle_null_timestamp(self, project_id):
query = "SELECT TIMESTAMP(NULL) AS null_timestamp"
df = gbq.read_gbq(
query,
project_id=project_id,
private_key=self.credentials,
dialect="legacy",
)
tm.assert_frame_equal(df, DataFrame({"false_boolean": [False]}))
tm.assert_frame_equal(df, DataFrame({"null_timestamp": [NaT]}))

def test_should_properly_handle_null_boolean(self, project_id):
query = "SELECT BOOLEAN(NULL) AS null_boolean"
Expand Down Expand Up @@ -741,12 +747,12 @@ def test_query_response_bytes(self):
assert self.gbq_connector.sizeof_fmt(1048576) == "1.0 MB"
assert self.gbq_connector.sizeof_fmt(1048576000) == "1000.0 MB"
assert self.gbq_connector.sizeof_fmt(1073741824) == "1.0 GB"
assert self.gbq_connector.sizeof_fmt(1.099512E12) == "1.0 TB"
assert self.gbq_connector.sizeof_fmt(1.125900E15) == "1.0 PB"
assert self.gbq_connector.sizeof_fmt(1.152922E18) == "1.0 EB"
assert self.gbq_connector.sizeof_fmt(1.180592E21) == "1.0 ZB"
assert self.gbq_connector.sizeof_fmt(1.208926E24) == "1.0 YB"
assert self.gbq_connector.sizeof_fmt(1.208926E28) == "10000.0 YB"
assert self.gbq_connector.sizeof_fmt(1.099512e12) == "1.0 TB"
assert self.gbq_connector.sizeof_fmt(1.125900e15) == "1.0 PB"
assert self.gbq_connector.sizeof_fmt(1.152922e18) == "1.0 EB"
assert self.gbq_connector.sizeof_fmt(1.180592e21) == "1.0 ZB"
assert self.gbq_connector.sizeof_fmt(1.208926e24) == "1.0 YB"
assert self.gbq_connector.sizeof_fmt(1.208926e28) == "10000.0 YB"

def test_struct(self, project_id):
query = """SELECT 1 int_field,
Expand Down
1 change: 0 additions & 1 deletion tests/unit/test_schema.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

import datetime

import pandas
Expand Down