Skip to content

Commit b204206

Browse files
authored
Parse all date/time types (#224)
* parse all datetime types * typo * new black version * I tihnk we're doing similar things twice * better type tests * check nulls before assigning type * add env to gitignore * remove old code * handle float and int columns re nulls * nullable int columns as floats (separate issue) * Chesterton's Fence * try falling back to standard black check * exclude nox * changelog
1 parent 942c879 commit b204206

File tree

7 files changed

+57
-46
lines changed

7 files changed

+57
-46
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
.pytest_cache
2323
.testmon*
2424
.vscode/
25+
.env
2526

2627
# Docs #
2728
########

docs/source/changelog.rst

+6
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,10 @@ Changelog
66
0.7.0 / [unreleased]
77
--------------------
88

9+
- `int` columns which contain `NULL` are now cast to `float`, rather than
10+
`object` type. (:issue:`174`)
11+
- `DATE`, `DATETIME` and `TIMESTAMP` columns are now parsed as pandas' `timestamp`
12+
objects (:issue:`224`)
913
- Add :class:`pandas_gbq.Context` to cache credentials in-memory, across
1014
calls to ``read_gbq`` and ``to_gbq``. (:issue:`198`, :issue:`208`)
1115
- Fast queries now do not log above ``DEBUG`` level. (:issue:`204`)
@@ -20,6 +24,8 @@ Internal changes
2024
~~~~~~~~~~~~~~~~
2125

2226
- Avoid listing datasets and tables in system tests. (:issue:`215`)
27+
- Improved performance from eliminating some duplicative parsing steps
28+
(:issue:`224`)
2329

2430
.. _changelog-0.6.1:
2531

noxfile.py

+1-7
Original file line numberDiff line numberDiff line change
@@ -77,13 +77,7 @@ def test_latest_deps(session, python=latest_python):
7777
@nox.session
7878
def lint(session, python=latest_python):
7979
session.install("black")
80-
session.run(
81-
"black",
82-
"--check",
83-
"--exclude",
84-
"(\.git|\.hg|\.mypy_cache|\.tox|\.nox|\.venv|_build|buck-out|build|dist)",
85-
".",
86-
)
80+
session.run("black", "--check", ".")
8781

8882

8983
@nox.session

pandas_gbq/gbq.py

+21-17
Original file line numberDiff line numberDiff line change
@@ -577,24 +577,41 @@ def _parse_schema(schema_fields):
577577
# see:
578578
# http://pandas.pydata.org/pandas-docs/dev/missing_data.html
579579
# #missing-data-casting-rules-and-indexing
580-
dtype_map = {"FLOAT": np.dtype(float), "TIMESTAMP": "M8[ns]"}
580+
dtype_map = {
581+
"FLOAT": np.dtype(float),
582+
"TIMESTAMP": "datetime64[ns]",
583+
"TIME": "datetime64[ns]",
584+
"DATE": "datetime64[ns]",
585+
"DATETIME": "datetime64[ns]",
586+
"BOOLEAN": bool,
587+
"INTEGER": np.int64,
588+
}
581589

582590
for field in schema_fields:
583591
name = str(field["name"])
584592
if field["mode"].upper() == "REPEATED":
585593
yield name, object
586594
else:
587-
dtype = dtype_map.get(field["type"].upper(), object)
595+
dtype = dtype_map.get(field["type"].upper())
588596
yield name, dtype
589597

590598

591599
def _parse_data(schema, rows):
592600

593601
column_dtypes = OrderedDict(_parse_schema(schema["fields"]))
594-
595602
df = DataFrame(data=(iter(r) for r in rows), columns=column_dtypes.keys())
603+
596604
for column in df:
597-
df[column] = df[column].astype(column_dtypes[column])
605+
dtype = column_dtypes[column]
606+
null_safe = (
607+
df[column].notnull().all()
608+
or dtype == float
609+
or dtype == "datetime64[ns]"
610+
)
611+
if dtype and null_safe:
612+
df[column] = df[column].astype(
613+
column_dtypes[column], errors="ignore"
614+
)
598615
return df
599616

600617

@@ -747,19 +764,6 @@ def read_gbq(
747764
"Column order does not match this DataFrame."
748765
)
749766

750-
# cast BOOLEAN and INTEGER columns from object to bool/int
751-
# if they dont have any nulls AND field mode is not repeated (i.e., array)
752-
type_map = {"BOOLEAN": bool, "INTEGER": np.int64}
753-
for field in schema["fields"]:
754-
if (
755-
field["type"].upper() in type_map
756-
and final_df[field["name"]].notnull().all()
757-
and field["mode"].lower() != "repeated"
758-
):
759-
final_df[field["name"]] = final_df[field["name"]].astype(
760-
type_map[field["type"].upper()]
761-
)
762-
763767
connector.log_elapsed_seconds(
764768
"Total time taken",
765769
datetime.now().strftime("s.\nFinished at %Y-%m-%d %H:%M:%S."),

pyproject.toml

+1
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,5 @@ exclude = '''
44
versioneer.py
55
| _version.py
66
| docs
7+
| .nox
78
'''

tests/system/test_gbq.py

+27-21
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
# -*- coding: utf-8 -*-
22

33
import sys
4-
from datetime import datetime
54
import uuid
5+
from datetime import datetime
66

77
import numpy as np
88
import pandas.util.testing as tm
@@ -200,9 +200,7 @@ def test_should_properly_handle_nullable_integers(self, project_id):
200200
private_key=self.credentials,
201201
dialect="legacy",
202202
)
203-
tm.assert_frame_equal(
204-
df, DataFrame({"nullable_integer": [1, None]}).astype(object)
205-
)
203+
tm.assert_frame_equal(df, DataFrame({"nullable_integer": [1, None]}))
206204

207205
def test_should_properly_handle_valid_longs(self, project_id):
208206
query = "SELECT 1 << 62 AS valid_long"
@@ -225,7 +223,7 @@ def test_should_properly_handle_nullable_longs(self, project_id):
225223
dialect="legacy",
226224
)
227225
tm.assert_frame_equal(
228-
df, DataFrame({"nullable_long": [1 << 62, None]}).astype(object)
226+
df, DataFrame({"nullable_long": [1 << 62, None]})
229227
)
230228

231229
def test_should_properly_handle_null_integers(self, project_id):
@@ -338,35 +336,43 @@ def test_should_properly_handle_arbitrary_timestamp(self, project_id):
338336
),
339337
)
340338

341-
def test_should_properly_handle_null_timestamp(self, project_id):
342-
query = "SELECT TIMESTAMP(NULL) AS null_timestamp"
343-
df = gbq.read_gbq(
344-
query,
345-
project_id=project_id,
346-
private_key=self.credentials,
347-
dialect="legacy",
348-
)
349-
tm.assert_frame_equal(df, DataFrame({"null_timestamp": [NaT]}))
339+
@pytest.mark.parametrize(
340+
"expression, type_",
341+
[
342+
("current_date()", "<M8[ns]"),
343+
("current_timestamp()", "<M8[ns]"),
344+
("current_datetime()", "<M8[ns]"),
345+
("TRUE", bool),
346+
("FALSE", bool),
347+
],
348+
)
349+
def test_return_correct_types(self, project_id, expression, type_):
350+
"""
351+
All type checks can be added to this function using additional
352+
parameters, rather than creating additional functions.
353+
We can consolidate the existing functions here in time
350354
351-
def test_should_properly_handle_true_boolean(self, project_id):
352-
query = "SELECT BOOLEAN(TRUE) AS true_boolean"
355+
TODO: time doesn't currently parse
356+
("time(12,30,00)", "<M8[ns]"),
357+
"""
358+
query = "SELECT {} AS _".format(expression)
353359
df = gbq.read_gbq(
354360
query,
355361
project_id=project_id,
356362
private_key=self.credentials,
357-
dialect="legacy",
363+
dialect="standard",
358364
)
359-
tm.assert_frame_equal(df, DataFrame({"true_boolean": [True]}))
365+
assert df["_"].dtype == type_
360366

361-
def test_should_properly_handle_false_boolean(self, project_id):
362-
query = "SELECT BOOLEAN(FALSE) AS false_boolean"
367+
def test_should_properly_handle_null_timestamp(self, project_id):
368+
query = "SELECT TIMESTAMP(NULL) AS null_timestamp"
363369
df = gbq.read_gbq(
364370
query,
365371
project_id=project_id,
366372
private_key=self.credentials,
367373
dialect="legacy",
368374
)
369-
tm.assert_frame_equal(df, DataFrame({"false_boolean": [False]}))
375+
tm.assert_frame_equal(df, DataFrame({"null_timestamp": [NaT]}))
370376

371377
def test_should_properly_handle_null_boolean(self, project_id):
372378
query = "SELECT BOOLEAN(NULL) AS null_boolean"

tests/unit/test_schema.py

-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
21
import datetime
32

43
import pandas

0 commit comments

Comments
 (0)