Parse all date/time types (#224)

max-sixty · web-flow · commit b204206de832 · 2018-10-10T13:27:45.000-04:00
* parse all datetime types

* typo

* new black version

* I tihnk we're doing similar things twice

* better type tests

* check nulls before assigning type

* add env to gitignore

* remove old code

* handle float and int columns re nulls

* nullable int columns as floats (separate issue)

* Chesterton's Fence

* try falling back to standard black check

* exclude nox

* changelog
diff --git a/.gitignore b/.gitignore
@@ -22,6 +22,7 @@
 .pytest_cache
 .testmon*
 .vscode/
+.env
 
 # Docs #
 ########
diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst
@@ -6,6 +6,10 @@ Changelog
 0.7.0 / [unreleased]
 --------------------
 
+- `int` columns which contain `NULL` are now cast to `float`, rather than
+  `object` type. (:issue:`174`)
+- `DATE`, `DATETIME` and `TIMESTAMP` columns are now parsed as pandas' `timestamp`
+  objects (:issue:`224`)
 - Add :class:`pandas_gbq.Context` to cache credentials in-memory, across
   calls to ``read_gbq`` and ``to_gbq``. (:issue:`198`, :issue:`208`)
 - Fast queries now do not log above ``DEBUG`` level. (:issue:`204`)
@@ -20,6 +24,8 @@ Internal changes
 ~~~~~~~~~~~~~~~~
 
 - Avoid listing datasets and tables in system tests. (:issue:`215`)
+- Improved performance from eliminating some duplicative parsing steps
+  (:issue:`224`)
 
 .. _changelog-0.6.1:
 
diff --git a/noxfile.py b/noxfile.py
@@ -77,13 +77,7 @@ def test_latest_deps(session, python=latest_python):
 @nox.session
 def lint(session, python=latest_python):
     session.install("black")
-    session.run(
-        "black",
-        "--check",
-        "--exclude",
-        "(\.git|\.hg|\.mypy_cache|\.tox|\.nox|\.venv|_build|buck-out|build|dist)",
-        ".",
-    )
+    session.run("black", "--check", ".")
 
 
 @nox.session
diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py
@@ -577,24 +577,41 @@ def _parse_schema(schema_fields):
     # see:
     # http://pandas.pydata.org/pandas-docs/dev/missing_data.html
     # #missing-data-casting-rules-and-indexing
-    dtype_map = {"FLOAT": np.dtype(float), "TIMESTAMP": "M8[ns]"}
+    dtype_map = {
+        "FLOAT": np.dtype(float),
+        "TIMESTAMP": "datetime64[ns]",
+        "TIME": "datetime64[ns]",
+        "DATE": "datetime64[ns]",
+        "DATETIME": "datetime64[ns]",
+        "BOOLEAN": bool,
+        "INTEGER": np.int64,
+    }
 
     for field in schema_fields:
         name = str(field["name"])
         if field["mode"].upper() == "REPEATED":
             yield name, object
         else:
-            dtype = dtype_map.get(field["type"].upper(), object)
+            dtype = dtype_map.get(field["type"].upper())
             yield name, dtype
 
 
 def _parse_data(schema, rows):
 
     column_dtypes = OrderedDict(_parse_schema(schema["fields"]))
-
     df = DataFrame(data=(iter(r) for r in rows), columns=column_dtypes.keys())
+
     for column in df:
-        df[column] = df[column].astype(column_dtypes[column])
+        dtype = column_dtypes[column]
+        null_safe = (
+            df[column].notnull().all()
+            or dtype == float
+            or dtype == "datetime64[ns]"
+        )
+        if dtype and null_safe:
+            df[column] = df[column].astype(
+                column_dtypes[column], errors="ignore"
+            )
     return df
 
 
@@ -747,19 +764,6 @@ def read_gbq(
                 "Column order does not match this DataFrame."
             )
 
-    # cast BOOLEAN and INTEGER columns from object to bool/int
-    # if they dont have any nulls AND field mode is not repeated (i.e., array)
-    type_map = {"BOOLEAN": bool, "INTEGER": np.int64}
-    for field in schema["fields"]:
-        if (
-            field["type"].upper() in type_map
-            and final_df[field["name"]].notnull().all()
-            and field["mode"].lower() != "repeated"
-        ):
-            final_df[field["name"]] = final_df[field["name"]].astype(
-                type_map[field["type"].upper()]
-            )
-
     connector.log_elapsed_seconds(
         "Total time taken",
         datetime.now().strftime("s.\nFinished at %Y-%m-%d %H:%M:%S."),
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,4 +4,5 @@ exclude = '''
 versioneer.py
 | _version.py
 | docs
+| .nox
 '''
diff --git a/tests/system/test_gbq.py b/tests/system/test_gbq.py
@@ -1,8 +1,8 @@
 # -*- coding: utf-8 -*-
 
 import sys
-from datetime import datetime
 import uuid
+from datetime import datetime
 
 import numpy as np
 import pandas.util.testing as tm
@@ -200,9 +200,7 @@ def test_should_properly_handle_nullable_integers(self, project_id):
             private_key=self.credentials,
             dialect="legacy",
         )
-        tm.assert_frame_equal(
-            df, DataFrame({"nullable_integer": [1, None]}).astype(object)
-        )
+        tm.assert_frame_equal(df, DataFrame({"nullable_integer": [1, None]}))
 
     def test_should_properly_handle_valid_longs(self, project_id):
         query = "SELECT 1 << 62 AS valid_long"
@@ -225,7 +223,7 @@ def test_should_properly_handle_nullable_longs(self, project_id):
             dialect="legacy",
         )
         tm.assert_frame_equal(
-            df, DataFrame({"nullable_long": [1 << 62, None]}).astype(object)
+            df, DataFrame({"nullable_long": [1 << 62, None]})
         )
 
     def test_should_properly_handle_null_integers(self, project_id):
@@ -338,35 +336,43 @@ def test_should_properly_handle_arbitrary_timestamp(self, project_id):
             ),
         )
 
-    def test_should_properly_handle_null_timestamp(self, project_id):
-        query = "SELECT TIMESTAMP(NULL) AS null_timestamp"
-        df = gbq.read_gbq(
-            query,
-            project_id=project_id,
-            private_key=self.credentials,
-            dialect="legacy",
-        )
-        tm.assert_frame_equal(df, DataFrame({"null_timestamp": [NaT]}))
+    @pytest.mark.parametrize(
+        "expression, type_",
+        [
+            ("current_date()", "<M8[ns]"),
+            ("current_timestamp()", "<M8[ns]"),
+            ("current_datetime()", "<M8[ns]"),
+            ("TRUE", bool),
+            ("FALSE", bool),
+        ],
+    )
+    def test_return_correct_types(self, project_id, expression, type_):
+        """
+        All type checks can be added to this function using additional
+        parameters, rather than creating additional functions.
+        We can consolidate the existing functions here in time
 
-    def test_should_properly_handle_true_boolean(self, project_id):
-        query = "SELECT BOOLEAN(TRUE) AS true_boolean"
+        TODO: time doesn't currently parse
+        ("time(12,30,00)", "<M8[ns]"),
+        """
+        query = "SELECT {} AS _".format(expression)
         df = gbq.read_gbq(
             query,
             project_id=project_id,
             private_key=self.credentials,
-            dialect="legacy",
+            dialect="standard",
         )
-        tm.assert_frame_equal(df, DataFrame({"true_boolean": [True]}))
+        assert df["_"].dtype == type_
 
-    def test_should_properly_handle_false_boolean(self, project_id):
-        query = "SELECT BOOLEAN(FALSE) AS false_boolean"
+    def test_should_properly_handle_null_timestamp(self, project_id):
+        query = "SELECT TIMESTAMP(NULL) AS null_timestamp"
         df = gbq.read_gbq(
             query,
             project_id=project_id,
             private_key=self.credentials,
             dialect="legacy",
         )
-        tm.assert_frame_equal(df, DataFrame({"false_boolean": [False]}))
+        tm.assert_frame_equal(df, DataFrame({"null_timestamp": [NaT]}))
 
     def test_should_properly_handle_null_boolean(self, project_id):
         query = "SELECT BOOLEAN(NULL) AS null_boolean"
diff --git a/tests/unit/test_schema.py b/tests/unit/test_schema.py
@@ -1,4 +1,3 @@
-
 import datetime
 
 import pandas

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,3 @@`
`1`		`-`
`2`	`1`	`import datetime`
`3`	`2`
`4`	`3`	`import pandas`