Supply expected dtypes to to_dataframe()

tswast · tswast · commit 008791d32827 · 2019-02-08T14:06:02.000-08:00
diff --git a/benchmark/read_gbq_large_results.py b/benchmark/read_gbq_large_results.py
@@ -5,4 +5,5 @@
 # result sets.
 df = pandas_gbq.read_gbq(
     "SELECT * FROM `bigquery-public-data.usa_names.usa_1910_2013`",
-    dialect="standard")
+    dialect="standard",
+)
diff --git a/benchmark/read_gbq_small_results.py b/benchmark/read_gbq_small_results.py
@@ -4,4 +4,5 @@
 # Select a few KB worth of data, to time downloading small result sets.
 df = pandas_gbq.read_gbq(
     "SELECT * FROM `bigquery-public-data.utility_us.country_code_iso`",
-    dialect="standard")
+    dialect="standard",
+)
diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py
@@ -480,7 +480,10 @@ def run_query(self, query, **kwargs):
             rows_iter = query_reply.result()
         except self.http_error as ex:
             self.process_http_error(ex)
-        df = rows_iter.to_dataframe()
+
+        schema_fields = [field.to_api_repr() for field in rows_iter.schema]
+        dtypes = _bqschema_to_dtypes(schema_fields)
+        df = rows_iter.to_dataframe(dtypes=dtypes)
         logger.debug("Got {} rows.\n".format(rows_iter.total_rows))
         return df
 
@@ -630,27 +633,32 @@ def delete_and_recreate_table(self, dataset_id, table_id, table_schema):
         table.create(table_id, table_schema)
 
 
-def _parse_schema(schema_fields):
+def _bqschema_to_dtypes(schema_fields):
+    # Only specify dtype when the dtype allows nulls. Otherwise, use pandas's
+    # default dtype choice.
+    #
     # see:
     # http://pandas.pydata.org/pandas-docs/dev/missing_data.html
     # #missing-data-casting-rules-and-indexing
     dtype_map = {
         "FLOAT": np.dtype(float),
-        "TIMESTAMP": "datetime64[ns]",
+        "TIMESTAMP": "datetime64[ns, UTC]",
         "TIME": "datetime64[ns]",
         "DATE": "datetime64[ns]",
         "DATETIME": "datetime64[ns]",
-        "BOOLEAN": bool,
-        "INTEGER": np.int64,
     }
 
+    dtypes = {}
     for field in schema_fields:
         name = str(field["name"])
         if field["mode"].upper() == "REPEATED":
-            yield name, object
-        else:
-            dtype = dtype_map.get(field["type"].upper())
-            yield name, dtype
+            continue
+
+        dtype = dtype_map.get(field["type"].upper())
+        if dtype:
+            dtypes[name] = dtype
+
+    return dtypes
 
 
 def read_gbq(
diff --git a/tests/system/test_gbq.py b/tests/system/test_gbq.py
@@ -138,14 +138,6 @@ def test_should_be_able_to_get_a_bigquery_client(self, gbq_connector):
         bigquery_client = gbq_connector.get_client()
         assert bigquery_client is not None
 
-    def test_should_be_able_to_get_schema_from_query(self, gbq_connector):
-        schema, pages = gbq_connector.run_query("SELECT 1")
-        assert schema is not None
-
-    def test_should_be_able_to_get_results_from_query(self, gbq_connector):
-        schema, pages = gbq_connector.run_query("SELECT 1")
-        assert pages is not None
-
 
 def test_should_read(project, credentials):
     query = 'SELECT "PI" AS valid_string'
diff --git a/tests/unit/test_gbq.py b/tests/unit/test_gbq.py
@@ -2,6 +2,7 @@
 
 import pandas.util.testing as tm
 import pytest
+import numpy
 from pandas import DataFrame
 from pandas.compat.numpy import np_datetime64_compat
 
@@ -65,26 +66,23 @@ def no_auth(monkeypatch):
 
 
 @pytest.mark.parametrize(
-    ("input", "type_", "expected"),
+    ("type_", "expected"),
     [
-        (1, "INTEGER", int(1)),
-        (1, "FLOAT", float(1)),
-        pytest.param("false", "BOOLEAN", False, marks=pytest.mark.xfail),
-        pytest.param(
-            "0e9",
-            "TIMESTAMP",
-            np_datetime64_compat("1970-01-01T00:00:00Z"),
-            marks=pytest.mark.xfail,
-        ),
-        ("STRING", "STRING", "STRING"),
+        ("INTEGER", None),  # Can't handle NULL
+        ("BOOLEAN", None),  # Can't handle NULL
+        ("FLOAT", numpy.dtype(float)),
+        ("TIMESTAMP", "datetime64[ns, UTC]"),
+        ("DATETIME", "datetime64[ns]"),
     ],
 )
-def test_should_return_bigquery_correctly_typed(input, type_, expected):
-    result = gbq._parse_data(
-        dict(fields=[dict(name="x", type=type_, mode="NULLABLE")]),
-        rows=[[input]],
-    ).iloc[0, 0]
-    assert result == expected
+def test_should_return_bigquery_correctly_typed(type_, expected):
+    result = gbq._bqschema_to_dtypes(
+        [dict(name="x", type=type_, mode="NULLABLE")]
+    )
+    if not expected:
+        assert result == {}
+    else:
+        assert result == {"x": expected}
 
 
 def test_to_gbq_should_fail_if_invalid_table_name_passed():
@@ -264,21 +262,6 @@ def test_read_gbq_with_inferred_project_id(monkeypatch):
     assert df is not None
 
 
-def test_that_parse_data_works_properly():
-    from google.cloud.bigquery.table import Row
-
-    test_schema = {
-        "fields": [{"mode": "NULLABLE", "name": "column_x", "type": "STRING"}]
-    }
-    field_to_index = {"column_x": 0}
-    values = ("row_value",)
-    test_page = [Row(values, field_to_index)]
-
-    test_output = gbq._parse_data(test_schema, test_page)
-    correct_output = DataFrame({"column_x": ["row_value"]})
-    tm.assert_frame_equal(test_output, correct_output)
-
-
 def test_read_gbq_with_invalid_private_key_json_should_fail():
     with pytest.raises(pandas_gbq.exceptions.InvalidPrivateKeyFormat):
         gbq.read_gbq(