CLN: Use to_dataframe to download query results.

tswast · tswast · commit f26e390432ae · 2019-02-15T15:38:09.000-08:00
This allows us to remove logic for parsing the schema and align with
google-cloud-bigquery.
diff --git a/benchmark/README.md b/benchmark/README.md
@@ -0,0 +1,16 @@
+# pandas-gbq benchmarks
+
+This directory contains a few scripts which are useful for performance
+testing the pandas-gbq library. Use cProfile to time the script and see
+details about where time is spent. To avoid timing how long BigQuery takes to
+execute a query, run the benchmark twice to ensure the results are cached.
+
+## `read_gbq`
+
+Read a small table (a few KB).
+
+    python -m cProfile --sort=cumtime read_gbq_small_results.py
+
+Read a large-ish table (100+ MB).
+
+    python -m cProfile --sort=cumtime read_gbq_large_results.py
diff --git a/benchmark/read_gbq_large_results.py b/benchmark/read_gbq_large_results.py
@@ -0,0 +1,8 @@
+
+import pandas_gbq
+
+# Select 163 MB worth of data, to time how long it takes to download large
+# result sets.
+df = pandas_gbq.read_gbq(
+    "SELECT * FROM `bigquery-public-data.usa_names.usa_1910_2013`",
+    dialect="standard")
diff --git a/benchmark/read_gbq_small_results.py b/benchmark/read_gbq_small_results.py
@@ -0,0 +1,7 @@
+
+import pandas_gbq
+
+# Select a few KB worth of data, to time downloading small result sets.
+df = pandas_gbq.read_gbq(
+    "SELECT * FROM `bigquery-public-data.utility_us.country_code_iso`",
+    dialect="standard")
diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py
@@ -1,11 +1,9 @@
 import logging
 import time
 import warnings
-from collections import OrderedDict
 from datetime import datetime
 
 import numpy as np
-from pandas import DataFrame
 
 from pandas_gbq.exceptions import AccessDenied
 
@@ -482,15 +480,9 @@ def run_query(self, query, **kwargs):
             rows_iter = query_reply.result()
         except self.http_error as ex:
             self.process_http_error(ex)
-        result_rows = list(rows_iter)
-        total_rows = rows_iter.total_rows
-        schema = {
-            "fields": [field.to_api_repr() for field in rows_iter.schema]
-        }
-
-        logger.debug("Got {} rows.\n".format(total_rows))
-
-        return schema, result_rows
+        df = rows_iter.to_dataframe()
+        logger.debug("Got {} rows.\n".format(rows_iter.total_rows))
+        return df
 
     def load_data(
         self,
@@ -661,25 +653,6 @@ def _parse_schema(schema_fields):
             yield name, dtype
 
 
-def _parse_data(schema, rows):
-
-    column_dtypes = OrderedDict(_parse_schema(schema["fields"]))
-    df = DataFrame(data=(iter(r) for r in rows), columns=column_dtypes.keys())
-
-    for column in df:
-        dtype = column_dtypes[column]
-        null_safe = (
-            df[column].notnull().all()
-            or dtype == float
-            or dtype == "datetime64[ns]"
-        )
-        if dtype and null_safe:
-            df[column] = df[column].astype(
-                column_dtypes[column], errors="ignore"
-            )
-    return df
-
-
 def read_gbq(
     query,
     project_id=None,
@@ -825,8 +798,8 @@ def read_gbq(
         credentials=credentials,
         private_key=private_key,
     )
-    schema, rows = connector.run_query(query, configuration=configuration)
-    final_df = _parse_data(schema, rows)
+
+    final_df = connector.run_query(query, configuration=configuration)
 
     # Reindex the DataFrame on the provided column
     if index_col is not None: