Skip to content

Commit f26e390

Browse files
committed
CLN: Use to_dataframe to download query results.
This allows us to remove logic for parsing the schema and align with google-cloud-bigquery.
1 parent f729a44 commit f26e390

File tree

4 files changed

+36
-32
lines changed

4 files changed

+36
-32
lines changed

benchmark/README.md

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# pandas-gbq benchmarks
2+
3+
This directory contains a few scripts which are useful for performance
4+
testing the pandas-gbq library. Use cProfile to time the script and see
5+
details about where time is spent. To avoid timing how long BigQuery takes to
6+
execute a query, run the benchmark twice to ensure the results are cached.
7+
8+
## `read_gbq`
9+
10+
Read a small table (a few KB).
11+
12+
python -m cProfile --sort=cumtime read_gbq_small_results.py
13+
14+
Read a large-ish table (100+ MB).
15+
16+
python -m cProfile --sort=cumtime read_gbq_large_results.py

benchmark/read_gbq_large_results.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
2+
import pandas_gbq
3+
4+
# Select 163 MB worth of data, to time how long it takes to download large
5+
# result sets.
6+
df = pandas_gbq.read_gbq(
7+
"SELECT * FROM `bigquery-public-data.usa_names.usa_1910_2013`",
8+
dialect="standard")

benchmark/read_gbq_small_results.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
2+
import pandas_gbq
3+
4+
# Select a few KB worth of data, to time downloading small result sets.
5+
df = pandas_gbq.read_gbq(
6+
"SELECT * FROM `bigquery-public-data.utility_us.country_code_iso`",
7+
dialect="standard")

pandas_gbq/gbq.py

Lines changed: 5 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,9 @@
11
import logging
22
import time
33
import warnings
4-
from collections import OrderedDict
54
from datetime import datetime
65

76
import numpy as np
8-
from pandas import DataFrame
97

108
from pandas_gbq.exceptions import AccessDenied
119

@@ -482,15 +480,9 @@ def run_query(self, query, **kwargs):
482480
rows_iter = query_reply.result()
483481
except self.http_error as ex:
484482
self.process_http_error(ex)
485-
result_rows = list(rows_iter)
486-
total_rows = rows_iter.total_rows
487-
schema = {
488-
"fields": [field.to_api_repr() for field in rows_iter.schema]
489-
}
490-
491-
logger.debug("Got {} rows.\n".format(total_rows))
492-
493-
return schema, result_rows
483+
df = rows_iter.to_dataframe()
484+
logger.debug("Got {} rows.\n".format(rows_iter.total_rows))
485+
return df
494486

495487
def load_data(
496488
self,
@@ -661,25 +653,6 @@ def _parse_schema(schema_fields):
661653
yield name, dtype
662654

663655

664-
def _parse_data(schema, rows):
665-
666-
column_dtypes = OrderedDict(_parse_schema(schema["fields"]))
667-
df = DataFrame(data=(iter(r) for r in rows), columns=column_dtypes.keys())
668-
669-
for column in df:
670-
dtype = column_dtypes[column]
671-
null_safe = (
672-
df[column].notnull().all()
673-
or dtype == float
674-
or dtype == "datetime64[ns]"
675-
)
676-
if dtype and null_safe:
677-
df[column] = df[column].astype(
678-
column_dtypes[column], errors="ignore"
679-
)
680-
return df
681-
682-
683656
def read_gbq(
684657
query,
685658
project_id=None,
@@ -825,8 +798,8 @@ def read_gbq(
825798
credentials=credentials,
826799
private_key=private_key,
827800
)
828-
schema, rows = connector.run_query(query, configuration=configuration)
829-
final_df = _parse_data(schema, rows)
801+
802+
final_df = connector.run_query(query, configuration=configuration)
830803

831804
# Reindex the DataFrame on the provided column
832805
if index_col is not None:

0 commit comments

Comments
 (0)