Skip to content

Commit 6294f7a

Browse files
authored
BUG: Adapt to breaking change in google-cloud-bigquery 0.32.0.dev1 (#152)
* BUG: Update pandas-gbq to latest version of google-cloud-bigquery There was a breaking change in 0.32.0.dev1 which changed the way configuration for the query job gets loaded. Also, it added the 'description' field to the schema resource, so this change updates the schema comparison logic to account for that. Detect google-cloud-bigquery version for backwards compatibility * DOC: Add verbose deprecation to changelog. * TST: MASTER in CI also builds with g-c-bigquery at MASTER.
1 parent 9666965 commit 6294f7a

File tree

8 files changed

+163
-60
lines changed

8 files changed

+163
-60
lines changed

.travis.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,9 @@ install:
2828
conda install -q numpy pytz python-dateutil;
2929
PRE_WHEELS="https://7933911d6844c6c53a7d-47bd50c35cd79bd838daf386af554a83.ssl.cf2.rackcdn.com";
3030
pip install --pre --upgrade --timeout=60 -f $PRE_WHEELS pandas;
31+
pip install -e 'git+https://github.com/GoogleCloudPlatform/google-cloud-python.git#egg=version_subpkg&subdirectory=api_core';
32+
pip install -e 'git+https://github.com/GoogleCloudPlatform/google-cloud-python.git#egg=version_subpkg&subdirectory=core';
33+
pip install -e 'git+https://github.com/GoogleCloudPlatform/google-cloud-python.git#egg=version_subpkg&subdirectory=bigquery';
3134
else
3235
conda install -q pandas=$PANDAS;
3336
fi

ci/requirements-3.6-MASTER.pip

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
11
google-auth
22
google-auth-oauthlib
33
mock
4-
google-cloud-bigquery

docs/source/changelog.rst

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
11
Changelog
22
=========
33

4-
0.3.2 / [TBD]
4+
0.4.0 / [TBD]
55
------------------
66
- Fix bug with querying for an array of floats (:issue:`123`)
7+
- Fix bug with integer columns on Windows. Explicitly use 64bit integers when converting from BQ types. (:issue:`119`)
8+
- Fix bug caused by breaking change the way ``google-cloud-python`` version 0.32.0+ handles additional configuration argument to ``read_gbq``. (:issue:`152`)
9+
- **Deprecates** the ``verbose`` parameter. Messages use the logging module instead of printing progress directly to standard output. (:issue:`12`)
710

811
0.3.1 / 2018-02-13
912
------------------

pandas_gbq/_query.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
2+
import pkg_resources
3+
from google.cloud import bigquery
4+
5+
6+
# Version with query config breaking change.
7+
BIGQUERY_CONFIG_VERSION = pkg_resources.parse_version('0.32.0.dev1')
8+
9+
10+
def query_config_old_version(resource):
11+
# Verify that we got a query resource. In newer versions of
12+
# google-cloud-bigquery enough of the configuration is passed on to the
13+
# backend that we can expect a backend validation error instead.
14+
if len(resource) != 1:
15+
raise ValueError("Only one job type must be specified, but "
16+
"given {}".format(','.join(resource.keys())))
17+
if 'query' not in resource:
18+
raise ValueError("Only 'query' job type is supported")
19+
return bigquery.QueryJobConfig.from_api_repr(resource['query'])
20+
21+
22+
def query_config(resource, installed_version):
23+
if installed_version < BIGQUERY_CONFIG_VERSION:
24+
return query_config_old_version(resource)
25+
return bigquery.QueryJobConfig.from_api_repr(resource)

pandas_gbq/gbq.py

Lines changed: 41 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
import time
55
import warnings
66
from datetime import datetime
7-
from distutils.version import StrictVersion
87
from time import sleep
98

109
import numpy as np
@@ -14,7 +13,11 @@
1413
logger = logging.getLogger(__name__)
1514

1615

16+
BIGQUERY_INSTALLED_VERSION = None
17+
18+
1719
def _check_google_client_version():
20+
global BIGQUERY_INSTALLED_VERSION
1821

1922
try:
2023
import pkg_resources
@@ -23,17 +26,15 @@ def _check_google_client_version():
2326
raise ImportError('Could not import pkg_resources (setuptools).')
2427

2528
# https://github.com/GoogleCloudPlatform/google-cloud-python/blob/master/bigquery/CHANGELOG.md
26-
bigquery_client_minimum_version = '0.29.0'
27-
28-
_BIGQUERY_CLIENT_VERSION = pkg_resources.get_distribution(
29-
'google-cloud-bigquery').version
29+
bigquery_minimum_version = pkg_resources.parse_version('0.29.0')
30+
BIGQUERY_INSTALLED_VERSION = pkg_resources.get_distribution(
31+
'google-cloud-bigquery').parsed_version
3032

31-
if (StrictVersion(_BIGQUERY_CLIENT_VERSION) <
32-
StrictVersion(bigquery_client_minimum_version)):
33-
raise ImportError('pandas-gbq requires google-cloud-bigquery >= {0}, '
34-
'current version {1}'
35-
.format(bigquery_client_minimum_version,
36-
_BIGQUERY_CLIENT_VERSION))
33+
if BIGQUERY_INSTALLED_VERSION < bigquery_minimum_version:
34+
raise ImportError(
35+
'pandas-gbq requires google-cloud-bigquery >= {0}, '
36+
'current version {1}'.format(
37+
bigquery_minimum_version, BIGQUERY_INSTALLED_VERSION))
3738

3839

3940
def _test_google_api_imports():
@@ -447,8 +448,8 @@ def process_http_error(ex):
447448

448449
def run_query(self, query, **kwargs):
449450
from google.auth.exceptions import RefreshError
450-
from google.cloud.bigquery import QueryJobConfig
451451
from concurrent.futures import TimeoutError
452+
from pandas_gbq import _query
452453

453454
job_config = {
454455
'query': {
@@ -459,29 +460,23 @@ def run_query(self, query, **kwargs):
459460
}
460461
config = kwargs.get('configuration')
461462
if config is not None:
462-
if len(config) != 1:
463-
raise ValueError("Only one job type must be specified, but "
464-
"given {}".format(','.join(config.keys())))
465-
if 'query' in config:
466-
if 'query' in config['query']:
467-
if query is not None:
468-
raise ValueError("Query statement can't be specified "
469-
"inside config while it is specified "
470-
"as parameter")
471-
query = config['query']['query']
472-
del config['query']['query']
473-
474-
job_config['query'].update(config['query'])
475-
else:
476-
raise ValueError("Only 'query' job type is supported")
463+
job_config.update(config)
464+
465+
if 'query' in config and 'query' in config['query']:
466+
if query is not None:
467+
raise ValueError("Query statement can't be specified "
468+
"inside config while it is specified "
469+
"as parameter")
470+
query = config['query'].pop('query')
477471

478472
self._start_timer()
479-
try:
480473

474+
try:
481475
logger.info('Requesting query... ')
482476
query_reply = self.client.query(
483477
query,
484-
job_config=QueryJobConfig.from_api_repr(job_config['query']))
478+
job_config=_query.query_config(
479+
job_config, BIGQUERY_INSTALLED_VERSION))
485480
logger.info('ok.\nQuery running...')
486481
except (RefreshError, ValueError):
487482
if self.private_key:
@@ -598,6 +593,15 @@ def schema(self, dataset_id, table_id):
598593
except self.http_error as ex:
599594
self.process_http_error(ex)
600595

596+
def _clean_schema_fields(self, fields):
597+
"""Return a sanitized version of the schema for comparisons."""
598+
fields_sorted = sorted(fields, key=lambda field: field['name'])
599+
# Ignore mode and description when comparing schemas.
600+
return [
601+
{'name': field['name'], 'type': field['type']}
602+
for field in fields_sorted
603+
]
604+
601605
def verify_schema(self, dataset_id, table_id, schema):
602606
"""Indicate whether schemas match exactly
603607
@@ -621,17 +625,9 @@ def verify_schema(self, dataset_id, table_id, schema):
621625
Whether the schemas match
622626
"""
623627

624-
fields_remote = sorted(self.schema(dataset_id, table_id),
625-
key=lambda x: x['name'])
626-
fields_local = sorted(schema['fields'], key=lambda x: x['name'])
627-
628-
# Ignore mode when comparing schemas.
629-
for field in fields_local:
630-
if 'mode' in field:
631-
del field['mode']
632-
for field in fields_remote:
633-
if 'mode' in field:
634-
del field['mode']
628+
fields_remote = self._clean_schema_fields(
629+
self.schema(dataset_id, table_id))
630+
fields_local = self._clean_schema_fields(schema['fields'])
635631

636632
return fields_remote == fields_local
637633

@@ -658,16 +654,9 @@ def schema_is_subset(self, dataset_id, table_id, schema):
658654
Whether the passed schema is a subset
659655
"""
660656

661-
fields_remote = self.schema(dataset_id, table_id)
662-
fields_local = schema['fields']
663-
664-
# Ignore mode when comparing schemas.
665-
for field in fields_local:
666-
if 'mode' in field:
667-
del field['mode']
668-
for field in fields_remote:
669-
if 'mode' in field:
670-
del field['mode']
657+
fields_remote = self._clean_schema_fields(
658+
self.schema(dataset_id, table_id))
659+
fields_local = self._clean_schema_fields(schema['fields'])
671660

672661
return all(field in fields_remote for field in fields_local)
673662

@@ -709,7 +698,7 @@ def _parse_data(schema, rows):
709698
col_names = [str(field['name']) for field in fields]
710699
col_dtypes = [
711700
dtype_map.get(field['type'].upper(), object)
712-
if field['mode'] != 'repeated'
701+
if field['mode'].lower() != 'repeated'
713702
else object
714703
for field in fields
715704
]
@@ -847,7 +836,7 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None,
847836
for field in schema['fields']:
848837
if field['type'].upper() in type_map and \
849838
final_df[field['name']].notnull().all() and \
850-
field['mode'] != 'repeated':
839+
field['mode'].lower() != 'repeated':
851840
final_df[field['name']] = \
852841
final_df[field['name']].astype(type_map[field['type'].upper()])
853842

pandas_gbq/tests/test__query.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
2+
import pkg_resources
3+
4+
import mock
5+
6+
7+
@mock.patch('google.cloud.bigquery.QueryJobConfig')
8+
def test_query_config_w_old_bq_version(mock_config):
9+
from pandas_gbq._query import query_config
10+
11+
old_version = pkg_resources.parse_version('0.29.0')
12+
query_config({'query': {'useLegacySql': False}}, old_version)
13+
mock_config.from_api_repr.assert_called_once_with({'useLegacySql': False})
14+
15+
16+
@mock.patch('google.cloud.bigquery.QueryJobConfig')
17+
def test_query_config_w_dev_bq_version(mock_config):
18+
from pandas_gbq._query import query_config
19+
20+
dev_version = pkg_resources.parse_version('0.32.0.dev1')
21+
query_config(
22+
{
23+
'query': {
24+
'useLegacySql': False,
25+
},
26+
'labels': {'key': 'value'},
27+
},
28+
dev_version)
29+
mock_config.from_api_repr.assert_called_once_with(
30+
{
31+
'query': {
32+
'useLegacySql': False,
33+
},
34+
'labels': {'key': 'value'},
35+
})
36+
37+
38+
@mock.patch('google.cloud.bigquery.QueryJobConfig')
39+
def test_query_config_w_new_bq_version(mock_config):
40+
from pandas_gbq._query import query_config
41+
42+
dev_version = pkg_resources.parse_version('1.0.0')
43+
query_config(
44+
{
45+
'query': {
46+
'useLegacySql': False,
47+
},
48+
'labels': {'key': 'value'},
49+
},
50+
dev_version)
51+
mock_config.from_api_repr.assert_called_once_with(
52+
{
53+
'query': {
54+
'useLegacySql': False,
55+
},
56+
'labels': {'key': 'value'},
57+
})

pandas_gbq/tests/test_gbq.py

Lines changed: 32 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1266,16 +1266,42 @@ def test_retrieve_schema(self):
12661266
test_id = "15"
12671267
test_schema = {
12681268
'fields': [
1269-
{'name': 'A', 'type': 'FLOAT', 'mode': 'NULLABLE'},
1270-
{'name': 'B', 'type': 'FLOAT', 'mode': 'NULLABLE'},
1271-
{'name': 'C', 'type': 'STRING', 'mode': 'NULLABLE'},
1272-
{'name': 'D', 'type': 'TIMESTAMP', 'mode': 'NULLABLE'}
1269+
{
1270+
'name': 'A',
1271+
'type': 'FLOAT',
1272+
'mode': 'NULLABLE',
1273+
'description': None,
1274+
},
1275+
{
1276+
'name': 'B',
1277+
'type': 'FLOAT',
1278+
'mode': 'NULLABLE',
1279+
'description': None,
1280+
},
1281+
{
1282+
'name': 'C',
1283+
'type': 'STRING',
1284+
'mode': 'NULLABLE',
1285+
'description': None,
1286+
},
1287+
{
1288+
'name': 'D',
1289+
'type': 'TIMESTAMP',
1290+
'mode': 'NULLABLE',
1291+
'description': None,
1292+
},
12731293
]
12741294
}
12751295

12761296
self.table.create(TABLE_ID + test_id, test_schema)
1277-
actual = self.sut.schema(self.dataset_prefix + "1", TABLE_ID + test_id)
1278-
expected = test_schema['fields']
1297+
actual = self.sut._clean_schema_fields(
1298+
self.sut.schema(self.dataset_prefix + "1", TABLE_ID + test_id))
1299+
expected = [
1300+
{'name': 'A', 'type': 'FLOAT'},
1301+
{'name': 'B', 'type': 'FLOAT'},
1302+
{'name': 'C', 'type': 'STRING'},
1303+
{'name': 'D', 'type': 'TIMESTAMP'},
1304+
]
12791305
assert expected == actual, 'Expected schema used to create table'
12801306

12811307
def test_schema_is_subset_passes_if_subset(self):

setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ def readme():
1717

1818

1919
INSTALL_REQUIRES = [
20+
'setuptools',
2021
'pandas',
2122
'google-auth',
2223
'google-auth-oauthlib',

0 commit comments

Comments
 (0)