ENH: log rather than print (#18)

max-sixty · web-flow · commit 8f19fdcc79b8 · 2018-03-23T21:31:42.000-04:00
* rebase

* Changes from feedback

* remove verbose from docstrings and internal classes

* WIP

* update for newer code

* remove verbose for private class

* updates if cache hit

* couple refinements

* remove commented out code

* show logs in pytest

* docs
diff --git a/.travis.yml b/.travis.yml
@@ -42,6 +42,6 @@ install:
   - python setup.py install
 
 script:
-  - pytest -s -v --cov=pandas_gbq --cov-report xml:/tmp/pytest-cov.xml pandas_gbq
+  - pytest -v --cov=pandas_gbq --cov-report xml:/tmp/pytest-cov.xml pandas_gbq
   - if [[ $COVERAGE == 'true' ]]; then codecov ; fi
   - if [[ $LINT == 'true' ]]; then flake8 pandas_gbq -v ; fi
diff --git a/docs/source/intro.rst b/docs/source/intro.rst
@@ -26,6 +26,23 @@ While this trade-off works well for most cases, it breaks down for storing
 values greater than 2**53. Such values in BigQuery can represent identifiers
 and unnoticed precision lost for identifier is what we want to avoid.
 
+Logging
++++++++
+
+Because some requests take some time, this library will log its progress of
+longer queries. IPython & Jupyter by default attach a handler to the logger.
+If you're running in another process and want to see logs, or you want to see
+more verbose logs, you can do something like:
+
+.. code-block:: ipython
+
+   import logging
+   import sys
+   logger = logging.getLogger('pandas_gbq')
+   logger.setLevel(logging.DEBUG)
+   logger.addHandler(logging.StreamHandler(stream=sys.stdout))
+
+
 .. _authentication:
 
 Authentication
@@ -49,8 +66,8 @@ Additional information on service accounts can be found
 `here <https://developers.google.com/identity/protocols/OAuth2#serviceaccount>`__.
 
 Authentication via ``application default credentials`` is also possible, but only valid
-if the parameter ``private_key`` is not provided. This method requires that the 
-credentials can be fetched from the development environment. Otherwise, the OAuth2 
+if the parameter ``private_key`` is not provided. This method requires that the
+credentials can be fetched from the development environment. Otherwise, the OAuth2
 client-side authentication is used. Additional information can be found on
 `application default credentials <https://developers.google.com/identity/protocols/application-default-credentials>`__.
 
diff --git a/docs/source/reading.rst b/docs/source/reading.rst
@@ -45,10 +45,6 @@ For more information about query configuration parameters see
    You can find your project id in the `Google developers console <https://console.developers.google.com>`__.
 
 
-.. note::
-
-   You can toggle the verbose output via the ``verbose`` flag which defaults to ``True``.
-
 .. note::
 
     The ``dialect`` argument can be used to indicate whether to use BigQuery's ``'legacy'`` SQL
diff --git a/docs/source/writing.rst b/docs/source/writing.rst
@@ -54,24 +54,6 @@ For example, the following writes ``df`` to a BigQuery table in batches of 10000
 
    to_gbq(df, 'my_dataset.my_table', projectid, chunksize=10000)
 
-You can also see the progress of your post via the ``verbose`` flag which defaults to ``True``.
-For example:
-
-.. code-block:: python
-
-   In [8]: to_gbq(df, 'my_dataset.my_table', projectid, chunksize=10000, verbose=True)
-
-           Streaming Insert is 10% Complete
-           Streaming Insert is 20% Complete
-           Streaming Insert is 30% Complete
-           Streaming Insert is 40% Complete
-           Streaming Insert is 50% Complete
-           Streaming Insert is 60% Complete
-           Streaming Insert is 70% Complete
-           Streaming Insert is 80% Complete
-           Streaming Insert is 90% Complete
-           Streaming Insert is 100% Complete
-
 .. note::
 
    If an error occurs while streaming data to BigQuery, see
diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py
@@ -1,6 +1,6 @@
 import json
+import logging
 import os
-import sys
 import time
 import warnings
 from datetime import datetime
@@ -11,6 +11,8 @@
 from pandas import DataFrame, compat
 from pandas.compat import lzip
 
+logger = logging.getLogger(__name__)
+
 
 def _check_google_client_version():
 
@@ -162,15 +164,14 @@ class TableCreationError(ValueError):
 class GbqConnector(object):
     scope = 'https://www.googleapis.com/auth/bigquery'
 
-    def __init__(self, project_id, reauth=False, verbose=False,
+    def __init__(self, project_id, reauth=False,
                  private_key=None, auth_local_webserver=False,
                  dialect='legacy'):
         from google.api_core.exceptions import GoogleAPIError
         from google.api_core.exceptions import ClientError
         self.http_error = (ClientError, GoogleAPIError)
         self.project_id = project_id
         self.reauth = reauth
-        self.verbose = verbose
         self.private_key = private_key
         self.auth_local_webserver = auth_local_webserver
         self.dialect = dialect
@@ -324,7 +325,7 @@ def save_user_account_credentials(self, credentials):
                 }
                 json.dump(credentials_json, credentials_file)
         except IOError:
-            self._print('Unable to save credentials.')
+            logger.warning('Unable to save credentials.')
 
     def get_user_account_credentials(self):
         """Gets user account credentials.
@@ -410,22 +411,17 @@ def get_service_account_credentials(self):
                 "Can be obtained from: https://console.developers.google."
                 "com/permissions/serviceaccounts")
 
-    def _print(self, msg, end='\n'):
-        if self.verbose:
-            sys.stdout.write(msg + end)
-            sys.stdout.flush()
-
     def _start_timer(self):
         self.start = time.time()
 
     def get_elapsed_seconds(self):
         return round(time.time() - self.start, 2)
 
-    def print_elapsed_seconds(self, prefix='Elapsed', postfix='s.',
-                              overlong=7):
+    def log_elapsed_seconds(self, prefix='Elapsed', postfix='s.',
+                            overlong=7):
         sec = self.get_elapsed_seconds()
         if sec > overlong:
-            self._print('{} {} {}'.format(prefix, sec, postfix))
+            logger.info('{} {} {}'.format(prefix, sec, postfix))
 
     # http://stackoverflow.com/questions/1094841/reusable-library-to-get-human-readable-version-of-file-size
     @staticmethod
@@ -481,11 +477,12 @@ def run_query(self, query, **kwargs):
 
         self._start_timer()
         try:
-            self._print('Requesting query... ', end="")
+
+            logger.info('Requesting query... ')
             query_reply = self.client.query(
                 query,
                 job_config=QueryJobConfig.from_api_repr(job_config['query']))
-            self._print('ok.')
+            logger.info('ok.\nQuery running...')
         except (RefreshError, ValueError):
             if self.private_key:
                 raise AccessDenied(
@@ -498,10 +495,10 @@ def run_query(self, query, **kwargs):
             self.process_http_error(ex)
 
         job_id = query_reply.job_id
-        self._print('Job ID: %s\nQuery running...' % job_id)
+        logger.info('Job ID: %s\nQuery running...' % job_id)
 
         while query_reply.state != 'DONE':
-            self.print_elapsed_seconds('  Elapsed', 's. Waiting...')
+            self.log_elapsed_seconds('  Elapsed', 's. Waiting...')
 
             timeout_ms = job_config['query'].get('timeoutMs')
             if timeout_ms and timeout_ms < self.get_elapsed_seconds() * 1000:
@@ -520,19 +517,16 @@ def run_query(self, query, **kwargs):
             except self.http_error as ex:
                 self.process_http_error(ex)
 
-        if self.verbose:
-            if query_reply.cache_hit:
-                self._print('Query done.\nCache hit.\n')
-            else:
-                bytes_processed = query_reply.total_bytes_processed or 0
-                bytes_billed = query_reply.total_bytes_billed or 0
-                self._print('Query done.\nProcessed: {} Billed: {}'.format(
-                    self.sizeof_fmt(bytes_processed),
-                    self.sizeof_fmt(bytes_billed)))
-                self._print('Standard price: ${:,.2f} USD\n'.format(
-                    bytes_billed * self.query_price_for_TB))
-
-            self._print('Retrieving results...')
+        if query_reply.cache_hit:
+            logger.debug('Query done.\nCache hit.\n')
+        else:
+            bytes_processed = query_reply.total_bytes_processed or 0
+            bytes_billed = query_reply.total_bytes_billed or 0
+            logger.debug('Query done.\nProcessed: {} Billed: {}'.format(
+                self.sizeof_fmt(bytes_processed),
+                self.sizeof_fmt(bytes_billed)))
+            logger.debug('Standard price: ${:,.2f} USD\n'.format(
+                bytes_billed * self.query_price_for_TB))
 
         try:
             rows_iter = query_reply.result()
@@ -546,8 +540,8 @@ def run_query(self, query, **kwargs):
                 for field in rows_iter.schema],
         }
 
-        # print basic query stats
-        self._print('Got {} rows.\n'.format(total_rows))
+        # log basic query stats
+        logger.info('Got {} rows.\n'.format(total_rows))
 
         return schema, result_rows
 
@@ -557,18 +551,18 @@ def load_data(
         from pandas_gbq import _load
 
         total_rows = len(dataframe)
-        self._print("\n\n")
+        logger.info("\n\n")
 
         try:
             for remaining_rows in _load.load_chunks(
                     self.client, dataframe, dataset_id, table_id,
                     chunksize=chunksize, schema=schema):
-                self._print("\rLoad is {0}% Complete".format(
+                logger.info("\rLoad is {0}% Complete".format(
                     ((total_rows - remaining_rows) * 100) / total_rows))
         except self.http_error as ex:
             self.process_http_error(ex)
 
-        self._print("\n")
+        logger.info("\n")
 
     def schema(self, dataset_id, table_id):
         """Retrieve the schema of the table
@@ -687,7 +681,7 @@ def delete_and_recreate_table(self, dataset_id, table_id, table_schema):
         # be a 120 second delay
 
         if not self.verify_schema(dataset_id, table_id, table_schema):
-            self._print('The existing table has a different schema. Please '
+            logger.info('The existing table has a different schema. Please '
                         'wait 2 minutes. See Google BigQuery issue #191')
             delay = 120
 
@@ -729,7 +723,7 @@ def _parse_data(schema, rows):
 
 
 def read_gbq(query, project_id=None, index_col=None, col_order=None,
-             reauth=False, verbose=True, private_key=None,
+             reauth=False, verbose=None, private_key=None,
              auth_local_webserver=False, dialect='legacy', **kwargs):
     r"""Load data from Google BigQuery using google-cloud-python
 
@@ -768,8 +762,6 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None,
     reauth : boolean (default False)
         Force Google BigQuery to reauthenticate the user. This is useful
         if multiple accounts are used.
-    verbose : boolean (default True)
-        Verbose output
     private_key : str (optional)
         Service account private key in JSON format. Can be file path
         or string contents. This is useful for remote server
@@ -793,6 +785,7 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None,
         compliant with the SQL 2011 standard. For more information
         see `BigQuery SQL Reference
         <https://cloud.google.com/bigquery/sql-reference/>`__
+    verbose : None, deprecated
 
     **kwargs : Arbitrary keyword arguments
         configuration (dict): query config parameters for job processing.
@@ -809,6 +802,11 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None,
         DataFrame representing results of query
 
     """
+    if verbose is not None:
+        warnings.warn(
+            "verbose is deprecated and will be removed in "
+            "a future version. Set logging level in order to vary "
+            "verbosity", FutureWarning, stacklevel=1)
 
     _test_google_api_imports()
 
@@ -819,7 +817,7 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None,
         raise ValueError("'{0}' is not valid for dialect".format(dialect))
 
     connector = GbqConnector(
-        project_id, reauth=reauth, verbose=verbose, private_key=private_key,
+        project_id, reauth=reauth, private_key=private_key,
         dialect=dialect, auth_local_webserver=auth_local_webserver)
     schema, rows = connector.run_query(query, **kwargs)
     final_df = _parse_data(schema, rows)
@@ -853,7 +851,7 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None,
             final_df[field['name']] = \
                 final_df[field['name']].astype(type_map[field['type'].upper()])
 
-    connector.print_elapsed_seconds(
+    connector.log_elapsed_seconds(
         'Total time taken',
         datetime.now().strftime('s.\nFinished at %Y-%m-%d %H:%M:%S.'),
         0
@@ -863,7 +861,7 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None,
 
 
 def to_gbq(dataframe, destination_table, project_id, chunksize=None,
-           verbose=True, reauth=False, if_exists='fail', private_key=None,
+           verbose=None, reauth=False, if_exists='fail', private_key=None,
            auth_local_webserver=False, table_schema=None):
     """Write a DataFrame to a Google BigQuery table.
 
@@ -899,8 +897,6 @@ def to_gbq(dataframe, destination_table, project_id, chunksize=None,
     chunksize : int (default None)
         Number of rows to be inserted in each chunk from the dataframe. Use
         ``None`` to load the dataframe in a single chunk.
-    verbose : boolean (default True)
-        Show percentage complete
     reauth : boolean (default False)
         Force Google BigQuery to reauthenticate the user. This is useful
         if multiple accounts are used.
@@ -930,10 +926,17 @@ def to_gbq(dataframe, destination_table, project_id, chunksize=None,
         of DataFrame columns. See BigQuery API documentation on available
         names of a field.
         .. versionadded:: 0.3.1
+    verbose : None, deprecated
     """
 
     _test_google_api_imports()
 
+    if verbose is not None:
+        warnings.warn(
+            "verbose is deprecated and will be removed in "
+            "a future version. Set logging level in order to vary "
+            "verbosity", FutureWarning, stacklevel=1)
+
     if if_exists not in ('fail', 'replace', 'append'):
         raise ValueError("'{0}' is not valid for if_exists".format(if_exists))
 
@@ -942,7 +945,7 @@ def to_gbq(dataframe, destination_table, project_id, chunksize=None,
             "Invalid Table Name. Should be of the form 'datasetId.tableId' ")
 
     connector = GbqConnector(
-        project_id, reauth=reauth, verbose=verbose, private_key=private_key,
+        project_id, reauth=reauth, private_key=private_key,
         auth_local_webserver=auth_local_webserver)
     dataset_id, table_id = destination_table.rsplit('.', 1)
 
@@ -1004,10 +1007,9 @@ def _generate_bq_schema(df, default_type='STRING'):
 
 class _Table(GbqConnector):
 
-    def __init__(self, project_id, dataset_id, reauth=False, verbose=False,
-                 private_key=None):
+    def __init__(self, project_id, dataset_id, reauth=False, private_key=None):
         self.dataset_id = dataset_id
-        super(_Table, self).__init__(project_id, reauth, verbose, private_key)
+        super(_Table, self).__init__(project_id, reauth, private_key)
 
     def exists(self, table_id):
         """ Check if a table exists in Google BigQuery
@@ -1101,10 +1103,8 @@ def delete(self, table_id):
 
 class _Dataset(GbqConnector):
 
-    def __init__(self, project_id, reauth=False, verbose=False,
-                 private_key=None):
-        super(_Dataset, self).__init__(project_id, reauth, verbose,
-                                       private_key)
+    def __init__(self, project_id, reauth=False, private_key=None):
+        super(_Dataset, self).__init__(project_id, reauth, private_key)
 
     def exists(self, dataset_id):
         """ Check if a dataset exists in Google BigQuery