Merge pull request #8915 from tshauck/generate_bq_schema

jreback · jreback · commit 98ea53babc4e · 2014-12-03T17:38:33.000-05:00
ENH: adds ability to generate bq schema from df
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -3651,6 +3651,14 @@ data quickly, but it is not a direct replacement for a transactional database.
 You can access the management console to determine project id's by:
 <https://code.google.com/apis/console/b/0/?noredirect>
 
+As of 0.15.2, the gbq module has a function ``generate_bq_schema`` which
+will produce the dictionary representation of the schema.
+
+.. code-block:: python
+
+    df = pandas.DataFrame({'A': [1.0]})
+    gbq.generate_bq_schema(df, default_type='STRING')
+
 .. warning::
 
    To use this module, you will need a valid BigQuery account. See
diff --git a/doc/source/whatsnew/v0.15.2.txt b/doc/source/whatsnew/v0.15.2.txt
@@ -73,6 +73,7 @@ Enhancements
 - ``Timedelta`` arithmetic returns ``NotImplemented`` in unknown cases, allowing extensions by custom classes (:issue:`8813`).
 - ``Timedelta`` now supports arithemtic with ``numpy.ndarray`` objects of the appropriate dtype (numpy 1.8 or newer only) (:issue:`8884`).
 - Added ``Timedelta.to_timedelta64`` method to the public API (:issue:`8884`).
+- Added ``gbq.generate_bq_schema`` function to the gbq module (:issue:`8325`).
 
 .. _whatsnew_0152.performance:
 
diff --git a/pandas/io/gbq.py b/pandas/io/gbq.py
@@ -444,3 +444,31 @@ def to_gbq(dataframe, destination_table, project_id=None, chunksize=10000,
     dataset_id, table_id = destination_table.rsplit('.',1)
 
     connector.load_data(dataframe, dataset_id, table_id, chunksize, verbose)
+
+def generate_bq_schema(df, default_type='STRING'):
+    """ Given a passed df, generate the associated big query schema.
+
+    Parameters
+    ----------
+    df : DataFrame
+    default_type : string
+        The default big query type in case the type of the column
+        does not exist in the schema.
+    """
+
+    type_mapping = {
+        'i': 'INTEGER',
+        'b': 'BOOLEAN',
+        'f': 'FLOAT',
+        'O': 'STRING',
+        'S': 'STRING',
+        'U': 'STRING',
+        'M': 'TIMESTAMP'
+    }
+
+    fields = []
+    for column_name, dtype in df.dtypes.iteritems():
+        fields.append({'name': column_name,
+                       'type': type_mapping.get(dtype.kind, default_type)})
+
+    return {'fields': fields}
diff --git a/pandas/io/tests/test_gbq.py b/pandas/io/tests/test_gbq.py
@@ -277,6 +277,17 @@ def test_google_upload_errors_should_raise_exception(self):
         with tm.assertRaises(gbq.UnknownGBQException):
             gbq.to_gbq(bad_df, 'pydata_pandas_bq_testing.new_test', project_id = PROJECT_ID)
 
+    def test_generate_bq_schema(self):
+
+        df = tm.makeMixedDataFrame()
+        schema = gbq.generate_bq_schema(df)
+
+        test_schema = {'fields': [{'name': 'A', 'type': 'FLOAT'},
+                                  {'name': 'B', 'type': 'FLOAT'},
+                                  {'name': 'C', 'type': 'STRING'},
+                                  {'name': 'D', 'type': 'TIMESTAMP'}]}
+
+        self.assertEqual(schema, test_schema)
 
     @classmethod
     def tearDownClass(cls):