BUG: Fix FastParquetImpl.write for non-existent file (#28326)

bnaul · TomAugspurger · commit fa1364d1299a · 2019-09-19T10:46:13.000-05:00
* Fix `FastParquetImpl.write` for non-existent file
diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
@@ -218,6 +218,7 @@ I/O
 - Bug in :meth:`DataFrame.to_csv` where values were truncated when the length of ``na_rep`` was shorter than the text input data. (:issue:`25099`)
 - Bug in :func:`DataFrame.to_string` where values were truncated using display options instead of outputting the full content (:issue:`9784`)
 - Bug in :meth:`DataFrame.to_json` where a datetime column label would not be written out in ISO format with ``orient="table"`` (:issue:`28130`)
+- Bug in :func:`DataFrame.to_parquet` where writing to GCS would fail with `engine='fastparquet'` if the file did not already exist (:issue:`28326`)
 
 Plotting
 ^^^^^^^^
diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
@@ -7,7 +7,7 @@
 
 from pandas import DataFrame, get_option
 
-from pandas.io.common import get_filepath_or_buffer, is_s3_url
+from pandas.io.common import get_filepath_or_buffer, is_gcs_url, is_s3_url
 
 
 def get_engine(engine):
@@ -159,12 +159,12 @@ def write(
         if partition_cols is not None:
             kwargs["file_scheme"] = "hive"
 
-        if is_s3_url(path):
-            # path is s3:// so we need to open the s3file in 'wb' mode.
+        if is_s3_url(path) or is_gcs_url(path):
+            # if path is s3:// or gs:// we need to open the file in 'wb' mode.
             # TODO: Support 'ab'
 
             path, _, _, _ = get_filepath_or_buffer(path, mode="wb")
-            # And pass the opened s3file to the fastparquet internal impl.
+            # And pass the opened file to the fastparquet internal impl.
             kwargs["open_with"] = lambda path, _: path
         else:
             path, _, _, _ = get_filepath_or_buffer(path)
diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py
@@ -1,4 +1,5 @@
 from io import StringIO
+import os
 
 import numpy as np
 import pytest
@@ -60,6 +61,31 @@ def open(*args):
     assert_frame_equal(df1, df2)
 
 
+@td.skip_if_no("fastparquet")
+@td.skip_if_no("gcsfs")
+def test_to_parquet_gcs_new_file(monkeypatch, tmpdir):
+    """Regression test for writing to a not-yet-existent GCS Parquet file."""
+    df1 = DataFrame(
+        {
+            "int": [1, 3],
+            "float": [2.0, np.nan],
+            "str": ["t", "s"],
+            "dt": date_range("2018-06-18", periods=2),
+        }
+    )
+
+    class MockGCSFileSystem:
+        def open(self, path, mode="r", *args):
+            if "w" not in mode:
+                raise FileNotFoundError
+            return open(os.path.join(tmpdir, "test.parquet"), mode)
+
+    monkeypatch.setattr("gcsfs.GCSFileSystem", MockGCSFileSystem)
+    df1.to_parquet(
+        "gs://test/test.csv", index=True, engine="fastparquet", compression=None
+    )
+
+
 @td.skip_if_no("gcsfs")
 def test_gcs_get_filepath_or_buffer(monkeypatch):
     df1 = DataFrame(