Skip to content

Commit fa1364d

Browse files
bnaulTomAugspurger
authored andcommitted
BUG: Fix FastParquetImpl.write for non-existent file (#28326)
* Fix `FastParquetImpl.write` for non-existent file
1 parent 0d69d91 commit fa1364d

File tree

3 files changed

+31
-4
lines changed

3 files changed

+31
-4
lines changed

doc/source/whatsnew/v1.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,7 @@ I/O
218218
- Bug in :meth:`DataFrame.to_csv` where values were truncated when the length of ``na_rep`` was shorter than the text input data. (:issue:`25099`)
219219
- Bug in :func:`DataFrame.to_string` where values were truncated using display options instead of outputting the full content (:issue:`9784`)
220220
- Bug in :meth:`DataFrame.to_json` where a datetime column label would not be written out in ISO format with ``orient="table"`` (:issue:`28130`)
221+
- Bug in :func:`DataFrame.to_parquet` where writing to GCS would fail with `engine='fastparquet'` if the file did not already exist (:issue:`28326`)
221222

222223
Plotting
223224
^^^^^^^^

pandas/io/parquet.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
from pandas import DataFrame, get_option
99

10-
from pandas.io.common import get_filepath_or_buffer, is_s3_url
10+
from pandas.io.common import get_filepath_or_buffer, is_gcs_url, is_s3_url
1111

1212

1313
def get_engine(engine):
@@ -159,12 +159,12 @@ def write(
159159
if partition_cols is not None:
160160
kwargs["file_scheme"] = "hive"
161161

162-
if is_s3_url(path):
163-
# path is s3:// so we need to open the s3file in 'wb' mode.
162+
if is_s3_url(path) or is_gcs_url(path):
163+
# if path is s3:// or gs:// we need to open the file in 'wb' mode.
164164
# TODO: Support 'ab'
165165

166166
path, _, _, _ = get_filepath_or_buffer(path, mode="wb")
167-
# And pass the opened s3file to the fastparquet internal impl.
167+
# And pass the opened file to the fastparquet internal impl.
168168
kwargs["open_with"] = lambda path, _: path
169169
else:
170170
path, _, _, _ = get_filepath_or_buffer(path)

pandas/tests/io/test_gcs.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from io import StringIO
2+
import os
23

34
import numpy as np
45
import pytest
@@ -60,6 +61,31 @@ def open(*args):
6061
assert_frame_equal(df1, df2)
6162

6263

64+
@td.skip_if_no("fastparquet")
65+
@td.skip_if_no("gcsfs")
66+
def test_to_parquet_gcs_new_file(monkeypatch, tmpdir):
67+
"""Regression test for writing to a not-yet-existent GCS Parquet file."""
68+
df1 = DataFrame(
69+
{
70+
"int": [1, 3],
71+
"float": [2.0, np.nan],
72+
"str": ["t", "s"],
73+
"dt": date_range("2018-06-18", periods=2),
74+
}
75+
)
76+
77+
class MockGCSFileSystem:
78+
def open(self, path, mode="r", *args):
79+
if "w" not in mode:
80+
raise FileNotFoundError
81+
return open(os.path.join(tmpdir, "test.parquet"), mode)
82+
83+
monkeypatch.setattr("gcsfs.GCSFileSystem", MockGCSFileSystem)
84+
df1.to_parquet(
85+
"gs://test/test.csv", index=True, engine="fastparquet", compression=None
86+
)
87+
88+
6389
@td.skip_if_no("gcsfs")
6490
def test_gcs_get_filepath_or_buffer(monkeypatch):
6591
df1 = DataFrame(

0 commit comments

Comments
 (0)