@@ -90,6 +90,25 @@ def df_cross_compat():
90
90
return df
91
91
92
92
93
+ @pytest .fixture
94
+ def df_full ():
95
+ return pd .DataFrame (
96
+ {'string' : list ('abc' ),
97
+ 'string_with_nan' : ['a' , np .nan , 'c' ],
98
+ 'string_with_none' : ['a' , None , 'c' ],
99
+ 'bytes' : [b'foo' , b'bar' , b'baz' ],
100
+ 'unicode' : [u'foo' , u'bar' , u'baz' ],
101
+ 'int' : list (range (1 , 4 )),
102
+ 'uint' : np .arange (3 , 6 ).astype ('u1' ),
103
+ 'float' : np .arange (4.0 , 7.0 , dtype = 'float64' ),
104
+ 'float_with_nan' : [2. , np .nan , 3. ],
105
+ 'bool' : [True , False , True ],
106
+ 'datetime' : pd .date_range ('20130101' , periods = 3 ),
107
+ 'datetime_with_nat' : [pd .Timestamp ('20130101' ),
108
+ pd .NaT ,
109
+ pd .Timestamp ('20130103' )]})
110
+
111
+
93
112
def test_invalid_engine (df_compat ):
94
113
95
114
with pytest .raises (ValueError ):
@@ -300,27 +319,29 @@ def test_read_columns(self, engine):
300
319
301
320
class TestParquetPyArrow (Base ):
302
321
303
- def test_basic (self , pa ):
322
+ def test_basic (self , pa , df_full ):
304
323
305
- df = pd .DataFrame ({'string' : list ('abc' ),
306
- 'string_with_nan' : ['a' , np .nan , 'c' ],
307
- 'string_with_none' : ['a' , None , 'c' ],
308
- 'bytes' : [b'foo' , b'bar' , b'baz' ],
309
- 'unicode' : [u'foo' , u'bar' , u'baz' ],
310
- 'int' : list (range (1 , 4 )),
311
- 'uint' : np .arange (3 , 6 ).astype ('u1' ),
312
- 'float' : np .arange (4.0 , 7.0 , dtype = 'float64' ),
313
- 'float_with_nan' : [2. , np .nan , 3. ],
314
- 'bool' : [True , False , True ],
315
- 'bool_with_none' : [True , None , True ],
316
- 'datetime_ns' : pd .date_range ('20130101' , periods = 3 ),
317
- 'datetime_with_nat' : [pd .Timestamp ('20130101' ),
318
- pd .NaT ,
319
- pd .Timestamp ('20130103' )]
320
- })
324
+ df = df_full
325
+
326
+ # additional supported types for pyarrow
327
+ df ['datetime_tz' ] = pd .date_range ('20130101' , periods = 3 ,
328
+ tz = 'Europe/Brussels' )
329
+ df ['bool_with_none' ] = [True , None , True ]
321
330
322
331
self .check_round_trip (df , pa )
323
332
333
+ @pytest .mark .xfail (reason = "pyarrow fails on this (ARROW-1883)" )
334
+ def test_basic_subset_columns (self , pa , df_full ):
335
+ # GH18628
336
+
337
+ df = df_full
338
+ # additional supported types for pyarrow
339
+ df ['datetime_tz' ] = pd .date_range ('20130101' , periods = 3 ,
340
+ tz = 'Europe/Brussels' )
341
+
342
+ self .check_round_trip (df , pa , expected = df [['string' , 'int' ]],
343
+ read_kwargs = {'columns' : ['string' , 'int' ]})
344
+
324
345
def test_duplicate_columns (self , pa ):
325
346
326
347
# not currently able to handle duplicate columns
@@ -363,25 +384,12 @@ def test_categorical_unsupported(self, pa_lt_070):
363
384
364
385
class TestParquetFastParquet (Base ):
365
386
366
- def test_basic (self , fp ):
367
-
368
- df = pd .DataFrame (
369
- {'string' : list ('abc' ),
370
- 'string_with_nan' : ['a' , np .nan , 'c' ],
371
- 'string_with_none' : ['a' , None , 'c' ],
372
- 'bytes' : [b'foo' , b'bar' , b'baz' ],
373
- 'unicode' : [u'foo' , u'bar' , u'baz' ],
374
- 'int' : list (range (1 , 4 )),
375
- 'uint' : np .arange (3 , 6 ).astype ('u1' ),
376
- 'float' : np .arange (4.0 , 7.0 , dtype = 'float64' ),
377
- 'float_with_nan' : [2. , np .nan , 3. ],
378
- 'bool' : [True , False , True ],
379
- 'datetime' : pd .date_range ('20130101' , periods = 3 ),
380
- 'datetime_with_nat' : [pd .Timestamp ('20130101' ),
381
- pd .NaT ,
382
- pd .Timestamp ('20130103' )],
383
- 'timedelta' : pd .timedelta_range ('1 day' , periods = 3 ),
384
- })
387
+ def test_basic (self , fp , df_full ):
388
+
389
+ df = df_full
390
+
391
+ # additional supported types for fastparquet
392
+ df ['timedelta' ] = pd .timedelta_range ('1 day' , periods = 3 )
385
393
386
394
self .check_round_trip (df , fp , write_kwargs = {'compression' : None })
387
395
0 commit comments