-
-
Notifications
You must be signed in to change notification settings - Fork 18.6k
ENH: Enable automatic writing of dates to Stata files #13710
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -432,7 +432,7 @@ def parse_dates_safe(dates, delta=False, year=False, days=False): | |
d = parse_dates_safe(dates, year=True) | ||
conv_dates = d.year | ||
else: | ||
raise ValueError("fmt %s not understood" % fmt) | ||
raise ValueError("Format %s is not a known Stata date format" % fmt) | ||
|
||
conv_dates = Series(conv_dates, dtype=np.float64) | ||
missing_value = struct.unpack('<d', b'\x00\x00\x00\x00\x00\x00\xe0\x7f')[0] | ||
|
@@ -1709,7 +1709,7 @@ def _convert_datetime_to_stata_type(fmt): | |
"%tq", "th", "%th", "ty", "%ty"]: | ||
return np.float64 # Stata expects doubles for SIFs | ||
else: | ||
raise ValueError("fmt %s not understood" % fmt) | ||
raise NotImplementedError("Format %s not implemented" % fmt) | ||
|
||
|
||
def _maybe_convert_to_int_keys(convert_dates, varlist): | ||
|
@@ -1721,9 +1721,8 @@ def _maybe_convert_to_int_keys(convert_dates, varlist): | |
new_dict.update({varlist.index(key): convert_dates[key]}) | ||
else: | ||
if not isinstance(key, int): | ||
raise ValueError( | ||
"convert_dates key is not in varlist and is not an int" | ||
) | ||
raise ValueError("convert_dates key must be a " | ||
"column or an integer") | ||
new_dict.update({key: convert_dates[key]}) | ||
return new_dict | ||
|
||
|
@@ -1763,8 +1762,7 @@ def _dtype_to_stata_type(dtype, column): | |
elif dtype == np.int8: | ||
return chr(251) | ||
else: # pragma : no cover | ||
raise ValueError("Data type %s not currently understood. " | ||
"Please report an error to the developers." % dtype) | ||
raise NotImplementedError("Data type %s not supported." % dtype) | ||
|
||
|
||
def _dtype_to_default_stata_fmt(dtype, column): | ||
|
@@ -1801,35 +1799,36 @@ def _dtype_to_default_stata_fmt(dtype, column): | |
elif dtype == np.int8 or dtype == np.int16: | ||
return "%8.0g" | ||
else: # pragma : no cover | ||
raise ValueError("Data type %s not currently understood. " | ||
"Please report an error to the developers." % dtype) | ||
raise NotImplementedError("Data type %s not supported." % dtype) | ||
|
||
|
||
class StataWriter(StataParser): | ||
""" | ||
A class for writing Stata binary dta files from array-like objects | ||
A class for writing Stata binary dta files | ||
|
||
Parameters | ||
---------- | ||
fname : file path or buffer | ||
Where to save the dta file. | ||
data : array-like | ||
Array-like input to save. Pandas objects are also accepted. | ||
fname : str or buffer | ||
String path of file-like object | ||
data : DataFrame | ||
Input to save | ||
convert_dates : dict | ||
Dictionary mapping column of datetime types to the stata internal | ||
format that you want to use for the dates. Options are | ||
'tc', 'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either a | ||
number or a name. | ||
Dictionary mapping columns containing datetime types to stata internal | ||
format to use when wirting the dates. Options are 'tc', 'td', 'tm', | ||
'tw', 'th', 'tq', 'ty'. Column can be either an integer or a name. | ||
Datetime columns that do not have a conversion type specified will be | ||
converted to 'tc'. Raises NotImplementedError if a datetime column has | ||
timezone information | ||
write_index : bool | ||
Write the index to Stata dataset. | ||
encoding : str | ||
Default is latin-1. Note that Stata does not support unicode. | ||
Default is latin-1. Unicode is not supported | ||
byteorder : str | ||
Can be ">", "<", "little", or "big". The default is None which uses | ||
`sys.byteorder` | ||
Can be ">", "<", "little", or "big". default is `sys.byteorder` | ||
time_stamp : datetime | ||
A date time to use when writing the file. Can be None, in which | ||
case the current time is used. | ||
A datetime to use as file creation date. Default is the current time | ||
dataset_label : str | ||
A label for the data set. Should be 80 characters or smaller. | ||
A label for the data set. Must be 80 characters or smaller. | ||
|
||
.. versionadded:: 0.19.0 | ||
|
||
|
@@ -1843,6 +1842,17 @@ class StataWriter(StataParser): | |
The StataWriter instance has a write_file method, which will | ||
write the file to the given `fname`. | ||
|
||
Raises | ||
------ | ||
NotImplementedError | ||
* If datetimes contain timezone information | ||
* Column dtype is not representable in Stata | ||
ValueError | ||
* Columns listed in convert_dates are noth either datetime64[ns] | ||
or datetime.datetime | ||
* Column listed in convert_dates is not in DataFrame | ||
* Categorical label contains more than 32,000 characters | ||
|
||
Examples | ||
-------- | ||
>>> import pandas as pd | ||
|
@@ -1861,7 +1871,7 @@ def __init__(self, fname, data, convert_dates=None, write_index=True, | |
encoding="latin-1", byteorder=None, time_stamp=None, | ||
data_label=None, variable_labels=None): | ||
super(StataWriter, self).__init__(encoding) | ||
self._convert_dates = convert_dates | ||
self._convert_dates = {} if convert_dates is None else convert_dates | ||
self._write_index = write_index | ||
self._time_stamp = time_stamp | ||
self._data_label = data_label | ||
|
@@ -2041,15 +2051,22 @@ def _prepare_pandas(self, data): | |
self.varlist = data.columns.tolist() | ||
|
||
dtypes = data.dtypes | ||
if self._convert_dates is not None: | ||
self._convert_dates = _maybe_convert_to_int_keys( | ||
self._convert_dates, self.varlist | ||
|
||
# Ensure all date columns are converted | ||
for col in data: | ||
if col in self._convert_dates: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. so maybe you should filter so only datetime64[ns] types pass this There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure needed since it raises NIE on tz-aware columns. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is_datetime64_dtype is False for tz aware datetimes, so I don't think an explicit check is needed. |
||
continue | ||
if is_datetime64_dtype(data[col]): | ||
self._convert_dates[col] = 'tc' | ||
|
||
self._convert_dates = _maybe_convert_to_int_keys(self._convert_dates, | ||
self.varlist) | ||
for key in self._convert_dates: | ||
new_type = _convert_datetime_to_stata_type( | ||
self._convert_dates[key] | ||
) | ||
for key in self._convert_dates: | ||
new_type = _convert_datetime_to_stata_type( | ||
self._convert_dates[key] | ||
) | ||
dtypes[key] = np.dtype(new_type) | ||
dtypes[key] = np.dtype(new_type) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this would blow up if they did convert_dates on a datetime64-tz aware type There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
|
||
self.typlist = [] | ||
self.fmtlist = [] | ||
for col, dtype in dtypes.iteritems(): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,17 +11,17 @@ | |
|
||
import nose | ||
import numpy as np | ||
from pandas.tslib import NaT | ||
|
||
import pandas as pd | ||
import pandas.util.testing as tm | ||
from pandas import compat | ||
from pandas.compat import iterkeys | ||
from pandas.core.frame import DataFrame, Series | ||
from pandas.types.common import is_categorical_dtype | ||
from pandas.tslib import NaT | ||
from pandas.io.parsers import read_csv | ||
from pandas.io.stata import (read_stata, StataReader, InvalidColumnName, | ||
PossiblePrecisionLoss, StataMissingValue) | ||
from pandas.types.common import is_categorical_dtype | ||
|
||
|
||
class TestStata(tm.TestCase): | ||
|
@@ -1165,6 +1165,52 @@ def test_write_variable_label_errors(self): | |
with tm.ensure_clean() as path: | ||
original.to_stata(path, variable_labels=variable_labels_long) | ||
|
||
def test_default_date_conversion(self): | ||
# GH 12259 | ||
dates = [dt.datetime(1999, 12, 31, 12, 12, 12, 12000), | ||
dt.datetime(2012, 12, 21, 12, 21, 12, 21000), | ||
dt.datetime(1776, 7, 4, 7, 4, 7, 4000)] | ||
original = pd.DataFrame({'nums': [1.0, 2.0, 3.0], | ||
'strs': ['apple', 'banana', 'cherry'], | ||
'dates': dates}) | ||
|
||
with tm.ensure_clean() as path: | ||
original.to_stata(path, write_index=False) | ||
reread = read_stata(path, convert_dates=True) | ||
tm.assert_frame_equal(original, reread) | ||
|
||
original.to_stata(path, | ||
write_index=False, | ||
convert_dates={'dates': 'tc'}) | ||
direct = read_stata(path, convert_dates=True) | ||
tm.assert_frame_equal(reread, direct) | ||
|
||
def test_unsupported_type(self): | ||
original = pd.DataFrame({'a': [1 + 2j, 2 + 4j]}) | ||
|
||
with tm.assertRaises(NotImplementedError): | ||
with tm.ensure_clean() as path: | ||
original.to_stata(path) | ||
|
||
def test_unsupported_datetype(self): | ||
dates = [dt.datetime(1999, 12, 31, 12, 12, 12, 12000), | ||
dt.datetime(2012, 12, 21, 12, 21, 12, 21000), | ||
dt.datetime(1776, 7, 4, 7, 4, 7, 4000)] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add in a tz-aware dtype There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Works as expected -- raises |
||
original = pd.DataFrame({'nums': [1.0, 2.0, 3.0], | ||
'strs': ['apple', 'banana', 'cherry'], | ||
'dates': dates}) | ||
|
||
with tm.assertRaises(NotImplementedError): | ||
with tm.ensure_clean() as path: | ||
original.to_stata(path, convert_dates={'dates': 'tC'}) | ||
|
||
dates = pd.date_range('1-1-1990',periods=3,tz='Asia/Hong_Kong') | ||
original = pd.DataFrame({'nums': [1.0, 2.0, 3.0], | ||
'strs': ['apple', 'banana', 'cherry'], | ||
'dates': dates}) | ||
with tm.assertRaises(NotImplementedError): | ||
with tm.ensure_clean() as path: | ||
original.to_stata(path) | ||
|
||
if __name__ == '__main__': | ||
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
is this where you validate the supported format? (e.g. the tc and such). I think this would be a ValueError if its not in the list (yes unsupported sort of means this, but more likely it was a typo by the user). and datetime w/tz can be not implemented error.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Revered this to ValueError since it should only raise if the type doesn't exist.