Skip to content

EA: revert treatment of i8values #24623

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 11 commits into from
2 changes: 1 addition & 1 deletion pandas/core/arrays/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -611,7 +611,7 @@ def _concat_same_type(cls, to_concat):

def copy(self, deep=False):
values = self.asi8.copy()
return type(self)(values, dtype=self.dtype, freq=self.freq)
return type(self)._simple_new(values, dtype=self.dtype, freq=self.freq)

def _values_for_factorize(self):
return self.asi8, iNaT
Expand Down
100 changes: 34 additions & 66 deletions pandas/core/arrays/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,83 +236,51 @@ class DatetimeArray(dtl.DatetimeLikeArrayMixin,
_dtype = None # type: Union[np.dtype, DatetimeTZDtype]
_freq = None

def __init__(self, values, dtype=_NS_DTYPE, freq=None, copy=False):
if isinstance(values, (ABCSeries, ABCIndexClass)):
values = values._values

if isinstance(values, type(self)):
# validation
dtz = getattr(dtype, 'tz', None)
if dtz and values.tz is None:
dtype = DatetimeTZDtype(tz=dtype.tz)
elif dtz and values.tz:
if not timezones.tz_compare(dtz, values.tz):
msg = (
"Timezone of the array and 'dtype' do not match. "
"'{}' != '{}'"
)
raise TypeError(msg.format(dtz, values.tz))
elif values.tz:
dtype = values.dtype
# freq = validate_values_freq(values, freq)
if freq is None:
freq = values.freq
values = values._data
def __init__(self, values, dtype=None, freq=None, copy=False):
if freq == "infer":
raise ValueError(
"Frequency inference not allowed in DatetimeArray.__init__. "
"Use 'pd.array()' instead.")

if not isinstance(values, np.ndarray):
msg = (
"Unexpected type '{}'. 'values' must be a DatetimeArray "
if not hasattr(values, "dtype"):
# e.g. list
raise ValueError(
"Unexpected type '{vals}'. 'values' must be a DatetimeArray "
"ndarray, or Series or Index containing one of those."
)
raise ValueError(msg.format(type(values).__name__))
.format(vals=type(values).__name__))

if values.dtype == 'i8':
# for compat with datetime/timedelta/period shared methods,
# we can sometimes get here with int64 values. These represent
# nanosecond UTC (or tz-naive) unix timestamps
values = values.view(_NS_DTYPE)

if values.dtype != _NS_DTYPE:
msg = (
if values.dtype == np.bool_:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why do u only check for book here? shouldn’t this. e inside _from_sequeneve?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because up until moments ago I thought we had a test checking that pd.to_datetime(False) returned NaT. No idea where I got that idea... Just moved this check down.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

well, the good news is that I'm not hallucinating non-existent test cases. the relevant test was for to_datetime(False, errors="coerce"), so I'm moving this check back up to __init__. To be clear, I don't think the three checks in __init__ should be there at all, but for the moment I'm trying to minimize behavior changes and just get the tests passing.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not because to_datetime has that behaviour that there should be a check for that here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not because to_datetime has that behaviour that there should be a check for that here?

It's because to_datetime has a coerce kwarg that specified fallback behavior. If we want to disallow bools (and other dtypes presumably, bools were just the only one tested in 24024) in DatetimeArray but not in sequence_to_dt64ns, the check needs to be here.

raise ValueError(
"The dtype of 'values' is incorrect. Must be 'datetime64[ns]'."
" Got {} instead."
)
raise ValueError(msg.format(values.dtype))
" Got {dtype} instead." .format(dtype=values.dtype))

dtype = _validate_dt64_dtype(dtype)

if freq == "infer":
msg = (
"Frequency inference not allowed in DatetimeArray.__init__. "
"Use 'pd.array()' instead."
)
raise ValueError(msg)

if copy:
values = values.copy()
if freq:
freq = to_offset(freq)
if getattr(dtype, 'tz', None):
# https://github.com/pandas-dev/pandas/issues/18595
# Ensure that we have a standard timezone for pytz objects.
# Without this, things like adding an array of timedeltas and
# a tz-aware Timestamp (with a tz specific to its datetime) will
# be incorrect(ish?) for the array as a whole
dtype = DatetimeTZDtype(tz=timezones.tz_standardize(dtype.tz))

self._data = values
self._dtype = dtype
self._freq = freq
arr = type(self)._from_sequence(values, dtype=dtype,
freq=freq, copy=copy)
self._data = arr._data
self._freq = arr._freq
self._dtype = arr._dtype

@classmethod
def _simple_new(cls, values, freq=None, tz=None):
def _simple_new(cls, values, freq=None, tz=None, dtype=None):
"""
we require the we have a dtype compat for the values
if we are passed a non-dtype compat, then coerce using the constructor
"""
dtype = DatetimeTZDtype(tz=tz) if tz else _NS_DTYPE

return cls(values, freq=freq, dtype=dtype)
if tz is not None:
# TODO: get tz out of here altogether
assert dtype is None
tz = timezones.tz_standardize(tz)
dtype = DatetimeTZDtype(tz=tz)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this tz normalization needs to be applied to the case of user-provided dtype=DatetimeTZDtype(...) as well. I'll try to construct a failing test case.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, this (roughly) passes on master, fails here

def test_foo():
    idx = pd.DatetimeIndex(['2000', '2001'], tz='US/Central')
    t1 = pd.DatetimeTZDtype(tz=idx.tz)
    t2 = pd.DatetimeTZDtype(tz=idx[0].tz)

    x = pd.arrays.DatetimeArray._simple_new(idx.view("i8"), dtype=t1)
    y = pd.arrays.DatetimeArray._simple_new(idx.view("i8"), dtype=t2)
    assert x.dtype.tz == y.dtype.tz    

I think it should pass.

elif dtype is None:
dtype = _NS_DTYPE

assert isinstance(values, np.ndarray), type(values)

result = object.__new__(cls)
result._data = values.view('datetime64[ns]')
result._freq = freq
result._dtype = dtype
return result

@classmethod
def _from_sequence(cls, data, dtype=None, copy=False,
Expand Down
3 changes: 2 additions & 1 deletion pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4377,7 +4377,8 @@ def _maybe_casted_values(index, labels=None):
values, mask, np.nan)

if issubclass(values_type, DatetimeLikeArray):
values = values_type(values, dtype=values_dtype)
values = values_type._simple_new(values,
dtype=values_dtype)

return values

Expand Down
8 changes: 4 additions & 4 deletions pandas/core/indexes/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,18 +316,18 @@ def _simple_new(cls, values, name=None, freq=None, tz=None, dtype=None):
tz = validate_tz_from_dtype(dtype, tz)
dtype = DatetimeTZDtype(tz=tz)
elif dtype is None:
dtype = _NS_DTYPE
dtype = values.dtype

values = DatetimeArray(values, freq=freq, dtype=dtype)
tz = values.tz
freq = values.freq
values = values._data
else:
tz = tz or getattr(dtype, 'tz', None)

# DatetimeArray._simple_new will accept either i8 or M8[ns] dtypes
if isinstance(values, DatetimeIndex):
values = values._data
assert isinstance(values, np.ndarray)
dtarr = DatetimeArray._simple_new(values, freq=freq, tz=tz)
assert isinstance(dtarr, DatetimeArray)

result = object.__new__(cls)
result._data = dtarr
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -3078,7 +3078,7 @@ def make_block(values, placement, klass=None, ndim=None, dtype=None,
elif klass is DatetimeTZBlock and not is_datetime64tz_dtype(values):
# TODO: This is no longer hit internally; does it need to be retained
# for e.g. pyarrow?
values = DatetimeArray(values, dtype)
values = DatetimeArray(values.view('i8'), dtype)

return klass(values, ndim=ndim, placement=placement)

Expand Down
4 changes: 3 additions & 1 deletion pandas/io/packers.py
Original file line number Diff line number Diff line change
Expand Up @@ -656,7 +656,9 @@ def create_block(b):
if is_datetime64tz_dtype(b[u'dtype']):
assert isinstance(values, np.ndarray), type(values)
assert values.dtype == 'M8[ns]', values.dtype
values = DatetimeArray(values, dtype=b[u'dtype'])
# These values are interpreted as unix timestamps, so we
# view as i8
values = DatetimeArray(values.view('i8'), dtype=b[u'dtype'])

return make_block(values=values,
klass=getattr(internals, b[u'klass']),
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/arrays/test_datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,7 +340,7 @@ def test_from_array_keeps_base(self):
arr = np.array(['2000-01-01', '2000-01-02'], dtype='M8[ns]')
dta = DatetimeArray(arr)

assert dta._data is arr
assert dta._data.base is arr
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why this change?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

because a view of arr is taken, so the assertion is untrue as-is.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

then why was it passing on master? Because you changed it to take a view? Is it important to do that?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because I changed it back to use _from_sequence, which uses sequence_to_dt64ns, which yes takes a view. And it is important in as much as in the PR implementing sequence_to_dt64ns when the original version only took a view if it was necessary, there was a request to do it unconditionally to save a line

dta = DatetimeArray(arr[:0])
assert dta._data.base is arr

Expand Down
67 changes: 65 additions & 2 deletions pandas/tests/arrays/test_datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,68 @@


class TestDatetimeArrayConstructor(object):

@pytest.mark.parametrize('tz', [None, 'Asia/Singapore'])
def test_constructor_equivalence(self, tz):
# GH#24623 check that DatetimeArray.__init__ behavior matches:
# Timestamp.__new__ for int64
# DatetimeArray._from_sequence for int64, datetime64[ns]
# DatetimeArray._simple_new for int64
# DatetimeIndex.__new__ for int64, datetime64[ns]
# DatetimeIndex._simple_new for int64, datetime64[ns]
#
# and that DatetimeArray._simple_new behaves like
# DatetimeIndex._simple_new for both int64 and datetime64[ns] inputs
arr = np.random.randint(-10**9, 10**9, size=5, dtype=np.int64)
dti = pd.date_range('1960-01-01', periods=1, tz=tz)

v1 = DatetimeArray._simple_new(arr.view('i8'), dtype=dti.dtype)
v2 = DatetimeArray(arr.view('i8'), dtype=dti.dtype)
v3 = DatetimeArray._from_sequence(arr.view('i8'), dtype=dti.dtype)
v4 = pd.DatetimeIndex._simple_new(arr.view('i8'), tz=dti.tz)
v5 = pd.DatetimeIndex(arr.view('i8'), tz=dti.tz)
v6 = pd.to_datetime(arr, utc=True).tz_convert(dti.tz)

# when dealing with _simple_new, i8 and M8[ns] are interchangeable
v7 = DatetimeArray._simple_new(arr.view('M8[ns]'), dtype=dti.dtype)
v8 = pd.DatetimeIndex._simple_new(arr.view('M8[ns]'), dtype=dti.dtype)

tm.assert_datetime_array_equal(v1, v2)
tm.assert_datetime_array_equal(v1, v3)
tm.assert_datetime_array_equal(v1, v4._data)
tm.assert_datetime_array_equal(v1, v5._data)
tm.assert_datetime_array_equal(v1, v6._data)
tm.assert_datetime_array_equal(v1, v7)
tm.assert_datetime_array_equal(v1, v8._data)

expected = [pd.Timestamp(i8, tz=dti.tz) for i8 in arr]
assert list(v1) == expected

# The guarantees for datetime64 data are fewer
dt64arr = arr.view('datetime64[ns]')
v1 = DatetimeArray(dt64arr, dtype=dti.dtype)
v2 = DatetimeArray._from_sequence(dt64arr, dtype=dti.dtype)
v3 = DatetimeArray._from_sequence(dt64arr, tz=dti.tz)
v4 = pd.DatetimeIndex(dt64arr, dtype=dti.dtype)
v5 = pd.DatetimeIndex(dt64arr, tz=dti.tz)

tm.assert_datetime_array_equal(v1, v2)
tm.assert_datetime_array_equal(v1, v3)
tm.assert_datetime_array_equal(v1, v4._data)
tm.assert_datetime_array_equal(v1, v5._data)

def test_freq_validation(self):
# GH#24623 check that invalid instances cannot be created with the
# public constructor
arr = pd.array(np.arange(5, dtype=np.int64)) * 3600 * 10**9

msg = ("Inferred frequency H from passed values does not "
"conform to passed frequency W-SUN")
with pytest.raises(ValueError, match=msg):
DatetimeArray(arr, freq="W")

def test_from_pandas_array(self):
# GH#24623, GH#24615
arr = pd.array(np.arange(5, dtype=np.int64)) * 3600 * 10**9

result = DatetimeArray._from_sequence(arr, freq='infer')
Expand All @@ -28,7 +89,8 @@ def test_mismatched_timezone_raises(self):
arr = DatetimeArray(np.array(['2000-01-01T06:00:00'], dtype='M8[ns]'),
dtype=DatetimeTZDtype(tz='US/Central'))
dtype = DatetimeTZDtype(tz='US/Eastern')
with pytest.raises(TypeError, match='Timezone of the array'):
with pytest.raises(TypeError,
match='data is already tz-aware US/Central'):
DatetimeArray(arr, dtype=dtype)

def test_non_array_raises(self):
Expand All @@ -51,10 +113,11 @@ def test_freq_infer_raises(self):
def test_copy(self):
data = np.array([1, 2, 3], dtype='M8[ns]')
arr = DatetimeArray(data, copy=False)
assert arr._data is data
assert arr._data.base is data

arr = DatetimeArray(data, copy=True)
assert arr._data is not data
assert arr._data.base is not data


class TestDatetimeArrayComparisons(object):
Expand Down
6 changes: 3 additions & 3 deletions pandas/tests/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1241,7 +1241,7 @@ def test_numpy_array_all_dtypes(any_numpy_dtype):
# tz-aware Datetime
(DatetimeArray(np.array(['2000-01-01T12:00:00',
'2000-01-02T12:00:00'],
dtype='M8[ns]'),
dtype='M8[ns]').view('i8'),
dtype=DatetimeTZDtype(tz="US/Central")),
'_data'),
])
Expand All @@ -1255,7 +1255,7 @@ def test_array(array, attr, box):
array = getattr(array, attr)
result = getattr(result, attr)

assert result is array
assert result is array or result.base is array.base


def test_array_multiindex_raises():
Expand All @@ -1282,7 +1282,7 @@ def test_array_multiindex_raises():
# tz-aware stays tz`-aware
(DatetimeArray(np.array(['2000-01-01T06:00:00',
'2000-01-02T06:00:00'],
dtype='M8[ns]'),
dtype='M8[ns]').view('i8'),
dtype=DatetimeTZDtype(tz='US/Central')),
np.array([pd.Timestamp('2000-01-01', tz='US/Central'),
pd.Timestamp('2000-01-02', tz='US/Central')])),
Expand Down