Refactor with maybe_convert_for_categorical

TomAugspurger · TomAugspurger · commit b02882714601 · 2017-09-26T15:50:31.000-05:00
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -469,6 +469,8 @@ Individual columns can be parsed as a ``Categorical`` using a dict specification
 
    pd.read_csv(StringIO(data), dtype={'col1': 'category'}).dtypes
 
+.. versionadded:: 0.21.0
+
 Specifying ``dtype='cateogry'`` will result in an unordered ``Categorical``
 whose ``categories`` are the unique values observed in the data. For more
 control on the categories and order, create a
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -48,7 +48,8 @@ from pandas.core.dtypes.common import (
 from pandas.core.categorical import Categorical, _recode_for_categories
 from pandas.core.algorithms import take_1d
 from pandas.core.dtypes.concat import union_categoricals
-from pandas import Index, to_numeric, to_datetime, to_timedelta
+from pandas.core.dtypes.cast import maybe_convert_for_categorical
+from pandas import Index
 
 import pandas.io.common as com
 
@@ -1274,19 +1275,7 @@ cdef class TextReader:
                 na_hashset, self.c_encoding)
             cats = Index(cats)
 
-            # Determine if we should convert inferred string
-            # categories to a specialized type
-            if (isinstance(dtype, CategoricalDtype) and
-                    dtype.categories is not None):
-                if dtype.categories.is_numeric():
-                    # is ignore correct?
-                    cats = to_numeric(cats, errors='ignore')
-                elif dtype.categories.is_all_dates:
-                    # is ignore correct?
-                    if is_datetime64_dtype(dtype.categories):
-                        cats = to_datetime(cats, errors='ignore')
-                    else:
-                        cats = to_timedelta(cats, errors='ignore')
+            cats = maybe_convert_for_categorical(cats, dtype)
 
             if (isinstance(dtype, CategoricalDtype) and
                     dtype.categories is not None):
@@ -1298,8 +1287,7 @@ cdef class TextReader:
                 # sort categories and recode if necessary
                 unsorted = cats.copy()
                 categories = cats.sort_values()
-                indexer = categories.get_indexer(unsorted)
-                codes = take_1d(indexer, codes, fill_value=-1)
+                codes = _recode_for_categories(codes, unsorted, categories)
                 ordered = False
             else:
                 categories = cats
diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -24,7 +24,8 @@
                      _ensure_int32, _ensure_int64,
                      _NS_DTYPE, _TD_DTYPE, _INT64_DTYPE,
                      _POSSIBLY_CAST_DTYPES)
-from .dtypes import ExtensionDtype, DatetimeTZDtype, PeriodDtype
+from .dtypes import (ExtensionDtype, DatetimeTZDtype, PeriodDtype,
+                     CategoricalDtype)
 from .generic import (ABCDatetimeIndex, ABCPeriodIndex,
                       ABCSeries)
 from .missing import isna, notna
@@ -604,6 +605,41 @@ def conv(r, dtype):
     return [conv(r, dtype) for r, dtype in zip(result, dtypes)]
 
 
+def maybe_convert_for_categorical(categories, dtype):
+    """Convert ``categories`` depending on ``dtype``.
+
+    Converts to numeric, datetime, or timedelta types, when ``dtype`` is
+    a CategoricalDtype with known, non-object categories.
+
+    Parameters
+    ----------
+    categories : array-like
+    type : CategoricalDtype
+
+    Returns
+    -------
+    new_categories : array or Index
+
+    Examples
+    --------
+    >>> maybe_convert_for_categorical(['1', '2'], CategoricalDtype([1, 2]))
+    array([  1,  2])
+    >>> maybe_convert_for_categorical([1, 'a'], CategoricalDtype([1, 2]))
+    array([  1.,  nan])
+    """
+    if isinstance(dtype, CategoricalDtype) and dtype.categories is not None:
+        from pandas import to_numeric, to_datetime, to_timedelta
+
+        if dtype.categories.is_numeric():
+            categories = to_numeric(categories, errors='coerce')
+        elif is_datetime64_dtype(dtype.categories):
+            categories = to_datetime(categories, errors='coerce')
+        elif is_timedelta64_dtype(dtype.categories):
+            categories = to_timedelta(categories, errors='coerce')
+
+    return categories
+
+
 def astype_nansafe(arr, dtype, copy=True):
     """ return a view if copy is False, but
         need to be very careful as the result shape could change! """
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -12,19 +12,19 @@
 
 import numpy as np
 
-from pandas import compat, to_numeric, to_timedelta
+from pandas import compat
 from pandas.compat import (range, lrange, PY3, StringIO, lzip,
                            zip, string_types, map, u)
 from pandas.core.dtypes.common import (
     is_integer, _ensure_object,
     is_list_like, is_integer_dtype,
     is_float, is_dtype_equal,
     is_object_dtype, is_string_dtype,
-    is_scalar, is_categorical_dtype,
-    is_datetime64_dtype, is_timedelta64_dtype)
+    is_scalar, is_categorical_dtype)
 from pandas.core.dtypes.dtypes import CategoricalDtype
 from pandas.core.dtypes.missing import isna
-from pandas.core.dtypes.cast import astype_nansafe
+from pandas.core.dtypes.cast import (astype_nansafe,
+                                     maybe_convert_for_categorical)
 from pandas.core.index import (Index, MultiIndex, RangeIndex,
                                _ensure_index_from_sequences)
 from pandas.core.series import Series
@@ -1609,21 +1609,16 @@ def _cast_types(self, values, cast_type, column):
             # as strings
             known_cats = (isinstance(cast_type, CategoricalDtype) and
                           cast_type.categories is not None)
-            str_values = is_object_dtype(values)
-
-            if known_cats and str_values:
-                if cast_type.categories.is_numeric():
-                    values = to_numeric(values, errors='ignore')
-                elif is_datetime64_dtype(cast_type.categories):
-                    values = tools.to_datetime(values, errors='ignore')
-                elif is_timedelta64_dtype(cast_type.categories):
-                    values = to_timedelta(values, errors='ignore')
-                values = Categorical(values, categories=cast_type.categories,
-                                     ordered=cast_type.ordered)
+
+            categories = ordered = None
+            if known_cats:
+                values = maybe_convert_for_categorical(values, cast_type)
+                categories = cast_type.categories
+                ordered = cast_type.ordered
             elif not is_object_dtype(values):
                 values = astype_nansafe(values, str)
-            else:
-                values = Categorical(values)
+            values = Categorical(values, categories=categories,
+                                 ordered=ordered)
         else:
             try:
                 values = astype_nansafe(values, cast_type, copy=True)
diff --git a/pandas/tests/dtypes/test_cast.py b/pandas/tests/dtypes/test_cast.py
@@ -16,6 +16,7 @@
 from pandas.core.dtypes.cast import (
     maybe_downcast_to_dtype,
     maybe_convert_objects,
+    maybe_convert_for_categorical,
     cast_scalar_to_array,
     infer_dtype_from_scalar,
     infer_dtype_from_array,
@@ -25,7 +26,8 @@
 from pandas.core.dtypes.dtypes import (
     CategoricalDtype,
     DatetimeTZDtype,
-    PeriodDtype)
+    PeriodDtype,
+    CategoricalDtype)
 from pandas.core.dtypes.common import (
     is_dtype_equal)
 from pandas.util import testing as tm
@@ -299,6 +301,44 @@ def test_maybe_infer_to_datetimelike(self):
                                      [NaT, 'b', 1]]))
         assert result.size == 6
 
+    def test_maybe_convert_for_categorical_noop(self):
+        expected = ['1', '2']
+        result = maybe_convert_for_categorical(expected, None)
+        assert result == expected
+
+        result = maybe_convert_for_categorical(expected, CategoricalDtype())
+        assert result == expected
+
+        result = maybe_convert_for_categorical(expected, 'category')
+        assert result == expected
+
+    @pytest.mark.parametrize('categories, dtype, expected', [
+        (['1', '2'], [1, 2, 3], np.array([1, 2])),
+        (['1', '2', 'a'], [1, 2, 3], np.array([1, 2, np.nan])),
+    ])
+    def test_maybe_convert_for_categorical(self, categories, dtype, expected):
+        dtype = CategoricalDtype(dtype)
+        result = maybe_convert_for_categorical(categories, dtype)
+        tm.assert_numpy_array_equal(result, expected)
+
+    @pytest.mark.parametrize('categories, dtype, expected', [
+        (['2016', '2017'], pd.to_datetime(['2016', '2017']),
+         pd.to_datetime(['2016', '2017'])),
+        (['2016', '2017', 'bad'], pd.to_datetime(['2016', '2017']),
+         pd.to_datetime(['2016', '2017', 'NaT'])),
+
+        (['1H', '2H'], pd.to_timedelta(['1H', '2H']),
+         pd.to_timedelta(['1H', '2H'])),
+        (['1H', '2H', 'bad'], pd.to_timedelta(['1H', '2H']),
+         pd.to_timedelta(['1H', '2H', 'NaT'])),
+
+    ])
+    def test_maybe_convert_for_categorical_dates(self, categories, dtype,
+                                                 expected):
+        dtype = CategoricalDtype(dtype)
+        result = maybe_convert_for_categorical(categories, dtype)
+        tm.assert_index_equal(result, expected)
+
 
 class TestConvert(object):