REF: refactor to new method

TomAugspurger · TomAugspurger · commit 8600c505b2ec · 2017-09-28T08:17:58.000-05:00
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -1273,30 +1273,9 @@ cdef class TextReader:
             codes, cats, na_count = _categorical_convert(
                 self.parser, i, start, end, na_filter,
                 na_hashset, self.c_encoding)
-            cats = Index(cats)
-
-            cats = maybe_convert_for_categorical(cats, dtype)
-
-            if (isinstance(dtype, CategoricalDtype) and
-                    dtype.categories is not None):
-                # recode for dtype.categories
-                categories = dtype.categories
-                codes = _recode_for_categories(codes, cats, categories)
-                ordered = dtype.ordered
-            elif not cats.is_monotonic_increasing:
-                # sort categories and recode if necessary
-                unsorted = cats.copy()
-                categories = cats.sort_values()
-                codes = _recode_for_categories(codes, unsorted, categories)
-                ordered = False
-            else:
-                categories = cats
-                ordered = False
-
-            cat = Categorical(codes, categories=categories, ordered=ordered,
-                              fastpath=True)
-
+            cat = Categorical._from_inferred_categories(cats, codes, dtype)
             return cat, na_count
+
         elif is_object_dtype(dtype):
             return self._string_convert(i, start, end, na_filter,
                                         na_hashset)
diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
@@ -509,6 +509,49 @@ def base(self):
         """ compat, we are always our own object """
         return None
 
+    @classmethod
+    def _from_inferred_categories(cls, inferred_categories, inferred_codes,
+                                  dtype):
+        """Construct a Categorical from inferred values
+
+        For inferred categories (`dtype` is None) the categories are sorted.
+        For explicit `dtype`, the `inferred_categories` are cast to the
+        appropriate type.
+
+        Parameters
+        ----------
+
+        inferred_categories, inferred_codes : Index
+        dtype : CategoricalDtype
+
+        Returns
+        -------
+        Categorical
+        """
+        from pandas.core.dtypes.cast import maybe_convert_for_categorical
+        from pandas import Index
+
+        cats = Index(inferred_categories)
+        cats = maybe_convert_for_categorical(cats, dtype)
+
+        if (isinstance(dtype, CategoricalDtype) and
+                dtype.categories is not None):
+            # recode for dtype.categories
+            categories = dtype.categories
+            codes = _recode_for_categories(inferred_codes, cats, categories)
+        elif not cats.is_monotonic_increasing:
+            # sort categories and recode if necessary
+            unsorted = cats.copy()
+            categories = cats.sort_values()
+            codes = _recode_for_categories(inferred_codes, unsorted,
+                                           categories)
+            dtype = CategoricalDtype(categories, ordered=False)
+        else:
+            dtype = CategoricalDtype(cats, ordered=False)
+            codes = inferred_codes
+
+        return cls(codes, dtype=dtype, fastpath=True)
+
     @classmethod
     def from_array(cls, data, **kwargs):
         """
diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py
@@ -560,6 +560,40 @@ def f():
             codes = np.random.choice([0, 1], 5, p=[0.9, 0.1])
             pd.Categorical.from_codes(codes, categories=["train", "test"])
 
+    @pytest.mark.parametrize('dtype', [None, 'category'])
+    def test_from_inferred_categories(self, dtype):
+        cats = ['a', 'b']
+        codes = [0, 0, 1, 1]
+        result = Categorical._from_inferred_categories(cats, codes, dtype)
+        expected = Categorical.from_codes(codes, cats)
+        tm.assert_categorical_equal(result, expected)
+
+    @pytest.mark.parametrize('dtype', [None, 'category'])
+    def test_from_inferred_categories_sorts(self, dtype):
+        cats = ['b', 'a']
+        codes = [0, 1, 1, 1]
+        result = Categorical._from_inferred_categories(cats, codes, dtype)
+        expected = Categorical.from_codes([1, 0, 0, 0], ['a', 'b'])
+        tm.assert_categorical_equal(result, expected)
+
+    def test_from_inferred_categories_dtype(self):
+        cats = ['a', 'b', 'd']
+        codes = [0, 1, 0, 2]
+        dtype = CategoricalDtype(['c', 'b', 'a'], ordered=True)
+        result = Categorical._from_inferred_categories(cats, codes, dtype)
+        expected = Categorical(['a', 'b', 'a', 'd'],
+                               categories=['c', 'b', 'a'],
+                               ordered=True)
+        tm.assert_categorical_equal(result, expected)
+
+    def test_from_inferred_categories_coerces(self):
+        cats = ['1', '2', 'bad']
+        codes = [0, 0, 1, 2]
+        dtype = CategoricalDtype([1, 2])
+        result = Categorical._from_inferred_categories(cats, codes, dtype)
+        expected = Categorical([1, 1, 2, np.nan])
+        tm.assert_categorical_equal(result, expected)
+
     def test_validate_ordered(self):
         # see gh-14058
         exp_msg = "'ordered' must either be 'True' or 'False'"