Skip to content

Commit 8600c50

Browse files
committed
REF: refactor to new method
1 parent d100f0c commit 8600c50

File tree

3 files changed

+79
-23
lines changed

3 files changed

+79
-23
lines changed

pandas/_libs/parsers.pyx

Lines changed: 2 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1273,30 +1273,9 @@ cdef class TextReader:
12731273
codes, cats, na_count = _categorical_convert(
12741274
self.parser, i, start, end, na_filter,
12751275
na_hashset, self.c_encoding)
1276-
cats = Index(cats)
1277-
1278-
cats = maybe_convert_for_categorical(cats, dtype)
1279-
1280-
if (isinstance(dtype, CategoricalDtype) and
1281-
dtype.categories is not None):
1282-
# recode for dtype.categories
1283-
categories = dtype.categories
1284-
codes = _recode_for_categories(codes, cats, categories)
1285-
ordered = dtype.ordered
1286-
elif not cats.is_monotonic_increasing:
1287-
# sort categories and recode if necessary
1288-
unsorted = cats.copy()
1289-
categories = cats.sort_values()
1290-
codes = _recode_for_categories(codes, unsorted, categories)
1291-
ordered = False
1292-
else:
1293-
categories = cats
1294-
ordered = False
1295-
1296-
cat = Categorical(codes, categories=categories, ordered=ordered,
1297-
fastpath=True)
1298-
1276+
cat = Categorical._from_inferred_categories(cats, codes, dtype)
12991277
return cat, na_count
1278+
13001279
elif is_object_dtype(dtype):
13011280
return self._string_convert(i, start, end, na_filter,
13021281
na_hashset)

pandas/core/categorical.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -509,6 +509,49 @@ def base(self):
509509
""" compat, we are always our own object """
510510
return None
511511

512+
@classmethod
513+
def _from_inferred_categories(cls, inferred_categories, inferred_codes,
514+
dtype):
515+
"""Construct a Categorical from inferred values
516+
517+
For inferred categories (`dtype` is None) the categories are sorted.
518+
For explicit `dtype`, the `inferred_categories` are cast to the
519+
appropriate type.
520+
521+
Parameters
522+
----------
523+
524+
inferred_categories, inferred_codes : Index
525+
dtype : CategoricalDtype
526+
527+
Returns
528+
-------
529+
Categorical
530+
"""
531+
from pandas.core.dtypes.cast import maybe_convert_for_categorical
532+
from pandas import Index
533+
534+
cats = Index(inferred_categories)
535+
cats = maybe_convert_for_categorical(cats, dtype)
536+
537+
if (isinstance(dtype, CategoricalDtype) and
538+
dtype.categories is not None):
539+
# recode for dtype.categories
540+
categories = dtype.categories
541+
codes = _recode_for_categories(inferred_codes, cats, categories)
542+
elif not cats.is_monotonic_increasing:
543+
# sort categories and recode if necessary
544+
unsorted = cats.copy()
545+
categories = cats.sort_values()
546+
codes = _recode_for_categories(inferred_codes, unsorted,
547+
categories)
548+
dtype = CategoricalDtype(categories, ordered=False)
549+
else:
550+
dtype = CategoricalDtype(cats, ordered=False)
551+
codes = inferred_codes
552+
553+
return cls(codes, dtype=dtype, fastpath=True)
554+
512555
@classmethod
513556
def from_array(cls, data, **kwargs):
514557
"""

pandas/tests/test_categorical.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -560,6 +560,40 @@ def f():
560560
codes = np.random.choice([0, 1], 5, p=[0.9, 0.1])
561561
pd.Categorical.from_codes(codes, categories=["train", "test"])
562562

563+
@pytest.mark.parametrize('dtype', [None, 'category'])
564+
def test_from_inferred_categories(self, dtype):
565+
cats = ['a', 'b']
566+
codes = [0, 0, 1, 1]
567+
result = Categorical._from_inferred_categories(cats, codes, dtype)
568+
expected = Categorical.from_codes(codes, cats)
569+
tm.assert_categorical_equal(result, expected)
570+
571+
@pytest.mark.parametrize('dtype', [None, 'category'])
572+
def test_from_inferred_categories_sorts(self, dtype):
573+
cats = ['b', 'a']
574+
codes = [0, 1, 1, 1]
575+
result = Categorical._from_inferred_categories(cats, codes, dtype)
576+
expected = Categorical.from_codes([1, 0, 0, 0], ['a', 'b'])
577+
tm.assert_categorical_equal(result, expected)
578+
579+
def test_from_inferred_categories_dtype(self):
580+
cats = ['a', 'b', 'd']
581+
codes = [0, 1, 0, 2]
582+
dtype = CategoricalDtype(['c', 'b', 'a'], ordered=True)
583+
result = Categorical._from_inferred_categories(cats, codes, dtype)
584+
expected = Categorical(['a', 'b', 'a', 'd'],
585+
categories=['c', 'b', 'a'],
586+
ordered=True)
587+
tm.assert_categorical_equal(result, expected)
588+
589+
def test_from_inferred_categories_coerces(self):
590+
cats = ['1', '2', 'bad']
591+
codes = [0, 0, 1, 2]
592+
dtype = CategoricalDtype([1, 2])
593+
result = Categorical._from_inferred_categories(cats, codes, dtype)
594+
expected = Categorical([1, 1, 2, np.nan])
595+
tm.assert_categorical_equal(result, expected)
596+
563597
def test_validate_ordered(self):
564598
# see gh-14058
565599
exp_msg = "'ordered' must either be 'True' or 'False'"

0 commit comments

Comments
 (0)