Skip to content

ENH: add from_dummies #31795

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 19 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ Other enhancements
- :class:`Styler` may now render CSS more efficiently where multiple cells have the same styling (:issue:`30876`)
- :meth:`Styler.highlight_null` now accepts ``subset`` argument (:issue:`31345`)
- When writing directly to a sqlite connection :func:`to_sql` now supports the ``multi`` method (:issue:`29921`)
- We have added a :meth:`pandas.from_dummies`, which is an inverse transformation of :meth:`pandas.get_dummies` (:issue:`8745`)
- `OptionError` is now exposed in `pandas.errors` (:issue:`27553`)
- :func:`timedelta_range` will now infer a frequency when passed ``start``, ``stop``, and ``periods`` (:issue:`32377`)
- Positional slicing on a :class:`IntervalIndex` now supports slices with ``step > 1`` (:issue:`31658`)
Expand Down
1 change: 1 addition & 0 deletions pandas/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@
get_dummies,
cut,
qcut,
from_dummies,
)

import pandas.api
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/reshape/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,5 @@
from pandas.core.reshape.melt import lreshape, melt, wide_to_long
from pandas.core.reshape.merge import merge, merge_asof, merge_ordered
from pandas.core.reshape.pivot import crosstab, pivot, pivot_table
from pandas.core.reshape.reshape import get_dummies
from pandas.core.reshape.reshape import from_dummies, get_dummies
from pandas.core.reshape.tile import cut, qcut
147 changes: 146 additions & 1 deletion pandas/core/reshape/reshape.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import itertools
from typing import List, Optional, Union
from typing import List, Optional, Set, Union

import numpy as np

import pandas._libs.algos as libalgos
import pandas._libs.reshape as libreshape
from pandas._libs.sparse import IntIndex
from pandas._typing import Dtype
from pandas.util._decorators import cache_readonly

from pandas.core.dtypes.cast import maybe_promote
Expand Down Expand Up @@ -727,6 +728,149 @@ def _convert_level_number(level_num, columns):
return result


def from_dummies(
data: "DataFrame",
prefix: Optional[Union[str, List[str]]] = None,
prefix_sep: str = "_",
dtype: Dtype = "category",
) -> "DataFrame":
"""
The inverse transformation of ``pandas.get_dummies``.

.. versionadded:: 1.1.0

Parameters
----------
data : DataFrame
Data which contains dummy indicators.
prefix : list-like, default None
How to name the decoded groups of columns. If there are columns
containing `prefix_sep`, then the part of their name preceding
`prefix_sep` will be used (see examples below).
prefix_sep : str, default '_'
Separator between original column name and dummy variable.
dtype : dtype, default 'category'
Data dtype for new columns - only a single data type is allowed.

Returns
-------
DataFrame
Decoded data.

See Also
--------
get_dummies : The inverse operation.

Examples
--------
Say we have a dataframe where some variables have been dummified:

>>> df = pd.DataFrame(
... {
... "baboon": [0, 0, 1],
... "lemur": [0, 1, 0],
... "zebra": [1, 0, 0],
... }
... )
>>> df
baboon lemur zebra
0 0 0 1
1 0 1 0
2 1 0 0

We can recover the original dataframe using `from_dummies`:

>>> pd.from_dummies(df, prefix='animal')
animal
0 zebra
1 lemur
2 baboon

If our dataframe already has columns with `prefix_sep` in them,
we don't need to pass in the `prefix` argument:

>>> df = pd.DataFrame(
... {
... "animal_baboon": [0, 0, 1],
... "animal_lemur": [0, 1, 0],
... "animal_zebra": [1, 0, 0],
... "other": ['a', 'b', 'c'],
... }
... )
>>> df
animal_baboon animal_lemur animal_zebra other
0 0 0 1 a
1 0 1 0 b
2 1 0 0 c

>>> pd.from_dummies(df)
other animal
0 a zebra
1 b lemur
2 c baboon
"""
if dtype is None:
dtype = "category"

columns_to_decode = [i for i in data.columns if prefix_sep in i]
if not columns_to_decode:
if prefix is None:
raise ValueError(
"If no columns contain `prefix_sep`, you must "
"pass a value to `prefix` with which to name "
"the decoded columns."
)
# If no column contains `prefix_sep`, we prepend `prefix` and
# `prefix_sep` to each column.
out = data.rename(columns=lambda x: f"{prefix}{prefix_sep}{x}").copy()
columns_to_decode = out.columns
else:
out = data.copy()

data_to_decode = out[columns_to_decode]

if prefix is None:
# If no prefix has been passed, extract it from columns containing
# `prefix_sep`
seen: Set[str] = set()
prefix = []
for i in columns_to_decode:
i = i.split(prefix_sep)[0]
if i in seen:
continue
seen.add(i)
prefix.append(i)
elif isinstance(prefix, str):
prefix = [prefix]

# Check each row sums to 1 or 0
def _validate_values(data):
if (data.sum(axis=1) != 1).any():
raise ValueError(
"Data cannot be decoded! Each row must contain only 0s and "
"1s, and each row may have at most one 1."
)

for prefix_ in prefix:
cols, labels = (
[
i.replace(x, "")
for i in data_to_decode.columns
if prefix_ + prefix_sep in i
]
for x in ["", prefix_ + prefix_sep]
)
if not cols:
continue
_validate_values(data_to_decode[cols])
out = out.drop(cols, axis=1)
out[prefix_] = Series(
np.array(labels)[np.argmax(data_to_decode[cols].to_numpy(), axis=1)],
dtype=dtype,
)
return out


def get_dummies(
data,
prefix=None,
Expand Down Expand Up @@ -777,6 +921,7 @@ def get_dummies(
See Also
--------
Series.str.get_dummies : Convert Series to dummy codes.
from_dummies : The inverse operation.

Examples
--------
Expand Down
1 change: 1 addition & 0 deletions pandas/tests/api/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ class TestPDApi(Base):
"eval",
"factorize",
"get_dummies",
"from_dummies",
"infer_freq",
"isna",
"isnull",
Expand Down
60 changes: 60 additions & 0 deletions pandas/tests/reshape/test_from_dummies.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import pytest

import pandas as pd
import pandas._testing as tm


@pytest.mark.parametrize(
"dtype, expected_dict",
[
("str", {"col1": ["a", "a", "b"]}),
(str, {"col1": ["a", "a", "b"]},),
("category", {"col1": ["a", "a", "b"]}),
],
)
def test_dtype(dtype, expected_dict):
df = pd.DataFrame({"col1_a": [1, 1, 0], "col1_b": [0, 0, 1]})
result = pd.from_dummies(df, dtype=dtype)
expected = pd.DataFrame(expected_dict, dtype=dtype)
tm.assert_frame_equal(result, expected)


def test_malformed():
df = pd.DataFrame({"col1_a": [1, 1, 0], "col1_b": [1, 0, 1]})
msg = (
"Data cannot be decoded! Each row must contain only 0s and 1s"
", and each row may have at most one 1"
)
with pytest.raises(ValueError, match=msg):
pd.from_dummies(df)


@pytest.mark.parametrize(
"prefix_sep, input_dict",
[
("_", {"col1_a": [1, 1, 0], "col1_b": [0, 0, 1]}),
("*", {"col1*a": [1, 1, 0], "col1*b": [0, 0, 1]}),
(".", {"col1.a": [1, 1, 0], "col1.b": [0, 0, 1]}),
],
)
def test_prefix_sep(prefix_sep, input_dict):
df = pd.DataFrame(input_dict)
result = pd.from_dummies(df, prefix_sep=prefix_sep)
expected = pd.DataFrame({"col1": ["a", "a", "b"]}, dtype="category")
tm.assert_frame_equal(result, expected)


def test_no_prefix():
df = pd.DataFrame({"a": [1, 1, 0], "b": [0, 0, 1]})
result = pd.from_dummies(df, prefix="letter")
expected = pd.DataFrame({"letter": ["a", "a", "b"]}, dtype="category")
tm.assert_frame_equal(result, expected)


def test_multiple_columns():
df = pd.DataFrame(
{"col1_a": [1, 0], "col1_b": [0, 1], "col2_a": [0, 0], "col2_c": [1, 1]}
)
result = pd.from_dummies(df)
expected = pd.DataFrame({"col1": ["a", "b"], "col2": ["c", "c"]}, dtype="category")
tm.assert_frame_equal(result, expected)