pandas-dev · Strilanc · Jul 11, 2019 · Jul 12, 2019 · Jul 12, 2019 · Jul 16, 2019
diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst
@@ -176,6 +176,7 @@ Computations / descriptive stats
    DataFrame.std
    DataFrame.var
    DataFrame.nunique
+   DataFrame.value_counts
 
 Reindexing / selection / label manipulation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
@@ -67,7 +67,7 @@ Other API changes
 ^^^^^^^^^^^^^^^^^
 
 - :meth:`pandas.api.types.infer_dtype` will now return "integer-na" for integer and ``np.nan`` mix (:issue:`27283`)
--
+- Added :meth:`pandas.core.frame.DataFrame.value_counts` (:issue:`5377`).
 -
 
 .. _whatsnew_1000.deprecations:

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -90,7 +90,7 @@
 from pandas.core.index import Index, ensure_index, ensure_index_from_sequences
 from pandas.core.indexes import base as ibase
 from pandas.core.indexes.datetimes import DatetimeIndex
-from pandas.core.indexes.multi import maybe_droplevels
+from pandas.core.indexes.multi import maybe_droplevels, MultiIndex
 from pandas.core.indexes.period import PeriodIndex
 from pandas.core.indexing import check_bool_indexer, convert_to_index_sliceable
 from pandas.core.internals import BlockManager
@@ -8417,6 +8417,132 @@ def isin(self, values):
                 self.columns,
             )
 
+    def value_counts(
+        self, normalize=False, sort=True, ascending=False, bins=None, dropna=True
+    ):
+        """
+        Return a Series containing counts of unique rows in the DataFrame.
+
+        .. versionadded:: 1.0.0
+
+        The returned Series will have a MultiIndex with one level per input
+        column.
+
+        By default, rows that contain any NaN value are omitted from the
+        results.
+
+        By default, the resulting series will be in descending order so that the
+        first element is the most frequently-occurring row.
+
+        Parameters
+        ----------
+        normalize : boolean, default False
+            If True then the Series returned will contain the relative
+            frequencies of the unique values.
+        sort : boolean, default True
+            Sort by frequencies.
+        ascending : boolean, default False
+            Sort in ascending order.
+        bins : integer, optional
+            Rather than count values, group them into half-open bins,
+            a convenience for ``pd.cut``, only works with single-column numeric
+            data.
+        dropna : boolean, default True
+            Don't include counts of rows containing NaN.
+
+        Returns
+        -------
+        counts : Series
+
+        See Also
+        --------
+        Series.value_counts: Equivalent method on Series.
+
+        Examples
+        --------
+
+        >>> df = pd.DataFrame({'num_legs': [2, 4, 4, 6],
+        ...                    'num_wings': [2, 0, 0, 0]},
+        ...                   index=['falcon', 'dog', 'cat', 'ant'])
+        >>> df
+                num_legs  num_wings
+        falcon         2          2
+        dog            4          0
+        cat            4          0
+        ant            6          0
+
+        >>> df.value_counts()
+        num_legs  num_wings
+        4         0            2
+        6         0            1
+        2         2            1
+        dtype: int64
+
+        >>> df.value_counts(sort=False)
+        num_legs  num_wings
+        2         2            1
+        4         0            2
+        6         0            1
+        dtype: int64
+
+        >>> df.value_counts(ascending=True)
+        num_legs  num_wings
+        2         2            1
+        6         0            1
+        4         0            2
+        dtype: int64
+
+        >>> df.value_counts(normalize=True)
+        num_legs  num_wings
+        4         0            0.50
+        6         0            0.25
+        2         2            0.25
+        dtype: float64
+
+        >>> single_col_df = df[['num_legs']]
+        >>> single_col_df.value_counts(bins=4)
+        num_legs
+        (3.0, 4.0]      2
+        (5.0, 6.0]      1
+        (1.995, 3.0]    1
+        (4.0, 5.0]      0
+        dtype: int64
+        """
+
+        # Delegate to Series.value_counts for single-column data frames.
+        if len(self.columns) == 1:
+            series = self[self.columns[0]].value_counts(
+                normalize=normalize,
+                sort=sort,
+                ascending=ascending,
+                bins=bins,
+                dropna=dropna,
+            )
+            # Move series name into its index, as happens in multi-column case.
+            return Series(
+                data=series.values,
+                index=MultiIndex.from_arrays(
+                    [series.index.values], names=[series.name]
+                ),
+            )
+
+        # Some features are only supported for single-column data.
+        if not dropna:
+            raise NotImplementedError(
+                "`dropna=False` not yet supported for multi-column dataframes."
+            )
+        if bins is not None:
+            raise ValueError(
+                "`bins` parameter not supported for multi-column dataframes."
+            )
+
+        counts = self.groupby(self.columns.tolist()).size()
+        if sort:
+            counts.sort_values(ascending=ascending, inplace=True)
+        if normalize:
+            counts /= counts.sum()
+        return counts
+
     # ----------------------------------------------------------------------
     # Add plotting methods to DataFrame
     plot = CachedAccessor("plot", pandas.plotting.PlotAccessor)

diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py
@@ -2766,3 +2766,123 @@ def test_multiindex_column_lookup(self):
         result = df.nlargest(3, ("x", "b"))
         expected = df.iloc[[3, 2, 1]]
         tm.assert_frame_equal(result, expected)
+
+    def test_data_frame_value_counts_unsorted(self):
+        df = pd.DataFrame(
+            {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]},
+            index=["falcon", "dog", "cat", "ant"],
+        )
+        result = df.value_counts(sort=False)
+        expected = pd.Series(
+            data=[1, 2, 1],
+            index=pd.MultiIndex.from_arrays(
+                [(2, 4, 6), (2, 0, 0)], names=["num_legs", "num_wings"]
+            ),
+        )
+        tm.assert_series_equal(result, expected)
+
+    def test_data_frame_value_counts_ascending(self):
+        df = pd.DataFrame(
+            {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]},
+            index=["falcon", "dog", "cat", "ant"],
+        )
+        result = df.value_counts(ascending=True)
+        expected = pd.Series(
+            data=[1, 1, 2],
+            index=pd.MultiIndex.from_arrays(
+                [(2, 6, 4), (2, 0, 0)], names=["num_legs", "num_wings"]
+            ),
+        )
+        tm.assert_series_equal(result, expected)
+
+    def test_data_frame_value_counts_default(self):
+        df = pd.DataFrame(
+            {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]},
+            index=["falcon", "dog", "cat", "ant"],
+        )
+        result = df.value_counts()
+        expected = pd.Series(
+            data=[2, 1, 1],
+            index=pd.MultiIndex.from_arrays(
+                [(4, 6, 2), (0, 0, 2)], names=["num_legs", "num_wings"]
+            ),
+        )
+        tm.assert_series_equal(result, expected)
+
+    def test_data_frame_value_counts_normalize(self):
+        df = pd.DataFrame(
+            {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]},
+            index=["falcon", "dog", "cat", "ant"],
+        )
+        result = df.value_counts(normalize=True)
+        expected = pd.Series(
+            data=[0.5, 0.25, 0.25],
+            index=pd.MultiIndex.from_arrays(
+                [(4, 6, 2), (0, 0, 2)], names=["num_legs", "num_wings"]
+            ),
+        )
+        tm.assert_series_equal(result, expected)
+
+    def test_data_frame_value_counts_dropna_not_supported_yet(self):
+        df = pd.DataFrame(
+            {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]},
+            index=["falcon", "dog", "cat", "ant"],
+        )
+        with pytest.raises(NotImplementedError, match="not yet supported"):
+            df.value_counts(dropna=False)
+
+    def test_data_frame_value_counts_bins_not_supported(self):
+        df = pd.DataFrame(
+            {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]},
+            index=["falcon", "dog", "cat", "ant"],
+        )
+        with pytest.raises(ValueError, match="not supported"):
+            df.value_counts(bins=2)
+
+    def test_data_frame_value_counts_single_col_default(self):
+        df = pd.DataFrame(
+            {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]},
+            index=["falcon", "dog", "cat", "ant"],
+        )
+        df_single_col = df[["num_legs"]]
+        result = df_single_col.value_counts()
+        expected = pd.Series(
+            data=[2, 1, 1],
+            index=pd.MultiIndex.from_arrays([[4, 6, 2]], names=["num_legs"]),
+        )
+        tm.assert_series_equal(result, expected)
+
+    def test_data_frame_value_counts_single_col_bins(self):
+        df = pd.DataFrame(
+            {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]},
+            index=["falcon", "dog", "cat", "ant"],
+        )
+        df_single_col = df[["num_legs"]]
+        result = df_single_col.value_counts(bins=4)
+        expected = pd.Series(
+            data=[2, 1, 1, 0],
+            index=pd.MultiIndex.from_arrays(
+                [
+                    [
+                        pd.Interval(3, 4),
+                        pd.Interval(5, 6),
+                        pd.Interval(1.995, 3),
+                        pd.Interval(4, 5),
+                    ]
+                ],
+                names=["num_legs"],
+            ),
+        )
+        tm.assert_series_equal(result, expected)
+
+    def test_data_frame_value_counts_empty(self):
+        df_no_cols = pd.DataFrame()
+        result = df_no_cols.value_counts()
+        expected = pd.Series([], dtype=np.int64)
+        tm.assert_series_equal(result, expected)
+
+    def test_data_frame_value_counts_empty_normalize(self):
+        df_no_cols = pd.DataFrame()
+        result = df_no_cols.value_counts(normalize=True)
+        expected = pd.Series([], dtype=np.float64)
+        tm.assert_series_equal(result, expected)