API: change default behaviour of str.match from deprecated extract to match (GH5224)

jorisvandenbossche · jorisvandenbossche · commit e91341359742 · 2017-01-29T22:45:02.000+01:00
diff --git a/doc/source/text.rst b/doc/source/text.rst
@@ -364,18 +364,6 @@ or match a pattern:
 The distinction between ``match`` and ``contains`` is strictness: ``match``
 relies on strict ``re.match``, while ``contains`` relies on ``re.search``.
 
-.. warning::
-
-   In previous versions, ``match`` was for *extracting* groups,
-   returning a not-so-convenient Series of tuples. The new method ``extract``
-   (described in the previous section) is now preferred.
-
-   This old, deprecated behavior of ``match`` is still the default. As
-   demonstrated above, use the new behavior by setting ``as_indexer=True``.
-   In this mode, ``match`` is analogous to ``contains``, returning a boolean
-   Series. The new behavior will become the default behavior in a future
-   release.
-
 Methods like ``match``, ``contains``, ``startswith``, and ``endswith`` take
  an extra ``na`` argument so missing values can be considered True or False:
 
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -327,7 +327,14 @@ Other API Changes
 - ``pd.read_csv()`` will now raise a ``ValueError`` for the C engine if the quote character is larger than than one byte (:issue:`11592`)
 - ``inplace`` arguments now require a boolean value, else a ``ValueError`` is thrown (:issue:`14189`)
 - ``pandas.api.types.is_datetime64_ns_dtype`` will now report ``True`` on a tz-aware dtype, similar to ``pandas.api.types.is_datetime64_any_dtype``
- - ``DataFrame.asof()`` will return a null filled ``Series`` instead the scalar ``NaN`` if a match is not found (:issue:`15118`)
+- ``DataFrame.asof()`` will return a null filled ``Series`` instead the scalar ``NaN`` if a match is not found (:issue:`15118`)
+- The default behaviour of ``Series.str.match`` has changed from extracting
+  groups to matching the pattern. The extracting behaviour was deprecated
+  since pandas version 0.13.0 and can be done with the ``Series.str.extract``
+  method (:issue:`5224`).
+
+
+
 .. _whatsnew_0200.deprecations:
 
 Deprecations
diff --git a/pandas/core/strings.py b/pandas/core/strings.py
@@ -429,11 +429,9 @@ def rep(x, r):
         return result
 
 
-def str_match(arr, pat, case=True, flags=0, na=np.nan, as_indexer=False):
+def str_match(arr, pat, case=True, flags=0, na=np.nan, as_indexer=None):
     """
-    Deprecated: Find groups in each string in the Series/Index
-    using passed regular expression.
-    If as_indexer=True, determine if each string matches a regular expression.
+    Determine if each string matches a regular expression.
 
     Parameters
     ----------
@@ -444,60 +442,33 @@ def str_match(arr, pat, case=True, flags=0, na=np.nan, as_indexer=False):
     flags : int, default 0 (no flags)
         re module flags, e.g. re.IGNORECASE
     na : default NaN, fill value for missing values.
-    as_indexer : False, by default, gives deprecated behavior better achieved
-        using str_extract. True return boolean indexer.
+    as_indexer : ignored
 
     Returns
     -------
     Series/array of boolean values
-        if as_indexer=True
-    Series/Index of tuples
-        if as_indexer=False, default but deprecated
 
     See Also
     --------
     contains : analogous, but less strict, relying on re.search instead of
         re.match
-    extract : now preferred to the deprecated usage of match (as_indexer=False)
+    extract : extract matched groups
 
-    Notes
-    -----
-    To extract matched groups, which is the deprecated behavior of match, use
-    str.extract.
     """
-
     if not case:
         flags |= re.IGNORECASE
 
     regex = re.compile(pat, flags=flags)
 
-    if (not as_indexer) and regex.groups > 0:
-        # Do this first, to make sure it happens even if the re.compile
-        # raises below.
-        warnings.warn("In future versions of pandas, match will change to"
-                      " always return a bool indexer.", FutureWarning,
-                      stacklevel=3)
-
-    if as_indexer and regex.groups > 0:
-        warnings.warn("This pattern has match groups. To actually get the"
-                      " groups, use str.extract.", UserWarning, stacklevel=3)
-
-    # If not as_indexer and regex.groups == 0, this returns empty lists
-    # and is basically useless, so we will not warn.
+    if as_indexer is not None:
+        # Previously, this keyword was used for changing the default but
+        # deprecated behaviour. This keyword is now no longer needed.
+        warnings.warn("'as_indexer' keyword was specified but will be ignored;"
+                      " match now returns a boolean indexer by default.",
+                      UserWarning, stacklevel=3)
 
-    if (not as_indexer) and regex.groups > 0:
-        dtype = object
-
-        def f(x):
-            m = regex.match(x)
-            if m:
-                return m.groups()
-            else:
-                return []
-    else:
-        # This is the new behavior of str_match.
-        dtype = bool
-        f = lambda x: bool(regex.match(x))
+    dtype = bool
+    f = lambda x: bool(regex.match(x))
 
     return _na_map(f, arr, na, dtype=dtype)
 
diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py
@@ -504,64 +504,39 @@ def test_repeat(self):
         exp = Series([u('a'), u('bb'), NA, u('cccc'), NA, u('dddddd')])
         tm.assert_series_equal(result, exp)
 
-    def test_deprecated_match(self):
-        # Old match behavior, deprecated (but still default) in 0.13
+    def test_match(self):
+        # New match behavior introduced in 0.13
         values = Series(['fooBAD__barBAD', NA, 'foo'])
-
-        with tm.assert_produces_warning():
-            result = values.str.match('.*(BAD[_]+).*(BAD)')
-        exp = Series([('BAD__', 'BAD'), NA, []])
-        tm.assert_series_equal(result, exp)
-
-        # mixed
-        mixed = Series(['aBAD_BAD', NA, 'BAD_b_BAD', True, datetime.today(),
-                        'foo', None, 1, 2.])
-
-        with tm.assert_produces_warning():
-            rs = Series(mixed).str.match('.*(BAD[_]+).*(BAD)')
-        xp = Series([('BAD_', 'BAD'), NA, ('BAD_', 'BAD'),
-                     NA, NA, [], NA, NA, NA])
-        tm.assertIsInstance(rs, Series)
-        tm.assert_series_equal(rs, xp)
-
-        # unicode
-        values = Series([u('fooBAD__barBAD'), NA, u('foo')])
-
-        with tm.assert_produces_warning():
-            result = values.str.match('.*(BAD[_]+).*(BAD)')
-        exp = Series([(u('BAD__'), u('BAD')), NA, []])
+        result = values.str.match('.*(BAD[_]+).*(BAD)')
+        exp = Series([True, NA, False])
         tm.assert_series_equal(result, exp)
 
-    def test_match(self):
-        # New match behavior introduced in 0.13
         values = Series(['fooBAD__barBAD', NA, 'foo'])
-        with tm.assert_produces_warning():
-            result = values.str.match('.*(BAD[_]+).*(BAD)', as_indexer=True)
+        result = values.str.match('.*BAD[_]+.*BAD')
         exp = Series([True, NA, False])
         tm.assert_series_equal(result, exp)
 
-        # If no groups, use new behavior even when as_indexer is False.
-        # (Old behavior is pretty much useless in this case.)
+        # test passing as_indexer still works but is ignored
         values = Series(['fooBAD__barBAD', NA, 'foo'])
-        result = values.str.match('.*BAD[_]+.*BAD', as_indexer=False)
         exp = Series([True, NA, False])
+        with tm.assert_produces_warning(UserWarning):
+            result = values.str.match('.*BAD[_]+.*BAD', as_indexer=True)
+        tm.assert_series_equal(result, exp)
+        with tm.assert_produces_warning(UserWarning):
+            result = values.str.match('.*BAD[_]+.*BAD', as_indexer=False)
         tm.assert_series_equal(result, exp)
 
         # mixed
         mixed = Series(['aBAD_BAD', NA, 'BAD_b_BAD', True, datetime.today(),
                         'foo', None, 1, 2.])
-
-        with tm.assert_produces_warning():
-            rs = Series(mixed).str.match('.*(BAD[_]+).*(BAD)', as_indexer=True)
+        rs = Series(mixed).str.match('.*(BAD[_]+).*(BAD)')
         xp = Series([True, NA, True, NA, NA, False, NA, NA, NA])
         tm.assertIsInstance(rs, Series)
         tm.assert_series_equal(rs, xp)
 
         # unicode
         values = Series([u('fooBAD__barBAD'), NA, u('foo')])
-
-        with tm.assert_produces_warning():
-            result = values.str.match('.*(BAD[_]+).*(BAD)', as_indexer=True)
+        result = values.str.match('.*(BAD[_]+).*(BAD)')
         exp = Series([True, NA, False])
         tm.assert_series_equal(result, exp)
 
@@ -2555,11 +2530,12 @@ def test_match_findall_flags(self):
 
         pat = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
 
-        with tm.assert_produces_warning(FutureWarning):
-            result = data.str.match(pat, flags=re.IGNORECASE)
-
+        result = data.str.extract(pat, flags=re.IGNORECASE)
         self.assertEqual(result[0], ('dave', 'google', 'com'))
 
+        result = data.str.match(pat, flags=re.IGNORECASE)
+        self.assertEqual(result[0], True)
+
         result = data.str.findall(pat, flags=re.IGNORECASE)
         self.assertEqual(result[0][0], ('dave', 'google', 'com'))