Skip to content

Commit e913413

Browse files
API: change default behaviour of str.match from deprecated extract to match (GH5224)
1 parent be32852 commit e913413

File tree

4 files changed

+37
-95
lines changed

4 files changed

+37
-95
lines changed

doc/source/text.rst

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -364,18 +364,6 @@ or match a pattern:
364364
The distinction between ``match`` and ``contains`` is strictness: ``match``
365365
relies on strict ``re.match``, while ``contains`` relies on ``re.search``.
366366

367-
.. warning::
368-
369-
In previous versions, ``match`` was for *extracting* groups,
370-
returning a not-so-convenient Series of tuples. The new method ``extract``
371-
(described in the previous section) is now preferred.
372-
373-
This old, deprecated behavior of ``match`` is still the default. As
374-
demonstrated above, use the new behavior by setting ``as_indexer=True``.
375-
In this mode, ``match`` is analogous to ``contains``, returning a boolean
376-
Series. The new behavior will become the default behavior in a future
377-
release.
378-
379367
Methods like ``match``, ``contains``, ``startswith``, and ``endswith`` take
380368
an extra ``na`` argument so missing values can be considered True or False:
381369

doc/source/whatsnew/v0.20.0.txt

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -327,7 +327,14 @@ Other API Changes
327327
- ``pd.read_csv()`` will now raise a ``ValueError`` for the C engine if the quote character is larger than than one byte (:issue:`11592`)
328328
- ``inplace`` arguments now require a boolean value, else a ``ValueError`` is thrown (:issue:`14189`)
329329
- ``pandas.api.types.is_datetime64_ns_dtype`` will now report ``True`` on a tz-aware dtype, similar to ``pandas.api.types.is_datetime64_any_dtype``
330-
- ``DataFrame.asof()`` will return a null filled ``Series`` instead the scalar ``NaN`` if a match is not found (:issue:`15118`)
330+
- ``DataFrame.asof()`` will return a null filled ``Series`` instead the scalar ``NaN`` if a match is not found (:issue:`15118`)
331+
- The default behaviour of ``Series.str.match`` has changed from extracting
332+
groups to matching the pattern. The extracting behaviour was deprecated
333+
since pandas version 0.13.0 and can be done with the ``Series.str.extract``
334+
method (:issue:`5224`).
335+
336+
337+
331338
.. _whatsnew_0200.deprecations:
332339

333340
Deprecations

pandas/core/strings.py

Lines changed: 12 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -429,11 +429,9 @@ def rep(x, r):
429429
return result
430430

431431

432-
def str_match(arr, pat, case=True, flags=0, na=np.nan, as_indexer=False):
432+
def str_match(arr, pat, case=True, flags=0, na=np.nan, as_indexer=None):
433433
"""
434-
Deprecated: Find groups in each string in the Series/Index
435-
using passed regular expression.
436-
If as_indexer=True, determine if each string matches a regular expression.
434+
Determine if each string matches a regular expression.
437435
438436
Parameters
439437
----------
@@ -444,60 +442,33 @@ def str_match(arr, pat, case=True, flags=0, na=np.nan, as_indexer=False):
444442
flags : int, default 0 (no flags)
445443
re module flags, e.g. re.IGNORECASE
446444
na : default NaN, fill value for missing values.
447-
as_indexer : False, by default, gives deprecated behavior better achieved
448-
using str_extract. True return boolean indexer.
445+
as_indexer : ignored
449446
450447
Returns
451448
-------
452449
Series/array of boolean values
453-
if as_indexer=True
454-
Series/Index of tuples
455-
if as_indexer=False, default but deprecated
456450
457451
See Also
458452
--------
459453
contains : analogous, but less strict, relying on re.search instead of
460454
re.match
461-
extract : now preferred to the deprecated usage of match (as_indexer=False)
455+
extract : extract matched groups
462456
463-
Notes
464-
-----
465-
To extract matched groups, which is the deprecated behavior of match, use
466-
str.extract.
467457
"""
468-
469458
if not case:
470459
flags |= re.IGNORECASE
471460

472461
regex = re.compile(pat, flags=flags)
473462

474-
if (not as_indexer) and regex.groups > 0:
475-
# Do this first, to make sure it happens even if the re.compile
476-
# raises below.
477-
warnings.warn("In future versions of pandas, match will change to"
478-
" always return a bool indexer.", FutureWarning,
479-
stacklevel=3)
480-
481-
if as_indexer and regex.groups > 0:
482-
warnings.warn("This pattern has match groups. To actually get the"
483-
" groups, use str.extract.", UserWarning, stacklevel=3)
484-
485-
# If not as_indexer and regex.groups == 0, this returns empty lists
486-
# and is basically useless, so we will not warn.
463+
if as_indexer is not None:
464+
# Previously, this keyword was used for changing the default but
465+
# deprecated behaviour. This keyword is now no longer needed.
466+
warnings.warn("'as_indexer' keyword was specified but will be ignored;"
467+
" match now returns a boolean indexer by default.",
468+
UserWarning, stacklevel=3)
487469

488-
if (not as_indexer) and regex.groups > 0:
489-
dtype = object
490-
491-
def f(x):
492-
m = regex.match(x)
493-
if m:
494-
return m.groups()
495-
else:
496-
return []
497-
else:
498-
# This is the new behavior of str_match.
499-
dtype = bool
500-
f = lambda x: bool(regex.match(x))
470+
dtype = bool
471+
f = lambda x: bool(regex.match(x))
501472

502473
return _na_map(f, arr, na, dtype=dtype)
503474

pandas/tests/test_strings.py

Lines changed: 17 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -504,64 +504,39 @@ def test_repeat(self):
504504
exp = Series([u('a'), u('bb'), NA, u('cccc'), NA, u('dddddd')])
505505
tm.assert_series_equal(result, exp)
506506

507-
def test_deprecated_match(self):
508-
# Old match behavior, deprecated (but still default) in 0.13
507+
def test_match(self):
508+
# New match behavior introduced in 0.13
509509
values = Series(['fooBAD__barBAD', NA, 'foo'])
510-
511-
with tm.assert_produces_warning():
512-
result = values.str.match('.*(BAD[_]+).*(BAD)')
513-
exp = Series([('BAD__', 'BAD'), NA, []])
514-
tm.assert_series_equal(result, exp)
515-
516-
# mixed
517-
mixed = Series(['aBAD_BAD', NA, 'BAD_b_BAD', True, datetime.today(),
518-
'foo', None, 1, 2.])
519-
520-
with tm.assert_produces_warning():
521-
rs = Series(mixed).str.match('.*(BAD[_]+).*(BAD)')
522-
xp = Series([('BAD_', 'BAD'), NA, ('BAD_', 'BAD'),
523-
NA, NA, [], NA, NA, NA])
524-
tm.assertIsInstance(rs, Series)
525-
tm.assert_series_equal(rs, xp)
526-
527-
# unicode
528-
values = Series([u('fooBAD__barBAD'), NA, u('foo')])
529-
530-
with tm.assert_produces_warning():
531-
result = values.str.match('.*(BAD[_]+).*(BAD)')
532-
exp = Series([(u('BAD__'), u('BAD')), NA, []])
510+
result = values.str.match('.*(BAD[_]+).*(BAD)')
511+
exp = Series([True, NA, False])
533512
tm.assert_series_equal(result, exp)
534513

535-
def test_match(self):
536-
# New match behavior introduced in 0.13
537514
values = Series(['fooBAD__barBAD', NA, 'foo'])
538-
with tm.assert_produces_warning():
539-
result = values.str.match('.*(BAD[_]+).*(BAD)', as_indexer=True)
515+
result = values.str.match('.*BAD[_]+.*BAD')
540516
exp = Series([True, NA, False])
541517
tm.assert_series_equal(result, exp)
542518

543-
# If no groups, use new behavior even when as_indexer is False.
544-
# (Old behavior is pretty much useless in this case.)
519+
# test passing as_indexer still works but is ignored
545520
values = Series(['fooBAD__barBAD', NA, 'foo'])
546-
result = values.str.match('.*BAD[_]+.*BAD', as_indexer=False)
547521
exp = Series([True, NA, False])
522+
with tm.assert_produces_warning(UserWarning):
523+
result = values.str.match('.*BAD[_]+.*BAD', as_indexer=True)
524+
tm.assert_series_equal(result, exp)
525+
with tm.assert_produces_warning(UserWarning):
526+
result = values.str.match('.*BAD[_]+.*BAD', as_indexer=False)
548527
tm.assert_series_equal(result, exp)
549528

550529
# mixed
551530
mixed = Series(['aBAD_BAD', NA, 'BAD_b_BAD', True, datetime.today(),
552531
'foo', None, 1, 2.])
553-
554-
with tm.assert_produces_warning():
555-
rs = Series(mixed).str.match('.*(BAD[_]+).*(BAD)', as_indexer=True)
532+
rs = Series(mixed).str.match('.*(BAD[_]+).*(BAD)')
556533
xp = Series([True, NA, True, NA, NA, False, NA, NA, NA])
557534
tm.assertIsInstance(rs, Series)
558535
tm.assert_series_equal(rs, xp)
559536

560537
# unicode
561538
values = Series([u('fooBAD__barBAD'), NA, u('foo')])
562-
563-
with tm.assert_produces_warning():
564-
result = values.str.match('.*(BAD[_]+).*(BAD)', as_indexer=True)
539+
result = values.str.match('.*(BAD[_]+).*(BAD)')
565540
exp = Series([True, NA, False])
566541
tm.assert_series_equal(result, exp)
567542

@@ -2555,11 +2530,12 @@ def test_match_findall_flags(self):
25552530

25562531
pat = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
25572532

2558-
with tm.assert_produces_warning(FutureWarning):
2559-
result = data.str.match(pat, flags=re.IGNORECASE)
2560-
2533+
result = data.str.extract(pat, flags=re.IGNORECASE)
25612534
self.assertEqual(result[0], ('dave', 'google', 'com'))
25622535

2536+
result = data.str.match(pat, flags=re.IGNORECASE)
2537+
self.assertEqual(result[0], True)
2538+
25632539
result = data.str.findall(pat, flags=re.IGNORECASE)
25642540
self.assertEqual(result[0][0], ('dave', 'google', 'com'))
25652541

0 commit comments

Comments
 (0)