Skip to content

Fix #61072: inconsistent fullmatch results with regex alternation #61343

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
Open
8 changes: 4 additions & 4 deletions pandas/_libs/tslibs/offsets.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -5108,8 +5108,8 @@ def _warn_about_deprecated_aliases(name: str, is_period: bool) -> str:
warnings.warn(
f"\'{name}\' is deprecated and will be removed "
f"in a future version, please use "
f"\'{c_PERIOD_AND_OFFSET_DEPR_FREQSTR.get(name)}\'"
f" instead.",
f"\'{c_PERIOD_AND_OFFSET_DEPR_FREQSTR.get(name)}\' "
f"instead.",
FutureWarning,
stacklevel=find_stack_level(),
)
Expand All @@ -5122,8 +5122,8 @@ def _warn_about_deprecated_aliases(name: str, is_period: bool) -> str:
warnings.warn(
f"\'{name}\' is deprecated and will be removed "
f"in a future version, please use "
f"\'{_name}\'"
f" instead.",
f"\'{_name}\' "
f"instead.",
FutureWarning,
stacklevel=find_stack_level(),
)
Expand Down
91 changes: 81 additions & 10 deletions pandas/core/strings/accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -1431,8 +1431,10 @@ def fullmatch(self, pat, case: bool = True, flags: int = 0, na=lib.no_default):
Determine if each string entirely matches a regular expression.

Checks if each string in the Series or Index fully matches the
specified regular expression pattern. This function is useful when the
requirement is for an entire string to conform to a pattern, such as
specified regular expression pattern.
This function is useful when the
requirement is for an entire string to conform
to a pattern, such as
validating formats like phone numbers or email addresses.

Parameters
Expand All @@ -1458,19 +1460,88 @@ def fullmatch(self, pat, case: bool = True, flags: int = 0, na=lib.no_default):

See Also
--------
match : Similar, but also returns `True` when only a *prefix* of the string
matches the regular expression.
extract : Extract matched groups.
re.fullmatch : Match the entire string using a regular expression.

Notes
-----
This method enforces consistent behavior between Python's string dtype
and PyArrow-backed string arrays when using regular expressions
containing alternation (|). For regex
patterns with alternation operators,
the method ensures proper grouping by
wrapping the pattern in parentheses
when using PyArrow-backed string arrays.

Examples
--------
>>> ser = pd.Series(["cat", "duck", "dove"])
>>> ser.str.fullmatch(r"d.+")
0 False
1 True
2 True
>>> s = pd.Series(["foo", "bar", "foobar", ""])
>>> s.str.fullmatch("foo")
0 True
1 False
2 False
3 False
dtype: bool

>>> s.str.fullmatch(".*")
0 True
1 True
2 True
3 True
dtype: bool

Using regular expressions with flags:

>>> import re
>>> s = pd.Series(["FOO", "foo", "FoO"])
>>> s.str.fullmatch("foo", flags=re.IGNORECASE)
0 True
1 True
2 True
dtype: bool
"""
is_pyarrow = False
arr = self._data.array
arr_type = type(arr).__name__
is_pyarrow = arr_type == "ArrowStringArray"
if not is_pyarrow:
is_pyarrow = "Arrow" in arr_type
if not is_pyarrow and hasattr(arr, "dtype"):
dtype_str = str(arr.dtype)
is_pyarrow = (
"pyarrow" in dtype_str.lower() or "arrow" in dtype_str.lower()
)
if is_pyarrow and "|" in pat:

def _is_fully_wrapped(pattern):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This appears to be an improper solution to me. Can you explain why the current code path fails?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have already fixed the error in the pull request, why do u think is it a improper solution?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You're introducing character-by-character parsing with a lot of branching logic.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the review. From what I can tell, the issue seems to be caused by how pyarrow handles regex alternation. Do you think there’s anything I should do on the pandas side, or is this something that should be fixed upstream?

if not (pattern.startswith("(") and pattern.endswith(")")):
return False
inner = pattern[1:-1]
level = 0
escape = False
in_char_class = False
for char in inner:
if escape:
escape = False
continue
if char == "\\":
escape = True
elif not in_char_class and char == "[":
in_char_class = True
elif in_char_class and char == "]":
in_char_class = False
elif not in_char_class:
if char == "(":
level += 1
elif char == ")":
if level == 0:
return False
level -= 1
return level == 0

if not (
pat.startswith("(") and pat.endswith(")") and _is_fully_wrapped(pat)
):
pat = f"({pat})"
result = self._data.array._str_fullmatch(pat, case=case, flags=flags, na=na)
return self._wrap_result(result, fill_value=na, returns_string=False)

Expand Down
36 changes: 36 additions & 0 deletions pandas/tests/strings/test_pyarrow_format_behavior.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import pytest

from pandas import Series


@pytest.mark.parametrize("dtype", [str])
def test_string_array(dtype):
test_series = Series(["asdf", "as"], dtype=dtype)
regex = r"((as)|(as))"
regex2 = r"(as)|(as)"
assert list(test_series.str.fullmatch(regex)) == [False, True]
assert list(test_series.str.fullmatch(regex2)) == [False, True]


@pytest.mark.parametrize(
"data, pattern, expected",
[
(["cat", "duck", "dove"], r"d.+", [False, True, True]),
],
)
def test_string_match(data, pattern, expected):
ser = Series(data)
assert list(ser.str.fullmatch(pattern)) == expected


@pytest.mark.parametrize("dtype", [str])
@pytest.mark.parametrize(
"pattern, expected",
[
(r"(foo)|((as)(df)?)", [True, True, True]),
("foo|as", [False, True, True]),
],
)
def test_string_alternation_patterns(dtype, pattern, expected):
ser = Series(["asdf", "foo", "as"], dtype=dtype)
assert list(ser.str.fullmatch(pattern)) == expected
Loading