-
-
Notifications
You must be signed in to change notification settings - Fork 18.6k
ENH: Implement more str accessor methods for ArrowDtype #52614
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
mroeschke
merged 12 commits into
pandas-dev:main
from
mroeschke:enh/str/more_arrow_string
Apr 21, 2023
Merged
Changes from 10 commits
Commits
Show all changes
12 commits
Select commit
Hold shift + click to select a range
345e812
Add more str arrow functions
mroeschke 8ee20fa
Merge remote-tracking branch 'upstream/main' into enh/str/more_arrow_…
mroeschke a992ea5
Finish functions
mroeschke 4fb0748
finish methods and add tests
mroeschke 30e441e
Merge remote-tracking branch 'upstream/main' into enh/str/more_arrow_…
mroeschke 85f4242
Merge remote-tracking branch 'upstream/main' into enh/str/more_arrow_…
mroeschke 449d23e
Finish implementing
mroeschke d3f4752
Merge remote-tracking branch 'upstream/main' into enh/str/more_arrow_…
mroeschke 555b12f
Fix >3.8 compat
mroeschke eeff7ed
Merge remote-tracking branch 'upstream/main' into enh/str/more_arrow_…
mroeschke 9eb232d
Merge remote-tracking branch 'upstream/main' into enh/str/more_arrow_…
mroeschke c168aa4
Create helper function
mroeschke File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,6 +2,8 @@ | |
|
||
import operator | ||
import re | ||
import sys | ||
import textwrap | ||
from typing import ( | ||
TYPE_CHECKING, | ||
Any, | ||
|
@@ -10,6 +12,7 @@ | |
Sequence, | ||
cast, | ||
) | ||
import unicodedata | ||
|
||
import numpy as np | ||
|
||
|
@@ -1868,13 +1871,29 @@ def _str_join(self, sep: str): | |
return type(self)(pc.binary_join(self._pa_array, sep)) | ||
|
||
def _str_partition(self, sep: str, expand: bool): | ||
raise NotImplementedError( | ||
"str.partition not supported with pd.ArrowDtype(pa.string())." | ||
return type(self)( | ||
pa.chunked_array( | ||
[ | ||
[ | ||
None if val.as_py() is None else val.as_py().partition(sep) | ||
for val in chunk | ||
] | ||
for chunk in self._pa_array.iterchunks() | ||
] | ||
) | ||
) | ||
|
||
def _str_rpartition(self, sep: str, expand: bool): | ||
raise NotImplementedError( | ||
"str.rpartition not supported with pd.ArrowDtype(pa.string())." | ||
return type(self)( | ||
pa.chunked_array( | ||
[ | ||
[ | ||
None if val.as_py() is None else val.as_py().rpartition(sep) | ||
for val in chunk | ||
] | ||
for chunk in self._pa_array.iterchunks() | ||
] | ||
) | ||
) | ||
|
||
def _str_slice( | ||
|
@@ -1964,14 +1983,31 @@ def _str_rstrip(self, to_strip=None): | |
return type(self)(result) | ||
|
||
def _str_removeprefix(self, prefix: str): | ||
raise NotImplementedError( | ||
"str.removeprefix not supported with pd.ArrowDtype(pa.string())." | ||
) | ||
# TODO: Should work once https://github.com/apache/arrow/issues/14991 is fixed | ||
# starts_with = pc.starts_with(self._pa_array, pattern=prefix) | ||
# removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix)) | ||
# result = pc.if_else(starts_with, removed, self._pa_array) | ||
# return type(self)(result) | ||
if sys.version_info < (3, 9): | ||
# NOTE pyupgrade will remove this when we run it with --py39-plus | ||
# so don't remove the unnecessary `else` statement below | ||
from pandas.util._str_methods import removeprefix | ||
|
||
else: | ||
removeprefix = lambda arg, prefix: arg.removeprefix(prefix) | ||
return type(self)( | ||
pa.chunked_array( | ||
[ | ||
[ | ||
None | ||
if val.as_py() is None | ||
else removeprefix(val.as_py(), prefix) | ||
for val in chunk | ||
] | ||
for chunk in self._pa_array.iterchunks() | ||
] | ||
) | ||
) | ||
|
||
def _str_removesuffix(self, suffix: str): | ||
ends_with = pc.ends_with(self._pa_array, pattern=suffix) | ||
|
@@ -1980,48 +2016,124 @@ def _str_removesuffix(self, suffix: str): | |
return type(self)(result) | ||
|
||
def _str_casefold(self): | ||
raise NotImplementedError( | ||
"str.casefold not supported with pd.ArrowDtype(pa.string())." | ||
return type(self)( | ||
pa.chunked_array( | ||
[ | ||
[ | ||
None if val.as_py() is None else val.as_py().casefold() | ||
for val in chunk | ||
] | ||
for chunk in self._pa_array.iterchunks() | ||
] | ||
) | ||
) | ||
|
||
def _str_encode(self, encoding, errors: str = "strict"): | ||
raise NotImplementedError( | ||
"str.encode not supported with pd.ArrowDtype(pa.string())." | ||
def _str_encode(self, encoding: str, errors: str = "strict"): | ||
return type(self)( | ||
pa.chunked_array( | ||
[ | ||
[ | ||
None | ||
if val.as_py() is None | ||
else val.as_py().encode(encoding, errors) | ||
for val in chunk | ||
] | ||
for chunk in self._pa_array.iterchunks() | ||
] | ||
) | ||
) | ||
|
||
def _str_extract(self, pat: str, flags: int = 0, expand: bool = True): | ||
raise NotImplementedError( | ||
"str.extract not supported with pd.ArrowDtype(pa.string())." | ||
) | ||
|
||
def _str_findall(self, pat, flags: int = 0): | ||
raise NotImplementedError( | ||
"str.findall not supported with pd.ArrowDtype(pa.string())." | ||
def _str_findall(self, pat: str, flags: int = 0): | ||
regex = re.compile(pat, flags=flags) | ||
return type(self)( | ||
pa.chunked_array( | ||
[ | ||
[ | ||
None if val.as_py() is None else regex.findall(val.as_py()) | ||
for val in chunk | ||
] | ||
for chunk in self._pa_array.iterchunks() | ||
] | ||
) | ||
) | ||
|
||
def _str_get_dummies(self, sep: str = "|"): | ||
raise NotImplementedError( | ||
"str.get_dummies not supported with pd.ArrowDtype(pa.string())." | ||
) | ||
split = pc.split_pattern(self._pa_array, sep).combine_chunks() | ||
uniques = split.flatten().unique() | ||
uniques_sorted = uniques.take(pa.compute.array_sort_indices(uniques)) | ||
result_data = [] | ||
for lst in split.to_pylist(): | ||
if lst is None: | ||
result_data.append([False] * len(uniques_sorted)) | ||
else: | ||
res = pc.is_in(uniques_sorted, pa.array(set(lst))) | ||
result_data.append(res.to_pylist()) | ||
result = type(self)(pa.array(result_data)) | ||
return result, uniques_sorted.to_pylist() | ||
|
||
def _str_index(self, sub, start: int = 0, end=None): | ||
raise NotImplementedError( | ||
"str.index not supported with pd.ArrowDtype(pa.string())." | ||
def _str_index(self, sub: str, start: int = 0, end: int | None = None): | ||
return type(self)( | ||
pa.chunked_array( | ||
[ | ||
[ | ||
None | ||
if val.as_py() is None | ||
else val.as_py().index(sub, start, end) | ||
for val in chunk | ||
] | ||
for chunk in self._pa_array.iterchunks() | ||
] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is it worth making a helper for this pattern so this can just be
|
||
) | ||
) | ||
|
||
def _str_rindex(self, sub, start: int = 0, end=None): | ||
raise NotImplementedError( | ||
"str.rindex not supported with pd.ArrowDtype(pa.string())." | ||
def _str_rindex(self, sub: str, start: int = 0, end: int | None = None): | ||
return type(self)( | ||
pa.chunked_array( | ||
[ | ||
[ | ||
None | ||
if val.as_py() is None | ||
else val.as_py().rindex(sub, start, end) | ||
for val in chunk | ||
] | ||
for chunk in self._pa_array.iterchunks() | ||
] | ||
) | ||
) | ||
|
||
def _str_normalize(self, form): | ||
raise NotImplementedError( | ||
"str.normalize not supported with pd.ArrowDtype(pa.string())." | ||
def _str_normalize(self, form: str): | ||
return type(self)( | ||
pa.chunked_array( | ||
[ | ||
[ | ||
None | ||
if val.as_py() is None | ||
else unicodedata.normalize(form, val.as_py()) | ||
for val in chunk | ||
] | ||
for chunk in self._pa_array.iterchunks() | ||
] | ||
) | ||
) | ||
|
||
def _str_rfind(self, sub, start: int = 0, end=None): | ||
raise NotImplementedError( | ||
"str.rfind not supported with pd.ArrowDtype(pa.string())." | ||
def _str_rfind(self, sub: str, start: int = 0, end=None): | ||
return type(self)( | ||
pa.chunked_array( | ||
[ | ||
[ | ||
None | ||
if val.as_py() is None | ||
else val.as_py().rfind(sub, start, end) | ||
for val in chunk | ||
] | ||
for chunk in self._pa_array.iterchunks() | ||
] | ||
) | ||
) | ||
|
||
def _str_split( | ||
|
@@ -2046,14 +2158,32 @@ def _str_rsplit(self, pat: str | None = None, n: int | None = -1): | |
pc.split_pattern(self._pa_array, pat, max_splits=n, reverse=True) | ||
) | ||
|
||
def _str_translate(self, table): | ||
raise NotImplementedError( | ||
"str.translate not supported with pd.ArrowDtype(pa.string())." | ||
def _str_translate(self, table: dict[int, str]): | ||
return type(self)( | ||
pa.chunked_array( | ||
[ | ||
[ | ||
None if val.as_py() is None else val.as_py().translate(table) | ||
for val in chunk | ||
] | ||
for chunk in self._pa_array.iterchunks() | ||
] | ||
) | ||
) | ||
|
||
def _str_wrap(self, width: int, **kwargs): | ||
raise NotImplementedError( | ||
"str.wrap not supported with pd.ArrowDtype(pa.string())." | ||
kwargs["width"] = width | ||
tw = textwrap.TextWrapper(**kwargs) | ||
return type(self)( | ||
pa.chunked_array( | ||
[ | ||
[ | ||
None if val.as_py() is None else "\n".join(tw.wrap(val.as_py())) | ||
for val in chunk | ||
] | ||
for chunk in self._pa_array.iterchunks() | ||
] | ||
) | ||
) | ||
|
||
@property | ||
|
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
it looks like this will maintain the chunking structure. is there a reason not to chain these together and end up with a single chunk?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I saw this related issue about ops not maintaining the underlying chunking structure and though best to try to keep it here: #42357