Skip to content

Commit 04999d3

Browse files
committed
add regex support to _replace_coerce
1 parent dfee1d9 commit 04999d3

File tree

1 file changed

+61
-49
lines changed

1 file changed

+61
-49
lines changed

pandas/core/internals.py

Lines changed: 61 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -1690,7 +1690,8 @@ def _nanpercentile(values, q, axis, **kw):
16901690
placement=np.arange(len(result)),
16911691
ndim=ndim)
16921692

1693-
def _replace_coerce(self, mask=None, dst=None, convert=False):
1693+
def _replace_coerce(self, mask=None, src=None, dst=None, inplace=True,
1694+
convert=False, regex=False, mgr=None):
16941695
"""
16951696
Replace value corresponding to the given boolean array with another
16961697
value.
@@ -1699,18 +1700,32 @@ def _replace_coerce(self, mask=None, dst=None, convert=False):
16991700
----------
17001701
mask : array_like of bool
17011702
The mask of values to replace.
1703+
src : object
1704+
The value to replace. It is ignored if regex is False.
17021705
dst : object
17031706
The value to be replaced with.
17041707
convert : bool
1705-
It is used in ObjectBlocks. It is here for API compatibility.
1708+
If true, try to coerce any object types to better types.
1709+
regex : bool
1710+
If true, search for element matching with the pattern in src.
1711+
Masked element is ignored.
1712+
mgr : BlockPlacement, optional
17061713
17071714
Returns
17081715
-------
17091716
A new block if there is anything to replace or the original block.
17101717
"""
1718+
17111719
if mask.any():
1712-
self = self.coerce_to_target_dtype(dst)
1713-
return self.putmask(mask, dst, inplace=True)
1720+
if not regex:
1721+
self = self.coerce_to_target_dtype(dst)
1722+
return self.putmask(mask, dst, inplace=inplace)
1723+
else:
1724+
return self._replace_single(src, dst, inplace=inplace,
1725+
regex=regex,
1726+
convert=convert,
1727+
mask=mask,
1728+
mgr=mgr)
17141729
return self
17151730

17161731

@@ -2488,7 +2503,7 @@ def replace(self, to_replace, value, inplace=False, filter=None,
24882503
regex=regex, mgr=mgr)
24892504

24902505
def _replace_single(self, to_replace, value, inplace=False, filter=None,
2491-
regex=False, convert=True, mgr=None):
2506+
regex=False, convert=True, mgr=None, mask=None):
24922507

24932508
inplace = validate_bool_kwarg(inplace, 'inplace')
24942509

@@ -2555,15 +2570,19 @@ def re_replacer(s):
25552570
else:
25562571
filt = self.mgr_locs.isin(filter).nonzero()[0]
25572572

2558-
new_values[filt] = f(new_values[filt])
2573+
if mask is None:
2574+
new_values[filt] = f(new_values[filt])
2575+
else:
2576+
new_values[filt][mask] = f(new_values[filt][mask])
25592577

25602578
# convert
25612579
block = self.make_block(new_values)
25622580
if convert:
25632581
block = block.convert(by_item=True, numeric=False)
25642582
return block
25652583

2566-
def _replace_coerce(self, mask=None, dst=None, convert=False):
2584+
def _replace_coerce(self, mask=None, src=None, dst=None, inplace=True,
2585+
convert=False, regex=False, mgr=None):
25672586
"""
25682587
Replace value corresponding to the given boolean array with another
25692588
value.
@@ -2572,17 +2591,29 @@ def _replace_coerce(self, mask=None, dst=None, convert=False):
25722591
----------
25732592
mask : array_like of bool
25742593
The mask of values to replace.
2594+
src : object
2595+
The value to replace. It is ignored if regex is False.
25752596
dst : object
25762597
The value to be replaced with.
25772598
convert : bool
25782599
If true, try to coerce any object types to better types.
2600+
regex : bool
2601+
If true, search for element matching with the pattern in src.
2602+
Masked element is ignored.
2603+
mgr : BlockPlacement, optional
25792604
25802605
Returns
25812606
-------
25822607
A new block if there is anything to replace or the original block.
25832608
"""
25842609
if mask.any():
2585-
block = super(ObjectBlock, self)._replace_coerce(mask, dst)
2610+
block = super(ObjectBlock, self)._replace_coerce(mask=mask,
2611+
src=src,
2612+
dst=dst,
2613+
inplace=inplace,
2614+
convert=convert,
2615+
regex=regex,
2616+
mgr=mgr)
25862617
if convert:
25872618
block = [b.convert(by_item=True, numeric=False, copy=True)
25882619
for b in block]
@@ -3779,17 +3810,17 @@ def replace_list(self, src_list, dest_list, inplace=False, regex=False,
37793810
# figure out our mask a-priori to avoid repeated replacements
37803811
values = self.as_array()
37813812

3782-
# only support equality comparision, regex comparision support
3783-
# is needed in the future
3784-
def comp(s):
3813+
def comp(s, reg=False):
37853814
if isna(s):
37863815
return isna(values)
37873816
if hasattr(s, 'asm8'):
37883817
return _maybe_compare(maybe_convert_objects(values),
3789-
getattr(s, 'asm8'), operator.eq)
3790-
return _maybe_compare(values, s, operator.eq)
3818+
getattr(s, 'asm8'), reg)
3819+
if reg and is_re_compilable(s):
3820+
return _maybe_compare(values, s, reg)
3821+
return _maybe_compare(values, s, reg)
37913822

3792-
masks = [comp(s) for i, s in enumerate(src_list)]
3823+
masks = [comp(s, regex) for i, s in enumerate(src_list)]
37933824

37943825
result_blocks = []
37953826
src_len = len(src_list) - 1
@@ -3801,39 +3832,16 @@ def comp(s):
38013832
for i, (s, d) in enumerate(zip(src_list, dest_list)):
38023833
new_rb = []
38033834
for b in rb:
3804-
# regular expression support needs to be improved.
3805-
# If the replacement for the previous pattern
3806-
# matches the next pattern, the value will be replaced
3807-
# again with a different value from dest_list
3808-
# i.e. when values is ['a', 'b']
3809-
# src_list: [r'a*', r'b*'], dest_list: ['b', 'a']
3810-
# result will be ['b', b'] after searching for pattern r'a'
3811-
# and then changed to ['a', 'a'] for pattern r'b*'
3812-
if regex:
3813-
if is_object_dtype(b.dtype):
3814-
convert = i == src_len
3815-
result = b.replace(s, d, inplace=inplace,
3816-
regex=regex,
3817-
mgr=mgr, convert=convert)
3818-
new_rb = _extend_blocks(result, new_rb)
3819-
else:
3820-
# get our mask for this element, sized to this
3821-
# particular block
3822-
m = masks[i][b.mgr_locs.indexer]
3823-
if m.any():
3824-
b = b.coerce_to_target_dtype(d)
3825-
new_rb.extend(b.putmask(m, d, inplace=True))
3826-
else:
3827-
new_rb.append(b)
3835+
m = masks[i][b.mgr_locs.indexer]
3836+
convert = i == src_len
3837+
result = b._replace_coerce(mask=m, src=s, dst=d,
3838+
inplace=inplace,
3839+
convert=convert, regex=regex,
3840+
mgr=mgr)
3841+
if m.any():
3842+
new_rb = _extend_blocks(result, new_rb)
38283843
else:
3829-
m = masks[i][b.mgr_locs.indexer]
3830-
convert = i == src_len
3831-
result = b._replace_coerce(mask=m, dst=d,
3832-
convert=convert)
3833-
if m.any():
3834-
new_rb = _extend_blocks(result, new_rb)
3835-
else:
3836-
new_rb.append(b)
3844+
new_rb.append(b)
38373845
rb = new_rb
38383846
result_blocks.extend(rb)
38393847

@@ -5207,7 +5215,12 @@ def _vstack(to_stack, dtype):
52075215
return np.vstack(to_stack)
52085216

52095217

5210-
def _maybe_compare(a, b, op):
5218+
def _maybe_compare(a, b, regex=False):
5219+
if not regex:
5220+
op = lambda x: operator.eq(x, b)
5221+
else:
5222+
op = np.vectorize(lambda x: bool(re.match(b, x)) if isinstance(x, str)
5223+
else False)
52115224

52125225
is_a_array = isinstance(a, np.ndarray)
52135226
is_b_array = isinstance(b, np.ndarray)
@@ -5219,9 +5232,8 @@ def _maybe_compare(a, b, op):
52195232
# numpy deprecation warning if comparing numeric vs string-like
52205233
elif is_numeric_v_string_like(a, b):
52215234
result = False
5222-
52235235
else:
5224-
result = op(a, b)
5236+
result = op(a)
52255237

52265238
if is_scalar(result) and (is_a_array or is_b_array):
52275239
type_names = [type(a).__name__, type(b).__name__]

0 commit comments

Comments
 (0)