Skip to content

Commit ca049e0

Browse files
committed
Merge branch 'PHP-5.6'
* PHP-5.6: updated NEWS Fixed Bug #53823 (preg_replace: * qualifier on unicode replace garbles the string)
2 parents 95b6575 + 1334722 commit ca049e0

File tree

3 files changed

+87
-4
lines changed

3 files changed

+87
-4
lines changed

ext/pcre/php_pcre.c

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -233,6 +233,25 @@ static char **make_subpats_table(int num_subpats, pcre_cache_entry *pce)
233233
}
234234
/* }}} */
235235

236+
/* {{{ static calculate_unit_length */
237+
/* Calculates the byte length of the next character. Assumes valid UTF-8 for PCRE_UTF8. */
238+
static zend_always_inline int calculate_unit_length(pcre_cache_entry *pce, char *start)
239+
{
240+
int unit_len;
241+
242+
if (pce->compile_options & PCRE_UTF8) {
243+
char *end = start;
244+
245+
/* skip continuation bytes */
246+
while ((*++end & 0xC0) == 0x80);
247+
unit_len = end - start;
248+
} else {
249+
unit_len = 1;
250+
}
251+
return unit_len;
252+
}
253+
/* }}} */
254+
236255
/* {{{ pcre_get_compiled_regex_cache
237256
*/
238257
PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache(zend_string *regex)
@@ -854,8 +873,10 @@ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, char *subject, int subjec
854873
the start offset, and continue. Fudge the offset values
855874
to achieve this, unless we're already at the end of the string. */
856875
if (g_notempty != 0 && start_offset < subject_len) {
876+
int unit_len = calculate_unit_length(pce, subject + start_offset);
877+
857878
offsets[0] = (int)start_offset;
858-
offsets[1] = (int)(start_offset + 1);
879+
offsets[1] = (int)(start_offset + unit_len);
859880
} else
860881
break;
861882
} else {
@@ -1247,10 +1268,12 @@ PHPAPI zend_string *php_pcre_replace_impl(pcre_cache_entry *pce, zend_string *su
12471268
the start offset, and continue. Fudge the offset values
12481269
to achieve this, unless we're already at the end of the string. */
12491270
if (g_notempty != 0 && start_offset < subject_len) {
1271+
int unit_len = calculate_unit_length(pce, piece);
1272+
12501273
offsets[0] = start_offset;
1251-
offsets[1] = start_offset + 1;
1252-
memcpy(&result->val[result_len], piece, 1);
1253-
result_len++;
1274+
offsets[1] = start_offset + unit_len;
1275+
memcpy(&result->val[result_len], piece, unit_len);
1276+
result_len += unit_len;
12541277
} else {
12551278
if (!result && subject_str) {
12561279
result = zend_string_copy(subject_str);

ext/pcre/tests/bug53823.phpt

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
--TEST--
2+
Bug #53823 - preg_replace: * qualifier on unicode replace garbles the string
3+
--FILE--
4+
<?php
5+
var_dump(preg_replace('/[^\pL\pM]*/iu', '', 'áéíóú'));
6+
// invalid UTF-8
7+
var_dump(preg_replace('/[^\pL\pM]*/iu', '', "\xFCáéíóú"));
8+
var_dump(preg_replace('/[^\pL\pM]*/iu', '', "áéíóú\xFC"));
9+
?>
10+
--EXPECT--
11+
string(10) "áéíóú"
12+
NULL
13+
NULL

ext/pcre/tests/bug66121.phpt

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
--TEST--
2+
Bug #66121 - UTF-8 lookbehinds match bytes instead of characters
3+
--FILE--
4+
<?php
5+
// Sinhala characters
6+
var_dump(preg_replace('/(?<!ක)/u', '*', ''));
7+
var_dump(preg_replace('/(?<!ක)/u', '*', ''));
8+
// English characters
9+
var_dump(preg_replace('/(?<!k)/u', '*', 'k'));
10+
var_dump(preg_replace('/(?<!k)/u', '*', 'm'));
11+
// Sinhala characters
12+
preg_match_all('/(?<!ක)/u', '', $matches, PREG_OFFSET_CAPTURE);
13+
var_dump($matches);
14+
// invalid UTF-8
15+
var_dump(preg_replace('/(?<!ක)/u', '*', "\xFC"));
16+
var_dump(preg_replace('/(?<!ක)/u', '*', "\xFC"));
17+
var_dump(preg_match_all('/(?<!ක)/u', "\xFC", $matches, PREG_OFFSET_CAPTURE));
18+
var_dump(preg_match_all('/(?<!ක)/u', "\xFC", $matches, PREG_OFFSET_CAPTURE));
19+
?>
20+
--EXPECT--
21+
string(4) "*ක"
22+
string(5) "*ම*"
23+
string(2) "*k"
24+
string(3) "*m*"
25+
array(1) {
26+
[0]=>
27+
array(2) {
28+
[0]=>
29+
array(2) {
30+
[0]=>
31+
string(0) ""
32+
[1]=>
33+
int(0)
34+
}
35+
[1]=>
36+
array(2) {
37+
[0]=>
38+
string(0) ""
39+
[1]=>
40+
int(3)
41+
}
42+
}
43+
}
44+
NULL
45+
NULL
46+
bool(false)
47+
bool(false)

0 commit comments

Comments
 (0)