Skip to content

Commit 7da210c

Browse files
committed
Fix multibyte bugs in my_instr_mb (mysql#123) (mysql#136)
The my_instr_mb function makes the incorrect assumption in multiple places that byte lengths for the input strings can be used to short cut certain decisions. In the case of multibyte character sets and collations, this can't be done though since characters with a different byte length can be considered equal under collation rules. Take for example 'a' and 'Å' which are considered equal under the default MySQL 8 collation utf8mb4_0900_ai_ci: mysql> select 'a' = 'Å'; +-------------+ | 'a' = 'Å' | +-------------+ | 1 | +-------------+ 1 row in set (0.00 sec) The character 'a' is though encoded with 1 byte, but 'Å' is encoded with 3 bytes. When using LOCATE, it should also find the character: mysql> select locate('a', 'Å'); +--------------------+ | locate('a', 'Å') | +--------------------+ | 0 | +--------------------+ 1 row in set (0.00 sec) It shows here that it's wrong, and it doesn't consider the character a substring of the other character, which mismatches the behavior seen when checking equality above. This is due to a number of bugs. First of all, there is this check at the beginning: if (s_length <= b_length) { To be explicit, both length variables here are byte lengths, not character lengths of the inputs. When this check does not match and the byte length of needle is longer than the haystack, the assumption is made that no result can be found. This is wrong, because in case here of a needle of 'Å' it will have 3 bytes, but the haystack of 'a' has 1 byte. We still need to search through with the proper collation settings. Therefore the change here removes this check as it's an incorrect optimization trying to short circuit. There's another attempt at optimization here to reduce the maximum string length searched: end = b + b_length - s_length + 1; This is also not correct because of the same reason. We can't assume byte lengths match character lengths, so this is changed to: end = b + b_length to ensure that the whole haystack string is searched. Lastly, there's another bug. The call to strnncoll again assumes that you can truncate the haystack to the byte length. This goes wrong if the needle is 'a' and the haystack is 'Å'. In that case, the haystack string of 'Å' is 3 bytes long and will be cut off at 1 byte. This results in an invalid UTF-8 input and no match on the search. Instead, strnncoll has a prefix option, so we use that and pass in the full string lenghts so that a proper search for a prefix is made. The higher level Item_func_locate function also has a bug where incorrectly byte length is used. The following is incorrect: return start + 1; As start here is just updated from the character position to the byte position. Instead, we need to use start0 here to return the correct offset. These changes resolve the given bugs around multibyte characters and substring searching. Signed-off-by: Dirkjan Bussink <[email protected]>
1 parent c985ed4 commit 7da210c

File tree

4 files changed

+67
-32
lines changed

4 files changed

+67
-32
lines changed

mysql-test/r/func_str.result

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5801,3 +5801,28 @@ SELECT QUOTE(x'80');
58015801
ERROR HY000: Cannot convert string '\x80' from binary to utf8mb4
58025802
SELECT QUOTE(_utf8mb4 x'80');
58035803
ERROR HY000: Invalid utf8mb4 character string: '80'
5804+
#
5805+
# Bug: The locate function does not find correct substring
5806+
#
5807+
set names utf8mb4;
5808+
select locate('a', 'Å');
5809+
locate('a', 'Å')
5810+
1
5811+
select locate('Å', 'a');
5812+
locate('Å', 'a')
5813+
1
5814+
select locate('aÅ', 'aÅ');
5815+
locate('aÅ', 'aÅ')
5816+
1
5817+
select locate('Åa', 'Åa');
5818+
locate('Åa', 'Åa')
5819+
1
5820+
select locate('ss', 'ß');
5821+
locate('ss', 'ß')
5822+
1
5823+
select locate('ß', 'ss');
5824+
locate('ß', 'ss')
5825+
1
5826+
select locate("", "ÅÅ", 2);
5827+
locate("", "ÅÅ", 2)
5828+
2

mysql-test/t/func_str.test

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2628,3 +2628,15 @@ SELECT QUOTE(NULL);
26282628
SELECT QUOTE(x'80');
26292629
--error ER_INVALID_CHARACTER_STRING
26302630
SELECT QUOTE(_utf8mb4 x'80');
2631+
2632+
--echo #
2633+
--echo # Bug: The locate function does not find correct substring
2634+
--echo #
2635+
set names utf8mb4;
2636+
select locate('a', 'Å');
2637+
select locate('Å', 'a');
2638+
select locate('aÅ', 'aÅ');
2639+
select locate('Åa', 'Åa');
2640+
select locate('ss', 'ß');
2641+
select locate('ß', 'ss');
2642+
select locate("", "ÅÅ", 2);

sql/item_func.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4156,7 +4156,7 @@ longlong Item_func_locate::val_int() {
41564156
}
41574157

41584158
if (!b->length()) // Found empty string at start
4159-
return start + 1;
4159+
return start0 + 1;
41604160

41614161
if (!cs->coll->strstr(cs, a->ptr() + start,
41624162
static_cast<uint>(a->length() - start), b->ptr(),

strings/ctype-mb.cc

Lines changed: 29 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -364,42 +364,40 @@ uint my_instr_mb(const CHARSET_INFO *cs, const char *b, size_t b_length,
364364
const char *end, *b0;
365365
int res = 0;
366366

367-
if (s_length <= b_length) {
368-
if (!s_length) {
369-
if (nmatch) {
370-
match->beg = 0;
371-
match->end = 0;
372-
match->mb_len = 0;
373-
}
374-
return 1; /* Empty string is always found */
367+
if (!s_length) {
368+
if (nmatch) {
369+
match->beg = 0;
370+
match->end = 0;
371+
match->mb_len = 0;
375372
}
373+
return 1; /* Empty string is always found */
374+
}
376375

377-
b0 = b;
378-
end = b + b_length - s_length + 1;
379-
380-
while (b < end) {
381-
int mb_len;
382-
383-
if (!cs->coll->strnncoll(cs, pointer_cast<const uchar *>(b), s_length,
384-
pointer_cast<const uchar *>(s), s_length,
385-
false)) {
386-
if (nmatch) {
387-
match[0].beg = 0;
388-
match[0].end = (uint)(b - b0);
389-
match[0].mb_len = res;
390-
if (nmatch > 1) {
391-
match[1].beg = match[0].end;
392-
match[1].end = match[0].end + (uint)s_length;
393-
match[1].mb_len = 0; /* Not computed */
394-
}
376+
b0 = b;
377+
end = b + b_length;
378+
379+
while (b < end) {
380+
int mb_len;
381+
382+
if (!cs->coll->strnncoll(cs, pointer_cast<const uchar *>(b), b_length,
383+
pointer_cast<const uchar *>(s), s_length,
384+
true)) {
385+
if (nmatch) {
386+
match[0].beg = 0;
387+
match[0].end = (uint)(b - b0);
388+
match[0].mb_len = res;
389+
if (nmatch > 1) {
390+
match[1].beg = match[0].end;
391+
match[1].end = match[0].end + (uint)s_length;
392+
match[1].mb_len = 0; /* Not computed */
395393
}
396-
return 2;
397394
}
398-
mb_len = (mb_len = my_ismbchar(cs, b, end)) ? mb_len : 1;
399-
b += mb_len;
400-
b_length -= mb_len;
401-
res++;
395+
return 2;
402396
}
397+
mb_len = (mb_len = my_ismbchar(cs, b, end)) ? mb_len : 1;
398+
b += mb_len;
399+
b_length -= mb_len;
400+
res++;
403401
}
404402
return 0;
405403
}

0 commit comments

Comments
 (0)