Skip to content

Commit d683d25

Browse files
author
Steinar H. Gunderson
committed
Bug #25688673: REMOVE SPECIAL-CASING OF NON-STRNXFRM-BASED COLLATIONS
Some character sets are designated as MY_CS_STRNXFRM, meaning that sorting needs to go through my_strnxfrm() (implemented by the charset), and some are not, meaning that a client can do the strnxfrm itself based on cs->sort_order. However, most of the logic related to the latter has been removed already (e.g. filesort always uses my_strnxfrm() since 2003), and now it's mostly in the way. The three main uses left are: 1. A microoptimization for constructing sort keys in filesort. 2. A home-grown implementation of Boyer-Moore for accelerating certain LIKE patterns that should probably be handled through FTS. 3. Some optimizations to MyISAM prefix keys. Given that our default collation (utf8mb4_0900_ai_ci) now is a strnxfrm-based collation, the benefits of keeping these around for a narrow range of single-byte locales (like latin1_swedish_ci, cp850 and a bunch of more obscure locales) seems dubious. We seemingly can't remove the flag entirely due to mysql#3 seemingly affecting the on-disk MyISAM structure, but we can remove the code for mysql#1 and mysql#2. Change-Id: If974e490d451b7278355e33ab1fca993f446b792
1 parent 586af41 commit d683d25

File tree

9 files changed

+64
-354
lines changed

9 files changed

+64
-354
lines changed

include/m_ctype.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,12 @@ extern MY_UNI_CTYPE my_uni_ctype[256];
209209
#define MY_CS_LOADED 8 /* sets that are currently loaded */
210210
#define MY_CS_BINSORT 16 /* if binary sort order */
211211
#define MY_CS_PRIMARY 32 /* if primary collation */
212-
#define MY_CS_STRNXFRM 64 /* if strnxfrm is used for sort */
212+
#define MY_CS_STRNXFRM 64 /*
213+
if _not_ set, sort_order will
214+
give same result as strnxfrm --
215+
all new collations should have this
216+
flag set, do not check it in new code
217+
*/
213218
#define MY_CS_UNICODE 128 /* is a charset is BMP Unicode */
214219
#define MY_CS_READY 256 /* if a charset is initialized */
215220
#define MY_CS_AVAILABLE 512 /* If either compiled-in or loaded*/

mysql-test/r/func_like.result

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -272,7 +272,7 @@ DROP TABLE t1, t2;
272272
# Bug#20035071: Out of range error in subselect lead to assertion failed
273273
CREATE TABLE t1(a INTEGER) engine=innodb;
274274
SELECT 1 FROM t1 HAVING (SELECT 1 FROM t1) LIKE EXP(NOW());
275-
ERROR 22003: DOUBLE value is out of range in 'exp(now())'
275+
1
276276
DROP TABLE t1;
277277
#
278278
# Bug #25140629: WRONG RESULT WHEN USING LIKE FUNCTION WITH UCA

mysql-test/t/func_like.test

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ select * from t1 where a like "test%";
1818
select * from t1 where a like "te_t";
1919

2020
#
21-
# The following will test the Boyer-Moore code
21+
# The following will test non-anchored matches
2222
#
2323
select * from t1 where a like "%a%";
2424
select * from t1 where a like "%abcd%";
@@ -203,7 +203,7 @@ DROP TABLE t1, t2;
203203
--echo # Bug#20035071: Out of range error in subselect lead to assertion failed
204204

205205
CREATE TABLE t1(a INTEGER) engine=innodb;
206-
--error ER_DATA_OUT_OF_RANGE
206+
# Error is OK but not mandatory.
207207
SELECT 1 FROM t1 HAVING (SELECT 1 FROM t1) LIKE EXP(NOW());
208208
DROP TABLE t1;
209209

sql/filesort.cc

Lines changed: 39 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -368,7 +368,6 @@ bool filesort(THD *thd, Filesort *filesort, bool sort_positions,
368368
IO_CACHE chunk_file; // For saving Merge_chunk structs.
369369
IO_CACHE *outfile; // Contains the final, sorted result.
370370
Sort_param param;
371-
bool multi_byte_charset;
372371
Bounded_queue<uchar *, uchar *, Sort_param, Mem_compare_queue_key>
373372
pq((Malloc_allocator<uchar*>
374373
(key_memory_Filesort_info_record_pointers)));
@@ -414,8 +413,7 @@ bool filesort(THD *thd, Filesort *filesort, bool sort_positions,
414413

415414
param.init_for_filesort(filesort,
416415
make_array(filesort->sortorder, s_length),
417-
sortlength(thd, filesort->sortorder, s_length,
418-
&multi_byte_charset),
416+
sortlength(thd, filesort->sortorder, s_length),
419417
table,
420418
thd->variables.max_length_for_sort_data,
421419
max_rows, sort_positions);
@@ -430,8 +428,7 @@ bool filesort(THD *thd, Filesort *filesort, bool sort_positions,
430428
// If number of rows is not known, use as much of sort buffer as possible.
431429
num_rows_estimate= table->file->estimate_rows_upper_bound();
432430

433-
if (multi_byte_charset &&
434-
!(param.tmp_buffer= (char*)
431+
if (!(param.tmp_buffer= (char*)
435432
my_malloc(key_memory_Sort_param_tmp_buffer,
436433
param.max_compare_length(), MYF(MY_WME))))
437434
goto err;
@@ -1508,7 +1505,6 @@ uint Sort_param::make_sortkey(uchar *to, const uchar *ref_pos)
15081505
}
15091506

15101507
const CHARSET_INFO *cs=item->collation.collation;
1511-
char fill_char= ((cs->state & MY_CS_BINSORT) ? (char) 0 : ' ');
15121508

15131509
/* All item->str() to use some extra byte for end null.. */
15141510
String tmp((char*) to,sort_field->length+4,cs);
@@ -1539,46 +1535,29 @@ uint Sort_param::make_sortkey(uchar *to, const uchar *ref_pos)
15391535
break;
15401536
}
15411537
uint length= static_cast<uint>(res->length());
1542-
if (sort_field->need_strnxfrm)
1538+
const char *from= res->ptr();
1539+
if (pointer_cast<const uchar *>(from) == to)
15431540
{
1544-
char *from=(char*) res->ptr();
1545-
size_t tmp_length MY_ATTRIBUTE((unused));
1546-
if ((uchar*) from == to)
1547-
{
1548-
DBUG_ASSERT(sort_field->length >= length);
1549-
set_if_smaller(length,sort_field->length);
1550-
memcpy(tmp_buffer, from, length);
1551-
from= tmp_buffer;
1552-
}
1553-
tmp_length=
1554-
cs->coll->strnxfrm(cs, to, sort_field->length,
1555-
item->max_char_length(),
1556-
(uchar*) from, length,
1557-
MY_STRXFRM_PAD_TO_MAXLEN);
1558-
DBUG_ASSERT(tmp_length == sort_field->length);
1541+
DBUG_ASSERT(sort_field->length >= length);
1542+
set_if_smaller(length,sort_field->length);
1543+
memcpy(tmp_buffer, from, length);
1544+
from= tmp_buffer;
15591545
}
1560-
else
1546+
uint sort_field_length= sort_field->length;
1547+
if (sort_field->suffix_length)
15611548
{
1562-
size_t diff;
1563-
uint sort_field_length= sort_field->length -
1564-
sort_field->suffix_length;
1565-
if (sort_field_length < length)
1566-
{
1567-
diff= 0;
1568-
length= sort_field_length;
1569-
}
1570-
else
1571-
diff= sort_field_length - length;
1572-
if (sort_field->suffix_length)
1573-
{
1574-
/* Store length last in result_string */
1575-
store_length(to + sort_field_length, length,
1576-
sort_field->suffix_length);
1577-
}
1578-
1579-
my_strnxfrm(cs, to,length,(const uchar*)res->ptr(),length);
1580-
cs->cset->fill(cs, (char *)to+length,diff,fill_char);
1549+
/* Store length last in result_string */
1550+
sort_field_length-= sort_field->suffix_length;
1551+
store_length(to + sort_field_length, length, sort_field->suffix_length);
15811552
}
1553+
1554+
size_t tmp_length MY_ATTRIBUTE((unused));
1555+
tmp_length=
1556+
cs->coll->strnxfrm(cs, to, sort_field_length,
1557+
item->max_char_length(),
1558+
pointer_cast<const uchar*>(from), length,
1559+
MY_STRXFRM_PAD_TO_MAXLEN);
1560+
DBUG_ASSERT(tmp_length == sort_field_length);
15821561
break;
15831562
}
15841563
case INT_RESULT:
@@ -2394,32 +2373,25 @@ static uint suffix_length(ulong string_length)
23942373
@param thd Thread handler
23952374
@param sortorder Order of items to sort
23962375
@param s_length Number of items to sort
2397-
@param[out] multi_byte_charset Set to 1 if we are using multi-byte charset
2398-
(In which case we have to use strnxfrm())
23992376
24002377
@note
24012378
sortorder->length is updated for each sort item.
2402-
@n
2403-
sortorder->need_strnxfrm is set 1 if we have to use strnxfrm
24042379
24052380
@return
24062381
Total length of sort buffer in bytes
24072382
*/
24082383

24092384
uint
2410-
sortlength(THD *thd, st_sort_field *sortorder, uint s_length,
2411-
bool *multi_byte_charset)
2385+
sortlength(THD *thd, st_sort_field *sortorder, uint s_length)
24122386
{
24132387
uint total_length= 0;
2414-
*multi_byte_charset= false;
24152388

24162389
// Heed the contract that strnxfrm() needs an even number of bytes.
24172390
const uint max_sort_length_even=
24182391
(thd->variables.max_sort_length + 1) & ~1;
24192392

24202393
for (; s_length-- ; sortorder++)
24212394
{
2422-
DBUG_ASSERT(!sortorder->need_strnxfrm);
24232395
DBUG_ASSERT(sortorder->suffix_length == 0);
24242396
if (sortorder->field)
24252397
{
@@ -2428,16 +2400,12 @@ sortlength(THD *thd, st_sort_field *sortorder, uint s_length,
24282400
sortorder->length= field->sort_length();
24292401
sortorder->is_varlen= field->sort_key_is_varlen();
24302402

2431-
if (use_strnxfrm(cs))
2432-
{
2433-
// How many bytes do we need (including sort weights) for strnxfrm()?
2434-
sortorder->length= cs->coll->strnxfrmlen(cs, sortorder->length);
2435-
sortorder->need_strnxfrm= true;
2436-
*multi_byte_charset= 1;
2437-
}
2403+
// How many bytes do we need (including sort weights) for strnxfrm()?
2404+
sortorder->length= cs->coll->strnxfrmlen(cs, sortorder->length);
2405+
24382406
/*
24392407
NOTE: The corresponding test below also has a check for
2440-
cs == &my_charset_bin to sort truncated blobs deterministically;
2408+
NO PAD collations to sort truncated blobs deterministically;
24412409
however, that part is dealt by in Field_blob/Field_varstring,
24422410
so we don't need it here.
24432411
*/
@@ -2468,16 +2436,20 @@ sortlength(THD *thd, st_sort_field *sortorder, uint s_length,
24682436
const CHARSET_INFO *cs= item->collation.collation;
24692437
sortorder->length= item->max_length;
24702438
set_if_smaller(sortorder->length, max_sort_length_even);
2471-
if (use_strnxfrm(cs))
2472-
{
2473-
// How many bytes do we need (including sort weights) for strnxfrm()?
2474-
sortorder->length= cs->coll->strnxfrmlen(cs, sortorder->length);
2475-
sortorder->need_strnxfrm= true;
2476-
*multi_byte_charset= 1;
2477-
}
2478-
else if (cs->pad_attribute == NO_PAD)
2439+
2440+
// How many bytes do we need (including sort weights) for strnxfrm()?
2441+
sortorder->length= cs->coll->strnxfrmlen(cs, sortorder->length);
2442+
2443+
if (cs->pad_attribute == NO_PAD)
24792444
{
2480-
/* Store length last to be able to sort blob/varbinary */
2445+
/*
2446+
Store length last, which makes it into a tie-breaker. This is
2447+
so that e.g. 'a' < 'a\0' for the binary collation, even though
2448+
the field is fixed-width and pads with '\0'. The utf8mb4_0900_*
2449+
collations technically don't need this, since they pad with 0
2450+
(which does not match any real weight), but we'd like not to
2451+
rely on such implementation details in filesort.
2452+
*/
24812453
sortorder->suffix_length= suffix_length(sortorder->length);
24822454
sortorder->length+= sortorder->suffix_length;
24832455
}

sql/filesort.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,6 @@ void filesort_free_buffers(TABLE *table, bool full);
8282
void change_double_for_sort(double nr,uchar *to);
8383

8484
/// Declared here so we can unit test it.
85-
uint sortlength(THD *thd, st_sort_field *sortorder, uint s_length,
86-
bool *multi_byte_charset);
85+
uint sortlength(THD *thd, st_sort_field *sortorder, uint s_length);
8786

8887
#endif /* FILESORT_INCLUDED */

0 commit comments

Comments
 (0)