Skip to content

Commit 13c3277

Browse files
author
Alexander Barkov
committed
Backporting WL#1213
1 parent b5936f7 commit 13c3277

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

49 files changed

+19307
-1074
lines changed

config/ac-macros/character_sets.m4

Lines changed: 61 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,11 @@ define(CHARSETS_AVAILABLE1,armscii8 ascii big5 cp1250 cp1251 cp1256 cp1257)
1313
define(CHARSETS_AVAILABLE2,cp850 cp852 cp866 cp932 dec8 eucjpms euckr gb2312 gbk geostd8)
1414
define(CHARSETS_AVAILABLE3,greek hebrew hp8 keybcs2 koi8r koi8u)
1515
define(CHARSETS_AVAILABLE4,latin1 latin2 latin5 latin7 macce macroman)
16-
define(CHARSETS_AVAILABLE5,sjis swe7 tis620 ucs2 ujis utf8)
16+
define(CHARSETS_AVAILABLE5,sjis swe7 tis620 ucs2 ujis utf8mb4 utf8 utf16 utf32)
1717

1818
DEFAULT_CHARSET=latin1
1919
CHARSETS_AVAILABLE="CHARSETS_AVAILABLE0 CHARSETS_AVAILABLE1 CHARSETS_AVAILABLE2 CHARSETS_AVAILABLE3 CHARSETS_AVAILABLE4 CHARSETS_AVAILABLE5"
20-
CHARSETS_COMPLEX="big5 cp1250 cp932 eucjpms euckr gb2312 gbk latin1 latin2 sjis tis620 ucs2 ujis utf8"
20+
CHARSETS_COMPLEX="big5 cp1250 cp932 eucjpms euckr gb2312 gbk latin1 latin2 sjis tis620 ucs2 ujis utf8mb4 utf8 utf16 utf32"
2121

2222
AC_DIVERT_POP
2323

@@ -50,7 +50,7 @@ AC_ARG_WITH(extra-charsets,
5050

5151
AC_MSG_CHECKING("character sets")
5252

53-
CHARSETS="$default_charset latin1 utf8"
53+
CHARSETS="$default_charset latin1 utf8mb4 utf8"
5454

5555
if test "$extra_charsets" = no; then
5656
CHARSETS="$CHARSETS"
@@ -195,8 +195,23 @@ do
195195
AC_DEFINE([USE_MB], [1], [Use multi-byte character routines])
196196
AC_DEFINE(USE_MB_IDENT, 1)
197197
;;
198+
utf8mb4)
199+
AC_DEFINE(HAVE_CHARSET_utf8mb4, 1, [Define to enable utf8mb4])
200+
AC_DEFINE([USE_MB], 1, [Use multi-byte character routines])
201+
AC_DEFINE(USE_MB_IDENT, 1)
202+
;;
198203
utf8)
199-
AC_DEFINE(HAVE_CHARSET_utf8, 1, [Define to enable ut8])
204+
AC_DEFINE(HAVE_CHARSET_utf8, 1, [Define to enable utf8])
205+
AC_DEFINE([USE_MB], 1, [Use multi-byte character routines])
206+
AC_DEFINE(USE_MB_IDENT, 1)
207+
;;
208+
utf16)
209+
AC_DEFINE(HAVE_CHARSET_utf16, 1, [Define to enable utf16])
210+
AC_DEFINE([USE_MB], 1, [Use multi-byte character routines])
211+
AC_DEFINE(USE_MB_IDENT, 1)
212+
;;
213+
utf32)
214+
AC_DEFINE(HAVE_CHARSET_utf32, 1, [Define to enable utf32])
200215
AC_DEFINE([USE_MB], 1, [Use multi-byte character routines])
201216
AC_DEFINE(USE_MB_IDENT, 1)
202217
;;
@@ -381,6 +396,48 @@ case $default_charset in
381396
fi
382397
default_charset_collations="$UTFC"
383398
;;
399+
utf8mb4)
400+
default_charset_default_collation="utf8mb4_general_ci"
401+
define(UTFC1, utf8mb4_general_ci utf8mb4_bin)
402+
define(UTFC2, utf8mb4_czech_ci utf8mb4_danish_ci)
403+
define(UTFC3, utf8mb4_esperanto_ci utf8mb4_estonian_ci utf8mb4_hungarian_ci)
404+
define(UTFC4, utf8mb4_icelandic_ci utf8mb4_latvian_ci utf8mb4_lithuanian_ci)
405+
define(UTFC5, utf8mb4_persian_ci utf8mb4_polish_ci utf8mb4_romanian_ci)
406+
define(UTFC6, utf8mb4_sinhala_ci utf8mb4_slovak_ci utf8mb4_slovenian_ci)
407+
define(UTFC7, utf8mb4_spanish2_ci utf8mb4_spanish_ci)
408+
define(UTFC8, utf8mb4_swedish_ci utf8mb4_turkish_ci)
409+
define(UTFC9, utf8mb4_unicode_ci)
410+
UTFC="UTFC1 UTFC2 UTFC3 UTFC4 UTFC5 UTFC6 UTFC7 UTFC8 UTFC9"
411+
default_charset_collations="$UTFC"
412+
;;
413+
utf16)
414+
default_charset_default_collation="utf16_general_ci"
415+
define(UTFC1, utf16_general_ci utf16_bin)
416+
define(UTFC2, utf16_czech_ci utf16_danish_ci)
417+
define(UTFC3, utf16_esperanto_ci utf16_estonian_ci utf16_hungarian_ci)
418+
define(UTFC4, utf16_icelandic_ci utf16_latvian_ci utf16_lithuanian_ci)
419+
define(UTFC5, utf16_persian_ci utf16_polish_ci utf16_romanian_ci)
420+
define(UTFC6, utf16_sinhala_ci utf16_slovak_ci utf16_slovenian_ci)
421+
define(UTFC7, utf16_spanish2_ci utf16_spanish_ci)
422+
define(UTFC8, utf16_swedish_ci utf16_turkish_ci)
423+
define(UTFC9, utf16_unicode_ci)
424+
UTFC="UTFC1 UTFC2 UTFC3 UTFC4 UTFC5 UTFC6 UTFC7 UTFC8 UTFC9"
425+
default_charset_collations="$UTFC"
426+
;;
427+
utf32)
428+
default_charset_default_collation="utf32_general_ci"
429+
define(UTFC1, utf32_general_ci utf32_bin)
430+
define(UTFC2, utf32_czech_ci utf32_danish_ci)
431+
define(UTFC3, utf32_esperanto_ci utf32_estonian_ci utf32_hungarian_ci)
432+
define(UTFC4, utf32_icelandic_ci utf32_latvian_ci utf32_lithuanian_ci)
433+
define(UTFC5, utf32_persian_ci utf32_polish_ci utf32_romanian_ci)
434+
define(UTFC6, utf32_sinhala_ci utf32_slovak_ci utf32_slovenian_ci)
435+
define(UTFC7, utf32_spanish2_ci utf32_spanish_ci)
436+
define(UTFC8, utf32_swedish_ci utf32_turkish_ci)
437+
define(UTFC9, utf32_unicode_ci)
438+
UTFC="UTFC1 UTFC2 UTFC3 UTFC4 UTFC5 UTFC6 UTFC7 UTFC8 UTFC9"
439+
default_charset_collations="$UTFC"
440+
;;
384441
*)
385442
AC_MSG_ERROR([Charset $cs not available. (Available are: $CHARSETS_AVAILABLE).
386443
See the Installation chapter in the Reference Manual.])

include/config-win.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -432,6 +432,9 @@ inline ulonglong double2ulonglong(double d)
432432
#define HAVE_CHARSET_ucs2 1
433433
#define HAVE_CHARSET_ujis 1
434434
#define HAVE_CHARSET_utf8 1
435+
#define HAVE_CHARSET_utf8mb4 1
436+
#define HAVE_CHARSET_utf16 1
437+
#define HAVE_CHARSET_utf32 1
435438

436439
#define HAVE_UCA_COLLATIONS 1
437440
#define HAVE_BOOL 1

include/m_ctype.h

Lines changed: 57 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -98,21 +98,21 @@ extern MY_UNI_CTYPE my_uni_ctype[256];
9898
#define MY_CS_BINSORT 16 /* if binary sort order */
9999
#define MY_CS_PRIMARY 32 /* if primary collation */
100100
#define MY_CS_STRNXFRM 64 /* if strnxfrm is used for sort */
101-
#define MY_CS_UNICODE 128 /* is a charset is full unicode */
101+
#define MY_CS_UNICODE 128 /* is a charset is BMP Unicode */
102102
#define MY_CS_READY 256 /* if a charset is initialized */
103103
#define MY_CS_AVAILABLE 512 /* If either compiled-in or loaded*/
104104
#define MY_CS_CSSORT 1024 /* if case sensitive sort order */
105105
#define MY_CS_HIDDEN 2048 /* don't display in SHOW */
106106
#define MY_CS_PUREASCII 4096 /* if a charset is pure ascii */
107107
#define MY_CS_NONASCII 8192 /* if not ASCII-compatible */
108+
#define MY_CS_UNICODE_SUPPLEMENT 16384 /* Non-BMP Unicode characters */
108109
#define MY_CHARSET_UNDEFINED 0
109110

110111
/* Character repertoire flags */
111112
#define MY_REPERTOIRE_ASCII 1 /* Pure ASCII U+0000..U+007F */
112113
#define MY_REPERTOIRE_EXTENDED 2 /* Extended characters: U+0080..U+FFFF */
113114
#define MY_REPERTOIRE_UNICODE30 3 /* ASCII | EXTENDED: U+0000..U+FFFF */
114115

115-
116116
typedef struct my_uni_idx_st
117117
{
118118
uint16 from;
@@ -304,10 +304,14 @@ typedef struct charset_info_st
304304

305305

306306
extern MYSQL_PLUGIN_IMPORT CHARSET_INFO my_charset_bin;
307+
extern MYSQL_PLUGIN_IMPORT CHARSET_INFO my_charset_latin1;
308+
extern MYSQL_PLUGIN_IMPORT CHARSET_INFO my_charset_filename;
309+
307310
extern CHARSET_INFO my_charset_big5_chinese_ci;
308311
extern CHARSET_INFO my_charset_big5_bin;
309312
extern CHARSET_INFO my_charset_cp932_japanese_ci;
310313
extern CHARSET_INFO my_charset_cp932_bin;
314+
extern CHARSET_INFO my_charset_cp1250_czech_ci;
311315
extern CHARSET_INFO my_charset_eucjpms_japanese_ci;
312316
extern CHARSET_INFO my_charset_eucjpms_bin;
313317
extern CHARSET_INFO my_charset_euckr_korean_ci;
@@ -316,7 +320,6 @@ extern CHARSET_INFO my_charset_gb2312_chinese_ci;
316320
extern CHARSET_INFO my_charset_gb2312_bin;
317321
extern CHARSET_INFO my_charset_gbk_chinese_ci;
318322
extern CHARSET_INFO my_charset_gbk_bin;
319-
extern MYSQL_PLUGIN_IMPORT CHARSET_INFO my_charset_latin1;
320323
extern CHARSET_INFO my_charset_latin1_german2_ci;
321324
extern CHARSET_INFO my_charset_latin1_bin;
322325
extern CHARSET_INFO my_charset_latin2_czech_ci;
@@ -329,11 +332,22 @@ extern CHARSET_INFO my_charset_ucs2_bin;
329332
extern CHARSET_INFO my_charset_ucs2_unicode_ci;
330333
extern CHARSET_INFO my_charset_ujis_japanese_ci;
331334
extern CHARSET_INFO my_charset_ujis_bin;
335+
extern CHARSET_INFO my_charset_utf16_bin;
336+
extern CHARSET_INFO my_charset_utf16_general_ci;
337+
extern CHARSET_INFO my_charset_utf16_unicode_ci;
338+
extern CHARSET_INFO my_charset_utf32_bin;
339+
extern CHARSET_INFO my_charset_utf32_general_ci;
340+
extern CHARSET_INFO my_charset_utf32_unicode_ci;
341+
332342
extern CHARSET_INFO my_charset_utf8_general_ci;
333343
extern CHARSET_INFO my_charset_utf8_unicode_ci;
334344
extern CHARSET_INFO my_charset_utf8_bin;
335-
extern CHARSET_INFO my_charset_cp1250_czech_ci;
336-
extern MYSQL_PLUGIN_IMPORT CHARSET_INFO my_charset_filename;
345+
extern CHARSET_INFO my_charset_utf8mb4_bin;
346+
extern CHARSET_INFO my_charset_utf8mb4_general_ci;
347+
extern CHARSET_INFO my_charset_utf8mb4_unicode_ci;
348+
#define MY_UTF8MB3 "utf8"
349+
#define MY_UTF8MB4 "utf8mb4"
350+
337351

338352
/* declarations for simple charsets */
339353
extern size_t my_strnxfrm_simple(CHARSET_INFO *, uchar *, size_t,
@@ -430,6 +444,19 @@ my_bool my_like_range_ucs2(CHARSET_INFO *cs,
430444
char *min_str, char *max_str,
431445
size_t *min_length, size_t *max_length);
432446

447+
my_bool my_like_range_utf16(CHARSET_INFO *cs,
448+
const char *ptr, size_t ptr_length,
449+
pbool escape, pbool w_one, pbool w_many,
450+
size_t res_length,
451+
char *min_str, char *max_str,
452+
size_t *min_length, size_t *max_length);
453+
454+
my_bool my_like_range_utf32(CHARSET_INFO *cs,
455+
const char *ptr, size_t ptr_length,
456+
pbool escape, pbool w_one, pbool w_many,
457+
size_t res_length,
458+
char *min_str, char *max_str,
459+
size_t *min_length, size_t *max_length);
433460

434461
int my_wildcmp_8bit(CHARSET_INFO *,
435462
const char *str,const char *str_end,
@@ -480,6 +507,31 @@ uint my_instr_mb(struct charset_info_st *,
480507
const char *s, size_t s_length,
481508
my_match_t *match, uint nmatch);
482509

510+
int my_strnncoll_mb_bin(CHARSET_INFO * cs,
511+
const uchar *s, size_t slen,
512+
const uchar *t, size_t tlen,
513+
my_bool t_is_prefix);
514+
515+
int my_strnncollsp_mb_bin(CHARSET_INFO *cs,
516+
const uchar *a, size_t a_length,
517+
const uchar *b, size_t b_length,
518+
my_bool diff_if_only_endspace_difference);
519+
520+
int my_wildcmp_mb_bin(CHARSET_INFO *cs,
521+
const char *str,const char *str_end,
522+
const char *wildstr,const char *wildend,
523+
int escape, int w_one, int w_many);
524+
525+
int my_strcasecmp_mb_bin(CHARSET_INFO * cs __attribute__((unused)),
526+
const char *s, const char *t);
527+
528+
void my_hash_sort_mb_bin(CHARSET_INFO *cs __attribute__((unused)),
529+
const uchar *key, size_t len,ulong *nr1, ulong *nr2);
530+
531+
size_t my_strnxfrm_unicode(CHARSET_INFO *,
532+
uchar *dst, size_t dstlen,
533+
const uchar *src, size_t srclen);
534+
483535
int my_wildcmp_unicode(CHARSET_INFO *cs,
484536
const char *str, const char *str_end,
485537
const char *wildstr, const char *wildend,

mysql-test/include/ctype_datetime.inc

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
#
2+
# Bug#32390 Character sets: casting utf32 to/from date doesn't work
3+
#
4+
CREATE TABLE t1 AS SELECT repeat('a',20) AS s1 LIMIT 0;
5+
SET timestamp=1216359724;
6+
INSERT INTO t1 VALUES (current_date);
7+
INSERT INTO t1 VALUES (current_time);
8+
INSERT INTO t1 VALUES (current_timestamp);
9+
SELECT s1, hex(s1) FROM t1;
10+
DROP TABLE t1;
11+
SET timestamp=0;

mysql-test/include/ctype_like.inc

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
select @@collation_connection;
2+
3+
#
4+
# Create a table with a nullable varchar(10) column
5+
# using currect character_set_connection.
6+
create table t1 as select repeat(' ',10) as a union select null;
7+
alter table t1 add key(a);
8+
show create table t1;
9+
insert into t1 values ("a"),("abc"),("abcd"),("hello"),("test");
10+
explain select * from t1 where a like 'abc%';
11+
explain select * from t1 where a like concat('abc','%');
12+
select * from t1 where a like "abc%";
13+
select * from t1 where a like concat("abc","%");
14+
select * from t1 where a like "ABC%";
15+
select * from t1 where a like "test%";
16+
select * from t1 where a like "te_t";
17+
select * from t1 where a like "%a%";
18+
select * from t1 where a like "%abcd%";
19+
select * from t1 where a like "%abc\d%";
20+
drop table t1;
21+
22+
#
23+
# Bug #2619 ucs2 LIKE comparison fails in some cases
24+
#
25+
26+
select 'AA' like 'AA';
27+
select 'AA' like 'A%A';
28+
select 'AA' like 'A%%A';
29+
select 'AA' like 'AA%';
30+
select 'AA' like '%AA%';
31+
select 'AA' like '%A';
32+
select 'AA' like '%AA';
33+
select 'AA' like 'A%A%';
34+
select 'AA' like '_%_%';
35+
select 'AA' like '%A%A';
36+
select 'AAA'like 'A%A%A';
37+
38+
select 'AZ' like 'AZ';
39+
select 'AZ' like 'A%Z';
40+
select 'AZ' like 'A%%Z';
41+
select 'AZ' like 'AZ%';
42+
select 'AZ' like '%AZ%';
43+
select 'AZ' like '%Z';
44+
select 'AZ' like '%AZ';
45+
select 'AZ' like 'A%Z%';
46+
select 'AZ' like '_%_%';
47+
select 'AZ' like '%A%Z';
48+
select 'AZ' like 'A_';
49+
select 'AZ' like '_Z';
50+
select 'AMZ'like 'A%M%Z';

mysql-test/include/have_utf16.inc

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
-- require r/have_utf16.require
2+
disable_query_log;
3+
show collation like 'utf16_general_ci';
4+
enable_query_log;

mysql-test/include/have_utf32.inc

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
-- require r/have_utf32.require
2+
disable_query_log;
3+
show collation like 'utf32_general_ci';
4+
enable_query_log;

mysql-test/include/have_utf8mb4.inc

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
--require r/have_utf8mb4.require
2+
3+
--disable_query_log
4+
5+
SHOW COLLATION LIKE 'utf8mb4_general_ci';
6+
7+
--enable_query_log

mysql-test/r/ctype_ldml.result

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,33 @@ select * from t1 where c1='b';
5353
c1
5454
a
5555
drop table t1;
56+
show collation like 'utf8mb4_test_ci';
57+
Collation Charset Id Default Compiled Sortlen
58+
utf8mb4_test_ci utf8mb4 326 8
59+
create table t1 (c1 char(1) character set utf8mb4 collate utf8mb4_test_ci);
60+
insert into t1 values ('a');
61+
select * from t1 where c1='b';
62+
c1
63+
a
64+
drop table t1;
65+
show collation like 'utf16_test_ci';
66+
Collation Charset Id Default Compiled Sortlen
67+
utf16_test_ci utf16 327 8
68+
create table t1 (c1 char(1) character set utf16 collate utf16_test_ci);
69+
insert into t1 values ('a');
70+
select * from t1 where c1='b';
71+
c1
72+
a
73+
drop table t1;
74+
show collation like 'utf32_test_ci';
75+
Collation Charset Id Default Compiled Sortlen
76+
utf32_test_ci utf32 391 8
77+
create table t1 (c1 char(1) character set utf32 collate utf32_test_ci);
78+
insert into t1 values ('a');
79+
select * from t1 where c1='b';
80+
c1
81+
a
82+
drop table t1;
5683
CREATE TABLE t1 (
5784
col1 varchar(100) character set utf8 collate utf8_test_ci
5885
);
@@ -373,16 +400,22 @@ select "foo" = "foo " collate latin1_test;
373400
The following tests check that two-byte collation IDs work
374401
select * from information_schema.collations where id>256 order by id;
375402
COLLATION_NAME CHARACTER_SET_NAME ID IS_DEFAULT IS_COMPILED SORTLEN
403+
utf8mb4_test_ci utf8mb4 326 8
404+
utf16_test_ci utf16 327 8
376405
utf8_phone_ci utf8 352 8
377406
utf8_test_ci utf8 353 8
378407
ucs2_test_ci ucs2 358 8
379408
ucs2_vn_ci ucs2 359 8
409+
utf32_test_ci utf32 391 8
380410
utf8_maxuserid_ci utf8 2047 8
381411
show collation like '%test%';
382412
Collation Charset Id Default Compiled Sortlen
383413
latin1_test latin1 99 Yes 1
384414
utf8_test_ci utf8 353 8
385415
ucs2_test_ci ucs2 358 8
416+
utf8mb4_test_ci utf8mb4 326 8
417+
utf16_test_ci utf16 327 8
418+
utf32_test_ci utf32 391 8
386419
show collation like 'ucs2_vn_ci';
387420
Collation Charset Id Default Compiled Sortlen
388421
ucs2_vn_ci ucs2 359 8

0 commit comments

Comments
 (0)