Skip to content

Commit 8c22cb2

Browse files
author
Xing Zhang
committed
WL#10480: Add Japanese kana sensitive collation to utf8mb4
Add collation utf8mb4_ja_0900_as_cs_ks to provide kana-sensitive collation for Japanese. Kana-sensitive collation needs to distinguish Katakana character from Hiragana character on quaternary level. We assign quaternary weight only to katakana and hiragana, but not to others like latin, kanji. This is because those characters can already be distinguished by three levels of weight. This makes katakana and hiragana equal on first three levels and Katakana greater than Hiragana on quaternary level. For iteration mark and length mark which usually follows other katakana / hiragana character, we also assign it quaternary weight just same as the kana character it follows. Add _ks collation to japanese collation test BM_JapaneseUTF8MB4 507 ns/iter 473.81 MB/sec BM_Japanese_AS_CS 3825 ns/iter 80.77 MB/sec BM_Japanese_AS_CS_KS 5008 ns/iter 61.69 MB/sec Change-Id: Ic211e95454d8da21a836c41087470029ae9f856c
1 parent c69b382 commit 8c22cb2

File tree

12 files changed

+50362
-24901
lines changed

12 files changed

+50362
-24901
lines changed

mysql-test/r/ctype_ldml.result

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -497,6 +497,7 @@ utf8mb4_hu_0900_as_cs utf8mb4 297 Yes 24 NO PAD
497497
utf8mb4_hr_0900_as_cs utf8mb4 298 Yes 24 NO PAD
498498
utf8mb4_vi_0900_as_cs utf8mb4 300 Yes 24 NO PAD
499499
utf8mb4_ja_0900_as_cs utf8mb4 303 Yes 24 NO PAD
500+
utf8mb4_ja_0900_as_cs_ks utf8mb4 304 Yes 24 NO PAD
500501
utf8mb4_test_ci utf8mb4 326 8 PAD SPACE
501502
utf16_test_ci utf16 327 8 PAD SPACE
502503
utf8mb4_test_400_ci utf8mb4 328 8 PAD SPACE

mysql-test/r/ctype_unicode900_as_cs.result

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1901,6 +1901,102 @@ D7FB Hangul
19011901
A000 Other
19021902
9FFF Other
19031903
DROP TABLE t1;
1904+
CREATE TABLE t1(a CHAR, description VARCHAR(30)) COLLATE utf8mb4_ja_0900_as_cs_ks;
1905+
INSERT INTO t1 VALUES('a', 'Latin'), ('A', 'Latin'), (_utf16 0x02AC, 'Latin'),
1906+
(_utf16 0x02AD, 'Latin'), (_utf16 0x03B1, 'Greak'), (_utf16 0x2C81, 'Coptic'),
1907+
(_utf16 0x0430, 'Cyrillic'), (_utf16 0xD7FB, 'Hangul'),
1908+
(_utf16 0x3041, 'Hiragana'), (_utf16 0x3105, 'Bopomofo'),
1909+
(_utf16 0x2F00, 'Other Han'), (_utf16 0x4E00, 'Japanese Han'),
1910+
(_utf16 0x9FFF, 'Other'), (_utf16 0xA000, 'Other'),
1911+
(_utf16 0x9FD5, 'Other Han'), (_utf16 0xFA0E, 'Other Han'),
1912+
(_utf16 0x3400, 'Other Han'), (_utf16 0x4E9C, 'Japanese Han'),
1913+
(_utf16 0x7199, 'Japanese Han'), (_utf16 0x6190, 'Japanese Han'),
1914+
(_utf16 0x30F3, 'Katakana'), (_utf16 0x306F, 'Hiragana HA'),
1915+
(_utf16 0x3070, 'Hiragana BA'), (_utf16 0x3071, 'Hiragana PA');
1916+
SELECT HEX(CONVERT(a USING utf16)), description FROM t1 ORDER BY a;
1917+
HEX(CONVERT(a USING utf16)) description
1918+
0061 Latin
1919+
0041 Latin
1920+
02AC Latin
1921+
02AD Latin
1922+
3041 Hiragana
1923+
306F Hiragana HA
1924+
3070 Hiragana BA
1925+
3071 Hiragana PA
1926+
30F3 Katakana
1927+
4E9C Japanese Han
1928+
4E00 Japanese Han
1929+
6190 Japanese Han
1930+
7199 Japanese Han
1931+
2F00 Other Han
1932+
9FD5 Other Han
1933+
FA0E Other Han
1934+
3400 Other Han
1935+
03B1 Greak
1936+
2C81 Coptic
1937+
0430 Cyrillic
1938+
D7FB Hangul
1939+
3105 Bopomofo
1940+
A000 Other
1941+
9FFF Other
1942+
DROP TABLE t1;
1943+
SET @s1 = CONVERT(_utf16 0x304D30853046 USING utf8mb4);
1944+
SET @s2 = CONVERT(_utf16 0x30AD30E530A6 USING utf8mb4);
1945+
SET @s3 = CONVERT(_utf16 0x304D30863046 USING utf8mb4);
1946+
SET @s4 = CONVERT(_utf16 0x30AD30E630A6 USING utf8mb4);
1947+
SELECT STRCMP(@s1 COLLATE utf8mb4_ja_0900_as_cs, @s2 COLLATE utf8mb4_ja_0900_as_cs);
1948+
STRCMP(@s1 COLLATE utf8mb4_ja_0900_as_cs, @s2 COLLATE utf8mb4_ja_0900_as_cs)
1949+
0
1950+
SELECT STRCMP(@s2 COLLATE utf8mb4_ja_0900_as_cs, @s3 COLLATE utf8mb4_ja_0900_as_cs);
1951+
STRCMP(@s2 COLLATE utf8mb4_ja_0900_as_cs, @s3 COLLATE utf8mb4_ja_0900_as_cs)
1952+
-1
1953+
SELECT STRCMP(@s3 COLLATE utf8mb4_ja_0900_as_cs, @s4 COLLATE utf8mb4_ja_0900_as_cs);
1954+
STRCMP(@s3 COLLATE utf8mb4_ja_0900_as_cs, @s4 COLLATE utf8mb4_ja_0900_as_cs)
1955+
0
1956+
SELECT STRCMP(@s1 COLLATE utf8mb4_ja_0900_as_cs_ks, @s2 COLLATE utf8mb4_ja_0900_as_cs_ks);
1957+
STRCMP(@s1 COLLATE utf8mb4_ja_0900_as_cs_ks, @s2 COLLATE utf8mb4_ja_0900_as_cs_ks)
1958+
-1
1959+
SELECT STRCMP(@s2 COLLATE utf8mb4_ja_0900_as_cs_ks, @s3 COLLATE utf8mb4_ja_0900_as_cs_ks);
1960+
STRCMP(@s2 COLLATE utf8mb4_ja_0900_as_cs_ks, @s3 COLLATE utf8mb4_ja_0900_as_cs_ks)
1961+
-1
1962+
SELECT STRCMP(@s3 COLLATE utf8mb4_ja_0900_as_cs_ks, @s4 COLLATE utf8mb4_ja_0900_as_cs_ks);
1963+
STRCMP(@s3 COLLATE utf8mb4_ja_0900_as_cs_ks, @s4 COLLATE utf8mb4_ja_0900_as_cs_ks)
1964+
-1
1965+
SET @s1 = CONVERT(_utf16 0x309D USING utf8mb4);
1966+
SET @s2 = CONVERT(_utf16 0x30FD USING utf8mb4);
1967+
SELECT STRCMP(@s1 COLLATE utf8mb4_ja_0900_as_cs, @s2 COLLATE utf8mb4_ja_0900_as_cs);
1968+
STRCMP(@s1 COLLATE utf8mb4_ja_0900_as_cs, @s2 COLLATE utf8mb4_ja_0900_as_cs)
1969+
0
1970+
SELECT STRCMP(@s1 COLLATE utf8mb4_ja_0900_as_cs_ks, @s2 COLLATE utf8mb4_ja_0900_as_cs_ks);
1971+
STRCMP(@s1 COLLATE utf8mb4_ja_0900_as_cs_ks, @s2 COLLATE utf8mb4_ja_0900_as_cs_ks)
1972+
-1
1973+
CREATE TABLE t1(a VARCHAR(20)) COLLATE utf8mb4_ja_0900_as_cs_ks;
1974+
INSERT INTO t1 VALUES(_utf16 0x30FC), (_utf16 0x30A230FC), (_utf16 0x304230FC),
1975+
(_utf16 0x65E5672C8A9E), (_utf16 0x30443059309E), (_utf16 0x30443059305A),
1976+
(_utf16 0x30A430B930FE), (_utf16 0x30A430B930BA),
1977+
(_utf16 0x65E5672C8A9E30CB30DB30F330B4);
1978+
SELECT HEX(CONVERT(a USING utf16)), HEX(WEIGHT_STRING(a)) FROM t1 ORDER BY a;
1979+
HEX(CONVERT(a USING utf16)) HEX(WEIGHT_STRING(a))
1980+
30FC 1C0E000000200000000200000008
1981+
304230FC 1FB61FB60000002000200000000E000C0021000000020002
1982+
30A230FC 1FB61FB60000002000200000000E000C0021000000080008
1983+
30443059309E 1FB71FC31FC3000000200020002000370000000E000E000E000100210000000200020002
1984+
30A430B930FE 1FB71FC31FC3000000200020002000370000000E000E000E000100210000000800080008
1985+
30443059305A 1FB71FC31FC3000000200020002000370000000E000E000E00020000000200020002
1986+
30A430B930BA 1FB71FC31FC3000000200020002000370000000E000E000E00020000000800080008
1987+
65E5672C8A9E 5D135EC957DF00000020002000200000000200020002
1988+
65E5672C8A9E30CB30DB30F330B4 5D135EC957DF1FCC1FD41FE71FC00000002000200020002000200020002000370000000200020002000E000E000E000E000200000008000800080008
1989+
DROP TABLE t1;
1990+
CREATE TABLE t1(a VARCHAR(20), KEY a (a)) COLLATE utf8mb4_ja_0900_as_cs_ks
1991+
PARTITION BY KEY (a) PARTITIONS 3;
1992+
INSERT INTO t1 VALUES(_utf16 0x30FC), (_utf16 0x30A230FC), (_utf16 0x304230FC),
1993+
(_utf16 0x65E5672C8A9E), (_utf16 0x30443059309E), (_utf16 0x30443059305A),
1994+
(_utf16 0x30A430B930FE), (_utf16 0x30A430B930BA),
1995+
(_utf16 0x65E5672C8A9E30CB30DB30F330B4);
1996+
SELECT HEX(CONVERT(a USING utf16)) FROM t1 WHERE a = _utf16 0x30443059305A;
1997+
HEX(CONVERT(a USING utf16))
1998+
30443059305A
1999+
DROP TABLE t1;
19042000
#
19052001
# End of 5.8 tests
19062002
#

0 commit comments

Comments
 (0)