Skip to content

Commit 843ef36

Browse files
author
Shaohua Wang
committed
WL#6607 : InnoDB FULLTEXT SEARCH: CJK support
We implement two fulltext parsers: 1. ngram parser for CJK; 2. mecab parser for Japanese. We use different partition schema for CJK fulltext index tables. Reviewed-by: Jimmy Yang <[email protected]> Reviewed-by: Marko Makela <[email protected]> Reviewed-by: Tor Didriksen <[email protected]> RB: 7328
1 parent e75ab78 commit 843ef36

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

62 files changed

+10709
-871
lines changed

include/ft_global.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,8 @@ struct _ft_vft_ext
5858

5959
#define FTS_DOC_ID_COL_NAME "FTS_DOC_ID"
6060

61+
#define FTS_NGRAM_PARSER_NAME "ngram"
62+
6163
#ifndef FT_CORE
6264
struct st_ft_info
6365
{

mysql-test/include/have_mecab.inc

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
#
2+
# Check if server has support for loading plugins
3+
#
4+
if (`SELECT @@have_dynamic_loading != 'YES'`) {
5+
--skip simple parser requires dynamic loading
6+
}
7+
8+
#
9+
# Check if the variable SIMPLE_PARSER is set
10+
#
11+
if (!$MECAB) {
12+
--skip mecab requires the environment variable \$MECAB to be set (normally done by mtr)
13+
}
14+
15+
#
16+
# Check if --plugin-dir was setup for simple parser
17+
#
18+
if (`SELECT CONCAT('--plugin-dir=', REPLACE(@@plugin_dir, '\\\\', '/')) != '$MECAB_OPT/'`) {
19+
--skip simple parser requires that --plugin-dir is set to the udf plugin dir (either the .opt file does not contain \$UDF_EXAMPLE_LIB_OPT or another plugin is in use)
20+
}

mysql-test/include/have_ngram.inc

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
if (`SELECT COUNT(*) = 0 FROM INFORMATION_SCHEMA.PLUGINS WHERE plugin_name = 'ngram' AND plugin_type = 'ftparser'`)
2+
{
3+
--skip Test requires ngram plugin.
4+
}

mysql-test/include/mysqld--help.inc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ perl;
2525
@build_specific=qw/auto-generate-certs/;
2626

2727
# Plugins which may or may not be there:
28-
@plugins=qw/innodb ndb ndbinfo archive blackhole federated partition ndbcluster debug temp-pool ssl des-key-file
28+
@plugins=qw/innodb ngram mecab ndb ndbinfo archive blackhole federated partition ndbcluster debug temp-pool ssl des-key-file
2929
thread-concurrency super-large-pages mutex-deadlock-detector null-audit
3030
sha256-password-private-key-path sha256-password-public-key-path
3131
sha256-password-auto-generate-rsa-keys/;

mysql-test/include/plugin.defs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ ha_archive storage/archive ARCHIVE_PLUGIN
4040
ha_blackhole storage/blackhole BLACKHOLE_PLUGIN
4141
ha_federated storage/federated FEDERATED_PLUGIN
4242
mypluglib plugin/fulltext SIMPLE_PARSER
43+
libpluginmecab plugin/fulltext MECAB
4344
adt_null plugin/audit_null AUDIT_NULL
4445
libdaemon_example plugin/daemon_example DAEMONEXAMPLE
4546
replication_observers_example_plugin plugin/replication_observers_example RPL_OBS_EXAMPLE
Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
#
2+
#-----------------------------------------------------------------------------
3+
# wl6607 : InnoDB FULLTEXT SEARCH: CJK support (mecab parser)
4+
# Adding FTS check with mecab parser
5+
# - Table with mecab parser
6+
# - Basic FTS query ( mainly BOOLEAN MODE check)
7+
# - FTS index with single and two columns
8+
# - external parser index using alter table
9+
#
10+
#------------------------------------------------------------------------------
11+
12+
# restart does not work with embedded
13+
-- source include/not_embedded.inc
14+
15+
call mtr.add_suppression("unknown variable 'loose_mecab_rc_file");
16+
call mtr.add_suppression("Fulltext index charset 'eucjpms' doesn't match mecab charset");
17+
call mtr.add_suppression("Mecab: param.cpp");
18+
call mtr.add_suppression("Plugin 'mecab' init function returned error");
19+
20+
-- disable_query_log
21+
22+
-- error 0, ER_CANT_INITIALIZE_UDF
23+
eval INSTALL PLUGIN mecab SONAME '$MECAB';
24+
25+
let $mecab_charset=`SELECT variable_value FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME='mecab_charset'`;
26+
if ($mecab_charset == '') {
27+
# Restart with package dictionary.
28+
let $MYSQL_DATADIR=`select @@datadir`;
29+
let $MYSQL_BASEDIR=`select @@basedir`;
30+
31+
let $mecabrc = $MYSQL_DATADIR/mecabrc;
32+
let $dicdir = $MYSQL_BASEDIR/lib/mecab/dic/ipadic_$ipadic_charset;
33+
34+
-- exec echo "dicdir=$dicdir" > $mecabrc
35+
36+
-- source include/shutdown_mysqld.inc
37+
-- exec echo "restart: --loose_mecab_rc_file=$mecabrc $MECAB_OPT --innodb_ft_min_token_size=2" >$MYSQLTEST_VARDIR/tmp/mysqld.1.expect
38+
-- enable_reconnect
39+
-- source include/wait_until_connected_again.inc
40+
-- disable_reconnect
41+
42+
-- disable_query_log
43+
44+
-- error 0, ER_CANT_INITIALIZE_UDF
45+
eval INSTALL PLUGIN mecab SONAME '$MECAB';
46+
47+
let $mecab_charset=`SELECT variable_value FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME='mecab_charset'`;
48+
}
49+
50+
if ($mecab_charset == '') {
51+
-- skip Test fail to load mecab parser, please set correct 'loose_mecab_rc_file'.
52+
}
53+
54+
if ($mecab_charset != $mysql_charset) {
55+
UNINSTALL PLUGIN mecab;
56+
-- skip Test mecab charset mismatch.
57+
}
58+
59+
--enable_query_log
60+
61+
SHOW STATUS LIKE 'mecab_charset';
62+
63+
SET NAMES utf8;
64+
eval CREATE TABLE page (
65+
page_id int(8) unsigned NOT NULL AUTO_INCREMENT,
66+
page_title varchar(255) CHARACTER SET $mecab_charset NOT NULL DEFAULT '',
67+
PRIMARY KEY (page_id),
68+
FULLTEXT KEY page_title (page_title) WITH PARSER mecab
69+
) ENGINE=InnoDB;
70+
71+
INSERT INTO page VALUES
72+
(1,'レモナ'),
73+
(2,'SDレモナ'),
74+
(3,'レモナ'),
75+
(4,'データベース管理システム'),
76+
(5,'キム'),
77+
(6,'鉄道100電車'),
78+
(7,'100'),
79+
(8,'ラフィタ・カスティージョ'),
80+
(9,'ツル科_(Sibley)'),
81+
(10,'ツル'),
82+
(11,'鉄道');
83+
84+
SET GLOBAL innodb_ft_aux_table="test/page";
85+
SELECT * FROM INFORMATION_SCHEMA.INNODB_FT_INDEX_CACHE;
86+
SELECT * FROM INFORMATION_SCHEMA.INNODB_FT_INDEX_TABLE;
87+
88+
SHOW CREATE TABLE page;
89+
90+
SELECT * FROM page WHERE MATCH(page_title) AGAINST("レモナ" IN BOOLEAN MODE);
91+
SELECT * FROM page WHERE MATCH(page_title) AGAINST("キムショウカン" IN BOOLEAN MODE);
92+
SELECT * FROM page WHERE MATCH(page_title) AGAINST("100" IN BOOLEAN MODE);
93+
SELECT * FROM page WHERE MATCH(page_title) AGAINST("ツル*" IN BOOLEAN MODE);
94+
SELECT * FROM page WHERE MATCH(page_title) AGAINST("ラフィタ・カスティージョ");
95+
SELECT * FROM page WHERE MATCH(page_title) AGAINST("ラフィタ・カスティージョ" IN BOOLEAN MODE);
96+
SELECT * FROM page WHERE MATCH(page_title) AGAINST("+ラフィタ・カスティージョ +データベース" IN BOOLEAN MODE);
97+
SELECT * FROM page WHERE MATCH(page_title) AGAINST("+ラフィタ・カスティージョ +(データベース)" IN BOOLEAN MODE);
98+
SELECT * FROM page WHERE MATCH(page_title) AGAINST("-ラフィタ・カスティージョ +(データベース)" IN BOOLEAN MODE);
99+
SELECT * FROM page WHERE MATCH(page_title) AGAINST("ラフィタ・カスティージョ 鉄道" IN BOOLEAN MODE);
100+
SELECT * FROM page WHERE MATCH(page_title) AGAINST("<ラフィタ・カスティージョ >鉄道" IN BOOLEAN MODE);
101+
SELECT * FROM page WHERE MATCH(page_title) AGAINST('鉄道');
102+
SELECT * FROM page WHERE MATCH(page_title) AGAINST('鉄道' WITH QUERY EXPANSION);
103+
104+
eval ALTER TABLE page add column c1 varchar(100) CHARACTER SET $mecab_charset DEFAULT 'レモナ';
105+
ALTER TABLE page ADD FULLTEXT INDEX idx1 (page_title,c1) WITH PARSER mecab;
106+
SELECT * FROM page WHERE MATCH(page_title,c1) AGAINST('レモナ');
107+
SELECT * FROM page WHERE MATCH(page_title,c1) AGAINST('-レモナ' IN BOOLEAN MODE);
108+
109+
DROP TABLE page;
110+
111+
# Test charset mismatch.
112+
CREATE TABLE page (
113+
page_id int(8) unsigned NOT NULL AUTO_INCREMENT,
114+
page_title varchar(255) CHARACTER SET eucjpms NOT NULL DEFAULT '',
115+
PRIMARY KEY (page_id)
116+
)ENGINE=InnoDB;
117+
118+
INSERT INTO page VALUES (1,'レモナ');
119+
120+
# We don't return charset mismatch error because of parallel fts build.
121+
CREATE FULLTEXT INDEX ft_idx ON page(page_title) WITH PARSER mecab;
122+
123+
SELECT * FROM INFORMATION_SCHEMA.INNODB_FT_INDEX_CACHE;
124+
SELECT * FROM INFORMATION_SCHEMA.INNODB_FT_INDEX_TABLE;
125+
126+
--error ER_ERROR_ON_WRITE
127+
INSERT INTO page VALUES (2,'レモナ');
128+
129+
--error ER_ERROR_ON_WRITE
130+
SELECT * FROM page WHERE MATCH(page_title) AGAINST("レモナ");
131+
132+
DROP TABLE page;
133+
134+
SET GLOBAL innodb_ft_aux_table=default;
135+
136+
UNINSTALL PLUGIN mecab;

0 commit comments

Comments
 (0)