Skip to content

Commit 1aaf68d

Browse files
Andrzej Jarzabekzmur
authored andcommitted
Bug#37039409 Backport Bug#36210202 FTS_DOC_ID comparison incorrect for large values
Problem: The method of comparing doc ids in FTS is incorrect - to determine which id is greater, first 64-bit unsigned subtraction is performed on them, and then the result is cast to 32-bit signed integer. This leads to incorrect results when the subtraction result is not in range of 32-bit signed values. This leads to further incorrect behavior in FTS including ranking score calculation. Fix: Subtraction "hack" replaced with simple comparison operators. Patch based on contribution from Alibaba's Shaohua Wang. Thank you! Change-Id: I5149d4de5de3c7e6732c1905fe54437ef7eb22e7
1 parent 5484c73 commit 1aaf68d

File tree

7 files changed

+113
-105
lines changed

7 files changed

+113
-105
lines changed

storage/innobase/fts/fts0fts.cc

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2438,7 +2438,8 @@ fts_trx_table_create(
24382438
ftt->table = table;
24392439
ftt->fts_trx = fts_trx;
24402440

2441-
ftt->rows = rbt_create(sizeof(fts_trx_row_t), fts_trx_row_doc_id_cmp);
2441+
ftt->rows = rbt_create(
2442+
sizeof(fts_trx_row_t), fts_doc_id_field_cmp<fts_trx_row_t>);
24422443

24432444
return(ftt);
24442445
}
@@ -2462,7 +2463,8 @@ fts_trx_table_clone(
24622463
ftt->table = ftt_src->table;
24632464
ftt->fts_trx = ftt_src->fts_trx;
24642465

2465-
ftt->rows = rbt_create(sizeof(fts_trx_row_t), fts_trx_row_doc_id_cmp);
2466+
ftt->rows = rbt_create(
2467+
sizeof(fts_trx_row_t), fts_doc_id_field_cmp<fts_trx_row_t>);
24662468

24672469
/* Copy the rb tree values to the new savepoint. */
24682470
rbt_merge_uniq(ftt->rows, ftt_src->rows);
@@ -4072,7 +4074,7 @@ fts_sync_add_deleted_cache(
40724074

40734075
ut_a(ib_vector_size(doc_ids) > 0);
40744076

4075-
ib_vector_sort(doc_ids, fts_update_doc_id_cmp);
4077+
ib_vector_sort(doc_ids, fts_doc_id_field_cmp<fts_update_t>);
40764078

40774079
info = pars_info_create();
40784080

storage/innobase/fts/fts0opt.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*****************************************************************************
22
3-
Copyright (c) 2007, 2023, Oracle and/or its affiliates.
3+
Copyright (c) 2007, 2024, Oracle and/or its affiliates.
44
55
This program is free software; you can redistribute it and/or modify
66
it under the terms of the GNU General Public License, version 2.0,
@@ -1028,7 +1028,7 @@ fts_table_fetch_doc_ids(
10281028
if (error == DB_SUCCESS) {
10291029
fts_sql_commit(trx);
10301030

1031-
ib_vector_sort(doc_ids->doc_ids, fts_update_doc_id_cmp);
1031+
ib_vector_sort(doc_ids->doc_ids, fts_doc_id_field_cmp<fts_update_t>);
10321032
} else {
10331033
fts_sql_rollback(trx);
10341034
}

storage/innobase/fts/fts0que.cc

Lines changed: 6 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*****************************************************************************
22
3-
Copyright (c) 2007, 2023, Oracle and/or its affiliates.
3+
Copyright (c) 2007, 2024, Oracle and/or its affiliates.
44
55
This program is free software; you can redistribute it and/or modify
66
it under the terms of the GNU General Public License, version 2.0,
@@ -293,6 +293,9 @@ struct fts_word_freq_t {
293293
double idf; /*!< Inverse document frequency */
294294
};
295295

296+
static ib_rbt_compare fts_ranking_doc_id_cmp =
297+
fts_doc_id_field_cmp<fts_ranking_t>;
298+
296299
/********************************************************************
297300
Callback function to fetch the rows in an FTS INDEX record.
298301
@return always TRUE */
@@ -392,25 +395,7 @@ fts_query_terms_in_document(
392395
fts_query_t* query, /*!< in: FTS query state */
393396
doc_id_t doc_id, /*!< in: the word to check */
394397
ulint* total); /*!< out: total words in document */
395-
#endif
396398

397-
/********************************************************************
398-
Compare two fts_doc_freq_t doc_ids.
399-
@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
400-
UNIV_INLINE
401-
int
402-
fts_freq_doc_id_cmp(
403-
/*================*/
404-
const void* p1, /*!< in: id1 */
405-
const void* p2) /*!< in: id2 */
406-
{
407-
const fts_doc_freq_t* fq1 = (const fts_doc_freq_t*) p1;
408-
const fts_doc_freq_t* fq2 = (const fts_doc_freq_t*) p2;
409-
410-
return((int) (fq1->doc_id - fq2->doc_id));
411-
}
412-
413-
#if 0
414399
/*******************************************************************//**
415400
Print the table used for calculating LCS. */
416401
static
@@ -683,7 +668,7 @@ fts_query_add_word_freq(
683668
word_freq.doc_count = 0;
684669

685670
word_freq.doc_freqs = rbt_create(
686-
sizeof(fts_doc_freq_t), fts_freq_doc_id_cmp);
671+
sizeof(fts_doc_freq_t), fts_doc_id_field_cmp<fts_doc_freq_t>);
687672

688673
parent.last = rbt_add_node(
689674
query->word_freqs, &parent, &word_freq);
@@ -4072,7 +4057,7 @@ fts_query(
40724057
DEBUG_SYNC_C("fts_deleted_doc_ids_append");
40734058

40744059
/* Sort the vector so that we can do a binary search over the ids. */
4075-
ib_vector_sort(query.deleted->doc_ids, fts_update_doc_id_cmp);
4060+
ib_vector_sort(query.deleted->doc_ids, fts_doc_id_field_cmp<fts_doc_freq_t>);
40764061

40774062
/* Convert the query string to lower case before parsing. We own
40784063
the ut_malloc'ed result and so remember to free it before return. */

storage/innobase/include/fts0types.h

Lines changed: 1 addition & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*****************************************************************************
22
3-
Copyright (c) 2007, 2023, Oracle and/or its affiliates.
3+
Copyright (c) 2007, 2024, Oracle and/or its affiliates.
44
55
This program is free software; you can redistribute it and/or modify
66
it under the terms of the GNU General Public License, version 2.0,
@@ -304,45 +304,6 @@ struct fts_token_t {
304304
/** It's defined in fts/fts0fts.c */
305305
extern const fts_index_selector_t fts_index_selector[];
306306

307-
/******************************************************************//**
308-
Compare two fts_trx_row_t instances doc_ids. */
309-
UNIV_INLINE
310-
int
311-
fts_trx_row_doc_id_cmp(
312-
/*===================*/
313-
/*!< out:
314-
< 0 if n1 < n2,
315-
0 if n1 == n2,
316-
> 0 if n1 > n2 */
317-
const void* p1, /*!< in: id1 */
318-
const void* p2); /*!< in: id2 */
319-
320-
/******************************************************************//**
321-
Compare two fts_ranking_t instances doc_ids. */
322-
UNIV_INLINE
323-
int
324-
fts_ranking_doc_id_cmp(
325-
/*===================*/
326-
/*!< out:
327-
< 0 if n1 < n2,
328-
0 if n1 == n2,
329-
> 0 if n1 > n2 */
330-
const void* p1, /*!< in: id1 */
331-
const void* p2); /*!< in: id2 */
332-
333-
/******************************************************************//**
334-
Compare two fts_update_t instances doc_ids. */
335-
UNIV_INLINE
336-
int
337-
fts_update_doc_id_cmp(
338-
/*==================*/
339-
/*!< out:
340-
< 0 if n1 < n2,
341-
0 if n1 == n2,
342-
> 0 if n1 > n2 */
343-
const void* p1, /*!< in: id1 */
344-
const void* p2); /*!< in: id2 */
345-
346307
/******************************************************************//**
347308
Decode and return the integer that was encoded using our VLC scheme.*/
348309
UNIV_INLINE

storage/innobase/include/fts0types.ic

Lines changed: 22 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*****************************************************************************
22

3-
Copyright (c) 2007, 2023, Oracle and/or its affiliates.
3+
Copyright (c) 2007, 2024, Oracle and/or its affiliates.
44

55
This program is free software; you can redistribute it and/or modify
66
it under the terms of the GNU General Public License, version 2.0,
@@ -58,51 +58,35 @@ fts_string_dup(
5858
}
5959

6060
/******************************************************************//**
61-
Compare two fts_trx_row_t doc_ids.
62-
@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
61+
Compare two doc_ids.
62+
@param[in] id1 1st doc_id to compare
63+
@param[in] id2 2nd doc_id to compare
64+
@return < 0 if id1 < id2, 0 if id1 == id2, > 0 if id1 > id2 */
6365
UNIV_INLINE
6466
int
65-
fts_trx_row_doc_id_cmp(
66-
/*===================*/
67-
const void* p1, /*!< in: id1 */
68-
const void* p2) /*!< in: id2 */
67+
fts_doc_id_cmp(doc_id_t id1, doc_id_t id2)
6968
{
70-
const fts_trx_row_t* tr1 = (const fts_trx_row_t*) p1;
71-
const fts_trx_row_t* tr2 = (const fts_trx_row_t*) p2;
72-
73-
return((int)(tr1->doc_id - tr2->doc_id));
69+
if (id1 < id2) {
70+
return -1;
71+
} else if (id1 > id2) {
72+
return 1;
73+
} else {
74+
return 0;
75+
}
7476
}
7577

7678
/******************************************************************//**
77-
Compare two fts_ranking_t doc_ids.
78-
@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
79-
UNIV_INLINE
80-
int
81-
fts_ranking_doc_id_cmp(
82-
/*===================*/
83-
const void* p1, /*!< in: id1 */
84-
const void* p2) /*!< in: id2 */
85-
{
86-
const fts_ranking_t* rk1 = (const fts_ranking_t*) p1;
87-
const fts_ranking_t* rk2 = (const fts_ranking_t*) p2;
88-
89-
return((int)(rk1->doc_id - rk2->doc_id));
90-
}
91-
92-
/******************************************************************//**
93-
Compare two fts_update_t doc_ids.
94-
@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
95-
UNIV_INLINE
96-
int
97-
fts_update_doc_id_cmp(
98-
/*==================*/
99-
const void* p1, /*!< in: id1 */
100-
const void* p2) /*!< in: id2 */
79+
Compare doc_ids of 2 objects.
80+
@param[in] p1 Pointer to first instance o1 of T
81+
@param[in] p2 Pointer to second instance o2 of T
82+
@return sign(o1->doc_id - o2->doc_id) */
83+
template <typename T>
84+
int fts_doc_id_field_cmp(const void *p1, const void *p2)
10185
{
102-
const fts_update_t* up1 = (const fts_update_t*) p1;
103-
const fts_update_t* up2 = (const fts_update_t*) p2;
86+
const T *o1 = static_cast<const T *>(p1);
87+
const T *o2 = static_cast<const T *>(p2);
10488

105-
return((int)(up1->doc_id - up2->doc_id));
89+
return fts_doc_id_cmp(o1->doc_id, o2->doc_id);
10690
}
10791

10892
/******************************************************************//**

unittest/gunit/innodb/CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2013, 2023, Oracle and/or its affiliates.
1+
# Copyright (c) 2013, 2024, Oracle and/or its affiliates.
22
#
33
# This program is free software; you can redistribute it and/or modify
44
# it under the terms of the GNU General Public License, version 2.0,
@@ -39,6 +39,7 @@ SET(TESTS
3939
ut0crc32
4040
ut0mem
4141
ut0new
42+
ut0rbt
4243
)
4344

4445
IF (MERGE_UNITTESTS)

unittest/gunit/innodb/ut0rbt-t.cc

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
/* Copyright (c) 2024, Oracle and/or its affiliates.
2+
3+
This program is free software; you can redistribute it and/or modify
4+
it under the terms of the GNU General Public License, version 2.0,
5+
as published by the Free Software Foundation.
6+
7+
This program is designed to work with certain software (including
8+
but not limited to OpenSSL) that is licensed under separate terms,
9+
as designated in a particular file or component or in included license
10+
documentation. The authors of MySQL hereby grant you an additional
11+
permission to link the program and your derivative works with the
12+
separately licensed software that they have either included with
13+
the program or referenced in the documentation.
14+
15+
This program is distributed in the hope that it will be useful,
16+
but WITHOUT ANY WARRANTY; without even the implied warranty of
17+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18+
GNU General Public License, version 2.0, for more details.
19+
20+
You should have received a copy of the GNU General Public License
21+
along with this program; if not, write to the Free Software
22+
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */
23+
24+
#include <gtest/gtest.h>
25+
26+
#include "fts0fts.h"
27+
#include "fts0types.h"
28+
#include "univ.i"
29+
#include "ut0rbt.h"
30+
31+
namespace innodb_ut0rbt_unittest {
32+
33+
/* Doc id array for testing with values exceeding 32-bit integer limit */
34+
const doc_id_t doc_ids[] = {
35+
17574, 89783, 94755, 97537, 101358, 101361,
36+
102587, 103571, 104018, 106821, 108647, 109352,
37+
109379, 110325, 122868, 210682130, 231275441, 234172769,
38+
366236849, 526467159, 1675241735, 1675243405, 1947751899, 1949940363,
39+
2033691953, 2148227299, 2256289791, 2294223591, 2367501260, 2792700091,
40+
2792701220, 2817121627, 2820680352, 2821165664, 3253312130, 3404918378,
41+
3532599429, 3538712078, 3539373037, 3546479309, 3566641838, 3580209634,
42+
3580871267, 3693930556, 3693932734, 3693932983, 3781949558, 3839877411,
43+
3930968983, 4146309172, 4524715523, 4524715525, 4534911119, 4597818456};
44+
45+
const doc_id_t search_doc_id = 1675241735;
46+
47+
namespace {
48+
struct dummy {
49+
doc_id_t doc_id;
50+
};
51+
} // namespace
52+
53+
TEST(ut0rbt, fts_doc_id_cmp) {
54+
ib_rbt_t *doc_id_rbt = rbt_create(sizeof(dummy), fts_doc_id_field_cmp<dummy>);
55+
56+
/* Insert doc ids into rbtree. */
57+
for (unsigned i = 0; i < sizeof(doc_ids)/sizeof(doc_ids[0]); i++)
58+
{
59+
const doc_id_t& doc_id = doc_ids[i];
60+
ib_rbt_bound_t parent;
61+
dummy obj;
62+
obj.doc_id = doc_id;
63+
64+
if (rbt_search(doc_id_rbt, &parent, &obj.doc_id) != 0) {
65+
rbt_add_node(doc_id_rbt, &parent, &obj);
66+
}
67+
}
68+
69+
/* Check if doc id exists in rbtree */
70+
ib_rbt_bound_t parent;
71+
EXPECT_EQ(rbt_search(doc_id_rbt, &parent, &search_doc_id), 0);
72+
73+
rbt_free(doc_id_rbt);
74+
}
75+
} // namespace innodb_ut0rbt_unittest

0 commit comments

Comments
 (0)