Skip to content

Commit daa02f8

Browse files
Andrzej Jarzabekzmur
authored andcommitted
Bug#36879147 [InnoDB] FULLTEXT index limits FTS_DOC_ID to max unsigned 32-bit value
Backport to 5.7.49. Issue: FTS_DOC_ID is a 64-bit field and can have values 2^32 and higher. However current implementation only supports 32-bit value range. This limtation takes the form of: - Assertions - Use of unsigned long type which resolves to 32-bit on some platforms - VLC (variable length coding) implementation supporting up to 35 buts Fix: Support 64-bit doc IDs: - Remove assertions - Replace use of unsigned long for doc ID deltas with uint64_t - Extend VLC functions to support full unsigned 64-bit range Change-Id: Ifb56b33c5ec75e578391612eb371c41fc6aeef31
1 parent b96d4be commit daa02f8

File tree

8 files changed

+450
-92
lines changed

8 files changed

+450
-92
lines changed

storage/innobase/fts/fts0fts.cc

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1280,7 +1280,6 @@ fts_cache_node_add_positions(
12801280
ulint enc_len;
12811281
ulint last_pos;
12821282
byte* ptr_start;
1283-
ulint doc_id_delta;
12841283

12851284
#ifdef UNIV_DEBUG
12861285
if (cache) {
@@ -1291,7 +1290,7 @@ fts_cache_node_add_positions(
12911290
ut_ad(doc_id >= node->last_doc_id);
12921291

12931292
/* Calculate the space required to store the ilist. */
1294-
doc_id_delta = (ulint)(doc_id - node->last_doc_id);
1293+
const uint64_t doc_id_delta = doc_id - node->last_doc_id;
12951294
enc_len = fts_get_encoded_len(doc_id_delta);
12961295

12971296
last_pos = 0;

storage/innobase/fts/fts0opt.cc

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1153,7 +1153,7 @@ fts_optimize_encode_node(
11531153
/* Calculate the space required to store the ilist. */
11541154
ut_ad(doc_id > node->last_doc_id);
11551155
doc_id_delta = doc_id - node->last_doc_id;
1156-
enc_len = fts_get_encoded_len(static_cast<ulint>(doc_id_delta));
1156+
enc_len = fts_get_encoded_len(doc_id_delta);
11571157

11581158
/* Calculate the size of the encoded pos array. */
11591159
while (*src) {
@@ -1197,9 +1197,8 @@ fts_optimize_encode_node(
11971197
src = enc->src_ilist_ptr;
11981198
dst = node->ilist + node->ilist_size;
11991199

1200-
/* Encode the doc id. Cast to ulint, the delta should be small and
1201-
therefore no loss of precision. */
1202-
dst += fts_encode_int((ulint) doc_id_delta, dst);
1200+
/* Encode the doc id. */
1201+
dst += fts_encode_int(doc_id_delta, dst);
12031202

12041203
/* Copy the encoded pos array. */
12051204
memcpy(dst, src, pos_enc_len);
@@ -1243,10 +1242,9 @@ fts_optimize_node(
12431242
while (copied < src_node->ilist_size
12441243
&& dst_node->ilist_size < FTS_ILIST_MAX_SIZE) {
12451244

1246-
doc_id_t delta;
12471245
doc_id_t del_doc_id = FTS_NULL_DOC_ID;
12481246

1249-
delta = fts_decode_vlc(&enc->src_ilist_ptr);
1247+
doc_id_t delta = fts_decode_vlc(&enc->src_ilist_ptr);
12501248

12511249
test_again:
12521250
/* Check whether the doc id is in the delete list, if

storage/innobase/fts/fts0que.cc

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ Completed 2011/7/10 Sunny and Jimmy Yang
5252
#endif
5353

5454
#include <iomanip>
55+
#include <limits>
5556
#include <vector>
5657

5758
#define FTS_ELEM(t, n, i, j) (t[(i) * n + (j)])
@@ -3154,14 +3155,17 @@ fts_query_find_doc_id(
31543155
ulint freq = 0;
31553156
ulint min_pos = 0;
31563157
ulint last_pos = 0;
3157-
ulint pos = fts_decode_vlc(&ptr);
3158+
const uint64_t delta = fts_decode_vlc(&ptr);
31583159

31593160
/* Add the delta. */
3160-
doc_id += pos;
3161+
doc_id += delta;
31613162

31623163
while (*ptr) {
31633164
++freq;
3164-
last_pos += fts_decode_vlc(&ptr);
3165+
const uint64_t pos_delta = fts_decode_vlc(&ptr);
3166+
ut_ad(uint64_t(last_pos) + pos_delta <=
3167+
std::numeric_limits<ulint>::max());
3168+
last_pos += static_cast<ulint>(pos_delta);
31653169

31663170
/* Only if min_pos is not set and the current
31673171
term exists in a position greater than the
@@ -3234,15 +3238,15 @@ fts_query_filter_doc_ids(
32343238
fts_doc_freq_t* doc_freq;
32353239
fts_match_t* match = NULL;
32363240
ulint last_pos = 0;
3237-
ulint pos = fts_decode_vlc(&ptr);
3241+
const uint64_t delta = fts_decode_vlc(&ptr);
32383242

32393243
/* Some sanity checks. */
32403244
if (doc_id == 0) {
3241-
ut_a(pos == node->first_doc_id);
3245+
ut_a(delta == node->first_doc_id);
32423246
}
32433247

32443248
/* Add the delta. */
3245-
doc_id += pos;
3249+
doc_id += delta;
32463250

32473251
if (calc_doc_count) {
32483252
word_freq->doc_count++;
@@ -3272,7 +3276,10 @@ fts_query_filter_doc_ids(
32723276

32733277
/* Unpack the positions within the document. */
32743278
while (*ptr) {
3275-
last_pos += fts_decode_vlc(&ptr);
3279+
const uint64_t decoded_pos = fts_decode_vlc(&ptr);
3280+
ut_ad(uint64_t(last_pos) + decoded_pos
3281+
<= std::numeric_limits<ulint>::max());
3282+
last_pos += static_cast<ulint>(decoded_pos);
32763283

32773284
/* Collect the matching word positions, for phrase
32783285
matching later. */

storage/innobase/handler/i_s.cc

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*****************************************************************************
22
3-
Copyright (c) 2007, 2023, Oracle and/or its affiliates.
3+
Copyright (c) 2007, 2024, Oracle and/or its affiliates.
44
55
This program is free software; you can redistribute it and/or modify
66
it under the terms of the GNU General Public License, version 2.0,
@@ -3377,13 +3377,15 @@ i_s_fts_index_cache_fill_one_index(
33773377
ptr = node->ilist;
33783378

33793379
while (decoded < node->ilist_size) {
3380-
ulint pos = fts_decode_vlc(&ptr);
3380+
const uint64_t delta = fts_decode_vlc(&ptr);
33813381

3382-
doc_id += pos;
3382+
doc_id += delta;
33833383

33843384
/* Get position info */
33853385
while (*ptr) {
3386-
pos = fts_decode_vlc(&ptr);
3386+
const uint64_t decoded_pos = fts_decode_vlc(&ptr);
3387+
ut_ad(decoded_pos <= std::numeric_limits<ulint>::max());
3388+
const ulint pos = static_cast<ulint>(decoded_pos);
33873389

33883390
OK(field_store_string(
33893391
fields[I_S_FTS_WORD],
@@ -3758,13 +3760,15 @@ i_s_fts_index_table_fill_one_fetch(
37583760
ptr = node->ilist;
37593761

37603762
while (decoded < node->ilist_size) {
3761-
ulint pos = fts_decode_vlc(&ptr);
3763+
const uint64_t delta = fts_decode_vlc(&ptr);
37623764

3763-
doc_id += pos;
3765+
doc_id += delta;
37643766

37653767
/* Get position info */
37663768
while (*ptr) {
3767-
pos = fts_decode_vlc(&ptr);
3769+
const uint64_t decoded_pos = fts_decode_vlc(&ptr);
3770+
ut_ad(decoded_pos <= std::numeric_limits<ulint>::max());
3771+
const ulint pos = static_cast<ulint>(decoded_pos);
37683772

37693773
OK(field_store_string(
37703774
fields[I_S_FTS_WORD],

storage/innobase/include/fts0types.h

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ Created 2007-03-27 Sunny Bains
3535
#ifndef INNOBASE_FTS0TYPES_H
3636
#define INNOBASE_FTS0TYPES_H
3737

38+
#include <stdint.h>
3839
#include "univ.i"
3940
#include "fts0fts.h"
4041
#include "fut0fut.h"
@@ -307,7 +308,7 @@ extern const fts_index_selector_t fts_index_selector[];
307308
/******************************************************************//**
308309
Decode and return the integer that was encoded using our VLC scheme.*/
309310
UNIV_INLINE
310-
ulint
311+
uint64_t
311312
fts_decode_vlc(
312313
/*===========*/
313314
/*!< out: value decoded */
@@ -331,22 +332,22 @@ fts_string_dup(
331332
/******************************************************************//**
332333
Return length of val if it were encoded using our VLC scheme. */
333334
UNIV_INLINE
334-
ulint
335+
unsigned int
335336
fts_get_encoded_len(
336337
/*================*/
337338
/*!< out: length of value
338339
encoded, in bytes */
339-
ulint val); /*!< in: value to encode */
340+
uint64_t val); /*!< in: value to encode */
340341

341342
/******************************************************************//**
342343
Encode an integer using our VLC scheme and return the length in bytes. */
343344
UNIV_INLINE
344-
ulint
345+
unsigned int
345346
fts_encode_int(
346347
/*===========*/
347348
/*!< out: length of value
348349
encoded, in bytes */
349-
ulint val, /*!< in: value to encode */
350+
uint64_t val, /*!< in: value to encode */
350351
byte* buf); /*!< in: buffer, must have
351352
enough space */
352353

storage/innobase/include/fts0vlc.ic

Lines changed: 61 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*****************************************************************************
22

3-
Copyright (c) 2007, 2023, Oracle and/or its affiliates.
3+
Copyright (c) 2007, 2024, Oracle and/or its affiliates.
44

55
This program is free software; you can redistribute it and/or modify
66
it under the terms of the GNU General Public License, version 2.0,
@@ -35,101 +35,97 @@ Created 2007-03-27 Sunny Bains
3535
#ifndef INNOBASE_FTS0VLC_IC
3636
#define INNOBASE_FTS0VLC_IC
3737

38+
#include <stddef.h>
39+
#include <stdint.h>
3840
#include "fts0types.h"
3941

4042
/******************************************************************//**
4143
Return length of val if it were encoded using our VLC scheme.
42-
FIXME: We will need to be able encode 8 bytes value
4344
@return length of value encoded, in bytes */
4445
UNIV_INLINE
45-
ulint
46+
unsigned int
4647
fts_get_encoded_len(
4748
/*================*/
48-
ulint val) /* in: value to encode */
49+
uint64_t val) /* in: value to encode */
4950
{
50-
if (val <= 127) {
51-
return(1);
52-
} else if (val <= 16383) {
53-
return(2);
54-
} else if (val <= 2097151) {
55-
return(3);
56-
} else if (val <= 268435455) {
57-
return(4);
58-
} else {
59-
/* Possibly we should care that on 64-bit machines ulint can
60-
contain values that we can't encode in 5 bytes, but
61-
fts_encode_int doesn't handle them either so it doesn't much
62-
matter. */
63-
64-
return(5);
65-
}
51+
unsigned int length = 1;
52+
for (;;)
53+
{
54+
val >>= 7;
55+
if (val != 0)
56+
{
57+
++length;
58+
}
59+
else
60+
{
61+
break;
62+
}
63+
}
64+
return length;
6665
}
6766

6867
/******************************************************************//**
6968
Encode an integer using our VLC scheme and return the length in bytes.
7069
@return length of value encoded, in bytes */
7170
UNIV_INLINE
72-
ulint
71+
unsigned int
7372
fts_encode_int(
7473
/*===========*/
75-
ulint val, /* in: value to encode */
74+
uint64_t val, /* in: value to encode */
7675
byte* buf) /* in: buffer, must have enough space */
7776
{
78-
ulint len;
79-
80-
if (val <= 127) {
81-
*buf = (byte) val;
82-
83-
len = 1;
84-
} else if (val <= 16383) {
85-
*buf++ = (byte)(val >> 7);
86-
*buf = (byte)(val & 0x7F);
87-
88-
len = 2;
89-
} else if (val <= 2097151) {
90-
*buf++ = (byte)(val >> 14);
91-
*buf++ = (byte)((val >> 7) & 0x7F);
92-
*buf = (byte)(val & 0x7F);
93-
94-
len = 3;
95-
} else if (val <= 268435455) {
96-
*buf++ = (byte)(val >> 21);
97-
*buf++ = (byte)((val >> 14) & 0x7F);
98-
*buf++ = (byte)((val >> 7) & 0x7F);
99-
*buf = (byte)(val & 0x7F);
100-
101-
len = 4;
102-
} else {
103-
/* Best to keep the limitations of the 32/64 bit versions
104-
identical, at least for the time being. */
105-
ut_ad(val <= 4294967295u);
106-
107-
*buf++ = (byte)(val >> 28);
108-
*buf++ = (byte)((val >> 21) & 0x7F);
109-
*buf++ = (byte)((val >> 14) & 0x7F);
110-
*buf++ = (byte)((val >> 7) & 0x7F);
111-
*buf = (byte)(val & 0x7F);
112-
113-
len = 5;
77+
const unsigned int max_length = 10;
78+
/* skip leading zeros */
79+
unsigned int count = max_length - 1;
80+
while (count > 0)
81+
{
82+
/* We split the value into 7 bit batches); so val >= 2^63 need 10 bytes,
83+
2^63 > val >= 2^56 needs 9 bytes, 2^56 > val >= 2^49 needs 8 bytes etc.
84+
*/
85+
if (val >= uint64_t(1) << (count * 7))
86+
{
87+
break;
88+
}
89+
--count;
11490
}
11591

116-
/* High-bit on means "last byte in the encoded integer". */
117-
*buf |= 0x80;
118-
119-
return(len);
92+
unsigned int length = count + 1;
93+
94+
byte *bufptr= buf;
95+
96+
for (;;)
97+
{
98+
*bufptr = (byte)((val >> (7 * count)) & 0x7f);
99+
if (count == 0)
100+
{
101+
/* High-bit on means "last byte in the encoded integer". */
102+
*bufptr |= 0x80;
103+
break;
104+
}
105+
else
106+
{
107+
--count;
108+
++bufptr;
109+
}
110+
}
111+
112+
ut_ad(length <= max_length);
113+
ut_a(bufptr - buf == ptrdiff_t(length) - 1);
114+
115+
return length;
120116
}
121117

122118
/******************************************************************//**
123119
Decode and return the integer that was encoded using our VLC scheme.
124120
@return value decoded */
125121
UNIV_INLINE
126-
ulint
122+
uint64_t
127123
fts_decode_vlc(
128124
/*===========*/
129125
byte** ptr) /* in: ptr to decode from, this ptr is
130126
incremented by the number of bytes decoded */
131127
{
132-
ulint val = 0;
128+
uint64_t val = 0;
133129

134130
for (;;) {
135131
byte b = **ptr;
@@ -145,7 +141,7 @@ fts_decode_vlc(
145141
}
146142
}
147143

148-
return(val);
144+
return val;
149145
}
150146

151147
#endif

unittest/gunit/innodb/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ INCLUDE(${CMAKE_SOURCE_DIR}/storage/innobase/innodb.cmake)
3434

3535
SET(TESTS
3636
#example
37+
fts0vlc
3738
ha_innodb
3839
mem0mem
3940
ut0crc32

0 commit comments

Comments
 (0)