Skip to content

Commit 1edacd8

Browse files
yoshinorimGunnar Kudrjavets
authored andcommitted
Fixing prefix bloom filter (mysql#109)
Summary: This diff fixes two bugs that returned missing rows with prefix bloom filter. 1. Range scan with equal predicates (i.e. WHERE id1=1 AND id2>=1). In this case, equal condition length should be (index_id + id1) and prefix bloom filter should be skipped if the length is shorter than prefix bloom length. Without this diff, equal condition length was (index_id + id1 + id2), which was incorrect. 2. IN clause with different condition length (i.e. WHERE name IN ('AAAAAAA', 'A'). In this case, ha_rocksdb::setup_index_scan() is called multiple times, and equal condition length is different. So whether to use prefix bloom filter should be decided one by one. @update-submodule: rocksdb Test Plan: bloomfilter2 Reviewers: jkedgar, maykov, hermanlee4, spetrunia Reviewed By: spetrunia Differential Revision: https://reviews.facebook.net/D46851 Differential Revision: https://reviews.facebook.net/D47085 Differential Revision: https://reviews.facebook.net/D47169 Differential Revision: https://reviews.facebook.net/D50181
1 parent 3a7ce25 commit 1edacd8

File tree

4 files changed

+169
-12
lines changed

4 files changed

+169
-12
lines changed

storage/rocksdb/ha_rocksdb.cc

Lines changed: 149 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919
#endif
2020

2121
#include <mysql/plugin.h>
22+
#include <my_global.h>
23+
#include <mysql/psi/mysql_table.h>
2224
#include "ha_rocksdb.h"
2325
#include "ha_rocksdb_proto.h"
2426
#include "rdb_perf_context.h"
@@ -2773,6 +2775,8 @@ int ha_rocksdb::open(const char *name, int mode, uint test_if_locked)
27732775
MYF(0))) ||
27742776
!(sec_key_packed_tuple_old= (uchar*)my_malloc(max_packed_sec_key_len,
27752777
MYF(0))) ||
2778+
!(end_key_packed_tuple= (uchar*)my_malloc(max_packed_sec_key_len,
2779+
MYF(0))) ||
27762780
!((sec_key_tails= (uchar*)my_malloc(tails_size, MYF(0)))) ||
27772781
!((sec_key_tails_old= (uchar*)my_malloc(tails_size, MYF(0)))) ||
27782782
!((pack_buffer=(uchar*)my_malloc(max_packed_sec_key_len, MYF(0)))))
@@ -2815,6 +2819,10 @@ int ha_rocksdb::close(void)
28152819
my_free(sec_key_packed_tuple_old);
28162820
sec_key_packed_tuple_old= NULL;
28172821

2822+
if (end_key_packed_tuple)
2823+
my_free(end_key_packed_tuple);
2824+
end_key_packed_tuple= NULL;
2825+
28182826
if (sec_key_tails)
28192827
my_free(sec_key_tails);
28202828
sec_key_tails= NULL;
@@ -3289,13 +3297,80 @@ int ha_rocksdb::secondary_index_read(int keyno, uchar *buf)
32893297

32903298

32913299
/*
3292-
See storage/rocksdb/rocksdb-range-access.txt for description of how MySQL
3293-
index navigation commands are converted into RocksDB lookup commands.
3300+
ha_rocksdb::read_range_first overrides handler::read_range_first.
3301+
The only difference from handler::read_range_first is that
3302+
ha_rocksdb::read_range_first passes end_key to
3303+
ha_rocksdb::index_read_map_impl function.
32943304
*/
3305+
int ha_rocksdb::read_range_first(const key_range *start_key,
3306+
const key_range *end_key,
3307+
bool eq_range_arg, bool sorted)
3308+
{
3309+
int result;
3310+
DBUG_ENTER("ha_rocksdb::read_range_first");
3311+
3312+
eq_range= eq_range_arg;
3313+
set_end_range(end_key, RANGE_SCAN_ASC);
3314+
3315+
range_key_part= table->key_info[active_index].key_part;
3316+
3317+
if (!start_key) // Read first record
3318+
result= ha_index_first(table->record[0]);
3319+
else
3320+
{
3321+
bool is_eq_cond_range_scan= end_key&&(start_key->flag!=HA_READ_KEY_EXACT);
3322+
MYSQL_TABLE_IO_WAIT(m_psi, PSI_TABLE_FETCH_ROW, active_index, 0,
3323+
{ result= index_read_map_impl(table->record[0],
3324+
start_key->key,
3325+
start_key->keypart_map,
3326+
start_key->flag,
3327+
is_eq_cond_range_scan, end_key); })
3328+
}
3329+
if (result)
3330+
DBUG_RETURN((result == HA_ERR_KEY_NOT_FOUND)
3331+
? HA_ERR_END_OF_FILE
3332+
: result);
3333+
3334+
if (compare_key(end_range) <= 0)
3335+
{
3336+
DBUG_RETURN(0);
3337+
}
3338+
else
3339+
{
3340+
/*
3341+
The last read row does not fall in the range. So request
3342+
storage engine to release row lock if possible.
3343+
*/
3344+
unlock_row();
3345+
DBUG_RETURN(HA_ERR_END_OF_FILE);
3346+
}
3347+
}
3348+
32953349

32963350
int ha_rocksdb::index_read_map(uchar *buf, const uchar *key,
32973351
key_part_map keypart_map,
32983352
enum ha_rkey_function find_flag)
3353+
{
3354+
return index_read_map_impl(buf, key, keypart_map, find_flag, false, NULL);
3355+
}
3356+
3357+
/*
3358+
See storage/rocksdb/rocksdb-range-access.txt for description of how MySQL
3359+
index navigation commands are converted into RocksDB lookup commands.
3360+
3361+
This function takes end_key as an argument, and it is set on range scan.
3362+
MyRocks needs to decide whether prefix bloom filter can be used or not.
3363+
To decide to use prefix bloom filter or not, calculating equal condition length
3364+
is needed. On equal lookups (find_flag == HA_READ_KEY_EXACT), equal
3365+
condition length is the same as rocksdb::Slice.size() of the start key.
3366+
On range scan, equal condition length is MIN(start_key, end_key) of the
3367+
rocksdb::Slice expression.
3368+
*/
3369+
int ha_rocksdb::index_read_map_impl(uchar *buf, const uchar *key,
3370+
key_part_map keypart_map,
3371+
enum ha_rkey_function find_flag,
3372+
const bool is_eq_cond_range_scan,
3373+
const key_range *end_key)
32993374
{
33003375
int rc= 0;
33013376
DBUG_ENTER("ha_rocksdb::index_read_map");
@@ -3330,10 +3405,19 @@ int ha_rocksdb::index_read_map(uchar *buf, const uchar *key,
33303405
DBUG_RETURN(rc);
33313406
}
33323407

3408+
uint end_key_packed_size= 0;
3409+
if (is_eq_cond_range_scan && end_key)
3410+
{
3411+
end_key_packed_size= kd->pack_index_tuple(table, pack_buffer,
3412+
end_key_packed_tuple, end_key->key,
3413+
end_key->keypart_map);
3414+
}
3415+
33333416
/*
33343417
Unique secondary index performs lookups without the extended key fields
33353418
*/
3336-
uint packed_size; if (active_index != table->s->primary_key &&
3419+
uint packed_size;
3420+
if (active_index != table->s->primary_key &&
33373421
table->key_info[active_index].flags & HA_NOSAME &&
33383422
find_flag == HA_READ_KEY_EXACT && using_full_key)
33393423
{
@@ -3346,9 +3430,11 @@ int ha_rocksdb::index_read_map(uchar *buf, const uchar *key,
33463430
using_full_key= false;
33473431
}
33483432
else
3433+
{
33493434
packed_size= kd->pack_index_tuple(table, pack_buffer,
33503435
sec_key_packed_tuple, key,
33513436
keypart_map);
3437+
}
33523438
if ((pushed_idx_cond && pushed_idx_cond_keyno == active_index) &&
33533439
(find_flag == HA_READ_KEY_EXACT || find_flag == HA_READ_PREFIX_LAST))
33543440
{
@@ -3379,6 +3465,34 @@ int ha_rocksdb::index_read_map(uchar *buf, const uchar *key,
33793465

33803466
rocksdb::Slice slice((char*)sec_key_packed_tuple, packed_size);
33813467

3468+
uint eq_cond_len= 0;
3469+
if (find_flag == HA_READ_KEY_EXACT)
3470+
{
3471+
eq_cond_len= slice.size();
3472+
}
3473+
else if (is_eq_cond_range_scan && end_key_packed_size > 0)
3474+
{
3475+
/*
3476+
Calculating length of the equal conditions here. 4 byte index id is included.
3477+
Example1: id1 BIGINT, id2 INT, id3 BIGINT, PRIMARY KEY (id1, id2, id3)
3478+
WHERE id1=1 AND id2=1 AND id3>=2 => eq_cond_len= 4+8+4= 16
3479+
WHERE id1=1 AND id2>=1 AND id3>=2 => eq_cond_len= 4+8= 12
3480+
Example2: id1 VARCHAR(30), id2 INT, PRIMARY KEY (id1, id2)
3481+
WHERE id1 = 'AAA' and id2 < 3; => eq_cond_len=13 (varchar used 9 bytes)
3482+
*/
3483+
rocksdb::Slice end_slice((char*)end_key_packed_tuple, end_key_packed_size);
3484+
eq_cond_len= slice.difference_offset(end_slice);
3485+
}
3486+
else
3487+
{
3488+
/*
3489+
On range scan without any end key condition, there is no
3490+
eq cond, and eq cond length is the same as index_id size (4 bytes).
3491+
Example1: id1 BIGINT, id2 INT, id3 BIGINT, PRIMARY KEY (id1, id2, id3)
3492+
WHERE id1>=1 AND id2 >= 2 and id2 <= 5 => eq_cond_len= 4
3493+
*/
3494+
eq_cond_len= RDBSE_KEYDEF::INDEX_NUMBER_SIZE;
3495+
}
33823496

33833497
bool use_all_keys= false;
33843498
if (find_flag == HA_READ_KEY_EXACT &&
@@ -3389,7 +3503,8 @@ int ha_rocksdb::index_read_map(uchar *buf, const uchar *key,
33893503
This will open the iterator and position it at a record that's equal or
33903504
greater than the lookup tuple.
33913505
*/
3392-
setup_index_scan(kd, &slice, use_all_keys, is_ascending(kd, find_flag));
3506+
setup_index_scan(kd, &slice, use_all_keys, is_ascending(kd, find_flag),
3507+
eq_cond_len);
33933508
bool move_forward= true;
33943509

33953510
switch (find_flag) {
@@ -4603,11 +4718,37 @@ int ha_rocksdb::update_write_row(const uchar *old_data,
46034718

46044719
void ha_rocksdb::setup_index_scan(RDBSE_KEYDEF *keydef,
46054720
rocksdb::Slice *slice,
4606-
bool use_all_keys,
4607-
bool is_ascending)
4721+
const bool use_all_keys,
4722+
const bool is_ascending,
4723+
const uint eq_cond_len)
46084724
{
4725+
DBUG_ASSERT(slice->size() >= eq_cond_len);
46094726
Rdb_transaction *trx= get_or_create_trx(table->in_use);
46104727

4728+
bool skip_bloom= true;
4729+
4730+
rocksdb::Slice eq_cond(slice->data(), eq_cond_len);
4731+
if (can_use_bloom_filter(keydef, eq_cond, use_all_keys, is_ascending))
4732+
skip_bloom= false;
4733+
4734+
/*
4735+
In some cases, setup_index_scan() is called multiple times from
4736+
the same query but bloom filter can not always be used.
4737+
Suppose the following query example. id2 is VARCHAR(30) and PRIMARY KEY (id1, id2).
4738+
select count(*) from t2 WHERE id1=100 and id2 IN ('00000000000000000000', '100');
4739+
In this case, setup_index_scan() is called twice, the first time is for
4740+
(id1, id2)=(100, '00000000000000000000') and the second time is for (100, '100').
4741+
If prefix bloom filter length is 24 bytes, prefix bloom filter can be used for the
4742+
first condition but not for the second condition.
4743+
If bloom filter condition is changed, currently it is necessary to destroy and
4744+
re-create Iterator.
4745+
*/
4746+
if (scan_it && scan_it->skip_bloom != skip_bloom)
4747+
{
4748+
delete scan_it;
4749+
scan_it= NULL;
4750+
}
4751+
46114752
/*
46124753
SQL layer can call rnd_init() multiple times in a row.
46134754
In that case, re-use the iterator, but re-position it at the table start.
@@ -4619,13 +4760,12 @@ void ha_rocksdb::setup_index_scan(RDBSE_KEYDEF *keydef,
46194760
options.fill_cache= false;
46204761
if (!lock_rows)
46214762
options.snapshot= trx->snapshot;
4622-
if (!can_use_bloom_filter(keydef, *slice, use_all_keys, is_ascending))
4623-
options.total_order_seek= true;
4763+
options.total_order_seek= skip_bloom;
46244764
rocksdb::Iterator* rocksdb_it= rdb->NewIterator(options, keydef->get_cf());
46254765
scan_it= new Apply_changes_iter;
46264766
scan_it->init(keydef->is_reverse_cf, &trx->changes, rocksdb_it);
4767+
scan_it->skip_bloom= skip_bloom;
46274768
}
4628-
46294769
/*
46304770
Seek() will "Position at the first key in the source that at or past target".
46314771
The operation cannot fail.

storage/rocksdb/ha_rocksdb.h

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,12 @@ class ha_rocksdb: public handler
161161
*/
162162
uchar *sec_key_packed_tuple;
163163

164+
/*
165+
Temporary buffers for storing end key part of the Key/Value pair.
166+
This is used for range scan only.
167+
*/
168+
uchar *end_key_packed_tuple;
169+
164170
/*
165171
Same as above, but it stores the value part. It holds unpack info which
166172
is currently not used (always empty).
@@ -242,11 +248,12 @@ class ha_rocksdb: public handler
242248
int secondary_index_read(int keyno, uchar *buf);
243249
void setup_index_scan(RDBSE_KEYDEF *keydef, rocksdb::Slice *slice)
244250
{
245-
setup_index_scan(keydef, slice, false, false);
251+
setup_index_scan(keydef, slice, false, false, 0);
246252
}
247253
bool is_ascending(RDBSE_KEYDEF *keydef, enum ha_rkey_function find_flag);
248254
void setup_index_scan(RDBSE_KEYDEF *keydef, rocksdb::Slice *slice,
249-
bool use_all_keys, bool is_ascending);
255+
const bool use_all_keys, const bool is_ascending,
256+
const uint eq_cond_len);
250257
int get_row_by_rowid(uchar *buf, const char *pk_tuple, uint pk_tuple_size);
251258

252259
void update_auto_incr_val();
@@ -433,10 +440,19 @@ class ha_rocksdb: public handler
433440
int index_read_map(uchar * buf, const uchar * key,
434441
key_part_map keypart_map,
435442
enum ha_rkey_function find_flag);
443+
int index_read_map_impl(uchar * buf, const uchar * key,
444+
key_part_map keypart_map,
445+
enum ha_rkey_function find_flag,
446+
const bool is_eq_cond_range_scan,
447+
const key_range *end_key);
436448

437449
int index_read_last_map(uchar * buf, const uchar * key,
438450
key_part_map keypart_map);
439451

452+
int read_range_first(const key_range *start_key,
453+
const key_range *end_key,
454+
bool eq_range, bool sorted);
455+
440456
virtual double scan_time() { return (double) (stats.records+stats.deleted) / 20.0+10; }
441457
virtual double read_time(uint, uint, ha_rows rows)
442458
{ return (double) rows / 20.0+1; }

storage/rocksdb/rdb_applyiter.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ class Apply_changes_iter
5353
bool Valid() { return valid; }
5454
rocksdb::Slice key();
5555
rocksdb::Slice value();
56+
bool skip_bloom;
5657
private:
5758
void advance(int direction);
5859

storage/rocksdb/rdb_datadic.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -396,7 +396,7 @@ uint RDBSE_KEYDEF::pack_index_tuple(TABLE *tbl, uchar *pack_buffer,
396396
{
397397
/* We were given a record in KeyTupleFormat. First, save it to record */
398398
uint key_len= calculate_key_len(tbl, keyno, key_tuple, keypart_map);
399-
key_restore(tbl->record[0], (uchar*)key_tuple, &tbl->key_info[keyno],
399+
key_restore(tbl->record[0], (uchar*)key_tuple, &tbl->key_info[keyno],
400400
key_len);
401401

402402
uint n_used_parts= my_count_bits(keypart_map);

0 commit comments

Comments
 (0)