19
19
#endif
20
20
21
21
#include < mysql/plugin.h>
22
+ #include < my_global.h>
23
+ #include < mysql/psi/mysql_table.h>
22
24
#include " ha_rocksdb.h"
23
25
#include " ha_rocksdb_proto.h"
24
26
#include " rdb_perf_context.h"
@@ -2773,6 +2775,8 @@ int ha_rocksdb::open(const char *name, int mode, uint test_if_locked)
2773
2775
MYF (0 ))) ||
2774
2776
!(sec_key_packed_tuple_old= (uchar*)my_malloc (max_packed_sec_key_len,
2775
2777
MYF (0 ))) ||
2778
+ !(end_key_packed_tuple= (uchar*)my_malloc (max_packed_sec_key_len,
2779
+ MYF (0 ))) ||
2776
2780
!((sec_key_tails= (uchar*)my_malloc (tails_size, MYF (0 )))) ||
2777
2781
!((sec_key_tails_old= (uchar*)my_malloc (tails_size, MYF (0 )))) ||
2778
2782
!((pack_buffer=(uchar*)my_malloc (max_packed_sec_key_len, MYF (0 )))))
@@ -2815,6 +2819,10 @@ int ha_rocksdb::close(void)
2815
2819
my_free (sec_key_packed_tuple_old);
2816
2820
sec_key_packed_tuple_old= NULL ;
2817
2821
2822
+ if (end_key_packed_tuple)
2823
+ my_free (end_key_packed_tuple);
2824
+ end_key_packed_tuple= NULL ;
2825
+
2818
2826
if (sec_key_tails)
2819
2827
my_free (sec_key_tails);
2820
2828
sec_key_tails= NULL ;
@@ -3289,13 +3297,80 @@ int ha_rocksdb::secondary_index_read(int keyno, uchar *buf)
3289
3297
3290
3298
3291
3299
/*
3292
- See storage/rocksdb/rocksdb-range-access.txt for description of how MySQL
3293
- index navigation commands are converted into RocksDB lookup commands.
3300
+ ha_rocksdb::read_range_first overrides handler::read_range_first.
3301
+ The only difference from handler::read_range_first is that
3302
+ ha_rocksdb::read_range_first passes end_key to
3303
+ ha_rocksdb::index_read_map_impl function.
3294
3304
*/
3305
+ int ha_rocksdb::read_range_first (const key_range *start_key,
3306
+ const key_range *end_key,
3307
+ bool eq_range_arg, bool sorted)
3308
+ {
3309
+ int result;
3310
+ DBUG_ENTER (" ha_rocksdb::read_range_first" );
3311
+
3312
+ eq_range= eq_range_arg;
3313
+ set_end_range (end_key, RANGE_SCAN_ASC);
3314
+
3315
+ range_key_part= table->key_info [active_index].key_part ;
3316
+
3317
+ if (!start_key) // Read first record
3318
+ result= ha_index_first (table->record [0 ]);
3319
+ else
3320
+ {
3321
+ bool is_eq_cond_range_scan= end_key&&(start_key->flag !=HA_READ_KEY_EXACT);
3322
+ MYSQL_TABLE_IO_WAIT (m_psi, PSI_TABLE_FETCH_ROW, active_index, 0 ,
3323
+ { result= index_read_map_impl (table->record [0 ],
3324
+ start_key->key ,
3325
+ start_key->keypart_map ,
3326
+ start_key->flag ,
3327
+ is_eq_cond_range_scan, end_key); })
3328
+ }
3329
+ if (result)
3330
+ DBUG_RETURN ((result == HA_ERR_KEY_NOT_FOUND)
3331
+ ? HA_ERR_END_OF_FILE
3332
+ : result);
3333
+
3334
+ if (compare_key (end_range) <= 0 )
3335
+ {
3336
+ DBUG_RETURN (0 );
3337
+ }
3338
+ else
3339
+ {
3340
+ /*
3341
+ The last read row does not fall in the range. So request
3342
+ storage engine to release row lock if possible.
3343
+ */
3344
+ unlock_row ();
3345
+ DBUG_RETURN (HA_ERR_END_OF_FILE);
3346
+ }
3347
+ }
3348
+
3295
3349
3296
3350
int ha_rocksdb::index_read_map (uchar *buf, const uchar *key,
3297
3351
key_part_map keypart_map,
3298
3352
enum ha_rkey_function find_flag)
3353
+ {
3354
+ return index_read_map_impl (buf, key, keypart_map, find_flag, false , NULL );
3355
+ }
3356
+
3357
+ /*
3358
+ See storage/rocksdb/rocksdb-range-access.txt for description of how MySQL
3359
+ index navigation commands are converted into RocksDB lookup commands.
3360
+
3361
+ This function takes end_key as an argument, and it is set on range scan.
3362
+ MyRocks needs to decide whether prefix bloom filter can be used or not.
3363
+ To decide to use prefix bloom filter or not, calculating equal condition length
3364
+ is needed. On equal lookups (find_flag == HA_READ_KEY_EXACT), equal
3365
+ condition length is the same as rocksdb::Slice.size() of the start key.
3366
+ On range scan, equal condition length is MIN(start_key, end_key) of the
3367
+ rocksdb::Slice expression.
3368
+ */
3369
+ int ha_rocksdb::index_read_map_impl (uchar *buf, const uchar *key,
3370
+ key_part_map keypart_map,
3371
+ enum ha_rkey_function find_flag,
3372
+ const bool is_eq_cond_range_scan,
3373
+ const key_range *end_key)
3299
3374
{
3300
3375
int rc= 0 ;
3301
3376
DBUG_ENTER (" ha_rocksdb::index_read_map" );
@@ -3330,10 +3405,19 @@ int ha_rocksdb::index_read_map(uchar *buf, const uchar *key,
3330
3405
DBUG_RETURN (rc);
3331
3406
}
3332
3407
3408
+ uint end_key_packed_size= 0 ;
3409
+ if (is_eq_cond_range_scan && end_key)
3410
+ {
3411
+ end_key_packed_size= kd->pack_index_tuple (table, pack_buffer,
3412
+ end_key_packed_tuple, end_key->key ,
3413
+ end_key->keypart_map );
3414
+ }
3415
+
3333
3416
/*
3334
3417
Unique secondary index performs lookups without the extended key fields
3335
3418
*/
3336
- uint packed_size; if (active_index != table->s ->primary_key &&
3419
+ uint packed_size;
3420
+ if (active_index != table->s ->primary_key &&
3337
3421
table->key_info [active_index].flags & HA_NOSAME &&
3338
3422
find_flag == HA_READ_KEY_EXACT && using_full_key)
3339
3423
{
@@ -3346,9 +3430,11 @@ int ha_rocksdb::index_read_map(uchar *buf, const uchar *key,
3346
3430
using_full_key= false ;
3347
3431
}
3348
3432
else
3433
+ {
3349
3434
packed_size= kd->pack_index_tuple (table, pack_buffer,
3350
3435
sec_key_packed_tuple, key,
3351
3436
keypart_map);
3437
+ }
3352
3438
if ((pushed_idx_cond && pushed_idx_cond_keyno == active_index) &&
3353
3439
(find_flag == HA_READ_KEY_EXACT || find_flag == HA_READ_PREFIX_LAST))
3354
3440
{
@@ -3379,6 +3465,34 @@ int ha_rocksdb::index_read_map(uchar *buf, const uchar *key,
3379
3465
3380
3466
rocksdb::Slice slice ((char *)sec_key_packed_tuple, packed_size);
3381
3467
3468
+ uint eq_cond_len= 0 ;
3469
+ if (find_flag == HA_READ_KEY_EXACT)
3470
+ {
3471
+ eq_cond_len= slice.size ();
3472
+ }
3473
+ else if (is_eq_cond_range_scan && end_key_packed_size > 0 )
3474
+ {
3475
+ /*
3476
+ Calculating length of the equal conditions here. 4 byte index id is included.
3477
+ Example1: id1 BIGINT, id2 INT, id3 BIGINT, PRIMARY KEY (id1, id2, id3)
3478
+ WHERE id1=1 AND id2=1 AND id3>=2 => eq_cond_len= 4+8+4= 16
3479
+ WHERE id1=1 AND id2>=1 AND id3>=2 => eq_cond_len= 4+8= 12
3480
+ Example2: id1 VARCHAR(30), id2 INT, PRIMARY KEY (id1, id2)
3481
+ WHERE id1 = 'AAA' and id2 < 3; => eq_cond_len=13 (varchar used 9 bytes)
3482
+ */
3483
+ rocksdb::Slice end_slice ((char *)end_key_packed_tuple, end_key_packed_size);
3484
+ eq_cond_len= slice.difference_offset (end_slice);
3485
+ }
3486
+ else
3487
+ {
3488
+ /*
3489
+ On range scan without any end key condition, there is no
3490
+ eq cond, and eq cond length is the same as index_id size (4 bytes).
3491
+ Example1: id1 BIGINT, id2 INT, id3 BIGINT, PRIMARY KEY (id1, id2, id3)
3492
+ WHERE id1>=1 AND id2 >= 2 and id2 <= 5 => eq_cond_len= 4
3493
+ */
3494
+ eq_cond_len= RDBSE_KEYDEF::INDEX_NUMBER_SIZE;
3495
+ }
3382
3496
3383
3497
bool use_all_keys= false ;
3384
3498
if (find_flag == HA_READ_KEY_EXACT &&
@@ -3389,7 +3503,8 @@ int ha_rocksdb::index_read_map(uchar *buf, const uchar *key,
3389
3503
This will open the iterator and position it at a record that's equal or
3390
3504
greater than the lookup tuple.
3391
3505
*/
3392
- setup_index_scan (kd, &slice, use_all_keys, is_ascending (kd, find_flag));
3506
+ setup_index_scan (kd, &slice, use_all_keys, is_ascending (kd, find_flag),
3507
+ eq_cond_len);
3393
3508
bool move_forward= true ;
3394
3509
3395
3510
switch (find_flag) {
@@ -4603,11 +4718,37 @@ int ha_rocksdb::update_write_row(const uchar *old_data,
4603
4718
4604
4719
void ha_rocksdb::setup_index_scan (RDBSE_KEYDEF *keydef,
4605
4720
rocksdb::Slice *slice,
4606
- bool use_all_keys,
4607
- bool is_ascending)
4721
+ const bool use_all_keys,
4722
+ const bool is_ascending,
4723
+ const uint eq_cond_len)
4608
4724
{
4725
+ DBUG_ASSERT (slice->size () >= eq_cond_len);
4609
4726
Rdb_transaction *trx= get_or_create_trx (table->in_use );
4610
4727
4728
+ bool skip_bloom= true ;
4729
+
4730
+ rocksdb::Slice eq_cond (slice->data (), eq_cond_len);
4731
+ if (can_use_bloom_filter (keydef, eq_cond, use_all_keys, is_ascending))
4732
+ skip_bloom= false ;
4733
+
4734
+ /*
4735
+ In some cases, setup_index_scan() is called multiple times from
4736
+ the same query but bloom filter can not always be used.
4737
+ Suppose the following query example. id2 is VARCHAR(30) and PRIMARY KEY (id1, id2).
4738
+ select count(*) from t2 WHERE id1=100 and id2 IN ('00000000000000000000', '100');
4739
+ In this case, setup_index_scan() is called twice, the first time is for
4740
+ (id1, id2)=(100, '00000000000000000000') and the second time is for (100, '100').
4741
+ If prefix bloom filter length is 24 bytes, prefix bloom filter can be used for the
4742
+ first condition but not for the second condition.
4743
+ If bloom filter condition is changed, currently it is necessary to destroy and
4744
+ re-create Iterator.
4745
+ */
4746
+ if (scan_it && scan_it->skip_bloom != skip_bloom)
4747
+ {
4748
+ delete scan_it;
4749
+ scan_it= NULL ;
4750
+ }
4751
+
4611
4752
/*
4612
4753
SQL layer can call rnd_init() multiple times in a row.
4613
4754
In that case, re-use the iterator, but re-position it at the table start.
@@ -4619,13 +4760,12 @@ void ha_rocksdb::setup_index_scan(RDBSE_KEYDEF *keydef,
4619
4760
options.fill_cache = false ;
4620
4761
if (!lock_rows)
4621
4762
options.snapshot = trx->snapshot ;
4622
- if (!can_use_bloom_filter (keydef, *slice, use_all_keys, is_ascending))
4623
- options.total_order_seek = true ;
4763
+ options.total_order_seek = skip_bloom;
4624
4764
rocksdb::Iterator* rocksdb_it= rdb->NewIterator (options, keydef->get_cf ());
4625
4765
scan_it= new Apply_changes_iter;
4626
4766
scan_it->init (keydef->is_reverse_cf , &trx->changes , rocksdb_it);
4767
+ scan_it->skip_bloom = skip_bloom;
4627
4768
}
4628
-
4629
4769
/*
4630
4770
Seek() will "Position at the first key in the source that at or past target".
4631
4771
The operation cannot fail.
0 commit comments