Skip to content

Commit 20fa302

Browse files
spetruniaGunnar Kudrjavets
authored andcommitted
Fix issue #14: Doubling space by primary key
Summary: SQL table records are stored as (pk_keyval -> record_val) key-value pairs. When PK column datatype allows to decode PK column value from its mem-comparable form in pk_keyval, we don't need to store the column in record_val. Test Plan: ./mtr t/rocks*test && ./mtr --suite=rocksdb && ./mtr --suite=rocksdb_rpl Reviewers: hermanlee4, jonahcohen, maykov, yoshinorim Reviewed By: yoshinorim Differential Revision: https://reviews.facebook.net/D33117
1 parent 135fb9f commit 20fa302

File tree

4 files changed

+69
-6
lines changed

4 files changed

+69
-6
lines changed

storage/rocksdb/ha_rocksdb.cc

Lines changed: 48 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -711,7 +711,7 @@ class Primary_key_comparator : public rocksdb::Comparator
711711

712712
/* The following is not needed by RocksDB, but conceptually should be here: */
713713
static ulong get_hashnr(const char *key, size_t key_len);
714-
const char* Name() const { return "RocksDB_SE_v3.1"; }
714+
const char* Name() const { return "RocksDB_SE_v3.2"; }
715715

716716
//TODO: advanced funcs:
717717
// - FindShortestSeparator
@@ -728,7 +728,7 @@ class Reverse_comparator : public rocksdb::Comparator
728728
{
729729
return -bytewise_compare(a,b);
730730
}
731-
const char* Name() const { return "rev:RocksDB_SE_v3.1"; }
731+
const char* Name() const { return "rev:RocksDB_SE_v3.2"; }
732732
void FindShortestSeparator(std::string* start, const rocksdb::Slice& limit) const {}
733733
void FindShortSuccessor(std::string* key) const {}
734734
};
@@ -1556,6 +1556,10 @@ void ha_rocksdb::convert_record_to_storage_format(rocksdb::Slice *packed_rec)
15561556

15571557
for (uint i=0; i < table->s->fields; i++)
15581558
{
1559+
/* Don't pack decode-able PK key parts */
1560+
if (field_enc[i].skip)
1561+
continue;
1562+
15591563
Field *field= table->field[i];
15601564
if (field_enc[i].maybe_null())
15611565
{
@@ -1624,7 +1628,9 @@ int ha_rocksdb::convert_record_from_storage_format(rocksdb::Slice *slice,
16241628

16251629
/*
16261630
Unpack the record in this->retrieved_record from storage format into
1627-
buf (which can be table->record[0] or table->record[1])
1631+
buf (which can be table->record[0] or table->record[1]).
1632+
1633+
psergey-todo: this->last_rowkey has a part of the data.
16281634
16291635
If the table has blobs, the unpacked data in buf may keep pointers to the
16301636
data in this->retrieved_record.
@@ -1639,12 +1645,26 @@ int ha_rocksdb::convert_record_from_storage_format(uchar * buf)
16391645
Stream_reader reader(retrieved_record);
16401646
my_ptrdiff_t ptr_diff= buf - table->record[0];
16411647

1642-
const char *null_bytes;
1643-
if (!(null_bytes= reader.read(null_bytes_in_rec)))
1648+
/*
1649+
Decode PK fields from the key
1650+
*/
1651+
rocksdb::Slice rowkey_slice(last_rowkey.ptr(), last_rowkey.length());
1652+
1653+
DBUG_ASSERT(pk_descr->get_unpack_data_len() == 0);
1654+
rocksdb::Slice pk_unpack_info_slice;
1655+
if (pk_descr->unpack_record(table, buf, &rowkey_slice, &pk_unpack_info_slice))
1656+
return HA_ERR_INTERNAL_ERROR;
1657+
1658+
/* Other fields are decoded from the value */
1659+
const char * UNINIT_VAR(null_bytes);
1660+
if (null_bytes_in_rec && !(null_bytes= reader.read(null_bytes_in_rec)))
16441661
return HA_ERR_INTERNAL_ERROR;
16451662

16461663
for (uint i=0; i < table->s->fields; i++)
16471664
{
1665+
if (field_enc[i].skip)
1666+
continue;
1667+
16481668
Field *field= table->field[i];
16491669

16501670
int isNull = field_enc[i].maybe_null() &&
@@ -1765,6 +1785,29 @@ void ha_rocksdb::setup_field_converters()
17651785
for (i= 0; i < table->s->fields; i++)
17661786
{
17671787
Field *field= table->field[i];
1788+
field_enc[i].skip= false;
1789+
1790+
/*
1791+
Check if this field is
1792+
- a part of primary key, and
1793+
- it can be decoded back from its key image.
1794+
If both hold, we don't need to store this field in the value part of
1795+
RocksDB's key-value pair.
1796+
*/
1797+
if (field->part_of_key.is_set(table->s->primary_key))
1798+
{
1799+
KEY *pk_info= &table->key_info[table->s->primary_key];
1800+
for (uint kp= 0; kp < pk_info->user_defined_key_parts; kp++)
1801+
{
1802+
if (field->field_index + 1 == pk_info->key_part[kp].fieldnr)
1803+
{
1804+
if (pk_descr->can_unpack(kp))
1805+
field_enc[i].skip= true; /* Don't store */
1806+
break;
1807+
}
1808+
}
1809+
}
1810+
17681811
field_enc[i].field_type= field->real_type();
17691812

17701813
if (field->real_maybe_null())

storage/rocksdb/ha_rocksdb.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,9 @@ class ha_rocksdb: public handler
192192
*/
193193
typedef struct st_field_encoder
194194
{
195+
/* skip=true means this is decodeable part of PK and so not stored */
196+
bool skip;
197+
195198
uint null_offset;
196199
uchar null_mask; // 0 means the field cannot be null
197200

storage/rocksdb/rdb_datadic.cc

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -485,6 +485,12 @@ int RDBSE_KEYDEF::unpack_record(TABLE *table, uchar *buf,
485485
return 0;
486486
}
487487

488+
489+
bool RDBSE_KEYDEF::can_unpack(uint kp) const
490+
{
491+
return (pack_info[kp].unpack_func != NULL);
492+
}
493+
488494
///////////////////////////////////////////////////////////////////////////////////////////
489495
// Field_pack_info
490496
///////////////////////////////////////////////////////////////////////////////////////////

storage/rocksdb/rdb_datadic.h

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,8 +45,11 @@ class Stream_reader
4545
public:
4646
explicit Stream_reader(const std::string &str)
4747
{
48-
ptr= &str.at(0);
4948
len= str.length();
49+
if (len)
50+
ptr= &str.at(0);
51+
else
52+
ptr= NULL;
5053
}
5154

5255
explicit Stream_reader(const rocksdb::Slice *slice)
@@ -247,6 +250,14 @@ class RDBSE_KEYDEF
247250

248251
rocksdb::ColumnFamilyHandle *get_cf() { return cf_handle; }
249252

253+
/* Check if keypart #kp can be unpacked from index tuple */
254+
bool can_unpack(uint kp) const;
255+
256+
/*
257+
Current code assumes that unpack_data occupies fixed length regardless of
258+
the value that is stored.
259+
*/
260+
bool get_unpack_data_len() { return unpack_data_len; }
250261
private:
251262

252263
/* Global number of this index (used as prefix in StorageFormat) */

0 commit comments

Comments
 (0)