Skip to content

Commit f36dff7

Browse files
authored
Efficiently encode multi-valued dimensions (#105271)
Detects and efficiently encodes cyclic ordinals, as proposed by @jpountz. This is beneficial for encoding dimensions that are multivalued, such as host.ip. A follow-up on #99747
1 parent 263ea5e commit f36dff7

File tree

3 files changed

+118
-26
lines changed

3 files changed

+118
-26
lines changed

server/src/main/java/org/elasticsearch/index/codec/PerFieldMapperCodec.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
import org.elasticsearch.index.codec.tsdb.ES87TSDBDocValuesFormat;
2424
import org.elasticsearch.index.mapper.DateFieldMapper;
2525
import org.elasticsearch.index.mapper.IdFieldMapper;
26+
import org.elasticsearch.index.mapper.IpFieldMapper;
2627
import org.elasticsearch.index.mapper.KeywordFieldMapper;
2728
import org.elasticsearch.index.mapper.Mapper;
2829
import org.elasticsearch.index.mapper.MapperService;
@@ -125,6 +126,9 @@ boolean useTSDBDocValuesFormat(final String field) {
125126
if (mappingLookup.getMapper(field) instanceof TimeSeriesIdFieldMapper) {
126127
return true;
127128
}
129+
if (mappingLookup.getMapper(field) instanceof IpFieldMapper) {
130+
return true;
131+
}
128132
}
129133
return false;
130134
}

server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesEncoder.java

Lines changed: 66 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -182,35 +182,57 @@ void encode(long[] in, DataOutput out) throws IOException {
182182
* Optimizes for encoding sorted fields where we expect a block to mostly either be the same value
183183
* or to make a transition from one value to a second one.
184184
* <p>
185-
* Encodes blocks in the following format:
185+
* The header is a vlong where the number of trailing ones defines the encoding strategy:
186186
* <ul>
187-
* <li>byte 0: 1/2 bits header+6/7 bits data</li>
188-
* <li>byte 1..n: data</li>
189-
* </ul>
190-
* The header (first 1 or 2 bits) describes how the data is encoded:
191-
* <ul>
192-
* <li>?0 block has a single value (vlong), 2nd bit already contains data</li>
193-
* <li>
194-
* 01 block has two runs, data contains value 1 (vlong), run-length (vint) of value 1,
195-
* and delta from first to second value (zlong)
196-
* </li>
197-
* <li>11 block is bit-packed</li>
187+
* <li>0: single run</li>
188+
* <li>1: two runs</li>
189+
* <li>2: bit-packed</li>
190+
* <li>3: cycle</li>
198191
* </ul>
199192
*/
200193
void encodeOrdinals(long[] in, DataOutput out, int bitsPerOrd) throws IOException {
201194
assert in.length == ES87TSDBDocValuesFormat.NUMERIC_BLOCK_SIZE;
202195
int numRuns = 1;
196+
long firstValue = in[0];
197+
long previousValue = firstValue;
198+
boolean cyclic = false;
199+
int cycleLength = 0;
203200
for (int i = 1; i < in.length; ++i) {
204-
if (in[i - 1] != in[i]) {
201+
long currentValue = in[i];
202+
if (previousValue != currentValue) {
205203
numRuns++;
206204
}
205+
if (currentValue == firstValue && cycleLength != -1) {
206+
if (cycleLength == 0) {
207+
// first candidate cycle detected
208+
cycleLength = i;
209+
} else if (cycleLength == 1 || i % cycleLength != 0) {
210+
// if the first two values are the same this isn't a cycle, it might be a run, though
211+
// this also isn't a cycle if the index of the next occurrence of the first value
212+
// isn't a multiple of the candidate cycle length
213+
// we can stop looking for cycles now
214+
cycleLength = -1;
215+
}
216+
}
217+
previousValue = currentValue;
218+
}
219+
// if the cycle is too long, bit-packing may be more space efficient
220+
int maxCycleLength = in.length / 4;
221+
if (numRuns > 2 && cycleLength > 1 && cycleLength <= maxCycleLength) {
222+
cyclic = true;
223+
for (int i = cycleLength; i < in.length; ++i) {
224+
if (in[i] != in[i - cycleLength]) {
225+
cyclic = false;
226+
break;
227+
}
228+
}
207229
}
208230
if (numRuns == 1 && bitsPerOrd < 63) {
209231
long value = in[0];
210-
// set first bit to 0 to indicate the block has a single run
232+
// unset first bit (0 trailing ones) to indicate the block has a single run
211233
out.writeVLong(value << 1);
212234
} else if (numRuns == 2 && bitsPerOrd < 62) {
213-
// set first two bits to 01 to indicate the block has two runs
235+
// set 1 trailing bit to indicate the block has two runs
214236
out.writeVLong((in[0] << 2) | 0b01);
215237
int firstRunLen = in.length;
216238
for (int i = 1; i < in.length; ++i) {
@@ -221,8 +243,15 @@ void encodeOrdinals(long[] in, DataOutput out, int bitsPerOrd) throws IOExceptio
221243
}
222244
out.writeVInt(firstRunLen);
223245
out.writeZLong(in[in.length - 1] - in[0]);
246+
} else if (cyclic) {
247+
// set 3 trailing bits to indicate the block cycles through the same values
248+
long headerAndCycleLength = ((long) cycleLength << 4) | 0b0111;
249+
out.writeVLong(headerAndCycleLength);
250+
for (int i = 0; i < cycleLength; i++) {
251+
out.writeVLong(in[i]);
252+
}
224253
} else {
225-
// set first two bits to 11 to indicate the block is bit-packed
254+
// set 2 trailing bits to indicate the block is bit-packed
226255
out.writeVLong(0b11);
227256
forUtil.encode(in, bitsPerOrd, out);
228257
}
@@ -232,20 +261,32 @@ void decodeOrdinals(DataInput in, long[] out, int bitsPerOrd) throws IOException
232261
assert out.length == ES87TSDBDocValuesFormat.NUMERIC_BLOCK_SIZE : out.length;
233262

234263
long v1 = in.readVLong();
235-
int header = (int) (v1 & 0b11L);
236-
if (header == 0b00 || header == 0b10) {
237-
// first bit is zero -> single run
238-
Arrays.fill(out, v1 >>> 1);
239-
} else if (header == 0b01) {
240-
// first two bits are 01 -> two runs
241-
v1 = v1 >>> 2;
264+
int encoding = Long.numberOfTrailingZeros(~v1);
265+
v1 >>>= encoding + 1;
266+
if (encoding == 0) {
267+
// single run
268+
Arrays.fill(out, v1);
269+
} else if (encoding == 1) {
270+
// two runs
242271
int runLen = in.readVInt();
243272
long v2 = v1 + in.readZLong();
244273
Arrays.fill(out, 0, runLen, v1);
245274
Arrays.fill(out, runLen, out.length, v2);
246-
} else {
247-
// first two bits are 11 -> bit-packed
275+
} else if (encoding == 2) {
276+
// bit-packed
248277
forUtil.decode(bitsPerOrd, in, out);
278+
} else if (encoding == 3) {
279+
// cycle encoding
280+
int cycleLength = (int) v1;
281+
for (int i = 0; i < cycleLength; i++) {
282+
out[i] = in.readVLong();
283+
}
284+
int length = cycleLength;
285+
while (length < out.length) {
286+
int copyLength = Math.min(length, out.length - length);
287+
System.arraycopy(out, 0, out, length, copyLength);
288+
length += copyLength;
289+
}
249290
}
250291
}
251292

server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesEncoderTests.java

Lines changed: 48 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -260,12 +260,59 @@ public void testEncodeOrdinalsNoRepetitions() throws IOException {
260260
doTestOrdinals(arr, 113);
261261
}
262262

263+
public void testEncodeOrdinalsBitPack3Bits() throws IOException {
264+
long[] arr = new long[blockSize];
265+
Arrays.fill(arr, 4);
266+
for (int i = 0; i < 4; i++) {
267+
arr[i] = i;
268+
}
269+
doTestOrdinals(arr, 49);
270+
}
271+
272+
public void testEncodeOrdinalsCycle2() throws IOException {
273+
long[] arr = new long[blockSize];
274+
Arrays.setAll(arr, i -> i % 2);
275+
doTestOrdinals(arr, 3);
276+
}
277+
278+
public void testEncodeOrdinalsCycle3() throws IOException {
279+
long[] arr = new long[blockSize];
280+
Arrays.setAll(arr, i -> i % 3);
281+
doTestOrdinals(arr, 4);
282+
}
283+
284+
public void testEncodeOrdinalsLongCycle() throws IOException {
285+
long[] arr = new long[blockSize];
286+
Arrays.setAll(arr, i -> i % 32);
287+
doTestOrdinals(arr, 34);
288+
}
289+
290+
public void testEncodeOrdinalsCycleTooLong() throws IOException {
291+
long[] arr = new long[blockSize];
292+
Arrays.setAll(arr, i -> i % 33);
293+
// the cycle is too long and the vales are bit-packed
294+
doTestOrdinals(arr, 97);
295+
}
296+
297+
public void testEncodeOrdinalsAlmostCycle() throws IOException {
298+
long[] arr = new long[blockSize];
299+
Arrays.setAll(arr, i -> i % 3);
300+
arr[arr.length - 1] = 4;
301+
doTestOrdinals(arr, 49);
302+
}
303+
304+
public void testEncodeOrdinalsDifferentCycles() throws IOException {
305+
long[] arr = new long[blockSize];
306+
Arrays.setAll(arr, i -> i > 64 ? i % 4 : i % 3);
307+
doTestOrdinals(arr, 33);
308+
}
309+
263310
private void doTestOrdinals(long[] arr, long expectedNumBytes) throws IOException {
264311
long maxOrd = 0;
265312
for (long ord : arr) {
266313
maxOrd = Math.max(maxOrd, ord);
267314
}
268-
final int bitsPerOrd = PackedInts.bitsRequired(maxOrd - 1);
315+
final int bitsPerOrd = PackedInts.bitsRequired(maxOrd);
269316
final long[] expected = arr.clone();
270317
try (Directory dir = newDirectory()) {
271318
try (IndexOutput out = dir.createOutput("tests.bin", IOContext.DEFAULT)) {

0 commit comments

Comments
 (0)