@@ -182,35 +182,57 @@ void encode(long[] in, DataOutput out) throws IOException {
182
182
* Optimizes for encoding sorted fields where we expect a block to mostly either be the same value
183
183
* or to make a transition from one value to a second one.
184
184
* <p>
185
- * Encodes blocks in the following format :
185
+ * The header is a vlong where the number of trailing ones defines the encoding strategy :
186
186
* <ul>
187
- * <li>byte 0: 1/2 bits header+6/7 bits data</li>
188
- * <li>byte 1..n: data</li>
189
- * </ul>
190
- * The header (first 1 or 2 bits) describes how the data is encoded:
191
- * <ul>
192
- * <li>?0 block has a single value (vlong), 2nd bit already contains data</li>
193
- * <li>
194
- * 01 block has two runs, data contains value 1 (vlong), run-length (vint) of value 1,
195
- * and delta from first to second value (zlong)
196
- * </li>
197
- * <li>11 block is bit-packed</li>
187
+ * <li>0: single run</li>
188
+ * <li>1: two runs</li>
189
+ * <li>2: bit-packed</li>
190
+ * <li>3: cycle</li>
198
191
* </ul>
199
192
*/
200
193
void encodeOrdinals (long [] in , DataOutput out , int bitsPerOrd ) throws IOException {
201
194
assert in .length == ES87TSDBDocValuesFormat .NUMERIC_BLOCK_SIZE ;
202
195
int numRuns = 1 ;
196
+ long firstValue = in [0 ];
197
+ long previousValue = firstValue ;
198
+ boolean cyclic = false ;
199
+ int cycleLength = 0 ;
203
200
for (int i = 1 ; i < in .length ; ++i ) {
204
- if (in [i - 1 ] != in [i ]) {
201
+ long currentValue = in [i ];
202
+ if (previousValue != currentValue ) {
205
203
numRuns ++;
206
204
}
205
+ if (currentValue == firstValue && cycleLength != -1 ) {
206
+ if (cycleLength == 0 ) {
207
+ // first candidate cycle detected
208
+ cycleLength = i ;
209
+ } else if (cycleLength == 1 || i % cycleLength != 0 ) {
210
+ // if the first two values are the same this isn't a cycle, it might be a run, though
211
+ // this also isn't a cycle if the index of the next occurrence of the first value
212
+ // isn't a multiple of the candidate cycle length
213
+ // we can stop looking for cycles now
214
+ cycleLength = -1 ;
215
+ }
216
+ }
217
+ previousValue = currentValue ;
218
+ }
219
+ // if the cycle is too long, bit-packing may be more space efficient
220
+ int maxCycleLength = in .length / 4 ;
221
+ if (numRuns > 2 && cycleLength > 1 && cycleLength <= maxCycleLength ) {
222
+ cyclic = true ;
223
+ for (int i = cycleLength ; i < in .length ; ++i ) {
224
+ if (in [i ] != in [i - cycleLength ]) {
225
+ cyclic = false ;
226
+ break ;
227
+ }
228
+ }
207
229
}
208
230
if (numRuns == 1 && bitsPerOrd < 63 ) {
209
231
long value = in [0 ];
210
- // set first bit to 0 to indicate the block has a single run
232
+ // unset first bit (0 trailing ones) to indicate the block has a single run
211
233
out .writeVLong (value << 1 );
212
234
} else if (numRuns == 2 && bitsPerOrd < 62 ) {
213
- // set first two bits to 01 to indicate the block has two runs
235
+ // set 1 trailing bit to indicate the block has two runs
214
236
out .writeVLong ((in [0 ] << 2 ) | 0b01);
215
237
int firstRunLen = in .length ;
216
238
for (int i = 1 ; i < in .length ; ++i ) {
@@ -221,8 +243,15 @@ void encodeOrdinals(long[] in, DataOutput out, int bitsPerOrd) throws IOExceptio
221
243
}
222
244
out .writeVInt (firstRunLen );
223
245
out .writeZLong (in [in .length - 1 ] - in [0 ]);
246
+ } else if (cyclic ) {
247
+ // set 3 trailing bits to indicate the block cycles through the same values
248
+ long headerAndCycleLength = ((long ) cycleLength << 4 ) | 0b0111;
249
+ out .writeVLong (headerAndCycleLength );
250
+ for (int i = 0 ; i < cycleLength ; i ++) {
251
+ out .writeVLong (in [i ]);
252
+ }
224
253
} else {
225
- // set first two bits to 11 to indicate the block is bit-packed
254
+ // set 2 trailing bits to indicate the block is bit-packed
226
255
out .writeVLong (0b11);
227
256
forUtil .encode (in , bitsPerOrd , out );
228
257
}
@@ -232,20 +261,32 @@ void decodeOrdinals(DataInput in, long[] out, int bitsPerOrd) throws IOException
232
261
assert out .length == ES87TSDBDocValuesFormat .NUMERIC_BLOCK_SIZE : out .length ;
233
262
234
263
long v1 = in .readVLong ();
235
- int header = ( int ) ( v1 & 0b11L );
236
- if ( header == 0b00 || header == 0b10) {
237
- // first bit is zero -> single run
238
- Arrays . fill ( out , v1 >>> 1 );
239
- } else if ( header == 0b01) {
240
- // first two bits are 01 -> two runs
241
- v1 = v1 >>> 2 ;
264
+ int encoding = Long . numberOfTrailingZeros (~ v1 );
265
+ v1 >>>= encoding + 1 ;
266
+ if ( encoding == 0 ) {
267
+ // single run
268
+ Arrays . fill ( out , v1 );
269
+ } else if ( encoding == 1 ) {
270
+ // two runs
242
271
int runLen = in .readVInt ();
243
272
long v2 = v1 + in .readZLong ();
244
273
Arrays .fill (out , 0 , runLen , v1 );
245
274
Arrays .fill (out , runLen , out .length , v2 );
246
- } else {
247
- // first two bits are 11 -> bit-packed
275
+ } else if ( encoding == 2 ) {
276
+ // bit-packed
248
277
forUtil .decode (bitsPerOrd , in , out );
278
+ } else if (encoding == 3 ) {
279
+ // cycle encoding
280
+ int cycleLength = (int ) v1 ;
281
+ for (int i = 0 ; i < cycleLength ; i ++) {
282
+ out [i ] = in .readVLong ();
283
+ }
284
+ int length = cycleLength ;
285
+ while (length < out .length ) {
286
+ int copyLength = Math .min (length , out .length - length );
287
+ System .arraycopy (out , 0 , out , length , copyLength );
288
+ length += copyLength ;
289
+ }
249
290
}
250
291
}
251
292
0 commit comments