feat(NODE-6537): add support for binary vectors

nbbeeken · nbbeeken · commit adbf3e58ad81 · 2024-11-14T09:26:15.000-05:00
diff --git a/src/binary.ts b/src/binary.ts
@@ -4,6 +4,7 @@ import { BSONError } from './error';
 import { BSON_BINARY_SUBTYPE_UUID_NEW } from './constants';
 import { ByteUtils } from './utils/byte_utils';
 import { BSONValue } from './bson_value';
+import { NumberUtils } from './utils/number_utils';
 
 /** @public */
 export type BinarySequence = Uint8Array | number[];
@@ -22,6 +23,15 @@ export interface BinaryExtended {
   };
 }
 
+/** Creates a copy of the Uint8Array bytes. */
+const copy =
+  // eslint-disable-next-line @typescript-eslint/unbound-method
+  Uint8Array.prototype.slice.call.bind(Uint8Array.prototype.slice) as unknown as (
+    bytes: Uint8Array,
+    start: number,
+    end: number
+  ) => Uint8Array;
+
 /**
  * A class representation of the BSON Binary type.
  * @public
@@ -58,9 +68,18 @@ export class Binary extends BSONValue {
   static readonly SUBTYPE_COLUMN = 7;
   /** Sensitive BSON type */
   static readonly SUBTYPE_SENSITIVE = 8;
+  /** Vector BSON type */
+  static readonly SUBTYPE_VECTOR = 9;
   /** User BSON type */
   static readonly SUBTYPE_USER_DEFINED = 128;
 
+  /** d_type of a Binary Vector (subtype: 9) */
+  static readonly VECTOR_TYPE = Object.freeze({
+    Int8: 0x10,
+    Float32: 0x27,
+    PackedBit: 0x03
+  } as const);
+
   buffer!: Uint8Array;
   sub_type!: number;
   position!: number;
@@ -272,6 +291,158 @@ export class Binary extends BSONValue {
     const subTypeArg = inspect(this.sub_type, options);
     return `Binary.createFromBase64(${base64Arg}, ${subTypeArg})`;
   }
+
+  /**
+   * If this Binary represents a Int8 Vector,
+   * returns a copy of the bytes in a new Int8Array.
+   */
+  public toInt8Array(): Int8Array {
+    if (this.sub_type !== Binary.SUBTYPE_VECTOR) {
+      throw new BSONError('Binary sub_type is not Vector');
+    }
+
+    if ((this.buffer[0] ?? 0) !== Binary.VECTOR_TYPE.Int8) {
+      throw new BSONError('Binary d_type field is not Int8');
+    }
+
+    return new Int8Array(copy(this.buffer, 2, this.position).buffer);
+  }
+
+  /**
+   * If this Binary represents a Float32 Vector,
+   * returns a copy of the bytes in a new Float32Array.
+   */
+  public toFloat32Array(): Float32Array {
+    if (this.sub_type !== Binary.SUBTYPE_VECTOR) {
+      throw new BSONError('Binary sub_type is not Vector');
+    }
+
+    if ((this.buffer[0] ?? 0) !== Binary.VECTOR_TYPE.Float32) {
+      throw new BSONError('Binary d_type field is not Float32');
+    }
+
+    const bytes = copy(this.buffer, 2, this.position);
+    if (NumberUtils.isBigEndian) {
+      for (let i = 0; i < bytes.length; i += 4) {
+        bytes[i] ^= bytes[i + 3];
+        bytes[i + 1] ^= bytes[i + 2];
+      }
+    }
+    return new Float32Array(bytes.buffer);
+  }
+
+  /**
+   * If this Binary represents packed bit Vector,
+   * returns a copy of the bytes that are packed bits.
+   *
+   * Use `toBits` to get the unpacked bits.
+   */
+  public toPackedBits(): Uint8Array {
+    if (this.sub_type !== Binary.SUBTYPE_VECTOR) {
+      throw new BSONError('Binary sub_type is not Vector');
+    }
+
+    if ((this.buffer[0] ?? 0) !== Binary.VECTOR_TYPE.PackedBit) {
+      throw new BSONError('Binary d_type field is not packed bit');
+    }
+
+    return copy(this.buffer, 2, this.position);
+  }
+
+  /**
+   * If this Binary represents a Packed bit Vector,
+   * returns a copy of the bit unpacked into a new Int8Array.
+   */
+  public toBits(): Int8Array {
+    if (this.sub_type !== Binary.SUBTYPE_VECTOR) {
+      throw new BSONError('Binary sub_type is not Vector');
+    }
+
+    if ((this.buffer[0] ?? 0) !== Binary.VECTOR_TYPE.PackedBit) {
+      throw new BSONError('Binary d_type field is not packed bit');
+    }
+
+    const byteCount = this.length() - 2;
+    const bitCount = byteCount * 8 - this.buffer[1];
+    const bits = new Int8Array(bitCount);
+    outer: for (let byteOffset = 0; byteOffset < byteCount; byteOffset++) {
+      const byte = this.buffer[byteOffset + 2];
+      for (let bitBase = 0; bitBase < 8; bitBase++) {
+        const bitOffset = Math.ceil(byteOffset / 8) * 8 + bitBase;
+        if (bitOffset >= bits.length) break outer;
+        const mask = 1 << (7 - bitBase);
+        bits[bitOffset] = byte & mask ? 1 : 0;
+      }
+    }
+    return bits;
+  }
+
+  /**
+   * Constructs a Binary representing an Int8 Vector.
+   * @param array - The array to store as a view on the Binary class
+   */
+  public static fromInt8Array(array: Int8Array): Binary {
+    const buffer = ByteUtils.allocate(array.byteLength + 2);
+    buffer[0] = Binary.VECTOR_TYPE.Int8;
+    buffer[1] = 0;
+    const intBytes = new Uint8Array(array.buffer, array.byteOffset, array.byteLength);
+    buffer.set(intBytes, 2);
+    return new this(buffer, this.SUBTYPE_VECTOR);
+  }
+
+  /** Constructs a Binary representing an Float32 Vector. */
+  public static fromFloat32Array(array: Float32Array): Binary {
+    const buffer = ByteUtils.allocate(array.byteLength + 2);
+    buffer[0] = Binary.VECTOR_TYPE.Float32;
+    buffer[1] = 0;
+    const floatBytes = new Uint8Array(array.buffer, array.byteOffset, array.byteLength);
+    if (NumberUtils.isBigEndian) {
+      for (let i = 0; i < array.length; i += 4) {
+        floatBytes[i] ^= floatBytes[i + 3];
+        floatBytes[i + 1] ^= floatBytes[i + 2];
+      }
+    }
+    return new this(buffer, this.SUBTYPE_VECTOR);
+  }
+
+  /**
+   * Constructs a Binary representing a packed bit Vector.
+   *
+   * Use `fromBits` to pack an array of 1s and 0s.
+   */
+  public static fromPackedBits(array: Uint8Array, padding = 0): Binary {
+    const buffer = ByteUtils.allocate(array.byteLength + 2);
+    buffer[0] = Binary.VECTOR_TYPE.PackedBit;
+    buffer[1] = padding;
+    buffer.set(array, 2);
+    return new this(buffer, this.SUBTYPE_VECTOR);
+  }
+
+  /**
+   * Constructs a Binary representing an Packed Bit Vector.
+   * @param array - The array of 1s and 0s to pack into the Binary instance
+   */
+  public static fromBits(bits: ArrayLike<number>): Binary {
+    const byteLength = Math.ceil(bits.length / 8);
+    const bytes = new Uint8Array(byteLength + 2);
+    bytes[0] = Binary.VECTOR_TYPE.PackedBit;
+
+    const remainder = bits.length % 8;
+    bytes[1] = remainder === 0 ? 0 : 8 - remainder;
+
+    for (let bitOffset = 0; bitOffset < bits.length; bitOffset++) {
+      const byteOffset = Math.floor(bitOffset / 8);
+
+      const bit = bits[bitOffset] ? 1 : 0;
+
+      if (bit === 0) continue;
+
+      const shift = 7 - (bitOffset % 8);
+      bytes[byteOffset + 2] |= bit << shift;
+    }
+
+    return new this(bytes, Binary.SUBTYPE_VECTOR);
+  }
 }
 
 /** @public */
diff --git a/src/utils/number_utils.ts b/src/utils/number_utils.ts
@@ -13,6 +13,8 @@ const isBigEndian = FLOAT_BYTES[7] === 0;
  * A collection of functions that get or set various numeric types and bit widths from a Uint8Array.
  */
 export type NumberUtils = {
+  /** Is true if the current system is big endian. */
+  isBigEndian: boolean;
   /**
    * Parses a signed int32 at offset. Throws a `RangeError` if value is negative.
    */
@@ -35,6 +37,8 @@ export type NumberUtils = {
  * @public
  */
 export const NumberUtils: NumberUtils = {
+  isBigEndian,
+
   getNonnegativeInt32LE(source: Uint8Array, offset: number): number {
     if (source[offset + 3] > 127) {
       throw new RangeError(`Size cannot be negative at offset: ${offset}`);
diff --git a/test/node/binary.test.ts b/test/node/binary.test.ts