Skip to content

Commit cdb779b

Browse files
authored
feat(NODE-5861): optimize parsing basic latin strings (#642)
1 parent 44bec19 commit cdb779b

File tree

12 files changed

+275
-45
lines changed

12 files changed

+275
-45
lines changed

rollup.config.mjs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ const tsConfig = {
1313
module: 'esnext',
1414
moduleResolution: 'node',
1515
removeComments: true,
16-
lib: ['es2021'],
16+
lib: ['es2021', 'ES2022.Error'],
1717
importHelpers: false,
1818
noEmitHelpers: false,
1919
noEmitOnError: true,

src/binary.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -191,8 +191,8 @@ export class Binary extends BSONValue {
191191
if (encoding === 'hex') return ByteUtils.toHex(this.buffer);
192192
if (encoding === 'base64') return ByteUtils.toBase64(this.buffer);
193193
if (encoding === 'utf8' || encoding === 'utf-8')
194-
return ByteUtils.toUTF8(this.buffer, 0, this.buffer.byteLength);
195-
return ByteUtils.toUTF8(this.buffer, 0, this.buffer.byteLength);
194+
return ByteUtils.toUTF8(this.buffer, 0, this.buffer.byteLength, false);
195+
return ByteUtils.toUTF8(this.buffer, 0, this.buffer.byteLength, false);
196196
}
197197

198198
/** @internal */

src/error.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ import { BSON_MAJOR_VERSION } from './constants';
44
* @public
55
* @category Error
66
*
7-
* `BSONError` objects are thrown when BSON ecounters an error.
7+
* `BSONError` objects are thrown when BSON encounters an error.
88
*
99
* This is the parent class for all the other errors thrown by this library.
1010
*/
@@ -23,8 +23,8 @@ export class BSONError extends Error {
2323
return 'BSONError';
2424
}
2525

26-
constructor(message: string) {
27-
super(message);
26+
constructor(message: string, options?: { cause?: unknown }) {
27+
super(message, options);
2828
}
2929

3030
/**

src/parser/deserializer.ts

Lines changed: 10 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -236,7 +236,7 @@ function deserializeObject(
236236
if (i >= buffer.byteLength) throw new BSONError('Bad BSON Document: illegal CString');
237237

238238
// Represents the key
239-
const name = isArray ? arrayIndex++ : ByteUtils.toUTF8(buffer, index, i);
239+
const name = isArray ? arrayIndex++ : ByteUtils.toUTF8(buffer, index, i, false);
240240

241241
// shouldValidateKey is true if the key should be validated, false otherwise
242242
let shouldValidateKey = true;
@@ -266,7 +266,7 @@ function deserializeObject(
266266
) {
267267
throw new BSONError('bad string length in bson');
268268
}
269-
value = getValidatedString(buffer, index, index + stringSize - 1, shouldValidateKey);
269+
value = ByteUtils.toUTF8(buffer, index, index + stringSize - 1, shouldValidateKey);
270270
index = index + stringSize;
271271
} else if (elementType === constants.BSON_DATA_OID) {
272272
const oid = ByteUtils.allocate(12);
@@ -476,7 +476,7 @@ function deserializeObject(
476476
// If are at the end of the buffer there is a problem with the document
477477
if (i >= buffer.length) throw new BSONError('Bad BSON Document: illegal CString');
478478
// Return the C string
479-
const source = ByteUtils.toUTF8(buffer, index, i);
479+
const source = ByteUtils.toUTF8(buffer, index, i, false);
480480
// Create the regexp
481481
index = i + 1;
482482

@@ -489,7 +489,7 @@ function deserializeObject(
489489
// If are at the end of the buffer there is a problem with the document
490490
if (i >= buffer.length) throw new BSONError('Bad BSON Document: illegal CString');
491491
// Return the C string
492-
const regExpOptions = ByteUtils.toUTF8(buffer, index, i);
492+
const regExpOptions = ByteUtils.toUTF8(buffer, index, i, false);
493493
index = i + 1;
494494

495495
// For each option add the corresponding one for javascript
@@ -521,7 +521,7 @@ function deserializeObject(
521521
// If are at the end of the buffer there is a problem with the document
522522
if (i >= buffer.length) throw new BSONError('Bad BSON Document: illegal CString');
523523
// Return the C string
524-
const source = ByteUtils.toUTF8(buffer, index, i);
524+
const source = ByteUtils.toUTF8(buffer, index, i, false);
525525
index = i + 1;
526526

527527
// Get the start search index
@@ -533,7 +533,7 @@ function deserializeObject(
533533
// If are at the end of the buffer there is a problem with the document
534534
if (i >= buffer.length) throw new BSONError('Bad BSON Document: illegal CString');
535535
// Return the C string
536-
const regExpOptions = ByteUtils.toUTF8(buffer, index, i);
536+
const regExpOptions = ByteUtils.toUTF8(buffer, index, i, false);
537537
index = i + 1;
538538

539539
// Set the object
@@ -551,7 +551,7 @@ function deserializeObject(
551551
) {
552552
throw new BSONError('bad string length in bson');
553553
}
554-
const symbol = getValidatedString(buffer, index, index + stringSize - 1, shouldValidateKey);
554+
const symbol = ByteUtils.toUTF8(buffer, index, index + stringSize - 1, shouldValidateKey);
555555
value = promoteValues ? symbol : new BSONSymbol(symbol);
556556
index = index + stringSize;
557557
} else if (elementType === constants.BSON_DATA_TIMESTAMP) {
@@ -587,7 +587,7 @@ function deserializeObject(
587587
) {
588588
throw new BSONError('bad string length in bson');
589589
}
590-
const functionString = getValidatedString(
590+
const functionString = ByteUtils.toUTF8(
591591
buffer,
592592
index,
593593
index + stringSize - 1,
@@ -626,7 +626,7 @@ function deserializeObject(
626626
}
627627

628628
// Javascript function
629-
const functionString = getValidatedString(
629+
const functionString = ByteUtils.toUTF8(
630630
buffer,
631631
index,
632632
index + stringSize - 1,
@@ -678,7 +678,7 @@ function deserializeObject(
678678
throw new BSONError('Invalid UTF-8 string in BSON document');
679679
}
680680
}
681-
const namespace = ByteUtils.toUTF8(buffer, index, index + stringSize - 1);
681+
const namespace = ByteUtils.toUTF8(buffer, index, index + stringSize - 1, false);
682682
// Update parse index position
683683
index = index + stringSize;
684684

@@ -728,24 +728,3 @@ function deserializeObject(
728728

729729
return object;
730730
}
731-
732-
function getValidatedString(
733-
buffer: Uint8Array,
734-
start: number,
735-
end: number,
736-
shouldValidateUtf8: boolean
737-
) {
738-
const value = ByteUtils.toUTF8(buffer, start, end);
739-
// if utf8 validation is on, do the check
740-
if (shouldValidateUtf8) {
741-
for (let i = 0; i < value.length; i++) {
742-
if (value.charCodeAt(i) === 0xfffd) {
743-
if (!validateUtf8(buffer, start, end)) {
744-
throw new BSONError('Invalid UTF-8 string in BSON document');
745-
}
746-
break;
747-
}
748-
}
749-
}
750-
return value;
751-
}

src/utils/byte_utils.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,8 @@ export type ByteUtils = {
2525
toHex: (buffer: Uint8Array) => string;
2626
/** Create a Uint8Array containing utf8 code units from a string */
2727
fromUTF8: (text: string) => Uint8Array;
28-
/** Create a string from utf8 code units */
29-
toUTF8: (buffer: Uint8Array, start: number, end: number) => string;
28+
/** Create a string from utf8 code units, fatal=true will throw an error if UTF-8 bytes are invalid, fatal=false will insert replacement characters */
29+
toUTF8: (buffer: Uint8Array, start: number, end: number, fatal: boolean) => string;
3030
/** Get the utf8 code unit count from a string if it were to be transformed to utf8 */
3131
utf8ByteLength: (input: string) => number;
3232
/** Encode UTF8 bytes generated from `source` string into `destination` at byteOffset. Returns the number of bytes encoded. */

src/utils/latin.ts

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
/**
2+
* This function is an optimization for small basic latin strings.
3+
* @internal
4+
* @remarks
5+
* ### Important characteristics:
6+
* - If the uint8array or distance between start and end is 0 this function returns an empty string
7+
* - If the byteLength of the string is 1, 2, or 3 we invoke String.fromCharCode and manually offset into the buffer
8+
* - If the byteLength of the string is less than or equal to 20 an array of bytes is built and `String.fromCharCode.apply` is called with the result
9+
* - If any byte exceeds 128 this function returns null
10+
*
11+
* @param uint8array - A sequence of bytes that may contain basic latin characters
12+
* @param start - The start index from which to search the uint8array
13+
* @param end - The index to stop searching the uint8array
14+
* @returns string if all bytes are within the basic latin range, otherwise null
15+
*/
16+
export function tryLatin(uint8array: Uint8Array, start: number, end: number): string | null {
17+
if (uint8array.length === 0) {
18+
return '';
19+
}
20+
21+
const stringByteLength = end - start;
22+
if (stringByteLength === 0) {
23+
return '';
24+
}
25+
26+
if (stringByteLength > 20) {
27+
return null;
28+
}
29+
30+
if (stringByteLength === 1 && uint8array[start] < 128) {
31+
return String.fromCharCode(uint8array[start]);
32+
}
33+
34+
if (stringByteLength === 2 && uint8array[start] < 128 && uint8array[start + 1] < 128) {
35+
return String.fromCharCode(uint8array[start]) + String.fromCharCode(uint8array[start + 1]);
36+
}
37+
38+
if (
39+
stringByteLength === 3 &&
40+
uint8array[start] < 128 &&
41+
uint8array[start + 1] < 128 &&
42+
uint8array[start + 2] < 128
43+
) {
44+
return (
45+
String.fromCharCode(uint8array[start]) +
46+
String.fromCharCode(uint8array[start + 1]) +
47+
String.fromCharCode(uint8array[start + 2])
48+
);
49+
}
50+
51+
const latinBytes = [];
52+
for (let i = start; i < end; i++) {
53+
const byte = uint8array[i];
54+
if (byte > 127) {
55+
return null;
56+
}
57+
latinBytes.push(byte);
58+
}
59+
60+
return String.fromCharCode(...latinBytes);
61+
}

src/utils/node_byte_utils.ts

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
import { BSONError } from '../error';
2+
import { validateUtf8 } from '../validate_utf8';
3+
import { tryLatin } from './latin';
24

35
type NodeJsEncoding = 'base64' | 'hex' | 'utf8' | 'binary';
46
type NodeJsBuffer = ArrayBufferView &
@@ -125,8 +127,25 @@ export const nodeJsByteUtils = {
125127
return Buffer.from(text, 'utf8');
126128
},
127129

128-
toUTF8(buffer: Uint8Array, start: number, end: number): string {
129-
return nodeJsByteUtils.toLocalBufferType(buffer).toString('utf8', start, end);
130+
toUTF8(buffer: Uint8Array, start: number, end: number, fatal: boolean): string {
131+
const basicLatin = end - start <= 20 ? tryLatin(buffer, start, end) : null;
132+
if (basicLatin != null) {
133+
return basicLatin;
134+
}
135+
136+
const string = nodeJsByteUtils.toLocalBufferType(buffer).toString('utf8', start, end);
137+
if (fatal) {
138+
// TODO(NODE-4930): Insufficiently strict BSON UTF8 validation
139+
for (let i = 0; i < string.length; i++) {
140+
if (string.charCodeAt(i) === 0xfffd) {
141+
if (!validateUtf8(buffer, start, end)) {
142+
throw new BSONError('Invalid UTF-8 string in BSON document');
143+
}
144+
break;
145+
}
146+
}
147+
}
148+
return string;
130149
},
131150

132151
utf8ByteLength(input: string): number {

src/utils/web_byte_utils.ts

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import { BSONError } from '../error';
2+
import { tryLatin } from './latin';
23

34
type TextDecoder = {
45
readonly encoding: string;
@@ -172,8 +173,20 @@ export const webByteUtils = {
172173
return new TextEncoder().encode(text);
173174
},
174175

175-
toUTF8(uint8array: Uint8Array, start: number, end: number): string {
176-
return new TextDecoder('utf8', { fatal: false }).decode(uint8array.slice(start, end));
176+
toUTF8(uint8array: Uint8Array, start: number, end: number, fatal: boolean): string {
177+
const basicLatin = end - start <= 20 ? tryLatin(uint8array, start, end) : null;
178+
if (basicLatin != null) {
179+
return basicLatin;
180+
}
181+
182+
if (fatal) {
183+
try {
184+
return new TextDecoder('utf8', { fatal }).decode(uint8array.slice(start, end));
185+
} catch (cause) {
186+
throw new BSONError('Invalid UTF-8 string in BSON document', { cause });
187+
}
188+
}
189+
return new TextDecoder('utf8', { fatal }).decode(uint8array.slice(start, end));
177190
},
178191

179192
utf8ByteLength(input: string): number {

test/node/byte_utils.test.ts

Lines changed: 40 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -400,19 +400,34 @@ const fromUTF8Tests: ByteUtilTest<'fromUTF8'>[] = [
400400
const toUTF8Tests: ByteUtilTest<'toUTF8'>[] = [
401401
{
402402
name: 'should create utf8 string from buffer input',
403-
inputs: [Buffer.from('abc\u{1f913}', 'utf8')],
403+
inputs: [Buffer.from('abc\u{1f913}', 'utf8'), 0, 7, false],
404404
expectation({ output, error }) {
405405
expect(error).to.be.null;
406406
expect(output).to.deep.equal(Buffer.from('abc\u{1f913}', 'utf8').toString('utf8'));
407407
}
408408
},
409409
{
410410
name: 'should return empty string for empty buffer input',
411-
inputs: [Buffer.alloc(0)],
411+
inputs: [Buffer.alloc(0), 0, 1, false],
412412
expectation({ output, error }) {
413413
expect(error).to.be.null;
414414
expect(output).to.be.a('string').with.lengthOf(0);
415415
}
416+
},
417+
{
418+
name: 'should throw an error if fatal is set and string is invalid',
419+
inputs: [Buffer.from('616263f09fa4', 'hex'), 0, 7, true],
420+
expectation({ error }) {
421+
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
422+
}
423+
},
424+
{
425+
name: 'should insert replacement character fatal is false and string is invalid',
426+
inputs: [Buffer.from('616263f09fa4', 'hex'), 0, 7, false],
427+
expectation({ error, output }) {
428+
expect(error).to.not.exist;
429+
expect(output).to.equal('abc\uFFFD');
430+
}
416431
}
417432
];
418433
const utf8ByteLengthTests: ByteUtilTest<'utf8ByteLength'>[] = [
@@ -596,6 +611,29 @@ describe('ByteUtils', () => {
596611
});
597612
});
598613

614+
describe('toUTF8 basic latin optimization', () => {
615+
afterEach(() => {
616+
sinon.restore();
617+
});
618+
619+
context('Given a basic latin string', () => {
620+
it('should not invoke Buffer.toString', () => {
621+
const buffer = Buffer.from('abcdef', 'utf8');
622+
const spy = sinon.spy(buffer, 'toString');
623+
nodeJsByteUtils.toUTF8(buffer, 0, 6, false);
624+
expect(spy).to.not.have.been.called;
625+
});
626+
627+
it('should not invoke TextDecoder.decode', () => {
628+
const utf8Bytes = Buffer.from('abcdef', 'utf8');
629+
const buffer = new Uint8Array(utf8Bytes.buffer, utf8Bytes.byteOffset, utf8Bytes.byteLength);
630+
const spy = sinon.spy(TextDecoder.prototype, 'decode');
631+
webByteUtils.toUTF8(buffer, 0, 6, false);
632+
expect(spy).to.not.have.been.called;
633+
});
634+
});
635+
});
636+
599637
describe('randomBytes fallback case when crypto is not present', () => {
600638
describe('web', function () {
601639
let bsonWithNoCryptoCtx;

test/node/release.test.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ const REQUIRED_FILES = [
4646
'src/utils/byte_utils.ts',
4747
'src/utils/node_byte_utils.ts',
4848
'src/utils/web_byte_utils.ts',
49+
'src/utils/latin.ts',
4950
'src/validate_utf8.ts',
5051
'vendor/base64/base64.js',
5152
'vendor/base64/package.json',

0 commit comments

Comments
 (0)