Skip to content

Commit 8d2ed1e

Browse files
fix(NODE-6123): toUtf8 validation insufficiently strict
1 parent c58d1e2 commit 8d2ed1e

File tree

6 files changed

+139
-57
lines changed

6 files changed

+139
-57
lines changed

etc/rollup/rollup-plugin-require-vendor/require_vendor.mjs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ export class RequireVendor {
1414
* @returns {{ code: string; map: import('magic-string').SourceMap }}
1515
*/
1616
transform(code, id) {
17-
if (!id.includes('web_byte_utils')) {
17+
if (!id.includes('validate_utf8')) {
1818
return;
1919
}
2020

src/parser/deserializer.ts

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@ import { BSONSymbol } from '../symbol';
1616
import { Timestamp } from '../timestamp';
1717
import { ByteUtils } from '../utils/byte_utils';
1818
import { NumberUtils } from '../utils/number_utils';
19-
import { validateUtf8 } from '../validate_utf8';
2019

2120
/** @public */
2221
export interface DeserializeOptions {
@@ -604,12 +603,12 @@ function deserializeObject(
604603
)
605604
throw new BSONError('bad string length in bson');
606605
// Namespace
607-
if (validation != null && validation.utf8) {
608-
if (!validateUtf8(buffer, index, index + stringSize - 1)) {
609-
throw new BSONError('Invalid UTF-8 string in BSON document');
610-
}
611-
}
612-
const namespace = ByteUtils.toUTF8(buffer, index, index + stringSize - 1, false);
606+
const namespace = ByteUtils.toUTF8(
607+
buffer,
608+
index,
609+
index + stringSize - 1,
610+
validation != null && (validation.utf8 as boolean)
611+
);
613612
// Update parse index position
614613
index = index + stringSize;
615614

src/utils/node_byte_utils.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,7 @@ export const nodeJsByteUtils = {
139139
// TODO(NODE-4930): Insufficiently strict BSON UTF8 validation
140140
for (let i = 0; i < string.length; i++) {
141141
if (string.charCodeAt(i) === 0xfffd) {
142-
if (!validateUtf8(buffer, start, end)) {
142+
if (!validateUtf8(buffer, start, end, fatal)) {
143143
throw new BSONError('Invalid UTF-8 string in BSON document');
144144
}
145145
break;

src/utils/web_byte_utils.ts

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import { BSONError } from '../error';
22
import { tryReadBasicLatin } from './latin';
3+
import { validateUtf8 } from '../validate_utf8';
34

45
type TextDecoder = {
56
readonly encoding: string;
@@ -179,14 +180,7 @@ export const webByteUtils = {
179180
return basicLatin;
180181
}
181182

182-
if (fatal) {
183-
try {
184-
return new TextDecoder('utf8', { fatal }).decode(uint8array.slice(start, end));
185-
} catch (cause) {
186-
throw new BSONError('Invalid UTF-8 string in BSON document', { cause });
187-
}
188-
}
189-
return new TextDecoder('utf8', { fatal }).decode(uint8array.slice(start, end));
183+
return validateUtf8(uint8array, start, end, fatal);
190184
},
191185

192186
utf8ByteLength(input: string): number {

src/validate_utf8.ts

Lines changed: 32 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,26 @@
1-
const FIRST_BIT = 0x80;
2-
const FIRST_TWO_BITS = 0xc0;
3-
const FIRST_THREE_BITS = 0xe0;
4-
const FIRST_FOUR_BITS = 0xf0;
5-
const FIRST_FIVE_BITS = 0xf8;
1+
import { BSONError } from './error';
62

7-
const TWO_BIT_CHAR = 0xc0;
8-
const THREE_BIT_CHAR = 0xe0;
9-
const FOUR_BIT_CHAR = 0xf0;
10-
const CONTINUING_CHAR = 0x80;
3+
type TextDecoder = {
4+
readonly encoding: string;
5+
readonly fatal: boolean;
6+
readonly ignoreBOM: boolean;
7+
decode(input?: Uint8Array): string;
8+
};
9+
type TextDecoderConstructor = {
10+
new (label: 'utf8', options: { fatal: boolean; ignoreBOM?: boolean }): TextDecoder;
11+
};
12+
13+
type TextEncoder = {
14+
readonly encoding: string;
15+
encode(input?: string): Uint8Array;
16+
};
17+
type TextEncoderConstructor = {
18+
new (): TextEncoder;
19+
};
20+
21+
// Node byte utils global
22+
declare const TextDecoder: TextDecoderConstructor;
23+
declare const TextEncoder: TextEncoderConstructor;
1124

1225
/**
1326
* Determines if the passed in bytes are valid utf8
@@ -16,32 +29,17 @@ const CONTINUING_CHAR = 0x80;
1629
* @param end - The index to end validating
1730
*/
1831
export function validateUtf8(
19-
bytes: { [index: number]: number },
32+
buffer: Uint8Array,
2033
start: number,
21-
end: number
22-
): boolean {
23-
let continuation = 0;
24-
25-
for (let i = start; i < end; i += 1) {
26-
const byte = bytes[i];
27-
28-
if (continuation) {
29-
if ((byte & FIRST_TWO_BITS) !== CONTINUING_CHAR) {
30-
return false;
31-
}
32-
continuation -= 1;
33-
} else if (byte & FIRST_BIT) {
34-
if ((byte & FIRST_THREE_BITS) === TWO_BIT_CHAR) {
35-
continuation = 1;
36-
} else if ((byte & FIRST_FOUR_BITS) === THREE_BIT_CHAR) {
37-
continuation = 2;
38-
} else if ((byte & FIRST_FIVE_BITS) === FOUR_BIT_CHAR) {
39-
continuation = 3;
40-
} else {
41-
return false;
42-
}
34+
end: number,
35+
fatal: boolean
36+
): string {
37+
if (fatal) {
38+
try {
39+
return new TextDecoder('utf8', { fatal }).decode(buffer.slice(start, end));
40+
} catch (cause) {
41+
throw new BSONError('Invalid UTF-8 string in BSON document', { cause });
4342
}
4443
}
45-
46-
return !continuation;
44+
return new TextDecoder('utf8', { fatal }).decode(buffer.slice(start, end));
4745
}

test/node/byte_utils.test.ts

Lines changed: 97 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import { webByteUtils } from '../../src/utils/web_byte_utils';
88
import * as sinon from 'sinon';
99
import { loadCJSModuleBSON, loadReactNativeCJSModuleBSON, loadESModuleBSON } from '../load_bson';
1010
import * as crypto from 'node:crypto';
11+
import { BSONError } from '../../src/error';
1112

1213
type ByteUtilTest<K extends keyof ByteUtils> = {
1314
name: string;
@@ -399,6 +400,7 @@ const fromUTF8Tests: ByteUtilTest<'encodeUTF8Into'>[] = [
399400
}
400401
}
401402
];
403+
402404
const toUTF8Tests: ByteUtilTest<'toUTF8'>[] = [
403405
{
404406
name: 'should create utf8 string from buffer input',
@@ -417,21 +419,57 @@ const toUTF8Tests: ByteUtilTest<'toUTF8'>[] = [
417419
}
418420
},
419421
{
420-
name: 'should throw an error if fatal is set and string is invalid',
422+
name: 'should insert replacement character fatal is false and string is invalid',
423+
inputs: [Buffer.from('616263f09fa4', 'hex'), 0, 7, false],
424+
expectation({ error, output }) {
425+
expect(error).to.not.exist;
426+
expect(output).to.equal('abc\uFFFD');
427+
}
428+
},
429+
{
430+
name: 'should throw an error if fatal is set and string is a sequence that decodes to an invalid code point',
421431
inputs: [Buffer.from('616263f09fa4', 'hex'), 0, 7, true],
422432
expectation({ error }) {
423433
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
424434
}
425435
},
426436
{
427-
name: 'should insert replacement character fatal is false and string is invalid',
428-
inputs: [Buffer.from('616263f09fa4', 'hex'), 0, 7, false],
429-
expectation({ error, output }) {
430-
expect(error).to.not.exist;
431-
expect(output).to.equal('abc\uFFFD');
437+
name: 'throw an error if fatal is set and string contains overlong encoding',
438+
inputs: [Buffer.from('11000000025f0005000000f08282ac0000', 'hex'), 0, 18, true],
439+
expectation({ error }) {
440+
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
441+
}
442+
},
443+
{
444+
name: 'throw an error if fatal is set and string contains invalid bytes',
445+
inputs: [Buffer.from('abcff', 'hex'), 0, 2, true],
446+
expectation({ error }) {
447+
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
448+
}
449+
},
450+
{
451+
name: 'throw an error if fatal is set and string contains an unexpected continuation byte',
452+
inputs: [Buffer.from('7F80', 'hex'), 0, 2, true],
453+
expectation({ error }) {
454+
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
455+
}
456+
},
457+
{
458+
name: 'throw an error if fatal is set and string contains a non-continuation byte before the end of the character',
459+
inputs: [Buffer.from('c000', 'hex'), 0, 2, true],
460+
expectation({ error }) {
461+
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
462+
}
463+
},
464+
{
465+
name: 'throw an error if fatal is set and string ends before the end of the character',
466+
inputs: [Buffer.from('c0', 'hex'), 0, 1, true],
467+
expectation({ error }) {
468+
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
432469
}
433470
}
434471
];
472+
435473
const utf8ByteLengthTests: ByteUtilTest<'utf8ByteLength'>[] = [
436474
{
437475
name: 'should return zero for empty string',
@@ -493,6 +531,51 @@ const randomBytesTests: ByteUtilTest<'randomBytes'>[] = [
493531
}
494532
];
495533

534+
// extra error cases copied from Web platform specs
535+
const toUTF8ErrorCaseTests = [
536+
{ input: [0xff], name: 'invalid code' },
537+
{ input: [0xc0], name: 'ends early' },
538+
{ input: [0xe0], name: 'ends early 2' },
539+
{ input: [0xc0, 0x00], name: 'invalid trail' },
540+
{ input: [0xc0, 0xc0], name: 'invalid trail 2' },
541+
{ input: [0xe0, 0x00], name: 'invalid trail 3' },
542+
{ input: [0xe0, 0xc0], name: 'invalid trail 4' },
543+
{ input: [0xe0, 0x80, 0x00], name: 'invalid trail 5' },
544+
{ input: [0xe0, 0x80, 0xc0], name: 'invalid trail 6' },
545+
{ input: [0xfc, 0x80, 0x80, 0x80, 0x80, 0x80], name: '> 0x10ffff' },
546+
{ input: [0xfe, 0x80, 0x80, 0x80, 0x80, 0x80], name: 'obsolete lead byte' },
547+
548+
// Overlong encodings
549+
{ input: [0xc0, 0x80], name: 'overlong U+0000 - 2 bytes' },
550+
{ input: [0xe0, 0x80, 0x80], name: 'overlong U+0000 - 3 bytes' },
551+
{ input: [0xf0, 0x80, 0x80, 0x80], name: 'overlong U+0000 - 4 bytes' },
552+
{ input: [0xf8, 0x80, 0x80, 0x80, 0x80], name: 'overlong U+0000 - 5 bytes' },
553+
{ input: [0xfc, 0x80, 0x80, 0x80, 0x80, 0x80], name: 'overlong U+0000 - 6 bytes' },
554+
555+
{ input: [0xc1, 0xbf], name: 'overlong U+007f - 2 bytes' },
556+
{ input: [0xe0, 0x81, 0xbf], name: 'overlong U+007f - 3 bytes' },
557+
{ input: [0xf0, 0x80, 0x81, 0xbf], name: 'overlong U+007f - 4 bytes' },
558+
{ input: [0xf8, 0x80, 0x80, 0x81, 0xbf], name: 'overlong U+007f - 5 bytes' },
559+
{ input: [0xfc, 0x80, 0x80, 0x80, 0x81, 0xbf], name: 'overlong U+007f - 6 bytes' },
560+
561+
{ input: [0xe0, 0x9f, 0xbf], name: 'overlong U+07ff - 3 bytes' },
562+
{ input: [0xf0, 0x80, 0x9f, 0xbf], name: 'overlong U+07ff - 4 bytes' },
563+
{ input: [0xf8, 0x80, 0x80, 0x9f, 0xbf], name: 'overlong U+07ff - 5 bytes' },
564+
{ input: [0xfc, 0x80, 0x80, 0x80, 0x9f, 0xbf], name: 'overlong U+07ff - 6 bytes' },
565+
566+
{ input: [0xf0, 0x8f, 0xbf, 0xbf], name: 'overlong U+ffff - 4 bytes' },
567+
{ input: [0xf8, 0x80, 0x8f, 0xbf, 0xbf], name: 'overlong U+ffff - 5 bytes' },
568+
{ input: [0xfc, 0x80, 0x80, 0x8f, 0xbf, 0xbf], name: 'overlong U+ffff - 6 bytes' },
569+
570+
{ input: [0xf8, 0x84, 0x8f, 0xbf, 0xbf], name: 'overlong U+10ffff - 5 bytes' },
571+
{ input: [0xfc, 0x80, 0x84, 0x8f, 0xbf, 0xbf], name: 'overlong U+10ffff - 6 bytes' },
572+
573+
// UTf-16 surrogates encoded as code points in UTf-8
574+
{ input: [0xed, 0xa0, 0x80], name: 'lead surrogate' },
575+
{ input: [0xed, 0xb0, 0x80], name: 'trail surrogate' },
576+
{ input: [0xed, 0xa0, 0x80, 0xed, 0xb0, 0x80], name: 'surrogate pair' }
577+
];
578+
496579
const utils = new Map([
497580
['nodeJsByteUtils', nodeJsByteUtils],
498581
['webByteUtils', webByteUtils]
@@ -798,6 +881,14 @@ describe('ByteUtils', () => {
798881
test.expectation({ web: byteUtilsName === 'webByteUtils', output, error });
799882
});
800883
}
884+
if (utility === 'toUTF8')
885+
for (const test of toUTF8ErrorCaseTests) {
886+
it(`throws error when fatal is set and provided ${test.name} as input`, () => {
887+
expect(() =>
888+
byteUtils[utility](Uint8Array.from(test.input), 0, test.input.length, true)
889+
).to.throw(BSONError, /Invalid UTF-8 string in BSON document/i);
890+
});
891+
}
801892
});
802893
}
803894
}

0 commit comments

Comments
 (0)