Skip to content

Commit fb0e246

Browse files
committed
Add better explanations for combining characters and surrogate pairs
1 parent 5713c4d commit fb0e246

File tree

1 file changed

+28
-1
lines changed

1 file changed

+28
-1
lines changed

packages/firestore/test/integration/api/query.test.ts

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2201,16 +2201,43 @@ apiDescribe('Queries', persistence => {
22012201
// to send an existence filter.
22022202
// eslint-disable-next-line no-restricted-properties
22032203
(USE_EMULATOR ? it.skip : it)(
2204-
'bloom filter should correctly encode special unicode characters',
2204+
'bloom filter should correctly encode complex Unicode characters',
22052205
async () => {
2206+
// Firestore does not do any Unicode normalization on the document IDs.
2207+
// Therefore, two document IDs that are canonically-equivalent (i.e. they
2208+
// visually appear identical) but are represented by a different sequence
2209+
// of Unicode code points are treated as distinct document IDs.
22062210
const testDocIds = [
22072211
'DocumentToDelete',
2212+
// The next two strings both end with "e" with an accent: the first uses
2213+
// the dedicated Unicode code point for this character, while the second
2214+
// uses the standard lowercase "e" followed by the accent combining
2215+
// character.
22082216
'LowercaseEWithAcuteAccent_\u00E9',
22092217
'LowercaseEWithAcuteAccent_\u0065\u0301',
2218+
// The next two strings both end with an "e" with two different accents
2219+
// applied via the following two combining characters. The combining
2220+
// characters are specified in a different order and Firestore treats
2221+
// these document IDs as unique, despite the order of the combining
2222+
// characters being irrelevant.
22102223
'LowercaseEWithMultipleAccents_\u0065\u0301\u0327',
22112224
'LowercaseEWithMultipleAccents_\u0065\u0327\u0301',
2225+
// The next string contains a character outside the BMP (the "basic
2226+
// multilingual plane"); that is, its code point is greater than 0xFFFF.
2227+
// In UTF-16 (which JavaScript uses to store Unicode strings) this
2228+
// requires a surrogate pair, two 16-bit code units, to represent this
2229+
// character. Make sure that its presence is correctly tested in the
2230+
// bloom filter, which uses UTF-8 encoding.
22122231
'Smiley_\u{1F600}'
22132232
];
2233+
2234+
// Verify assumptions about the equivalence of strings in `testDocIds`.
2235+
expect(testDocIds[1].normalize()).equals(testDocIds[2].normalize());
2236+
expect(testDocIds[3].normalize()).equals(testDocIds[4].normalize());
2237+
expect(testDocIds[5]).equals('Smiley_\uD83D\uDE00');
2238+
2239+
// Create the mapping from document ID to document data for the document
2240+
// IDs specified in `testDocIds`.
22142241
const testDocs = testDocIds.reduce((map, docId) => {
22152242
map[docId] = { foo: 42 };
22162243
return map;

0 commit comments

Comments
 (0)