Skip to content

Commit 7adc175

Browse files
authored
Adjust isIdentifierText to skip multiple characters when a code point is multiple chars long (microsoft#32720)
* Adjust isIdentifierText to skip multiple characters when a code point is multiple chars long * Add a few examples with mixed unicode characters * for posterity, add some unicode cursive script characters * Test some more planes more explicitly
1 parent 624d1ca commit 7adc175

File tree

5 files changed

+278
-10
lines changed

5 files changed

+278
-10
lines changed

src/compiler/scanner.ts

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -832,12 +832,13 @@ namespace ts {
832832

833833
/* @internal */
834834
export function isIdentifierText(name: string, languageVersion: ScriptTarget | undefined): boolean {
835-
if (!isIdentifierStart(name.charCodeAt(0), languageVersion)) {
835+
let ch = codePointAt(name, 0);
836+
if (!isIdentifierStart(ch, languageVersion)) {
836837
return false;
837838
}
838839

839-
for (let i = 1; i < name.length; i++) {
840-
if (!isIdentifierPart(name.charCodeAt(i), languageVersion)) {
840+
for (let i = charSize(ch); i < name.length; i += charSize(ch)) {
841+
if (!isIdentifierPart(ch = codePointAt(name, i), languageVersion)) {
841842
return false;
842843
}
843844
}
@@ -1870,13 +1871,6 @@ namespace ts {
18701871
}
18711872
}
18721873

1873-
function charSize(ch: number) {
1874-
if (ch > 0x10000) {
1875-
return 2;
1876-
}
1877-
return 1;
1878-
}
1879-
18801874
function reScanGreaterToken(): SyntaxKind {
18811875
if (token === SyntaxKind.GreaterThanToken) {
18821876
if (text.charCodeAt(pos) === CharacterCodes.greaterThan) {
@@ -2238,4 +2232,12 @@ namespace ts {
22382232
}
22392233
return first;
22402234
};
2235+
2236+
/* @internal */
2237+
function charSize(ch: number) {
2238+
if (ch > 0x10000) {
2239+
return 2;
2240+
}
2241+
return 1;
2242+
}
22412243
}

tests/baselines/reference/extendedUnicodePlaneIdentifiers.js

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,64 @@
22
const 𝑚 = 4;
33
const 𝑀 = 5;
44
console.log(𝑀 + 𝑚); // 9
5+
6+
// lower 8 bits look like 'a'
7+
const = 6;
8+
console.log( ** );
9+
10+
// lower 8 bits aren't a valid unicode character
11+
const = 7;
12+
console.log( ** );
13+
14+
// a mix, for good measure
15+
const ဒၡ𝑀 = 7;
16+
console.log(ဒၡ𝑀 ** ဒၡ𝑀);
17+
18+
const ၡ𝑀ဒ = 7;
19+
console.log(ၡ𝑀ဒ ** ၡ𝑀ဒ);
20+
21+
const 𝑀ဒၡ = 7;
22+
console.log(𝑀ဒၡ ** 𝑀ဒၡ);
23+
24+
const 𝓱𝓮𝓵𝓵𝓸 = "𝔀𝓸𝓻𝓵𝓭";
25+
26+
const Ɐⱱ = "ok"; // BMP
27+
28+
const 𓀸𓀹𓀺 = "ok"; // SMP
29+
30+
const 𡚭𡚮𡚯 = "ok"; // SIP
31+
32+
const 𡚭𓀺ⱱ𝓮 = "ok";
33+
34+
const 𓀺ⱱ𝓮𡚭 = "ok";
35+
36+
const ⱱ𝓮𡚭𓀺 = "ok";
37+
38+
const 𝓮𡚭𓀺ⱱ = "ok";
539

640

741
//// [extendedUnicodePlaneIdentifiers.js]
842
const 𝑚 = 4;
943
const 𝑀 = 5;
1044
console.log(𝑀 + 𝑚); // 9
45+
// lower 8 bits look like 'a'
46+
const = 6;
47+
console.log( ** );
48+
// lower 8 bits aren't a valid unicode character
49+
const = 7;
50+
console.log( ** );
51+
// a mix, for good measure
52+
const ဒၡ𝑀 = 7;
53+
console.log(ဒၡ𝑀 ** ဒၡ𝑀);
54+
const ၡ𝑀ဒ = 7;
55+
console.log(ၡ𝑀ဒ ** ၡ𝑀ဒ);
56+
const 𝑀ဒၡ = 7;
57+
console.log(𝑀ဒၡ ** 𝑀ဒၡ);
58+
const 𝓱𝓮𝓵𝓵𝓸 = "𝔀𝓸𝓻𝓵𝓭";
59+
const Ɐⱱ = "ok"; // BMP
60+
const 𓀸𓀹𓀺 = "ok"; // SMP
61+
const 𡚭𡚮𡚯 = "ok"; // SIP
62+
const 𡚭𓀺ⱱ𝓮 = "ok";
63+
const 𓀺ⱱ𝓮𡚭 = "ok";
64+
const ⱱ𝓮𡚭𓀺 = "ok";
65+
const 𝓮𡚭𓀺ⱱ = "ok";

tests/baselines/reference/extendedUnicodePlaneIdentifiers.symbols

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,3 +12,80 @@ console.log(𝑀 + 𝑚); // 9
1212
>𝑀 : Symbol(𝑀, Decl(extendedUnicodePlaneIdentifiers.ts, 1, 5))
1313
>𝑚 : Symbol(𝑚, Decl(extendedUnicodePlaneIdentifiers.ts, 0, 5))
1414

15+
// lower 8 bits look like 'a'
16+
const ၡ = 6;
17+
>ၡ : Symbol(ၡ, Decl(extendedUnicodePlaneIdentifiers.ts, 5, 5))
18+
19+
console.log(ၡ ** ၡ);
20+
>console.log : Symbol(Console.log, Decl(lib.dom.d.ts, --, --))
21+
>console : Symbol(console, Decl(lib.dom.d.ts, --, --))
22+
>log : Symbol(Console.log, Decl(lib.dom.d.ts, --, --))
23+
>ၡ : Symbol(ၡ, Decl(extendedUnicodePlaneIdentifiers.ts, 5, 5))
24+
>ၡ : Symbol(ၡ, Decl(extendedUnicodePlaneIdentifiers.ts, 5, 5))
25+
26+
// lower 8 bits aren't a valid unicode character
27+
const ဒ = 7;
28+
>ဒ : Symbol(ဒ, Decl(extendedUnicodePlaneIdentifiers.ts, 9, 5))
29+
30+
console.log(ဒ ** ဒ);
31+
>console.log : Symbol(Console.log, Decl(lib.dom.d.ts, --, --))
32+
>console : Symbol(console, Decl(lib.dom.d.ts, --, --))
33+
>log : Symbol(Console.log, Decl(lib.dom.d.ts, --, --))
34+
>ဒ : Symbol(ဒ, Decl(extendedUnicodePlaneIdentifiers.ts, 9, 5))
35+
>ဒ : Symbol(ဒ, Decl(extendedUnicodePlaneIdentifiers.ts, 9, 5))
36+
37+
// a mix, for good measure
38+
const ဒၡ𝑀 = 7;
39+
>ဒၡ𝑀 : Symbol(ဒၡ𝑀, Decl(extendedUnicodePlaneIdentifiers.ts, 13, 5))
40+
41+
console.log(ဒၡ𝑀 ** ဒၡ𝑀);
42+
>console.log : Symbol(Console.log, Decl(lib.dom.d.ts, --, --))
43+
>console : Symbol(console, Decl(lib.dom.d.ts, --, --))
44+
>log : Symbol(Console.log, Decl(lib.dom.d.ts, --, --))
45+
>ဒၡ𝑀 : Symbol(ဒၡ𝑀, Decl(extendedUnicodePlaneIdentifiers.ts, 13, 5))
46+
>ဒၡ𝑀 : Symbol(ဒၡ𝑀, Decl(extendedUnicodePlaneIdentifiers.ts, 13, 5))
47+
48+
const ၡ𝑀ဒ = 7;
49+
>ၡ𝑀ဒ : Symbol(ၡ𝑀ဒ, Decl(extendedUnicodePlaneIdentifiers.ts, 16, 5))
50+
51+
console.log(ၡ𝑀ဒ ** ၡ𝑀ဒ);
52+
>console.log : Symbol(Console.log, Decl(lib.dom.d.ts, --, --))
53+
>console : Symbol(console, Decl(lib.dom.d.ts, --, --))
54+
>log : Symbol(Console.log, Decl(lib.dom.d.ts, --, --))
55+
>ၡ𝑀ဒ : Symbol(ၡ𝑀ဒ, Decl(extendedUnicodePlaneIdentifiers.ts, 16, 5))
56+
>ၡ𝑀ဒ : Symbol(ၡ𝑀ဒ, Decl(extendedUnicodePlaneIdentifiers.ts, 16, 5))
57+
58+
const 𝑀ဒၡ = 7;
59+
>𝑀ဒၡ : Symbol(𝑀ဒၡ, Decl(extendedUnicodePlaneIdentifiers.ts, 19, 5))
60+
61+
console.log(𝑀ဒၡ ** 𝑀ဒၡ);
62+
>console.log : Symbol(Console.log, Decl(lib.dom.d.ts, --, --))
63+
>console : Symbol(console, Decl(lib.dom.d.ts, --, --))
64+
>log : Symbol(Console.log, Decl(lib.dom.d.ts, --, --))
65+
>𝑀ဒၡ : Symbol(𝑀ဒၡ, Decl(extendedUnicodePlaneIdentifiers.ts, 19, 5))
66+
>𝑀ဒၡ : Symbol(𝑀ဒၡ, Decl(extendedUnicodePlaneIdentifiers.ts, 19, 5))
67+
68+
const 𝓱𝓮𝓵𝓵𝓸 = "𝔀𝓸𝓻𝓵𝓭";
69+
>𝓱𝓮𝓵𝓵𝓸 : Symbol(𝓱𝓮𝓵𝓵𝓸, Decl(extendedUnicodePlaneIdentifiers.ts, 22, 5))
70+
71+
const Ɐⱱ = "ok"; // BMP
72+
>Ɐⱱ : Symbol(Ɐⱱ, Decl(extendedUnicodePlaneIdentifiers.ts, 24, 5))
73+
74+
const 𓀸𓀹𓀺 = "ok"; // SMP
75+
>𓀸𓀹𓀺 : Symbol(𓀸𓀹𓀺, Decl(extendedUnicodePlaneIdentifiers.ts, 26, 5))
76+
77+
const 𡚭𡚮𡚯 = "ok"; // SIP
78+
>𡚭𡚮𡚯 : Symbol(𡚭𡚮𡚯, Decl(extendedUnicodePlaneIdentifiers.ts, 28, 5))
79+
80+
const 𡚭𓀺ⱱ𝓮 = "ok";
81+
>𡚭𓀺ⱱ𝓮 : Symbol(𡚭𓀺ⱱ𝓮, Decl(extendedUnicodePlaneIdentifiers.ts, 30, 5))
82+
83+
const 𓀺ⱱ𝓮𡚭 = "ok";
84+
>𓀺ⱱ𝓮𡚭 : Symbol(𓀺ⱱ𝓮𡚭, Decl(extendedUnicodePlaneIdentifiers.ts, 32, 5))
85+
86+
const ⱱ𝓮𡚭𓀺 = "ok";
87+
>ⱱ𝓮𡚭𓀺 : Symbol(ⱱ𝓮𡚭𓀺, Decl(extendedUnicodePlaneIdentifiers.ts, 34, 5))
88+
89+
const 𝓮𡚭𓀺ⱱ = "ok";
90+
>𝓮𡚭𓀺ⱱ : Symbol(𝓮𡚭𓀺ⱱ, Decl(extendedUnicodePlaneIdentifiers.ts, 36, 5))
91+

tests/baselines/reference/extendedUnicodePlaneIdentifiers.types

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,3 +16,103 @@ console.log(𝑀 + 𝑚); // 9
1616
>𝑀 : 5
1717
>𝑚 : 4
1818

19+
// lower 8 bits look like 'a'
20+
const ၡ = 6;
21+
>ၡ : 6
22+
>6 : 6
23+
24+
console.log(ၡ ** ၡ);
25+
>console.log(ၡ ** ၡ) : void
26+
>console.log : (message?: any, ...optionalParams: any[]) => void
27+
>console : Console
28+
>log : (message?: any, ...optionalParams: any[]) => void
29+
>ၡ ** ၡ : number
30+
>ၡ : 6
31+
>ၡ : 6
32+
33+
// lower 8 bits aren't a valid unicode character
34+
const ဒ = 7;
35+
>ဒ : 7
36+
>7 : 7
37+
38+
console.log(ဒ ** ဒ);
39+
>console.log(ဒ ** ဒ) : void
40+
>console.log : (message?: any, ...optionalParams: any[]) => void
41+
>console : Console
42+
>log : (message?: any, ...optionalParams: any[]) => void
43+
>ဒ ** ဒ : number
44+
>ဒ : 7
45+
>ဒ : 7
46+
47+
// a mix, for good measure
48+
const ဒၡ𝑀 = 7;
49+
>ဒၡ𝑀 : 7
50+
>7 : 7
51+
52+
console.log(ဒၡ𝑀 ** ဒၡ𝑀);
53+
>console.log(ဒၡ𝑀 ** ဒၡ𝑀) : void
54+
>console.log : (message?: any, ...optionalParams: any[]) => void
55+
>console : Console
56+
>log : (message?: any, ...optionalParams: any[]) => void
57+
>ဒၡ𝑀 ** ဒၡ𝑀 : number
58+
>ဒၡ𝑀 : 7
59+
>ဒၡ𝑀 : 7
60+
61+
const ၡ𝑀ဒ = 7;
62+
>ၡ𝑀ဒ : 7
63+
>7 : 7
64+
65+
console.log(ၡ𝑀ဒ ** ၡ𝑀ဒ);
66+
>console.log(ၡ𝑀ဒ ** ၡ𝑀ဒ) : void
67+
>console.log : (message?: any, ...optionalParams: any[]) => void
68+
>console : Console
69+
>log : (message?: any, ...optionalParams: any[]) => void
70+
>ၡ𝑀ဒ ** ၡ𝑀ဒ : number
71+
>ၡ𝑀ဒ : 7
72+
>ၡ𝑀ဒ : 7
73+
74+
const 𝑀ဒၡ = 7;
75+
>𝑀ဒၡ : 7
76+
>7 : 7
77+
78+
console.log(𝑀ဒၡ ** 𝑀ဒၡ);
79+
>console.log(𝑀ဒၡ ** 𝑀ဒၡ) : void
80+
>console.log : (message?: any, ...optionalParams: any[]) => void
81+
>console : Console
82+
>log : (message?: any, ...optionalParams: any[]) => void
83+
>𝑀ဒၡ ** 𝑀ဒၡ : number
84+
>𝑀ဒၡ : 7
85+
>𝑀ဒၡ : 7
86+
87+
const 𝓱𝓮𝓵𝓵𝓸 = "𝔀𝓸𝓻𝓵𝓭";
88+
>𝓱𝓮𝓵𝓵𝓸 : "𝔀𝓸𝓻𝓵𝓭"
89+
>"𝔀𝓸𝓻𝓵𝓭" : "𝔀𝓸𝓻𝓵𝓭"
90+
91+
const Ɐⱱ = "ok"; // BMP
92+
>Ɐⱱ : "ok"
93+
>"ok" : "ok"
94+
95+
const 𓀸𓀹𓀺 = "ok"; // SMP
96+
>𓀸𓀹𓀺 : "ok"
97+
>"ok" : "ok"
98+
99+
const 𡚭𡚮𡚯 = "ok"; // SIP
100+
>𡚭𡚮𡚯 : "ok"
101+
>"ok" : "ok"
102+
103+
const 𡚭𓀺ⱱ𝓮 = "ok";
104+
>𡚭𓀺ⱱ𝓮 : "ok"
105+
>"ok" : "ok"
106+
107+
const 𓀺ⱱ𝓮𡚭 = "ok";
108+
>𓀺ⱱ𝓮𡚭 : "ok"
109+
>"ok" : "ok"
110+
111+
const ⱱ𝓮𡚭𓀺 = "ok";
112+
>ⱱ𝓮𡚭𓀺 : "ok"
113+
>"ok" : "ok"
114+
115+
const 𝓮𡚭𓀺ⱱ = "ok";
116+
>𝓮𡚭𓀺ⱱ : "ok"
117+
>"ok" : "ok"
118+

tests/cases/compiler/extendedUnicodePlaneIdentifiers.ts

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,37 @@
22
const 𝑚 = 4;
33
const 𝑀 = 5;
44
console.log(𝑀 + 𝑚); // 9
5+
6+
// lower 8 bits look like 'a'
7+
const = 6;
8+
console.log( ** );
9+
10+
// lower 8 bits aren't a valid unicode character
11+
const = 7;
12+
console.log( ** );
13+
14+
// a mix, for good measure
15+
const ဒၡ𝑀 = 7;
16+
console.log(ဒၡ𝑀 ** ဒၡ𝑀);
17+
18+
const ၡ𝑀ဒ = 7;
19+
console.log(ၡ𝑀ဒ ** ၡ𝑀ဒ);
20+
21+
const 𝑀ဒၡ = 7;
22+
console.log(𝑀ဒၡ ** 𝑀ဒၡ);
23+
24+
const 𝓱𝓮𝓵𝓵𝓸 = "𝔀𝓸𝓻𝓵𝓭";
25+
26+
const Ɐⱱ = "ok"; // BMP
27+
28+
const 𓀸𓀹𓀺 = "ok"; // SMP
29+
30+
const 𡚭𡚮𡚯 = "ok"; // SIP
31+
32+
const 𡚭𓀺ⱱ𝓮 = "ok";
33+
34+
const 𓀺ⱱ𝓮𡚭 = "ok";
35+
36+
const ⱱ𝓮𡚭𓀺 = "ok";
37+
38+
const 𝓮𡚭𓀺ⱱ = "ok";

0 commit comments

Comments
 (0)