Skip to content

Commit 76820ed

Browse files
committed
Mangling: support for special encoding ASCII of characters which may not appear in symbol names.
Such characters (like ‘.’) can be punycode-encoded just like non-ASCII unicode characters.
1 parent 97f1fac commit 76820ed

File tree

3 files changed

+33
-6
lines changed

3 files changed

+33
-6
lines changed

include/swift/Basic/Punycode.h

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,9 @@
1818
// - Encoding digits are represented using [a-zA-J] instead of [a-z0-9], because
1919
// symbol names are case-sensitive, and Swift mangled identifiers cannot begin
2020
// with a digit.
21+
// - Optinally, non-symbol ASCII characters (characters except [$_a-zA-Z0-9])
22+
// are mapped to the code range 0xD800 - 0xD880 and are also encoded like
23+
// non-ASCII unicode characters.
2124
//
2225
//===----------------------------------------------------------------------===//
2326

@@ -45,7 +48,13 @@ bool encodePunycode(const std::vector<uint32_t> &InputCodePoints,
4548
bool decodePunycode(StringRef InputPunycode,
4649
std::vector<uint32_t> &OutCodePoints);
4750

48-
bool encodePunycodeUTF8(StringRef InputUTF8, std::string &OutPunycode);
51+
/// Encodes an UTF8 string into Punycode.
52+
///
53+
/// If \p mapNonSymbolChars is true, non-symbol ASCII characters (characters
54+
/// except [$_a-zA-Z0-9]) are also encoded like non-ASCII unicode characters.
55+
/// Returns false if \p InputUTF8 contains surrogate code points.
56+
bool encodePunycodeUTF8(StringRef InputUTF8, std::string &OutPunycode,
57+
bool mapNonSymbolChars = false);
4958

5059
bool decodePunycodeUTF8(StringRef InputPunycode, std::string &OutUTF8);
5160

lib/Basic/Punycode.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,9 @@ static int digit_index(char value) {
4747
}
4848

4949
static bool isValidUnicodeScalar(uint32_t S) {
50-
return (S < 0xD800) || (S >= 0xE000 && S <= 0x1FFFFF);
50+
// Also accept the range of 0xD800 - 0xD880, which is used for non-symbol
51+
// ASCII characters.
52+
return (S < 0xD880) || (S >= 0xE000 && S <= 0x1FFFFF);
5153
}
5254

5355
// Section 6.1: Bias adaptation function
@@ -200,6 +202,8 @@ static bool encodeToUTF8(const std::vector<uint32_t> &Scalars,
200202
OutUTF8.clear();
201203
return false;
202204
}
205+
if (S >= 0xD800 && S < 0xD880)
206+
S -= 0xD800;
203207

204208
unsigned Bytes = 0;
205209
if (S < 0x80)

lib/Basic/PunycodeUTF8.cpp

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,19 +19,32 @@ static bool isContinuationByte(uint8_t unit) {
1919
return (unit & 0xC0) == 0x80;
2020
}
2121

22+
static bool isValidSymbolChar(char ch) {
23+
return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') ||
24+
(ch >= '0' && ch <= '9') || ch == '_' || ch == '$';
25+
}
26+
2227
/// Reencode well-formed UTF-8 as UTF-32.
2328
///
2429
/// This entry point is only called from compiler-internal entry points, so does
2530
/// only minimal validation. In particular, it does *not* check for overlong
2631
/// encodings.
32+
/// If \p mapNonSymbolChars is true, non-symbol ASCII characters (characters
33+
/// except [$_a-zA-Z0-9]) are also encoded like non-ASCII unicode characters.
34+
/// Returns false if \p InputUTF8 contains surrogate code points.
2735
static bool convertUTF8toUTF32(StringRef InputUTF8,
28-
std::vector<uint32_t> &OutUTF32) {
36+
std::vector<uint32_t> &OutUTF32,
37+
bool mapNonSymbolChars) {
2938
auto ptr = InputUTF8.begin();
3039
auto end = InputUTF8.end();
3140
while (ptr < end) {
3241
uint8_t first = *ptr++;
3342
if (first < 0x80) {
34-
OutUTF32.push_back(first);
43+
if (isValidSymbolChar(first) || !mapNonSymbolChars) {
44+
OutUTF32.push_back(first);
45+
} else {
46+
OutUTF32.push_back((uint32_t)first + 0xD800);
47+
}
3548
} else if (first < 0xC0) {
3649
// Invalid continuation byte.
3750
return false;
@@ -75,11 +88,12 @@ static bool convertUTF8toUTF32(StringRef InputUTF8,
7588
}
7689

7790
bool Punycode::encodePunycodeUTF8(StringRef InputUTF8,
78-
std::string &OutPunycode) {
91+
std::string &OutPunycode,
92+
bool mapNonSymbolChars) {
7993
std::vector<uint32_t> InputCodePoints;
8094
InputCodePoints.reserve(InputUTF8.size());
8195

82-
if (!convertUTF8toUTF32(InputUTF8, InputCodePoints))
96+
if (!convertUTF8toUTF32(InputUTF8, InputCodePoints, mapNonSymbolChars))
8397
return false;
8498

8599
return encodePunycode(InputCodePoints, OutPunycode);

0 commit comments

Comments
 (0)