Skip to content

Commit ebc7830

Browse files
[libc] Change ctype to be encoding independent
The previous implementation of the ctype functions assumed ASCII. This patch changes to a switch/case implementation that looks odd, but actually is easier for the compiler to understand and optimize.
1 parent b3b6141 commit ebc7830

File tree

1 file changed

+275
-23
lines changed

1 file changed

+275
-23
lines changed

libc/src/__support/ctype_utils.h

Lines changed: 275 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -15,44 +15,296 @@
1515
namespace LIBC_NAMESPACE_DECL {
1616
namespace internal {
1717

18-
// ------------------------------------------------------
19-
// Rationale: Since these classification functions are
20-
// called in other functions, we will avoid the overhead
21-
// of a function call by inlining them.
22-
// ------------------------------------------------------
18+
// -----------------------------------------------------------------------------
19+
// ****************** WARNING ******************
20+
// ****************** DO NOT TRY TO OPTIMIZE THESE FUNCTIONS! ******************
21+
// -----------------------------------------------------------------------------
22+
// This switch/case form is easier for the compiler to understand, and is
23+
// optimized into a form that is almost always the same as or better than
24+
// versions written by hand (see https://godbolt.org/z/qvrebqvvr). Also this
25+
// form makes these functions encoding independent. If you want to rewrite these
26+
// functions, make sure you have benchmarks to show your new solution is faster,
27+
// as well as a way to support non-ASCII character encodings.
2328

24-
LIBC_INLINE static constexpr bool isalpha(unsigned ch) {
25-
return (ch | 32) - 'a' < 26;
29+
LIBC_INLINE static constexpr bool islower(int ch) {
30+
switch (ch) {
31+
case 'a':
32+
case 'b':
33+
case 'c':
34+
case 'd':
35+
case 'e':
36+
case 'f':
37+
case 'g':
38+
case 'h':
39+
case 'i':
40+
case 'j':
41+
case 'k':
42+
case 'l':
43+
case 'm':
44+
case 'n':
45+
case 'o':
46+
case 'p':
47+
case 'q':
48+
case 'r':
49+
case 's':
50+
case 't':
51+
case 'u':
52+
case 'v':
53+
case 'w':
54+
case 'x':
55+
case 'y':
56+
case 'z':
57+
return true;
58+
default:
59+
return false;
60+
}
2661
}
2762

28-
LIBC_INLINE static constexpr bool isdigit(unsigned ch) {
29-
return (ch - '0') < 10;
63+
LIBC_INLINE static constexpr bool isupper(int ch) {
64+
switch (ch) {
65+
case 'A':
66+
case 'B':
67+
case 'C':
68+
case 'D':
69+
case 'E':
70+
case 'F':
71+
case 'G':
72+
case 'H':
73+
case 'I':
74+
case 'J':
75+
case 'K':
76+
case 'L':
77+
case 'M':
78+
case 'N':
79+
case 'O':
80+
case 'P':
81+
case 'Q':
82+
case 'R':
83+
case 'S':
84+
case 'T':
85+
case 'U':
86+
case 'V':
87+
case 'W':
88+
case 'X':
89+
case 'Y':
90+
case 'Z':
91+
return true;
92+
default:
93+
return false;
94+
}
3095
}
3196

32-
LIBC_INLINE static constexpr bool isalnum(unsigned ch) {
33-
return isalpha(ch) || isdigit(ch);
97+
LIBC_INLINE static constexpr bool isdigit(int ch) {
98+
switch (ch) {
99+
case '0':
100+
case '1':
101+
case '2':
102+
case '3':
103+
case '4':
104+
case '5':
105+
case '6':
106+
case '7':
107+
case '8':
108+
case '9':
109+
return true;
110+
default:
111+
return false;
112+
}
34113
}
35114

36-
LIBC_INLINE static constexpr bool isgraph(unsigned ch) {
37-
return 0x20 < ch && ch < 0x7f;
115+
LIBC_INLINE static constexpr int tolower(int ch) {
116+
switch (ch) {
117+
case 'A':
118+
return 'a';
119+
case 'B':
120+
return 'b';
121+
case 'C':
122+
return 'c';
123+
case 'D':
124+
return 'd';
125+
case 'E':
126+
return 'e';
127+
case 'F':
128+
return 'f';
129+
case 'G':
130+
return 'g';
131+
case 'H':
132+
return 'h';
133+
case 'I':
134+
return 'i';
135+
case 'J':
136+
return 'j';
137+
case 'K':
138+
return 'k';
139+
case 'L':
140+
return 'l';
141+
case 'M':
142+
return 'm';
143+
case 'N':
144+
return 'n';
145+
case 'O':
146+
return 'o';
147+
case 'P':
148+
return 'p';
149+
case 'Q':
150+
return 'q';
151+
case 'R':
152+
return 'r';
153+
case 'S':
154+
return 's';
155+
case 'T':
156+
return 't';
157+
case 'U':
158+
return 'u';
159+
case 'V':
160+
return 'v';
161+
case 'W':
162+
return 'w';
163+
case 'X':
164+
return 'x';
165+
case 'Y':
166+
return 'y';
167+
case 'Z':
168+
return 'z';
169+
default:
170+
return ch;
171+
}
172+
}
173+
174+
LIBC_INLINE static constexpr bool isalpha(int ch) {
175+
switch (tolower(ch)) {
176+
case 'a':
177+
case 'b':
178+
case 'c':
179+
case 'd':
180+
case 'e':
181+
case 'f':
182+
case 'g':
183+
case 'h':
184+
case 'i':
185+
case 'j':
186+
case 'k':
187+
case 'l':
188+
case 'm':
189+
case 'n':
190+
case 'o':
191+
case 'p':
192+
case 'q':
193+
case 'r':
194+
case 's':
195+
case 't':
196+
case 'u':
197+
case 'v':
198+
case 'w':
199+
case 'x':
200+
case 'y':
201+
case 'z':
202+
return true;
203+
default:
204+
return false;
205+
}
38206
}
39207

40-
LIBC_INLINE static constexpr bool islower(unsigned ch) {
41-
return (ch - 'a') < 26;
208+
LIBC_INLINE static constexpr bool isalnum(int ch) {
209+
return isalpha(ch) || isdigit(ch);
42210
}
43211

44-
LIBC_INLINE static constexpr bool isupper(unsigned ch) {
45-
return (ch - 'A') < 26;
212+
LIBC_INLINE static constexpr int b36_char_to_int(int ch) {
213+
switch (tolower(ch)) {
214+
case '0':
215+
return 0;
216+
case '1':
217+
return 1;
218+
case '2':
219+
return 2;
220+
case '3':
221+
return 3;
222+
case '4':
223+
return 4;
224+
case '5':
225+
return 5;
226+
case '6':
227+
return 6;
228+
case '7':
229+
return 7;
230+
case '8':
231+
return 8;
232+
case '9':
233+
return 9;
234+
case 'a':
235+
return 10;
236+
case 'b':
237+
return 11;
238+
case 'c':
239+
return 12;
240+
case 'd':
241+
return 13;
242+
case 'e':
243+
return 14;
244+
case 'f':
245+
return 15;
246+
case 'g':
247+
return 16;
248+
case 'h':
249+
return 17;
250+
case 'i':
251+
return 18;
252+
case 'j':
253+
return 19;
254+
case 'k':
255+
return 20;
256+
case 'l':
257+
return 21;
258+
case 'm':
259+
return 22;
260+
case 'n':
261+
return 23;
262+
case 'o':
263+
return 24;
264+
case 'p':
265+
return 25;
266+
case 'q':
267+
return 26;
268+
case 'r':
269+
return 27;
270+
case 's':
271+
return 28;
272+
case 't':
273+
return 29;
274+
case 'u':
275+
return 30;
276+
case 'v':
277+
return 31;
278+
case 'w':
279+
return 32;
280+
case 'x':
281+
return 33;
282+
case 'y':
283+
return 34;
284+
case 'z':
285+
return 35;
286+
default:
287+
return 0;
288+
}
46289
}
47290

48-
LIBC_INLINE static constexpr bool isspace(unsigned ch) {
49-
return ch == ' ' || (ch - '\t') < 5;
291+
LIBC_INLINE static constexpr bool isspace(int ch) {
292+
switch (ch) {
293+
case ' ':
294+
case '\t':
295+
case '\n':
296+
case '\v':
297+
case '\f':
298+
case '\r':
299+
return true;
300+
default:
301+
return false;
302+
}
50303
}
51304

52-
LIBC_INLINE static constexpr int tolower(int ch) {
53-
if (isupper(ch))
54-
return ch + ('a' - 'A');
55-
return ch;
305+
// not yet encoding independent.
306+
LIBC_INLINE static constexpr bool isgraph(int ch) {
307+
return 0x20 < ch && ch < 0x7f;
56308
}
57309

58310
} // namespace internal

0 commit comments

Comments
 (0)