@@ -10,6 +10,7 @@ native "rust" mod rustrt {
10
10
fn str_from_vec ( vec[ mutable? u8 ] b ) -> str ;
11
11
fn str_from_cstr ( sbuf cstr) -> str ;
12
12
fn str_from_buf ( sbuf buf, uint len) -> str ;
13
+ fn str_push_byte ( str s, uint byte) -> str ;
13
14
fn refcount [ T ] ( str s) -> uint ;
14
15
}
15
16
@@ -65,15 +66,42 @@ fn hash(&str s) -> uint {
65
66
ret u;
66
67
}
67
68
69
+ // UTF-8 tags and ranges
70
+ const u8 tag_cont_u8 = 0x80_u8 ;
71
+ const uint tag_cont = 0x80_ u;
72
+ const uint max_one_b = 0x80_ u;
73
+ const uint tag_two_b = 0xc0_ u;
74
+ const uint max_two_b = 0x800_ u;
75
+ const uint tag_three_b = 0xe0_ u;
76
+ const uint max_three_b = 0x10000_ u;
77
+ const uint tag_four_b = 0xf0_ u;
78
+ const uint max_four_b = 0x200000_ u;
79
+ const uint tag_five_b = 0xf8_ u;
80
+ const uint max_five_b = 0x4000000_ u;
81
+ const uint tag_six_b = 0xfc_ u;
82
+
68
83
fn is_utf8 ( vec[ u8] v ) -> bool {
69
- fail; // FIXME
84
+ auto i = 0 u;
85
+ auto total = _vec. len [ u8] ( v) ;
86
+ while ( i < total) {
87
+ auto chsize = utf8_char_width ( v. ( i) ) ;
88
+ if ( chsize == 0 u) { ret false ; }
89
+ if ( i + chsize > total) { ret false ; }
90
+ i += 1 u;
91
+ while ( chsize > 1 u) {
92
+ if ( v. ( i) & 0xc0_u8 != tag_cont_u8) { ret false ; }
93
+ i += 1 u;
94
+ chsize -= 1 u;
95
+ }
96
+ }
97
+ ret true;
70
98
}
71
99
72
100
fn is_ascii ( str s) -> bool {
73
101
let uint i = byte_len ( s) ;
74
102
while ( i > 0 u) {
75
103
i -= 1 u;
76
- if ( ( s. ( i) & 0x80u8 ) != 0u8 ) {
104
+ if ( ( s. ( i) & 0x80_u8 ) != 0u8 ) {
77
105
ret false ;
78
106
}
79
107
}
@@ -134,6 +162,139 @@ unsafe fn str_from_buf(sbuf buf, uint len) -> str {
134
162
ret rustrt. str_from_buf ( buf, len) ;
135
163
}
136
164
165
+ fn push_utf8_bytes ( & mutable str s, char ch) {
166
+ auto code = ch as uint ;
167
+ if ( code < max_one_b) {
168
+ s = rustrt. str_push_byte ( s, code) ;
169
+ } else if ( code < max_two_b) {
170
+ s = rustrt. str_push_byte ( s, ( ( code >> 6 u) & 0x1f_ u) | tag_two_b) ;
171
+ s = rustrt. str_push_byte ( s, ( code & 0x3f_ u) | tag_cont) ;
172
+ } else if ( code < max_three_b) {
173
+ s = rustrt. str_push_byte ( s, ( ( code >> 12 u) & 0x0f_ u) | tag_three_b) ;
174
+ s = rustrt. str_push_byte ( s, ( ( code >> 6 u) & 0x3f_ u) | tag_cont) ;
175
+ s = rustrt. str_push_byte ( s, ( code & 0x3f_ u) | tag_cont) ;
176
+ } else if ( code < max_four_b) {
177
+ s = rustrt. str_push_byte ( s, ( ( code >> 18 u) & 0x07_ u) | tag_four_b) ;
178
+ s = rustrt. str_push_byte ( s, ( ( code >> 12 u) & 0x3f_ u) | tag_cont) ;
179
+ s = rustrt. str_push_byte ( s, ( ( code >> 6 u) & 0x3f_ u) | tag_cont) ;
180
+ s = rustrt. str_push_byte ( s, ( code & 0x3f_ u) | tag_cont) ;
181
+ } else if ( code < max_five_b) {
182
+ s = rustrt. str_push_byte ( s, ( ( code >> 24 u) & 0x03_ u) | tag_five_b) ;
183
+ s = rustrt. str_push_byte ( s, ( ( code >> 18 u) & 0x3f_ u) | tag_cont) ;
184
+ s = rustrt. str_push_byte ( s, ( ( code >> 12 u) & 0x3f_ u) | tag_cont) ;
185
+ s = rustrt. str_push_byte ( s, ( ( code >> 6 u) & 0x3f_ u) | tag_cont) ;
186
+ s = rustrt. str_push_byte ( s, ( code & 0x3f_ u) | tag_cont) ;
187
+ } else {
188
+ s = rustrt. str_push_byte ( s, ( ( code >> 30 u) & 0x01_ u) | tag_six_b) ;
189
+ s = rustrt. str_push_byte ( s, ( ( code >> 24 u) & 0x3f_ u) | tag_cont) ;
190
+ s = rustrt. str_push_byte ( s, ( ( code >> 18 u) & 0x3f_ u) | tag_cont) ;
191
+ s = rustrt. str_push_byte ( s, ( ( code >> 12 u) & 0x3f_ u) | tag_cont) ;
192
+ s = rustrt. str_push_byte ( s, ( ( code >> 6 u) & 0x3f_ u) | tag_cont) ;
193
+ s = rustrt. str_push_byte ( s, ( code & 0x3f_ u) | tag_cont) ;
194
+ }
195
+ }
196
+
197
+ fn from_char ( char ch) -> str {
198
+ auto buf = "" ;
199
+ push_utf8_bytes ( buf, ch) ;
200
+ ret buf;
201
+ }
202
+
203
+ fn from_chars ( vec[ char] chs ) -> str {
204
+ auto buf = "" ;
205
+ for ( char ch in chs) { push_utf8_bytes ( buf, ch) ; }
206
+ ret buf;
207
+ }
208
+
209
+ fn utf8_char_width ( u8 b) -> uint {
210
+ let uint byte = b as uint ;
211
+ if ( byte < 0x80_ u) { ret 1 u; }
212
+ if ( byte < 0xc0_ u) { ret 0 u; } // Not a valid start byte
213
+ if ( byte < 0xe0_ u) { ret 2 u; }
214
+ if ( byte < 0xf0_ u) { ret 3 u; }
215
+ if ( byte < 0xf8_ u) { ret 4 u; }
216
+ if ( byte < 0xfc_ u) { ret 5 u; }
217
+ ret 6 u;
218
+ }
219
+
220
+ fn char_range_at ( str s, uint i) -> tup ( char , uint ) {
221
+ auto b0 = s. ( i) ;
222
+ auto w = utf8_char_width ( b0) ;
223
+ check ( w != 0 u) ;
224
+ if ( w == 1 u) { ret tup ( b0 as char , i + 1 u) ; }
225
+ auto val = 0 u;
226
+ auto end = i + w;
227
+ i += 1 u;
228
+ while ( i < end) {
229
+ auto byte = s. ( i) ;
230
+ check ( byte & 0xc0_u8 == tag_cont_u8) ;
231
+ val <<= 6 u;
232
+ val += ( byte & 0x3f_u8 ) as uint ;
233
+ i += 1 u;
234
+ }
235
+ // Clunky way to get the right bits from the first byte. Uses two shifts,
236
+ // the first to clip off the marker bits at the left of the byte, and then
237
+ // a second (as uint) to get it to the right position.
238
+ val += ( ( b0 << ( ( w + 1 u) as u8 ) ) as uint ) << ( ( w - 1 u) * 6 u - w - 1 u) ;
239
+ ret tup( val as char , i) ;
240
+ }
241
+
242
+ fn char_at ( str s, uint i) -> char {
243
+ ret char_range_at ( s, i) . _0 ;
244
+ }
245
+
246
+ fn char_len ( str s) -> uint {
247
+ auto i = 0 u;
248
+ auto len = 0 u;
249
+ auto total = byte_len ( s) ;
250
+ while ( i < total) {
251
+ auto chsize = utf8_char_width ( s. ( i) ) ;
252
+ check ( chsize > 0 u) ;
253
+ len += 1 u;
254
+ i += chsize;
255
+ }
256
+ check ( i == total) ;
257
+ ret len;
258
+ }
259
+
260
+ fn to_chars ( str s) -> vec[ char ] {
261
+ let vec[ char] buf = vec ( ) ;
262
+ auto i = 0 u;
263
+ auto len = byte_len ( s) ;
264
+ while ( i < len) {
265
+ auto cur = char_range_at ( s, i) ;
266
+ _vec. push [ char] ( buf, cur. _0 ) ;
267
+ i = cur. _1 ;
268
+ }
269
+ ret buf;
270
+ }
271
+
272
+ fn push_char ( & mutable str s, char ch) {
273
+ s += from_char ( ch) ;
274
+ }
275
+
276
+ fn pop_char ( & mutable str s) -> char {
277
+ auto end = byte_len ( s) ;
278
+ while ( end > 0 u && s. ( end - 1 u) & 0xc0_u8 == tag_cont_u8) { end -= 1 u; }
279
+ check ( end > 0 u) ;
280
+ auto ch = char_at ( s, end - 1 u) ;
281
+ s = substr ( s, 0 u, end - 1 u) ;
282
+ ret ch;
283
+ }
284
+
285
+ fn shift_char ( & mutable str s) -> char {
286
+ auto r = char_range_at ( s, 0 u) ;
287
+ s = substr ( s, r. _1 , byte_len ( s) - r. _1 ) ;
288
+ ret r. _0 ;
289
+ }
290
+
291
+ fn unshift_char ( & mutable str s, char ch) {
292
+ // Workaround for rustboot order-of-evaluation issue -- if I put s
293
+ // directly after the +, the string ends up containing (only) the
294
+ // character, twice.
295
+ auto x = s;
296
+ s = from_char ( ch) + x;
297
+ }
137
298
138
299
fn refcount ( str s) -> uint {
139
300
auto r = rustrt. refcount [ u8] ( s) ;
@@ -256,7 +417,7 @@ fn pop_byte(&mutable str s) -> u8 {
256
417
}
257
418
258
419
fn push_byte ( & mutable str s, u8 b) {
259
- s += unsafe_from_byte ( b ) ;
420
+ s = rustrt . str_push_byte ( s , b as uint ) ;
260
421
}
261
422
262
423
fn unshift_byte ( & mutable str s, u8 b) {
0 commit comments