Skip to content

Commit 350f480

Browse files
committed
Merge pull request #1534 from killerswan/string_work
Added string functions: split_func, split_char, lines, lines_any, words
2 parents 3466c9b + d8b0a19 commit 350f480

File tree

2 files changed

+189
-9
lines changed

2 files changed

+189
-9
lines changed

src/libcore/str.rs

Lines changed: 97 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@ String manipulation.
77
export eq, lteq, hash, is_empty, is_not_empty, is_whitespace, byte_len,
88
byte_len_range, index,
99
rindex, find, starts_with, ends_with, substr, slice, split, splitn,
10-
split_str, concat, connect, to_lower, to_upper, replace, char_slice,
10+
split_str, split_func, split_char, lines, lines_any, words,
11+
concat, connect, to_lower, to_upper, replace, char_slice,
1112
trim_left, trim_right, trim, unshift_char, shift_char, pop_char,
1213
push_char, is_utf8, from_chars, to_chars, char_len, char_len_range,
1314
char_at, bytes, is_ascii, shift_byte, pop_byte,
@@ -252,7 +253,7 @@ fn from_chars(chs: [char]) -> str {
252253
/*
253254
Function: utf8_char_width
254255
255-
FIXME: What does this function do?
256+
Given a first byte, determine how many bytes are in this UTF-8 character
256257
*/
257258
pure fn utf8_char_width(b: u8) -> uint {
258259
let byte: uint = b as uint;
@@ -275,15 +276,27 @@ Pluck a character out of a string and return the index of the next character.
275276
This function can be used to iterate over the unicode characters of a string.
276277
277278
Example:
278-
279-
> let s = "Clam chowder, hot sauce, pork rinds";
280-
> let i = 0;
281-
> while i < len(s) {
282-
> let {ch, next} = char_range_at(s, i);
283-
> log(debug, ch);
284-
> i = next;
279+
> let s = "中华Việt Nam";
280+
> let i = 0u;
281+
> while i < str::byte_len(s) {
282+
> let {ch, next} = str::char_range_at(s, i);
283+
> std::io::println(#fmt("%u: %c",i,ch));
284+
> i = next;
285285
> }
286286
287+
Example output:
288+
289+
0: 中
290+
3: 华
291+
6: V
292+
7: i
293+
8: ệ
294+
11: t
295+
12:
296+
13: N
297+
14: a
298+
15: m
299+
287300
Parameters:
288301
289302
s - The string
@@ -721,6 +734,8 @@ Split a string at each occurance of a given separator
721734
Returns:
722735
723736
A vector containing all the strings between each occurance of the separator
737+
738+
FIXME: should be renamed to split_byte
724739
*/
725740
fn split(s: str, sep: u8) -> [str] {
726741
let v: [str] = [];
@@ -772,6 +787,9 @@ leading fields are suppressed, and empty trailing fields are preserved.
772787
Returns:
773788
774789
A vector containing all the strings between each occurrence of the separator.
790+
791+
FIXME: should behave like split and split_char:
792+
assert ["", "XXX", "YYY", ""] == split_str(".XXX.YYY.", ".");
775793
*/
776794
fn split_str(s: str, sep: str) -> [str] {
777795
assert byte_len(sep) > 0u;
@@ -799,6 +817,76 @@ fn split_str(s: str, sep: str) -> [str] {
799817
ret v;
800818
}
801819

820+
/*
821+
Function: split_func
822+
823+
Splits a string into substrings using a function
824+
(unicode safe)
825+
826+
FIXME: will be renamed to split.
827+
*/
828+
fn split_func(ss: str, sepfn: fn&(cc: char)->bool) -> [str] {
829+
let vv: [str] = [];
830+
let accum: str = "";
831+
let ends_with_sep: bool = false;
832+
833+
str::iter_chars(ss, {|cc| if sepfn(cc) {
834+
vv += [accum];
835+
accum = "";
836+
ends_with_sep = true;
837+
} else {
838+
str::push_char(accum, cc);
839+
ends_with_sep = false;
840+
}
841+
});
842+
843+
if char_len(accum) >= 0u || ends_with_sep {
844+
vv += [accum];
845+
}
846+
847+
ret vv;
848+
}
849+
850+
/*
851+
Function: split_char
852+
853+
Splits a string into a vector of the substrings separated by a given character
854+
*/
855+
fn split_char(ss: str, cc: char) -> [str] {
856+
split_func(ss, {|kk| kk == cc})
857+
}
858+
859+
/*
860+
Function: lines
861+
862+
Splits a string into a vector of the substrings
863+
separated by LF ('\n')
864+
*/
865+
fn lines(ss: str) -> [str] {
866+
split_func(ss, {|cc| cc == '\n'})
867+
}
868+
869+
/*
870+
Function: lines_any
871+
872+
Splits a string into a vector of the substrings
873+
separated by LF ('\n') and/or CR LF ('\r\n')
874+
*/
875+
fn lines_any(ss: str) -> [str] {
876+
vec::map(lines(ss), {|s| trim_right(s)})
877+
}
878+
879+
/*
880+
Function: words
881+
882+
Splits a string into a vector of the substrings
883+
separated by whitespace
884+
*/
885+
fn words(ss: str) -> [str] {
886+
ret vec::filter( split_func(ss, {|cc| char::is_whitespace(cc)}),
887+
{|w| 0u < str::char_len(w)});
888+
}
889+
802890
/*
803891
Function: concat
804892

src/test/stdtest/str.rs

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,12 +80,83 @@ fn test_split_str() {
8080
let v = str::split_str(s, sep);
8181
assert str::eq(v[i], k);
8282
}
83+
84+
//FIXME: should behave like split and split_char:
85+
//assert ["", "XXX", "YYY", ""] == str::split_str(".XXX.YYY.", ".");
86+
8387
t("abc::hello::there", "::", 0, "abc");
8488
t("abc::hello::there", "::", 1, "hello");
8589
t("abc::hello::there", "::", 2, "there");
8690
t("::hello::there", "::", 0, "hello");
8791
t("hello::there::", "::", 2, "");
8892
t("::hello::there::", "::", 2, "");
93+
t("ประเทศไทย中华Việt Nam", "中华", 0, "ประเทศไทย");
94+
t("ประเทศไทย中华Việt Nam", "中华", 1, "Việt Nam");
95+
}
96+
97+
#[test]
98+
fn test_split_func () {
99+
let data = "ประเทศไทย中华Việt Nam";
100+
assert ["ประเทศไทย中", "Việt Nam"]
101+
== str::split_func (data, {|cc| cc == '华'});
102+
103+
assert ["", "", "XXX", "YYY", ""]
104+
== str::split_func("zzXXXzYYYz", char::is_lowercase);
105+
106+
assert ["zz", "", "", "z", "", "", "z"]
107+
== str::split_func("zzXXXzYYYz", char::is_uppercase);
108+
109+
assert ["",""] == str::split_func("z", {|cc| cc == 'z'});
110+
assert [""] == str::split_func("", {|cc| cc == 'z'});
111+
assert ["ok"] == str::split_func("ok", {|cc| cc == 'z'});
112+
}
113+
114+
#[test]
115+
fn test_split_char () {
116+
let data = "ประเทศไทย中华Việt Nam";
117+
assert ["ประเทศไทย中", "Việt Nam"]
118+
== str::split_char(data, '华');
119+
120+
assert ["", "", "XXX", "YYY", ""]
121+
== str::split_char("zzXXXzYYYz", 'z');
122+
assert ["",""] == str::split_char("z", 'z');
123+
assert [""] == str::split_char("", 'z');
124+
assert ["ok"] == str::split_char("ok", 'z');
125+
}
126+
127+
#[test]
128+
fn test_lines () {
129+
let lf = "\nMary had a little lamb\nLittle lamb\n";
130+
let crlf = "\r\nMary had a little lamb\r\nLittle lamb\r\n";
131+
132+
assert ["", "Mary had a little lamb", "Little lamb", ""]
133+
== str::lines(lf);
134+
135+
assert ["", "Mary had a little lamb", "Little lamb", ""]
136+
== str::lines_any(lf);
137+
138+
assert ["\r", "Mary had a little lamb\r", "Little lamb\r", ""]
139+
== str::lines(crlf);
140+
141+
assert ["", "Mary had a little lamb", "Little lamb", ""]
142+
== str::lines_any(crlf);
143+
144+
assert [""] == str::lines ("");
145+
assert [""] == str::lines_any("");
146+
assert ["",""] == str::lines ("\n");
147+
assert ["",""] == str::lines_any("\n");
148+
assert ["banana"] == str::lines ("banana");
149+
assert ["banana"] == str::lines_any("banana");
150+
}
151+
152+
#[test]
153+
fn test_words () {
154+
let data = "\nMary had a little lamb\nLittle lamb\n";
155+
assert ["Mary","had","a","little","lamb","Little","lamb"]
156+
== str::words(data);
157+
158+
assert ["ok"] == str::words("ok");
159+
assert [] == str::words("");
89160
}
90161

91162
#[test]
@@ -215,6 +286,27 @@ fn test_char_slice() {
215286
assert (str::eq("bc", str::char_slice("abc", 1u, 3u)));
216287
assert (str::eq("", str::char_slice("abc", 1u, 1u)));
217288
assert (str::eq("\u65e5", str::char_slice("\u65e5\u672c", 0u, 1u)));
289+
290+
let data = "ประเทศไทย中华";
291+
assert (str::eq("ป", str::char_slice(data, 0u, 1u)));
292+
assert (str::eq("ร", str::char_slice(data, 1u, 2u)));
293+
assert (str::eq("华", str::char_slice(data, 10u, 11u)));
294+
assert (str::eq("", str::char_slice(data, 1u, 1u)));
295+
296+
fn a_million_letter_X() -> str {
297+
let i = 0;
298+
let rs = "";
299+
while i < 100000 { rs += "华华华华华华华华华华"; i += 1; }
300+
ret rs;
301+
}
302+
fn half_a_million_letter_X() -> str {
303+
let i = 0;
304+
let rs = "";
305+
while i < 100000 { rs += "华华华华华"; i += 1; }
306+
ret rs;
307+
}
308+
assert (str::eq(half_a_million_letter_X(),
309+
str::char_slice(a_million_letter_X(), 0u, 500000u)));
218310
}
219311

220312
#[test]

0 commit comments

Comments
 (0)