9
9
// except according to those terms.
10
10
11
11
use core:: cmp;
12
+ use core:: iter:: Filter ;
12
13
13
14
// All of the logic for forward iteration over sentences
14
15
mod fwd {
@@ -40,6 +41,7 @@ mod fwd {
40
41
StatePart :: Sot
41
42
] ) ;
42
43
44
+ #[ derive( Clone ) ]
43
45
pub struct SentenceBreaks < ' a > {
44
46
pub string : & ' a str ,
45
47
pos : usize ,
@@ -256,13 +258,32 @@ mod fwd {
256
258
257
259
}
258
260
261
+ /// An iterator over the substrings of a string which, after splitting the string on
262
+ /// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries),
263
+ /// contain any characters with the
264
+ /// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
265
+ /// property, or with
266
+ /// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
267
+ #[ derive( Clone ) ]
268
+ pub struct UnicodeSentences < ' a > {
269
+ inner : Filter < USentenceBounds < ' a > , fn ( & & str ) -> bool > ,
270
+ }
271
+
259
272
/// External iterator for a string's
260
273
/// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
274
+ #[ derive( Clone ) ]
261
275
pub struct USentenceBounds < ' a > {
262
276
iter : fwd:: SentenceBreaks < ' a > ,
263
277
sentence_start : Option < usize >
264
278
}
265
279
280
+ /// External iterator for sentence boundaries and byte offsets.
281
+ #[ derive( Clone ) ]
282
+ pub struct USentenceBoundIndices < ' a > {
283
+ start_offset : usize ,
284
+ iter : USentenceBounds < ' a > ,
285
+ }
286
+
266
287
#[ inline]
267
288
pub fn new_sentence_bounds < ' a > ( source : & ' a str ) -> USentenceBounds < ' a > {
268
289
USentenceBounds {
@@ -271,6 +292,32 @@ pub fn new_sentence_bounds<'a>(source: &'a str) -> USentenceBounds<'a> {
271
292
}
272
293
}
273
294
295
+ #[ inline]
296
+ pub fn new_sentence_bound_indices < ' a > ( source : & ' a str ) -> USentenceBoundIndices < ' a > {
297
+ USentenceBoundIndices {
298
+ start_offset : source. as_ptr ( ) as usize ,
299
+ iter : new_sentence_bounds ( source)
300
+ }
301
+ }
302
+
303
+ #[ inline]
304
+ pub fn new_unicode_sentences < ' b > ( s : & ' b str ) -> UnicodeSentences < ' b > {
305
+ use super :: UnicodeSegmentation ;
306
+ use tables:: util:: is_alphanumeric;
307
+
308
+ fn has_alphanumeric ( s : & & str ) -> bool { s. chars ( ) . any ( |c| is_alphanumeric ( c) ) }
309
+ let has_alphanumeric: fn ( & & str ) -> bool = has_alphanumeric; // coerce to fn pointer
310
+
311
+ UnicodeSentences { inner : s. split_sentence_bounds ( ) . filter ( has_alphanumeric) }
312
+ }
313
+
314
+ impl < ' a > Iterator for UnicodeSentences < ' a > {
315
+ type Item = & ' a str ;
316
+
317
+ #[ inline]
318
+ fn next ( & mut self ) -> Option < & ' a str > { self . inner . next ( ) }
319
+ }
320
+
274
321
impl < ' a > Iterator for USentenceBounds < ' a > {
275
322
type Item = & ' a str ;
276
323
@@ -300,3 +347,17 @@ impl<'a> Iterator for USentenceBounds<'a> {
300
347
}
301
348
}
302
349
}
350
+
351
+ impl < ' a > Iterator for USentenceBoundIndices < ' a > {
352
+ type Item = ( usize , & ' a str ) ;
353
+
354
+ #[ inline]
355
+ fn next ( & mut self ) -> Option < ( usize , & ' a str ) > {
356
+ self . iter . next ( ) . map ( |s| ( s. as_ptr ( ) as usize - self . start_offset , s) )
357
+ }
358
+
359
+ #[ inline]
360
+ fn size_hint ( & self ) -> ( usize , Option < usize > ) {
361
+ self . iter . size_hint ( )
362
+ }
363
+ }
0 commit comments