Skip to content

Commit 50058a5

Browse files
committed
Adds unicode_sentences and split_sentence_bound_indices
1 parent 7ac6f29 commit 50058a5

File tree

2 files changed

+87
-1
lines changed

2 files changed

+87
-1
lines changed

src/lib.rs

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ pub use grapheme::{Graphemes, GraphemeIndices};
6767
pub use grapheme::{GraphemeCursor, GraphemeIncomplete};
6868
pub use tables::UNICODE_VERSION;
6969
pub use word::{UWordBounds, UWordBoundIndices, UnicodeWords};
70-
pub use sentence::{USentenceBounds};
70+
pub use sentence::{USentenceBounds, USentenceBoundIndices, UnicodeSentences};
7171

7272
mod grapheme;
7373
mod tables;
@@ -181,7 +181,22 @@ pub trait UnicodeSegmentation {
181181
/// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
182182
///
183183
/// The concatenation of the substrings returned by this function is just the original string.
184+
fn unicode_sentences<'a>(&'a self) -> UnicodeSentences<'a>;
185+
186+
/// Returns an iterator over substrings of `self` separated on
187+
/// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
188+
///
189+
/// Here, "sentences" are just those substrings which, after splitting on
190+
/// UAX#29 sentence boundaries, contain any alphanumeric characters. That is, the
191+
/// substring must contain at least one character with the
192+
/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
193+
/// property, or with
194+
/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
184195
fn split_sentence_bounds<'a>(&'a self) -> USentenceBounds<'a>;
196+
197+
/// Returns an iterator over substrings of `self`, split on UAX#29 sentence boundaries,
198+
/// and their offsets. See `split_sentence_bounds()` for more information.
199+
fn split_sentence_bound_indices<'a>(&'a self) -> USentenceBoundIndices<'a>;
185200
}
186201

187202
impl UnicodeSegmentation for str {
@@ -210,8 +225,18 @@ impl UnicodeSegmentation for str {
210225
word::new_word_bound_indices(self)
211226
}
212227

228+
#[inline]
229+
fn unicode_sentences(&self) -> UnicodeSentences {
230+
sentence::new_unicode_sentences(self)
231+
}
232+
213233
#[inline]
214234
fn split_sentence_bounds(&self) -> USentenceBounds {
215235
sentence::new_sentence_bounds(self)
216236
}
237+
238+
#[inline]
239+
fn split_sentence_bound_indices(&self) -> USentenceBoundIndices {
240+
sentence::new_sentence_bound_indices(self)
241+
}
217242
}

src/sentence.rs

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
// except according to those terms.
1010

1111
use core::cmp;
12+
use core::iter::Filter;
1213

1314
// All of the logic for forward iteration over sentences
1415
mod fwd {
@@ -40,6 +41,7 @@ mod fwd {
4041
StatePart::Sot
4142
]);
4243

44+
#[derive(Clone)]
4345
pub struct SentenceBreaks<'a> {
4446
pub string: &'a str,
4547
pos: usize,
@@ -256,13 +258,32 @@ mod fwd {
256258

257259
}
258260

261+
/// An iterator over the substrings of a string which, after splitting the string on
262+
/// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries),
263+
/// contain any characters with the
264+
/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
265+
/// property, or with
266+
/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
267+
#[derive(Clone)]
268+
pub struct UnicodeSentences<'a> {
269+
inner: Filter<USentenceBounds<'a>, fn(&&str) -> bool>,
270+
}
271+
259272
/// External iterator for a string's
260273
/// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
274+
#[derive(Clone)]
261275
pub struct USentenceBounds<'a> {
262276
iter: fwd::SentenceBreaks<'a>,
263277
sentence_start: Option<usize>
264278
}
265279

280+
/// External iterator for sentence boundaries and byte offsets.
281+
#[derive(Clone)]
282+
pub struct USentenceBoundIndices<'a> {
283+
start_offset: usize,
284+
iter: USentenceBounds<'a>,
285+
}
286+
266287
#[inline]
267288
pub fn new_sentence_bounds<'a>(source: &'a str) -> USentenceBounds<'a> {
268289
USentenceBounds {
@@ -271,6 +292,32 @@ pub fn new_sentence_bounds<'a>(source: &'a str) -> USentenceBounds<'a> {
271292
}
272293
}
273294

295+
#[inline]
296+
pub fn new_sentence_bound_indices<'a>(source: &'a str) -> USentenceBoundIndices<'a> {
297+
USentenceBoundIndices {
298+
start_offset: source.as_ptr() as usize,
299+
iter: new_sentence_bounds(source)
300+
}
301+
}
302+
303+
#[inline]
304+
pub fn new_unicode_sentences<'b>(s: &'b str) -> UnicodeSentences<'b> {
305+
use super::UnicodeSegmentation;
306+
use tables::util::is_alphanumeric;
307+
308+
fn has_alphanumeric(s: &&str) -> bool { s.chars().any(|c| is_alphanumeric(c)) }
309+
let has_alphanumeric: fn(&&str) -> bool = has_alphanumeric; // coerce to fn pointer
310+
311+
UnicodeSentences { inner: s.split_sentence_bounds().filter(has_alphanumeric) }
312+
}
313+
314+
impl<'a> Iterator for UnicodeSentences<'a> {
315+
type Item = &'a str;
316+
317+
#[inline]
318+
fn next(&mut self) -> Option<&'a str> { self.inner.next() }
319+
}
320+
274321
impl<'a> Iterator for USentenceBounds<'a> {
275322
type Item = &'a str;
276323

@@ -300,3 +347,17 @@ impl<'a> Iterator for USentenceBounds<'a> {
300347
}
301348
}
302349
}
350+
351+
impl<'a> Iterator for USentenceBoundIndices<'a> {
352+
type Item = (usize, &'a str);
353+
354+
#[inline]
355+
fn next(&mut self) -> Option<(usize, &'a str)> {
356+
self.iter.next().map(|s| (s.as_ptr() as usize - self.start_offset, s))
357+
}
358+
359+
#[inline]
360+
fn size_hint(&self) -> (usize, Option<usize>) {
361+
self.iter.size_hint()
362+
}
363+
}

0 commit comments

Comments
 (0)