Adds unicode_sentences and split_sentence_bound_indices

tomcumming · tomcumming · commit 50058a56157b · 2019-05-13T19:06:18.000+01:00
diff --git a/src/lib.rs b/src/lib.rs
@@ -67,7 +67,7 @@ pub use grapheme::{Graphemes, GraphemeIndices};
 pub use grapheme::{GraphemeCursor, GraphemeIncomplete};
 pub use tables::UNICODE_VERSION;
 pub use word::{UWordBounds, UWordBoundIndices, UnicodeWords};
-pub use sentence::{USentenceBounds};
+pub use sentence::{USentenceBounds, USentenceBoundIndices, UnicodeSentences};
 
 mod grapheme;
 mod tables;
@@ -181,7 +181,22 @@ pub trait UnicodeSegmentation {
     /// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
     ///
     /// The concatenation of the substrings returned by this function is just the original string.
+    fn unicode_sentences<'a>(&'a self) -> UnicodeSentences<'a>;
+
+    /// Returns an iterator over substrings of `self` separated on
+    /// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
+    ///
+    /// Here, "sentences" are just those substrings which, after splitting on
+    /// UAX#29 sentence boundaries, contain any alphanumeric characters. That is, the
+    /// substring must contain at least one character with the
+    /// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
+    /// property, or with
+    /// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
     fn split_sentence_bounds<'a>(&'a self) -> USentenceBounds<'a>;
+
+    /// Returns an iterator over substrings of `self`, split on UAX#29 sentence boundaries,
+    /// and their offsets. See `split_sentence_bounds()` for more information.
+    fn split_sentence_bound_indices<'a>(&'a self) -> USentenceBoundIndices<'a>;
 }
 
 impl UnicodeSegmentation for str {
@@ -210,8 +225,18 @@ impl UnicodeSegmentation for str {
         word::new_word_bound_indices(self)
     }
 
+    #[inline]
+    fn unicode_sentences(&self) -> UnicodeSentences {
+        sentence::new_unicode_sentences(self)
+    }
+
     #[inline]
     fn split_sentence_bounds(&self) -> USentenceBounds {
         sentence::new_sentence_bounds(self)
     }
+
+    #[inline]
+    fn split_sentence_bound_indices(&self) -> USentenceBoundIndices {
+        sentence::new_sentence_bound_indices(self)
+    }
 }
diff --git a/src/sentence.rs b/src/sentence.rs
@@ -9,6 +9,7 @@
 // except according to those terms.
 
 use core::cmp;
+use core::iter::Filter;
 
 // All of the logic for forward iteration over sentences
 mod fwd {
@@ -40,6 +41,7 @@ mod fwd {
         StatePart::Sot
     ]);
 
+    #[derive(Clone)]
     pub struct SentenceBreaks<'a> {
         pub string: &'a str,
         pos: usize,
@@ -256,13 +258,32 @@ mod fwd {
 
 }
 
+/// An iterator over the substrings of a string which, after splitting the string on
+/// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries),
+/// contain any characters with the
+/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
+/// property, or with
+/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
+#[derive(Clone)]
+pub struct UnicodeSentences<'a> {
+    inner: Filter<USentenceBounds<'a>, fn(&&str) -> bool>,
+}
+
 /// External iterator for a string's
 /// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
+#[derive(Clone)]
 pub struct USentenceBounds<'a> {
     iter: fwd::SentenceBreaks<'a>,
     sentence_start: Option<usize>
 }
 
+/// External iterator for sentence boundaries and byte offsets.
+#[derive(Clone)]
+pub struct USentenceBoundIndices<'a> {
+    start_offset: usize,
+    iter: USentenceBounds<'a>,
+}
+
 #[inline]
 pub fn new_sentence_bounds<'a>(source: &'a str) -> USentenceBounds<'a> {
     USentenceBounds {
@@ -271,6 +292,32 @@ pub fn new_sentence_bounds<'a>(source: &'a str) -> USentenceBounds<'a> {
     }
 }
 
+#[inline]
+pub fn new_sentence_bound_indices<'a>(source: &'a str) -> USentenceBoundIndices<'a> {
+    USentenceBoundIndices {
+        start_offset: source.as_ptr() as usize,
+        iter: new_sentence_bounds(source)
+    }
+}
+
+#[inline]
+pub fn new_unicode_sentences<'b>(s: &'b str) -> UnicodeSentences<'b> {
+    use super::UnicodeSegmentation;
+    use tables::util::is_alphanumeric;
+
+    fn has_alphanumeric(s: &&str) -> bool { s.chars().any(|c| is_alphanumeric(c)) }
+    let has_alphanumeric: fn(&&str) -> bool = has_alphanumeric; // coerce to fn pointer
+
+    UnicodeSentences { inner: s.split_sentence_bounds().filter(has_alphanumeric) }
+}
+
+impl<'a> Iterator for UnicodeSentences<'a> {
+    type Item = &'a str;
+
+    #[inline]
+    fn next(&mut self) -> Option<&'a str> { self.inner.next() }
+}
+
 impl<'a> Iterator for USentenceBounds<'a> {
     type Item = &'a str;
 
@@ -300,3 +347,17 @@ impl<'a> Iterator for USentenceBounds<'a> {
         }
     }
 }
+
+impl<'a> Iterator for USentenceBoundIndices<'a> {
+    type Item = (usize, &'a str);
+
+    #[inline]
+    fn next(&mut self) -> Option<(usize, &'a str)> {
+        self.iter.next().map(|s| (s.as_ptr() as usize - self.start_offset, s))
+    }
+
+    #[inline]
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        self.iter.size_hint()
+    }
+}