1 use regex_automata::DFA;
2 
3 use crate::{
4     ext_slice::ByteSlice,
5     unicode::fsm::sentence_break_fwd::SENTENCE_BREAK_FWD, utf8,
6 };
7 
8 /// An iterator over sentences in a byte string.
9 ///
10 /// This iterator is typically constructed by
11 /// [`ByteSlice::sentences`](trait.ByteSlice.html#method.sentences).
12 ///
13 /// Sentences typically include their trailing punctuation and whitespace.
14 ///
15 /// Since sentences are made up of one or more codepoints, this iterator yields
16 /// `&str` elements. When invalid UTF-8 is encountered, replacement codepoints
17 /// are [substituted](index.html#handling-of-invalid-utf-8).
18 ///
19 /// This iterator yields words in accordance with the default sentence boundary
20 /// rules specified in
21 /// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Sentence_Boundaries).
22 #[derive(Clone, Debug)]
23 pub struct Sentences<'a> {
24     bs: &'a [u8],
25 }
26 
27 impl<'a> Sentences<'a> {
new(bs: &'a [u8]) -> Sentences<'a>28     pub(crate) fn new(bs: &'a [u8]) -> Sentences<'a> {
29         Sentences { bs }
30     }
31 
32     /// View the underlying data as a subslice of the original data.
33     ///
34     /// The slice returned has the same lifetime as the original slice, and so
35     /// the iterator can continue to be used while this exists.
36     ///
37     /// # Examples
38     ///
39     /// ```
40     /// use bstr::ByteSlice;
41     ///
42     /// let mut it = b"I want this. Not that. Right now.".sentences();
43     ///
44     /// assert_eq!(&b"I want this. Not that. Right now."[..], it.as_bytes());
45     /// it.next();
46     /// assert_eq!(b"Not that. Right now.", it.as_bytes());
47     /// it.next();
48     /// it.next();
49     /// assert_eq!(b"", it.as_bytes());
50     /// ```
51     #[inline]
as_bytes(&self) -> &'a [u8]52     pub fn as_bytes(&self) -> &'a [u8] {
53         self.bs
54     }
55 }
56 
57 impl<'a> Iterator for Sentences<'a> {
58     type Item = &'a str;
59 
60     #[inline]
next(&mut self) -> Option<&'a str>61     fn next(&mut self) -> Option<&'a str> {
62         let (sentence, size) = decode_sentence(self.bs);
63         if size == 0 {
64             return None;
65         }
66         self.bs = &self.bs[size..];
67         Some(sentence)
68     }
69 }
70 
71 /// An iterator over sentences in a byte string, along with their byte offsets.
72 ///
73 /// This iterator is typically constructed by
74 /// [`ByteSlice::sentence_indices`](trait.ByteSlice.html#method.sentence_indices).
75 ///
76 /// Sentences typically include their trailing punctuation and whitespace.
77 ///
78 /// Since sentences are made up of one or more codepoints, this iterator
79 /// yields `&str` elements (along with their start and end byte offsets).
80 /// When invalid UTF-8 is encountered, replacement codepoints are
81 /// [substituted](index.html#handling-of-invalid-utf-8). Because of this, the
82 /// indices yielded by this iterator may not correspond to the length of the
83 /// sentence yielded with those indices. For example, when this iterator
84 /// encounters `\xFF` in the byte string, then it will yield a pair of indices
85 /// ranging over a single byte, but will provide an `&str` equivalent to
86 /// `"\u{FFFD}"`, which is three bytes in length. However, when given only
87 /// valid UTF-8, then all indices are in exact correspondence with their paired
88 /// word.
89 ///
90 /// This iterator yields words in accordance with the default sentence boundary
91 /// rules specified in
92 /// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Sentence_Boundaries).
93 #[derive(Clone, Debug)]
94 pub struct SentenceIndices<'a> {
95     bs: &'a [u8],
96     forward_index: usize,
97 }
98 
99 impl<'a> SentenceIndices<'a> {
new(bs: &'a [u8]) -> SentenceIndices<'a>100     pub(crate) fn new(bs: &'a [u8]) -> SentenceIndices<'a> {
101         SentenceIndices { bs, forward_index: 0 }
102     }
103 
104     /// View the underlying data as a subslice of the original data.
105     ///
106     /// The slice returned has the same lifetime as the original slice, and so
107     /// the iterator can continue to be used while this exists.
108     ///
109     /// # Examples
110     ///
111     /// ```
112     /// use bstr::ByteSlice;
113     ///
114     /// let mut it = b"I want this. Not that. Right now.".sentence_indices();
115     ///
116     /// assert_eq!(&b"I want this. Not that. Right now."[..], it.as_bytes());
117     /// it.next();
118     /// assert_eq!(b"Not that. Right now.", it.as_bytes());
119     /// it.next();
120     /// it.next();
121     /// assert_eq!(b"", it.as_bytes());
122     /// ```
123     #[inline]
as_bytes(&self) -> &'a [u8]124     pub fn as_bytes(&self) -> &'a [u8] {
125         self.bs
126     }
127 }
128 
129 impl<'a> Iterator for SentenceIndices<'a> {
130     type Item = (usize, usize, &'a str);
131 
132     #[inline]
next(&mut self) -> Option<(usize, usize, &'a str)>133     fn next(&mut self) -> Option<(usize, usize, &'a str)> {
134         let index = self.forward_index;
135         let (word, size) = decode_sentence(self.bs);
136         if size == 0 {
137             return None;
138         }
139         self.bs = &self.bs[size..];
140         self.forward_index += size;
141         Some((index, index + size, word))
142     }
143 }
144 
decode_sentence(bs: &[u8]) -> (&str, usize)145 fn decode_sentence(bs: &[u8]) -> (&str, usize) {
146     if bs.is_empty() {
147         ("", 0)
148     } else if let Some(end) = SENTENCE_BREAK_FWD.find(bs) {
149         // Safe because a match can only occur for valid UTF-8.
150         let sentence = unsafe { bs[..end].to_str_unchecked() };
151         (sentence, sentence.len())
152     } else {
153         const INVALID: &'static str = "\u{FFFD}";
154         // No match on non-empty bytes implies we found invalid UTF-8.
155         let (_, size) = utf8::decode_lossy(bs);
156         (INVALID, size)
157     }
158 }
159 
160 #[cfg(all(test, feature = "std"))]
161 mod tests {
162     #[cfg(not(miri))]
163     use ucd_parse::SentenceBreakTest;
164 
165     use crate::ext_slice::ByteSlice;
166 
167     #[test]
168     #[cfg(not(miri))]
forward_ucd()169     fn forward_ucd() {
170         for (i, test) in ucdtests().into_iter().enumerate() {
171             let given = test.sentences.concat();
172             let got = sentences(given.as_bytes());
173             assert_eq!(
174                 test.sentences,
175                 got,
176                 "\n\nsentence forward break test {} failed:\n\
177                  given:    {:?}\n\
178                  expected: {:?}\n\
179                  got:      {:?}\n",
180                 i,
181                 given,
182                 strs_to_bstrs(&test.sentences),
183                 strs_to_bstrs(&got),
184             );
185         }
186     }
187 
188     // Some additional tests that don't seem to be covered by the UCD tests.
189     #[test]
forward_additional()190     fn forward_additional() {
191         assert_eq!(vec!["a.. ", "A"], sentences(b"a.. A"));
192         assert_eq!(vec!["a.. a"], sentences(b"a.. a"));
193 
194         assert_eq!(vec!["a... ", "A"], sentences(b"a... A"));
195         assert_eq!(vec!["a... a"], sentences(b"a... a"));
196 
197         assert_eq!(vec!["a...,..., a"], sentences(b"a...,..., a"));
198     }
199 
sentences(bytes: &[u8]) -> Vec<&str>200     fn sentences(bytes: &[u8]) -> Vec<&str> {
201         bytes.sentences().collect()
202     }
203 
204     #[cfg(not(miri))]
strs_to_bstrs<S: AsRef<str>>(strs: &[S]) -> Vec<&[u8]>205     fn strs_to_bstrs<S: AsRef<str>>(strs: &[S]) -> Vec<&[u8]> {
206         strs.iter().map(|s| s.as_ref().as_bytes()).collect()
207     }
208 
209     /// Return all of the UCD for sentence breaks.
210     #[cfg(not(miri))]
ucdtests() -> Vec<SentenceBreakTest>211     fn ucdtests() -> Vec<SentenceBreakTest> {
212         const TESTDATA: &'static str =
213             include_str!("data/SentenceBreakTest.txt");
214 
215         let mut tests = vec![];
216         for mut line in TESTDATA.lines() {
217             line = line.trim();
218             if line.starts_with("#") || line.contains("surrogate") {
219                 continue;
220             }
221             tests.push(line.parse().unwrap());
222         }
223         tests
224     }
225 }
226