1 // Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
4 //
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
10 
11 use core::cmp;
12 
13 use crate::tables::grapheme::GraphemeCat;
14 
15 /// External iterator for grapheme clusters and byte offsets.
16 ///
17 /// This struct is created by the [`grapheme_indices`] method on the [`UnicodeSegmentation`]
18 /// trait. See its documentation for more.
19 ///
20 /// [`grapheme_indices`]: trait.UnicodeSegmentation.html#tymethod.grapheme_indices
21 /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
22 #[derive(Clone)]
23 pub struct GraphemeIndices<'a> {
24     start_offset: usize,
25     iter: Graphemes<'a>,
26 }
27 
28 impl<'a> GraphemeIndices<'a> {
29     #[inline]
30     /// View the underlying data (the part yet to be iterated) as a slice of the original string.
31     ///
32     /// ```rust
33     /// # use unicode_segmentation::UnicodeSegmentation;
34     /// let mut iter = "abc".grapheme_indices(true);
35     /// assert_eq!(iter.as_str(), "abc");
36     /// iter.next();
37     /// assert_eq!(iter.as_str(), "bc");
38     /// iter.next();
39     /// iter.next();
40     /// assert_eq!(iter.as_str(), "");
41     /// ```
as_str(&self) -> &'a str42     pub fn as_str(&self) -> &'a str {
43         self.iter.as_str()
44     }
45 }
46 
47 impl<'a> Iterator for GraphemeIndices<'a> {
48     type Item = (usize, &'a str);
49 
50     #[inline]
next(&mut self) -> Option<(usize, &'a str)>51     fn next(&mut self) -> Option<(usize, &'a str)> {
52         self.iter
53             .next()
54             .map(|s| (s.as_ptr() as usize - self.start_offset, s))
55     }
56 
57     #[inline]
size_hint(&self) -> (usize, Option<usize>)58     fn size_hint(&self) -> (usize, Option<usize>) {
59         self.iter.size_hint()
60     }
61 }
62 
63 impl<'a> DoubleEndedIterator for GraphemeIndices<'a> {
64     #[inline]
next_back(&mut self) -> Option<(usize, &'a str)>65     fn next_back(&mut self) -> Option<(usize, &'a str)> {
66         self.iter
67             .next_back()
68             .map(|s| (s.as_ptr() as usize - self.start_offset, s))
69     }
70 }
71 
72 /// External iterator for a string's
73 /// [grapheme clusters](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries).
74 ///
75 /// This struct is created by the [`graphemes`] method on the [`UnicodeSegmentation`] trait. See its
76 /// documentation for more.
77 ///
78 /// [`graphemes`]: trait.UnicodeSegmentation.html#tymethod.graphemes
79 /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
80 #[derive(Clone, Debug)]
81 pub struct Graphemes<'a> {
82     string: &'a str,
83     cursor: GraphemeCursor,
84     cursor_back: GraphemeCursor,
85 }
86 
87 impl<'a> Graphemes<'a> {
88     #[inline]
89     /// View the underlying data (the part yet to be iterated) as a slice of the original string.
90     ///
91     /// ```rust
92     /// # use unicode_segmentation::UnicodeSegmentation;
93     /// let mut iter = "abc".graphemes(true);
94     /// assert_eq!(iter.as_str(), "abc");
95     /// iter.next();
96     /// assert_eq!(iter.as_str(), "bc");
97     /// iter.next();
98     /// iter.next();
99     /// assert_eq!(iter.as_str(), "");
100     /// ```
as_str(&self) -> &'a str101     pub fn as_str(&self) -> &'a str {
102         &self.string[self.cursor.cur_cursor()..self.cursor_back.cur_cursor()]
103     }
104 }
105 
106 impl<'a> Iterator for Graphemes<'a> {
107     type Item = &'a str;
108 
109     #[inline]
size_hint(&self) -> (usize, Option<usize>)110     fn size_hint(&self) -> (usize, Option<usize>) {
111         let slen = self.cursor_back.cur_cursor() - self.cursor.cur_cursor();
112         (cmp::min(slen, 1), Some(slen))
113     }
114 
115     #[inline]
next(&mut self) -> Option<&'a str>116     fn next(&mut self) -> Option<&'a str> {
117         let start = self.cursor.cur_cursor();
118         if start == self.cursor_back.cur_cursor() {
119             return None;
120         }
121         let next = self.cursor.next_boundary(self.string, 0).unwrap().unwrap();
122         Some(&self.string[start..next])
123     }
124 }
125 
126 impl<'a> DoubleEndedIterator for Graphemes<'a> {
127     #[inline]
next_back(&mut self) -> Option<&'a str>128     fn next_back(&mut self) -> Option<&'a str> {
129         let end = self.cursor_back.cur_cursor();
130         if end == self.cursor.cur_cursor() {
131             return None;
132         }
133         let prev = self
134             .cursor_back
135             .prev_boundary(self.string, 0)
136             .unwrap()
137             .unwrap();
138         Some(&self.string[prev..end])
139     }
140 }
141 
142 #[inline]
new_graphemes<'b>(s: &'b str, is_extended: bool) -> Graphemes<'b>143 pub fn new_graphemes<'b>(s: &'b str, is_extended: bool) -> Graphemes<'b> {
144     let len = s.len();
145     Graphemes {
146         string: s,
147         cursor: GraphemeCursor::new(0, len, is_extended),
148         cursor_back: GraphemeCursor::new(len, len, is_extended),
149     }
150 }
151 
152 #[inline]
new_grapheme_indices<'b>(s: &'b str, is_extended: bool) -> GraphemeIndices<'b>153 pub fn new_grapheme_indices<'b>(s: &'b str, is_extended: bool) -> GraphemeIndices<'b> {
154     GraphemeIndices {
155         start_offset: s.as_ptr() as usize,
156         iter: new_graphemes(s, is_extended),
157     }
158 }
159 
160 // maybe unify with PairResult?
161 // An enum describing information about a potential boundary.
162 #[derive(PartialEq, Eq, Clone, Debug)]
163 enum GraphemeState {
164     // No information is known.
165     Unknown,
166     // It is known to not be a boundary.
167     NotBreak,
168     // It is known to be a boundary.
169     Break,
170     // The codepoint after is a Regional Indicator Symbol, so a boundary iff
171     // it is preceded by an even number of RIS codepoints. (GB12, GB13)
172     Regional,
173     // The codepoint after is Extended_Pictographic,
174     // so whether it's a boundary depends on pre-context according to GB11.
175     Emoji,
176 }
177 
178 /// Cursor-based segmenter for grapheme clusters.
179 ///
180 /// This allows working with ropes and other datastructures where the string is not contiguous or
181 /// fully known at initialization time.
182 #[derive(Clone, Debug)]
183 pub struct GraphemeCursor {
184     // Current cursor position.
185     offset: usize,
186     // Total length of the string.
187     len: usize,
188     // A config flag indicating whether this cursor computes legacy or extended
189     // grapheme cluster boundaries (enables GB9a and GB9b if set).
190     is_extended: bool,
191     // Information about the potential boundary at `offset`
192     state: GraphemeState,
193     // Category of codepoint immediately preceding cursor, if known.
194     cat_before: Option<GraphemeCat>,
195     // Category of codepoint immediately after cursor, if known.
196     cat_after: Option<GraphemeCat>,
197     // If set, at least one more codepoint immediately preceding this offset
198     // is needed to resolve whether there's a boundary at `offset`.
199     pre_context_offset: Option<usize>,
200     // The number of RIS codepoints preceding `offset`. If `pre_context_offset`
201     // is set, then counts the number of RIS between that and `offset`, otherwise
202     // is an accurate count relative to the string.
203     ris_count: Option<usize>,
204     // Set if a call to `prev_boundary` or `next_boundary` was suspended due
205     // to needing more input.
206     resuming: bool,
207     // Cached grapheme category and associated scalar value range.
208     grapheme_cat_cache: (u32, u32, GraphemeCat),
209 }
210 
211 /// An error return indicating that not enough content was available in the
212 /// provided chunk to satisfy the query, and that more content must be provided.
213 #[derive(PartialEq, Eq, Debug)]
214 pub enum GraphemeIncomplete {
215     /// More pre-context is needed. The caller should call `provide_context`
216     /// with a chunk ending at the offset given, then retry the query. This
217     /// will only be returned if the `chunk_start` parameter is nonzero.
218     PreContext(usize),
219 
220     /// When requesting `prev_boundary`, the cursor is moving past the beginning
221     /// of the current chunk, so the chunk before that is requested. This will
222     /// only be returned if the `chunk_start` parameter is nonzero.
223     PrevChunk,
224 
225     /// When requesting `next_boundary`, the cursor is moving past the end of the
226     /// current chunk, so the chunk after that is requested. This will only be
227     /// returned if the chunk ends before the `len` parameter provided on
228     /// creation of the cursor.
229     NextChunk, // requesting chunk following the one given
230 
231     /// An error returned when the chunk given does not contain the cursor position.
232     InvalidOffset,
233 }
234 
235 // An enum describing the result from lookup of a pair of categories.
236 #[derive(PartialEq, Eq)]
237 enum PairResult {
238     NotBreak, // definitely not a break
239     Break,    // definitely a break
240     Extended, // a break iff not in extended mode
241     Regional, // a break if preceded by an even number of RIS
242     Emoji,    // a break if preceded by emoji base and (Extend)*
243 }
244 
245 #[inline]
check_pair(before: GraphemeCat, after: GraphemeCat) -> PairResult246 fn check_pair(before: GraphemeCat, after: GraphemeCat) -> PairResult {
247     use self::PairResult::*;
248     use crate::tables::grapheme::GraphemeCat::*;
249     match (before, after) {
250         (GC_CR, GC_LF) => NotBreak,                                 // GB3
251         (GC_Control, _) => Break,                                   // GB4
252         (GC_CR, _) => Break,                                        // GB4
253         (GC_LF, _) => Break,                                        // GB4
254         (_, GC_Control) => Break,                                   // GB5
255         (_, GC_CR) => Break,                                        // GB5
256         (_, GC_LF) => Break,                                        // GB5
257         (GC_L, GC_L) => NotBreak,                                   // GB6
258         (GC_L, GC_V) => NotBreak,                                   // GB6
259         (GC_L, GC_LV) => NotBreak,                                  // GB6
260         (GC_L, GC_LVT) => NotBreak,                                 // GB6
261         (GC_LV, GC_V) => NotBreak,                                  // GB7
262         (GC_LV, GC_T) => NotBreak,                                  // GB7
263         (GC_V, GC_V) => NotBreak,                                   // GB7
264         (GC_V, GC_T) => NotBreak,                                   // GB7
265         (GC_LVT, GC_T) => NotBreak,                                 // GB8
266         (GC_T, GC_T) => NotBreak,                                   // GB8
267         (_, GC_Extend) => NotBreak,                                 // GB9
268         (_, GC_ZWJ) => NotBreak,                                    // GB9
269         (_, GC_SpacingMark) => Extended,                            // GB9a
270         (GC_Prepend, _) => Extended,                                // GB9b
271         (GC_ZWJ, GC_Extended_Pictographic) => Emoji,                // GB11
272         (GC_Regional_Indicator, GC_Regional_Indicator) => Regional, // GB12, GB13
273         (_, _) => Break,                                            // GB999
274     }
275 }
276 
277 impl GraphemeCursor {
278     /// Create a new cursor. The string and initial offset are given at creation
279     /// time, but the contents of the string are not. The `is_extended` parameter
280     /// controls whether extended grapheme clusters are selected.
281     ///
282     /// The `offset` parameter must be on a codepoint boundary.
283     ///
284     /// ```rust
285     /// # use unicode_segmentation::GraphemeCursor;
286     /// let s = "हिन्दी";
287     /// let mut legacy = GraphemeCursor::new(0, s.len(), false);
288     /// assert_eq!(legacy.next_boundary(s, 0), Ok(Some("ह".len())));
289     /// let mut extended = GraphemeCursor::new(0, s.len(), true);
290     /// assert_eq!(extended.next_boundary(s, 0), Ok(Some("हि".len())));
291     /// ```
new(offset: usize, len: usize, is_extended: bool) -> GraphemeCursor292     pub fn new(offset: usize, len: usize, is_extended: bool) -> GraphemeCursor {
293         let state = if offset == 0 || offset == len {
294             GraphemeState::Break
295         } else {
296             GraphemeState::Unknown
297         };
298         GraphemeCursor {
299             offset: offset,
300             len: len,
301             state: state,
302             is_extended: is_extended,
303             cat_before: None,
304             cat_after: None,
305             pre_context_offset: None,
306             ris_count: None,
307             resuming: false,
308             grapheme_cat_cache: (0, 0, GraphemeCat::GC_Control),
309         }
310     }
311 
grapheme_category(&mut self, ch: char) -> GraphemeCat312     fn grapheme_category(&mut self, ch: char) -> GraphemeCat {
313         use crate::tables::grapheme as gr;
314         use crate::tables::grapheme::GraphemeCat::*;
315 
316         if ch <= '\u{7e}' {
317             // Special-case optimization for ascii, except U+007F.  This
318             // improves performance even for many primarily non-ascii texts,
319             // due to use of punctuation and white space characters from the
320             // ascii range.
321             if ch >= '\u{20}' {
322                 GC_Any
323             } else if ch == '\n' {
324                 GC_LF
325             } else if ch == '\r' {
326                 GC_CR
327             } else {
328                 GC_Control
329             }
330         } else {
331             // If this char isn't within the cached range, update the cache to the
332             // range that includes it.
333             if (ch as u32) < self.grapheme_cat_cache.0 || (ch as u32) > self.grapheme_cat_cache.1 {
334                 self.grapheme_cat_cache = gr::grapheme_category(ch);
335             }
336             self.grapheme_cat_cache.2
337         }
338     }
339 
340     // Not sure I'm gonna keep this, the advantage over new() seems thin.
341 
342     /// Set the cursor to a new location in the same string.
343     ///
344     /// ```rust
345     /// # use unicode_segmentation::GraphemeCursor;
346     /// let s = "abcd";
347     /// let mut cursor = GraphemeCursor::new(0, s.len(), false);
348     /// assert_eq!(cursor.cur_cursor(), 0);
349     /// cursor.set_cursor(2);
350     /// assert_eq!(cursor.cur_cursor(), 2);
351     /// ```
set_cursor(&mut self, offset: usize)352     pub fn set_cursor(&mut self, offset: usize) {
353         if offset != self.offset {
354             self.offset = offset;
355             self.state = if offset == 0 || offset == self.len {
356                 GraphemeState::Break
357             } else {
358                 GraphemeState::Unknown
359             };
360             // reset state derived from text around cursor
361             self.cat_before = None;
362             self.cat_after = None;
363             self.ris_count = None;
364         }
365     }
366 
367     #[inline]
368     /// The current offset of the cursor. Equal to the last value provided to
369     /// `new()` or `set_cursor()`, or returned from `next_boundary()` or
370     /// `prev_boundary()`.
371     ///
372     /// ```rust
373     /// # use unicode_segmentation::GraphemeCursor;
374     /// // Two flags (��������), each flag is two RIS codepoints, each RIS is 4 bytes.
375     /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
376     /// let mut cursor = GraphemeCursor::new(4, flags.len(), false);
377     /// assert_eq!(cursor.cur_cursor(), 4);
378     /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(8)));
379     /// assert_eq!(cursor.cur_cursor(), 8);
380     /// ```
cur_cursor(&self) -> usize381     pub fn cur_cursor(&self) -> usize {
382         self.offset
383     }
384 
385     /// Provide additional pre-context when it is needed to decide a boundary.
386     /// The end of the chunk must coincide with the value given in the
387     /// `GraphemeIncomplete::PreContext` request.
388     ///
389     /// ```rust
390     /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
391     /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
392     /// let mut cursor = GraphemeCursor::new(8, flags.len(), false);
393     /// // Not enough pre-context to decide if there's a boundary between the two flags.
394     /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Err(GraphemeIncomplete::PreContext(8)));
395     /// // Provide one more Regional Indicator Symbol of pre-context
396     /// cursor.provide_context(&flags[4..8], 4);
397     /// // Still not enough context to decide.
398     /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Err(GraphemeIncomplete::PreContext(4)));
399     /// // Provide additional requested context.
400     /// cursor.provide_context(&flags[0..4], 0);
401     /// // That's enough to decide (it always is when context goes to the start of the string)
402     /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Ok(true));
403     /// ```
provide_context(&mut self, chunk: &str, chunk_start: usize)404     pub fn provide_context(&mut self, chunk: &str, chunk_start: usize) {
405         use crate::tables::grapheme as gr;
406         assert!(chunk_start + chunk.len() == self.pre_context_offset.unwrap());
407         self.pre_context_offset = None;
408         if self.is_extended && chunk_start + chunk.len() == self.offset {
409             let ch = chunk.chars().rev().next().unwrap();
410             if self.grapheme_category(ch) == gr::GC_Prepend {
411                 self.decide(false); // GB9b
412                 return;
413             }
414         }
415         match self.state {
416             GraphemeState::Regional => self.handle_regional(chunk, chunk_start),
417             GraphemeState::Emoji => self.handle_emoji(chunk, chunk_start),
418             _ => {
419                 if self.cat_before.is_none() && self.offset == chunk.len() + chunk_start {
420                     let ch = chunk.chars().rev().next().unwrap();
421                     self.cat_before = Some(self.grapheme_category(ch));
422                 }
423             }
424         }
425     }
426 
427     #[inline]
decide(&mut self, is_break: bool)428     fn decide(&mut self, is_break: bool) {
429         self.state = if is_break {
430             GraphemeState::Break
431         } else {
432             GraphemeState::NotBreak
433         };
434     }
435 
436     #[inline]
decision(&mut self, is_break: bool) -> Result<bool, GraphemeIncomplete>437     fn decision(&mut self, is_break: bool) -> Result<bool, GraphemeIncomplete> {
438         self.decide(is_break);
439         Ok(is_break)
440     }
441 
442     #[inline]
is_boundary_result(&self) -> Result<bool, GraphemeIncomplete>443     fn is_boundary_result(&self) -> Result<bool, GraphemeIncomplete> {
444         if self.state == GraphemeState::Break {
445             Ok(true)
446         } else if self.state == GraphemeState::NotBreak {
447             Ok(false)
448         } else if let Some(pre_context_offset) = self.pre_context_offset {
449             Err(GraphemeIncomplete::PreContext(pre_context_offset))
450         } else {
451             unreachable!("inconsistent state");
452         }
453     }
454 
455     #[inline]
handle_regional(&mut self, chunk: &str, chunk_start: usize)456     fn handle_regional(&mut self, chunk: &str, chunk_start: usize) {
457         use crate::tables::grapheme as gr;
458         let mut ris_count = self.ris_count.unwrap_or(0);
459         for ch in chunk.chars().rev() {
460             if self.grapheme_category(ch) != gr::GC_Regional_Indicator {
461                 self.ris_count = Some(ris_count);
462                 self.decide((ris_count % 2) == 0);
463                 return;
464             }
465             ris_count += 1;
466         }
467         self.ris_count = Some(ris_count);
468         if chunk_start == 0 {
469             self.decide((ris_count % 2) == 0);
470             return;
471         }
472         self.pre_context_offset = Some(chunk_start);
473         self.state = GraphemeState::Regional;
474     }
475 
476     #[inline]
handle_emoji(&mut self, chunk: &str, chunk_start: usize)477     fn handle_emoji(&mut self, chunk: &str, chunk_start: usize) {
478         use crate::tables::grapheme as gr;
479         let mut iter = chunk.chars().rev();
480         if let Some(ch) = iter.next() {
481             if self.grapheme_category(ch) != gr::GC_ZWJ {
482                 self.decide(true);
483                 return;
484             }
485         }
486         for ch in iter {
487             match self.grapheme_category(ch) {
488                 gr::GC_Extend => (),
489                 gr::GC_Extended_Pictographic => {
490                     self.decide(false);
491                     return;
492                 }
493                 _ => {
494                     self.decide(true);
495                     return;
496                 }
497             }
498         }
499         if chunk_start == 0 {
500             self.decide(true);
501             return;
502         }
503         self.pre_context_offset = Some(chunk_start);
504         self.state = GraphemeState::Emoji;
505     }
506 
507     #[inline]
508     /// Determine whether the current cursor location is a grapheme cluster boundary.
509     /// Only a part of the string need be supplied. If `chunk_start` is nonzero or
510     /// the length of `chunk` is not equal to `len` on creation, then this method
511     /// may return `GraphemeIncomplete::PreContext`. The caller should then
512     /// call `provide_context` with the requested chunk, then retry calling this
513     /// method.
514     ///
515     /// For partial chunks, if the cursor is not at the beginning or end of the
516     /// string, the chunk should contain at least the codepoint following the cursor.
517     /// If the string is nonempty, the chunk must be nonempty.
518     ///
519     /// All calls should have consistent chunk contents (ie, if a chunk provides
520     /// content for a given slice, all further chunks covering that slice must have
521     /// the same content for it).
522     ///
523     /// ```rust
524     /// # use unicode_segmentation::GraphemeCursor;
525     /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
526     /// let mut cursor = GraphemeCursor::new(8, flags.len(), false);
527     /// assert_eq!(cursor.is_boundary(flags, 0), Ok(true));
528     /// cursor.set_cursor(12);
529     /// assert_eq!(cursor.is_boundary(flags, 0), Ok(false));
530     /// ```
is_boundary( &mut self, chunk: &str, chunk_start: usize, ) -> Result<bool, GraphemeIncomplete>531     pub fn is_boundary(
532         &mut self,
533         chunk: &str,
534         chunk_start: usize,
535     ) -> Result<bool, GraphemeIncomplete> {
536         use crate::tables::grapheme as gr;
537         if self.state == GraphemeState::Break {
538             return Ok(true);
539         }
540         if self.state == GraphemeState::NotBreak {
541             return Ok(false);
542         }
543         if self.offset < chunk_start || self.offset >= chunk_start + chunk.len() {
544             if self.offset > chunk_start + chunk.len() || self.cat_after.is_none() {
545                 return Err(GraphemeIncomplete::InvalidOffset);
546             }
547         }
548         if let Some(pre_context_offset) = self.pre_context_offset {
549             return Err(GraphemeIncomplete::PreContext(pre_context_offset));
550         }
551         let offset_in_chunk = self.offset - chunk_start;
552         if self.cat_after.is_none() {
553             let ch = chunk[offset_in_chunk..].chars().next().unwrap();
554             self.cat_after = Some(self.grapheme_category(ch));
555         }
556         if self.offset == chunk_start {
557             let mut need_pre_context = true;
558             match self.cat_after.unwrap() {
559                 gr::GC_Regional_Indicator => self.state = GraphemeState::Regional,
560                 gr::GC_Extended_Pictographic => self.state = GraphemeState::Emoji,
561                 _ => need_pre_context = self.cat_before.is_none(),
562             }
563             if need_pre_context {
564                 self.pre_context_offset = Some(chunk_start);
565                 return Err(GraphemeIncomplete::PreContext(chunk_start));
566             }
567         }
568         if self.cat_before.is_none() {
569             let ch = chunk[..offset_in_chunk].chars().rev().next().unwrap();
570             self.cat_before = Some(self.grapheme_category(ch));
571         }
572         match check_pair(self.cat_before.unwrap(), self.cat_after.unwrap()) {
573             PairResult::NotBreak => return self.decision(false),
574             PairResult::Break => return self.decision(true),
575             PairResult::Extended => {
576                 let is_extended = self.is_extended;
577                 return self.decision(!is_extended);
578             }
579             PairResult::Regional => {
580                 if let Some(ris_count) = self.ris_count {
581                     return self.decision((ris_count % 2) == 0);
582                 }
583                 self.handle_regional(&chunk[..offset_in_chunk], chunk_start);
584                 self.is_boundary_result()
585             }
586             PairResult::Emoji => {
587                 self.handle_emoji(&chunk[..offset_in_chunk], chunk_start);
588                 self.is_boundary_result()
589             }
590         }
591     }
592 
593     #[inline]
594     /// Find the next boundary after the current cursor position. Only a part of
595     /// the string need be supplied. If the chunk is incomplete, then this
596     /// method might return `GraphemeIncomplete::PreContext` or
597     /// `GraphemeIncomplete::NextChunk`. In the former case, the caller should
598     /// call `provide_context` with the requested chunk, then retry. In the
599     /// latter case, the caller should provide the chunk following the one
600     /// given, then retry.
601     ///
602     /// See `is_boundary` for expectations on the provided chunk.
603     ///
604     /// ```rust
605     /// # use unicode_segmentation::GraphemeCursor;
606     /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
607     /// let mut cursor = GraphemeCursor::new(4, flags.len(), false);
608     /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(8)));
609     /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(16)));
610     /// assert_eq!(cursor.next_boundary(flags, 0), Ok(None));
611     /// ```
612     ///
613     /// And an example that uses partial strings:
614     ///
615     /// ```rust
616     /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
617     /// let s = "abcd";
618     /// let mut cursor = GraphemeCursor::new(0, s.len(), false);
619     /// assert_eq!(cursor.next_boundary(&s[..2], 0), Ok(Some(1)));
620     /// assert_eq!(cursor.next_boundary(&s[..2], 0), Err(GraphemeIncomplete::NextChunk));
621     /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(2)));
622     /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(3)));
623     /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(4)));
624     /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(None));
625     /// ```
next_boundary( &mut self, chunk: &str, chunk_start: usize, ) -> Result<Option<usize>, GraphemeIncomplete>626     pub fn next_boundary(
627         &mut self,
628         chunk: &str,
629         chunk_start: usize,
630     ) -> Result<Option<usize>, GraphemeIncomplete> {
631         if self.offset == self.len {
632             return Ok(None);
633         }
634         let mut iter = chunk[self.offset - chunk_start..].chars();
635         let mut ch = iter.next().unwrap();
636         loop {
637             if self.resuming {
638                 if self.cat_after.is_none() {
639                     self.cat_after = Some(self.grapheme_category(ch));
640                 }
641             } else {
642                 self.offset += ch.len_utf8();
643                 self.state = GraphemeState::Unknown;
644                 self.cat_before = self.cat_after.take();
645                 if self.cat_before.is_none() {
646                     self.cat_before = Some(self.grapheme_category(ch));
647                 }
648                 if self.cat_before.unwrap() == GraphemeCat::GC_Regional_Indicator {
649                     self.ris_count = self.ris_count.map(|c| c + 1);
650                 } else {
651                     self.ris_count = Some(0);
652                 }
653                 if let Some(next_ch) = iter.next() {
654                     ch = next_ch;
655                     self.cat_after = Some(self.grapheme_category(ch));
656                 } else if self.offset == self.len {
657                     self.decide(true);
658                 } else {
659                     self.resuming = true;
660                     return Err(GraphemeIncomplete::NextChunk);
661                 }
662             }
663             self.resuming = true;
664             if self.is_boundary(chunk, chunk_start)? {
665                 self.resuming = false;
666                 return Ok(Some(self.offset));
667             }
668             self.resuming = false;
669         }
670     }
671 
672     /// Find the previous boundary after the current cursor position. Only a part
673     /// of the string need be supplied. If the chunk is incomplete, then this
674     /// method might return `GraphemeIncomplete::PreContext` or
675     /// `GraphemeIncomplete::PrevChunk`. In the former case, the caller should
676     /// call `provide_context` with the requested chunk, then retry. In the
677     /// latter case, the caller should provide the chunk preceding the one
678     /// given, then retry.
679     ///
680     /// See `is_boundary` for expectations on the provided chunk.
681     ///
682     /// ```rust
683     /// # use unicode_segmentation::GraphemeCursor;
684     /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
685     /// let mut cursor = GraphemeCursor::new(12, flags.len(), false);
686     /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(Some(8)));
687     /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(Some(0)));
688     /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(None));
689     /// ```
690     ///
691     /// And an example that uses partial strings (note the exact return is not
692     /// guaranteed, and may be `PrevChunk` or `PreContext` arbitrarily):
693     ///
694     /// ```rust
695     /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
696     /// let s = "abcd";
697     /// let mut cursor = GraphemeCursor::new(4, s.len(), false);
698     /// assert_eq!(cursor.prev_boundary(&s[2..4], 2), Ok(Some(3)));
699     /// assert_eq!(cursor.prev_boundary(&s[2..4], 2), Err(GraphemeIncomplete::PrevChunk));
700     /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(2)));
701     /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(1)));
702     /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(0)));
703     /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(None));
704     /// ```
prev_boundary( &mut self, chunk: &str, chunk_start: usize, ) -> Result<Option<usize>, GraphemeIncomplete>705     pub fn prev_boundary(
706         &mut self,
707         chunk: &str,
708         chunk_start: usize,
709     ) -> Result<Option<usize>, GraphemeIncomplete> {
710         if self.offset == 0 {
711             return Ok(None);
712         }
713         if self.offset == chunk_start {
714             return Err(GraphemeIncomplete::PrevChunk);
715         }
716         let mut iter = chunk[..self.offset - chunk_start].chars().rev();
717         let mut ch = iter.next().unwrap();
718         loop {
719             if self.offset == chunk_start {
720                 self.resuming = true;
721                 return Err(GraphemeIncomplete::PrevChunk);
722             }
723             if self.resuming {
724                 self.cat_before = Some(self.grapheme_category(ch));
725             } else {
726                 self.offset -= ch.len_utf8();
727                 self.cat_after = self.cat_before.take();
728                 self.state = GraphemeState::Unknown;
729                 if let Some(ris_count) = self.ris_count {
730                     self.ris_count = if ris_count > 0 {
731                         Some(ris_count - 1)
732                     } else {
733                         None
734                     };
735                 }
736                 if let Some(prev_ch) = iter.next() {
737                     ch = prev_ch;
738                     self.cat_before = Some(self.grapheme_category(ch));
739                 } else if self.offset == 0 {
740                     self.decide(true);
741                 } else {
742                     self.resuming = true;
743                     self.cat_after = Some(self.grapheme_category(ch));
744                     return Err(GraphemeIncomplete::PrevChunk);
745                 }
746             }
747             self.resuming = true;
748             if self.is_boundary(chunk, chunk_start)? {
749                 self.resuming = false;
750                 return Ok(Some(self.offset));
751             }
752             self.resuming = false;
753         }
754     }
755 }
756 
757 #[test]
test_grapheme_cursor_ris_precontext()758 fn test_grapheme_cursor_ris_precontext() {
759     let s = "\u{1f1fa}\u{1f1f8}\u{1f1fa}\u{1f1f8}\u{1f1fa}\u{1f1f8}";
760     let mut c = GraphemeCursor::new(8, s.len(), true);
761     assert_eq!(
762         c.is_boundary(&s[4..], 4),
763         Err(GraphemeIncomplete::PreContext(4))
764     );
765     c.provide_context(&s[..4], 0);
766     assert_eq!(c.is_boundary(&s[4..], 4), Ok(true));
767 }
768 
769 #[test]
test_grapheme_cursor_chunk_start_require_precontext()770 fn test_grapheme_cursor_chunk_start_require_precontext() {
771     let s = "\r\n";
772     let mut c = GraphemeCursor::new(1, s.len(), true);
773     assert_eq!(
774         c.is_boundary(&s[1..], 1),
775         Err(GraphemeIncomplete::PreContext(1))
776     );
777     c.provide_context(&s[..1], 0);
778     assert_eq!(c.is_boundary(&s[1..], 1), Ok(false));
779 }
780 
781 #[test]
test_grapheme_cursor_prev_boundary()782 fn test_grapheme_cursor_prev_boundary() {
783     let s = "abcd";
784     let mut c = GraphemeCursor::new(3, s.len(), true);
785     assert_eq!(
786         c.prev_boundary(&s[2..], 2),
787         Err(GraphemeIncomplete::PrevChunk)
788     );
789     assert_eq!(c.prev_boundary(&s[..2], 0), Ok(Some(2)));
790 }
791 
792 #[test]
test_grapheme_cursor_prev_boundary_chunk_start()793 fn test_grapheme_cursor_prev_boundary_chunk_start() {
794     let s = "abcd";
795     let mut c = GraphemeCursor::new(2, s.len(), true);
796     assert_eq!(
797         c.prev_boundary(&s[2..], 2),
798         Err(GraphemeIncomplete::PrevChunk)
799     );
800     assert_eq!(c.prev_boundary(&s[..2], 0), Ok(Some(1)));
801 }
802