1 //! Functionality for finding words.
2 //!
3 //! In order to wrap text, we need to know where the legal break
4 //! points are, i.e., where the words of the text are. This means that
5 //! we need to define what a "word" is.
6 //!
7 //! A simple approach is to simply split the text on whitespace, but
8 //! this does not work for East-Asian languages such as Chinese or
9 //! Japanese where there are no spaces between words. Breaking a long
10 //! sequence of emojis is another example where line breaks might be
11 //! wanted even if there are no whitespace to be found.
12 //!
13 //! The [`WordSeparator`] trait is responsible for determining where
14 //! there words are in a line of text. Please refer to the trait and
15 //! the structs which implement it for more information.
16 
17 #[cfg(feature = "unicode-linebreak")]
18 use crate::core::skip_ansi_escape_sequence;
19 use crate::core::Word;
20 
21 /// Describes where words occur in a line of text.
22 ///
23 /// The simplest approach is say that words are separated by one or
24 /// more ASCII spaces (`' '`). This works for Western languages
25 /// without emojis. A more complex approach is to use the Unicode line
26 /// breaking algorithm, which finds break points in non-ASCII text.
27 ///
28 /// The line breaks occur between words, please see
29 /// [`WordSplitter`](crate::WordSplitter) for options of how to handle
30 /// hyphenation of individual words.
31 ///
32 /// # Examples
33 ///
34 /// ```
35 /// use textwrap::core::Word;
36 /// use textwrap::WordSeparator::AsciiSpace;
37 ///
38 /// let words = AsciiSpace.find_words("Hello World!").collect::<Vec<_>>();
39 /// assert_eq!(words, vec![Word::from("Hello "), Word::from("World!")]);
40 /// ```
41 #[derive(Clone, Copy)]
42 pub enum WordSeparator {
43     /// Find words by splitting on runs of `' '` characters.
44     ///
45     /// # Examples
46     ///
47     /// ```
48     /// use textwrap::core::Word;
49     /// use textwrap::WordSeparator::AsciiSpace;
50     ///
51     /// let words = AsciiSpace.find_words("Hello   World!").collect::<Vec<_>>();
52     /// assert_eq!(words, vec![Word::from("Hello   "),
53     ///                        Word::from("World!")]);
54     /// ```
55     AsciiSpace,
56 
57     /// Split `line` into words using Unicode break properties.
58     ///
59     /// This word separator uses the Unicode line breaking algorithm
60     /// described in [Unicode Standard Annex
61     /// #14](https://www.unicode.org/reports/tr14/) to find legal places
62     /// to break lines. There is a small difference in that the U+002D
63     /// (Hyphen-Minus) and U+00AD (Soft Hyphen) don’t create a line break:
64     /// to allow a line break at a hyphen, use
65     /// [`WordSplitter::HyphenSplitter`](crate::WordSplitter::HyphenSplitter).
66     /// Soft hyphens are not currently supported.
67     ///
68     /// # Examples
69     ///
70     /// Unlike [`WordSeparator::AsciiSpace`], the Unicode line
71     /// breaking algorithm will find line break opportunities between
72     /// some characters with no intervening whitespace:
73     ///
74     /// ```
75     /// #[cfg(feature = "unicode-linebreak")] {
76     /// use textwrap::core::Word;
77     /// use textwrap::WordSeparator::UnicodeBreakProperties;
78     ///
79     /// assert_eq!(UnicodeBreakProperties.find_words("Emojis: ����").collect::<Vec<_>>(),
80     ///            vec![Word::from("Emojis: "),
81     ///                 Word::from("��"),
82     ///                 Word::from("��")]);
83     ///
84     /// assert_eq!(UnicodeBreakProperties.find_words("CJK: 你好").collect::<Vec<_>>(),
85     ///            vec![Word::from("CJK: "),
86     ///                 Word::from("你"),
87     ///                 Word::from("好")]);
88     /// }
89     /// ```
90     ///
91     /// A U+2060 (Word Joiner) character can be inserted if you want to
92     /// manually override the defaults and keep the characters together:
93     ///
94     /// ```
95     /// #[cfg(feature = "unicode-linebreak")] {
96     /// use textwrap::core::Word;
97     /// use textwrap::WordSeparator::UnicodeBreakProperties;
98     ///
99     /// assert_eq!(UnicodeBreakProperties.find_words("Emojis: ��\u{2060}��").collect::<Vec<_>>(),
100     ///            vec![Word::from("Emojis: "),
101     ///                 Word::from("��\u{2060}��")]);
102     /// }
103     /// ```
104     ///
105     /// The Unicode line breaking algorithm will also automatically
106     /// suppress break breaks around certain punctuation characters::
107     ///
108     /// ```
109     /// #[cfg(feature = "unicode-linebreak")] {
110     /// use textwrap::core::Word;
111     /// use textwrap::WordSeparator::UnicodeBreakProperties;
112     ///
113     /// assert_eq!(UnicodeBreakProperties.find_words("[ foo ] bar !").collect::<Vec<_>>(),
114     ///            vec![Word::from("[ foo ] "),
115     ///                 Word::from("bar !")]);
116     /// }
117     /// ```
118     #[cfg(feature = "unicode-linebreak")]
119     UnicodeBreakProperties,
120 
121     /// Find words using a custom word separator
122     Custom(fn(line: &str) -> Box<dyn Iterator<Item = Word<'_>> + '_>),
123 }
124 
125 impl PartialEq for WordSeparator {
126     /// Compare two word separators.
127     ///
128     /// ```
129     /// use textwrap::WordSeparator;
130     ///
131     /// assert_eq!(WordSeparator::AsciiSpace, WordSeparator::AsciiSpace);
132     /// #[cfg(feature = "unicode-linebreak")] {
133     ///     assert_eq!(WordSeparator::UnicodeBreakProperties,
134     ///                WordSeparator::UnicodeBreakProperties);
135     /// }
136     /// ```
137     ///
138     /// Note that `WordSeparator::Custom` values never compare equal:
139     ///
140     /// ```
141     /// use textwrap::WordSeparator;
142     /// use textwrap::core::Word;
143     /// fn word_separator(line: &str) -> Box<dyn Iterator<Item = Word<'_>> + '_> {
144     ///     Box::new(line.split_inclusive(' ').map(Word::from))
145     /// }
146     /// assert_ne!(WordSeparator::Custom(word_separator),
147     ///            WordSeparator::Custom(word_separator));
148     /// ```
eq(&self, other: &Self) -> bool149     fn eq(&self, other: &Self) -> bool {
150         match (self, other) {
151             (WordSeparator::AsciiSpace, WordSeparator::AsciiSpace) => true,
152             #[cfg(feature = "unicode-linebreak")]
153             (WordSeparator::UnicodeBreakProperties, WordSeparator::UnicodeBreakProperties) => true,
154             (_, _) => false,
155         }
156     }
157 }
158 
159 impl std::fmt::Debug for WordSeparator {
fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result160     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
161         match self {
162             WordSeparator::AsciiSpace => f.write_str("AsciiSpace"),
163             #[cfg(feature = "unicode-linebreak")]
164             WordSeparator::UnicodeBreakProperties => f.write_str("UnicodeBreakProperties"),
165             WordSeparator::Custom(_) => f.write_str("Custom(...)"),
166         }
167     }
168 }
169 
170 impl WordSeparator {
171     /// Create a new word separator.
172     ///
173     /// The best available algorithm is used by default, i.e.,
174     /// [`WordSeparator::UnicodeBreakProperties`] if available,
175     /// otherwise [`WordSeparator::AsciiSpace`].
new() -> Self176     pub const fn new() -> Self {
177         #[cfg(feature = "unicode-linebreak")]
178         {
179             WordSeparator::UnicodeBreakProperties
180         }
181 
182         #[cfg(not(feature = "unicode-linebreak"))]
183         {
184             WordSeparator::AsciiSpace
185         }
186     }
187 
188     // This function should really return impl Iterator<Item = Word>, but
189     // this isn't possible until Rust supports higher-kinded types:
190     // https://github.com/rust-lang/rfcs/blob/master/text/1522-conservative-impl-trait.md
191     /// Find all words in `line`.
find_words<'a>(&self, line: &'a str) -> Box<dyn Iterator<Item = Word<'a>> + 'a>192     pub fn find_words<'a>(&self, line: &'a str) -> Box<dyn Iterator<Item = Word<'a>> + 'a> {
193         match self {
194             WordSeparator::AsciiSpace => find_words_ascii_space(line),
195             #[cfg(feature = "unicode-linebreak")]
196             WordSeparator::UnicodeBreakProperties => find_words_unicode_break_properties(line),
197             WordSeparator::Custom(func) => func(line),
198         }
199     }
200 }
201 
find_words_ascii_space<'a>(line: &'a str) -> Box<dyn Iterator<Item = Word<'a>> + 'a>202 fn find_words_ascii_space<'a>(line: &'a str) -> Box<dyn Iterator<Item = Word<'a>> + 'a> {
203     let mut start = 0;
204     let mut in_whitespace = false;
205     let mut char_indices = line.char_indices();
206 
207     Box::new(std::iter::from_fn(move || {
208         for (idx, ch) in char_indices.by_ref() {
209             if in_whitespace && ch != ' ' {
210                 let word = Word::from(&line[start..idx]);
211                 start = idx;
212                 in_whitespace = ch == ' ';
213                 return Some(word);
214             }
215 
216             in_whitespace = ch == ' ';
217         }
218 
219         if start < line.len() {
220             let word = Word::from(&line[start..]);
221             start = line.len();
222             return Some(word);
223         }
224 
225         None
226     }))
227 }
228 
229 // Strip all ANSI escape sequences from `text`.
230 #[cfg(feature = "unicode-linebreak")]
strip_ansi_escape_sequences(text: &str) -> String231 fn strip_ansi_escape_sequences(text: &str) -> String {
232     let mut result = String::with_capacity(text.len());
233 
234     let mut chars = text.chars();
235     while let Some(ch) = chars.next() {
236         if skip_ansi_escape_sequence(ch, &mut chars) {
237             continue;
238         }
239         result.push(ch);
240     }
241 
242     result
243 }
244 
245 /// Soft hyphen, also knows as a “shy hyphen”. Should show up as ‘-’
246 /// if a line is broken at this point, and otherwise be invisible.
247 /// Textwrap does not currently support breaking words at soft
248 /// hyphens.
249 #[cfg(feature = "unicode-linebreak")]
250 const SHY: char = '\u{00ad}';
251 
252 /// Find words in line. ANSI escape sequences are ignored in `line`.
253 #[cfg(feature = "unicode-linebreak")]
find_words_unicode_break_properties<'a>( line: &'a str, ) -> Box<dyn Iterator<Item = Word<'a>> + 'a>254 fn find_words_unicode_break_properties<'a>(
255     line: &'a str,
256 ) -> Box<dyn Iterator<Item = Word<'a>> + 'a> {
257     // Construct an iterator over (original index, stripped index)
258     // tuples. We find the Unicode linebreaks on a stripped string,
259     // but we need the original indices so we can form words based on
260     // the original string.
261     let mut last_stripped_idx = 0;
262     let mut char_indices = line.char_indices();
263     let mut idx_map = std::iter::from_fn(move || match char_indices.next() {
264         Some((orig_idx, ch)) => {
265             let stripped_idx = last_stripped_idx;
266             if !skip_ansi_escape_sequence(ch, &mut char_indices.by_ref().map(|(_, ch)| ch)) {
267                 last_stripped_idx += ch.len_utf8();
268             }
269             Some((orig_idx, stripped_idx))
270         }
271         None => None,
272     });
273 
274     let stripped = strip_ansi_escape_sequences(line);
275     let mut opportunities = unicode_linebreak::linebreaks(&stripped)
276         .filter(|(idx, _)| {
277             #[allow(clippy::match_like_matches_macro)]
278             match &stripped[..*idx].chars().next_back() {
279                 // We suppress breaks at ‘-’ since we want to control
280                 // this via the WordSplitter.
281                 Some('-') => false,
282                 // Soft hyphens are currently not supported since we
283                 // require all `Word` fragments to be continuous in
284                 // the input string.
285                 Some(SHY) => false,
286                 // Other breaks should be fine!
287                 _ => true,
288             }
289         })
290         .collect::<Vec<_>>()
291         .into_iter();
292 
293     // Remove final break opportunity, we will add it below using
294     // &line[start..]; This ensures that we correctly include a
295     // trailing ANSI escape sequence.
296     opportunities.next_back();
297 
298     let mut start = 0;
299     Box::new(std::iter::from_fn(move || {
300         for (idx, _) in opportunities.by_ref() {
301             if let Some((orig_idx, _)) = idx_map.find(|&(_, stripped_idx)| stripped_idx == idx) {
302                 let word = Word::from(&line[start..orig_idx]);
303                 start = orig_idx;
304                 return Some(word);
305             }
306         }
307 
308         if start < line.len() {
309             let word = Word::from(&line[start..]);
310             start = line.len();
311             return Some(word);
312         }
313 
314         None
315     }))
316 }
317 
318 #[cfg(test)]
319 mod tests {
320     use super::WordSeparator::*;
321     use super::*;
322 
323     // Like assert_eq!, but the left expression is an iterator.
324     macro_rules! assert_iter_eq {
325         ($left:expr, $right:expr) => {
326             assert_eq!($left.collect::<Vec<_>>(), $right);
327         };
328     }
329 
to_words(words: Vec<&str>) -> Vec<Word<'_>>330     fn to_words(words: Vec<&str>) -> Vec<Word<'_>> {
331         words.into_iter().map(Word::from).collect()
332     }
333 
334     macro_rules! test_find_words {
335         ($ascii_name:ident,
336          $unicode_name:ident,
337          $([ $line:expr, $ascii_words:expr, $unicode_words:expr ]),+) => {
338             #[test]
339             fn $ascii_name() {
340                 $(
341                     let expected_words = to_words($ascii_words.to_vec());
342                     let actual_words = WordSeparator::AsciiSpace
343                         .find_words($line)
344                         .collect::<Vec<_>>();
345                     assert_eq!(actual_words, expected_words, "Line: {:?}", $line);
346                 )+
347             }
348 
349             #[test]
350             #[cfg(feature = "unicode-linebreak")]
351             fn $unicode_name() {
352                 $(
353                     let expected_words = to_words($unicode_words.to_vec());
354                     let actual_words = WordSeparator::UnicodeBreakProperties
355                         .find_words($line)
356                         .collect::<Vec<_>>();
357                     assert_eq!(actual_words, expected_words, "Line: {:?}", $line);
358                 )+
359             }
360         };
361     }
362 
363     test_find_words!(ascii_space_empty, unicode_empty, ["", [], []]);
364 
365     test_find_words!(
366         ascii_single_word,
367         unicode_single_word,
368         ["foo", ["foo"], ["foo"]]
369     );
370 
371     test_find_words!(
372         ascii_two_words,
373         unicode_two_words,
374         ["foo bar", ["foo ", "bar"], ["foo ", "bar"]]
375     );
376 
377     test_find_words!(
378         ascii_multiple_words,
379         unicode_multiple_words,
380         ["foo bar", ["foo ", "bar"], ["foo ", "bar"]],
381         ["x y z", ["x ", "y ", "z"], ["x ", "y ", "z"]]
382     );
383 
384     test_find_words!(
385         ascii_only_whitespace,
386         unicode_only_whitespace,
387         [" ", [" "], [" "]],
388         ["    ", ["    "], ["    "]]
389     );
390 
391     test_find_words!(
392         ascii_inter_word_whitespace,
393         unicode_inter_word_whitespace,
394         ["foo   bar", ["foo   ", "bar"], ["foo   ", "bar"]]
395     );
396 
397     test_find_words!(
398         ascii_trailing_whitespace,
399         unicode_trailing_whitespace,
400         ["foo   ", ["foo   "], ["foo   "]]
401     );
402 
403     test_find_words!(
404         ascii_leading_whitespace,
405         unicode_leading_whitespace,
406         ["   foo", ["   ", "foo"], ["   ", "foo"]]
407     );
408 
409     test_find_words!(
410         ascii_multi_column_char,
411         unicode_multi_column_char,
412         ["\u{1f920}", ["\u{1f920}"], ["\u{1f920}"]] // cowboy emoji ��
413     );
414 
415     test_find_words!(
416         ascii_hyphens,
417         unicode_hyphens,
418         ["foo-bar", ["foo-bar"], ["foo-bar"]],
419         ["foo- bar", ["foo- ", "bar"], ["foo- ", "bar"]],
420         ["foo - bar", ["foo ", "- ", "bar"], ["foo ", "- ", "bar"]],
421         ["foo -bar", ["foo ", "-bar"], ["foo ", "-bar"]]
422     );
423 
424     test_find_words!(
425         ascii_newline,
426         unicode_newline,
427         ["foo\nbar", ["foo\nbar"], ["foo\n", "bar"]]
428     );
429 
430     test_find_words!(
431         ascii_tab,
432         unicode_tab,
433         ["foo\tbar", ["foo\tbar"], ["foo\t", "bar"]]
434     );
435 
436     test_find_words!(
437         ascii_non_breaking_space,
438         unicode_non_breaking_space,
439         ["foo\u{00A0}bar", ["foo\u{00A0}bar"], ["foo\u{00A0}bar"]]
440     );
441 
442     #[test]
443     #[cfg(unix)]
find_words_colored_text()444     fn find_words_colored_text() {
445         use termion::color::{Blue, Fg, Green, Reset};
446 
447         let green_hello = format!("{}Hello{} ", Fg(Green), Fg(Reset));
448         let blue_world = format!("{}World!{}", Fg(Blue), Fg(Reset));
449         assert_iter_eq!(
450             AsciiSpace.find_words(&format!("{}{}", green_hello, blue_world)),
451             vec![Word::from(&green_hello), Word::from(&blue_world)]
452         );
453 
454         #[cfg(feature = "unicode-linebreak")]
455         assert_iter_eq!(
456             UnicodeBreakProperties.find_words(&format!("{}{}", green_hello, blue_world)),
457             vec![Word::from(&green_hello), Word::from(&blue_world)]
458         );
459     }
460 
461     #[test]
find_words_color_inside_word()462     fn find_words_color_inside_word() {
463         let text = "foo\u{1b}[0m\u{1b}[32mbar\u{1b}[0mbaz";
464         assert_iter_eq!(AsciiSpace.find_words(text), vec![Word::from(text)]);
465 
466         #[cfg(feature = "unicode-linebreak")]
467         assert_iter_eq!(
468             UnicodeBreakProperties.find_words(text),
469             vec![Word::from(text)]
470         );
471     }
472 
473     #[test]
word_separator_new()474     fn word_separator_new() {
475         #[cfg(feature = "unicode-linebreak")]
476         assert!(matches!(WordSeparator::new(), UnicodeBreakProperties));
477 
478         #[cfg(not(feature = "unicode-linebreak"))]
479         assert!(matches!(WordSeparator::new(), AsciiSpace));
480     }
481 }
482