1 //
2 // https://vt100.net/emu/dec_ansi_parser
3 //
4 // The parser is heavily inspired by the vte (https://crates.io/crates/vte) crate.
5 // Tried to use this crate, but it doesn't work for opposite way (terminal -> sequence),
6 // because there're couple of exceptions we have to handle and it doesn't make much
7 // sense to add them to the vte crate. An example is Esc key where we need to know if
8 // there's additional input available or not and then the decision is made if the
9 // Esc char is dispatched immediately (user hits just Esc key) or if it's an escape/csi/...
10 // sequence.
11 //
12 const MAX_PARAMETERS: usize = 30;
13 const DEFAULT_PARAMETER_VALUE: u64 = 0;
14 const MAX_UTF8_CODE_POINTS: usize = 4;
15 
16 /// A parser engine state.
17 ///
18 /// All these variant names come from the
19 /// [A parser for DEC’s ANSI-compatible video terminals](https://vt100.net/emu/dec_ansi_parser)
20 /// description.
21 #[derive(Copy, Clone, Debug, Eq, PartialEq)]
22 enum State {
23     /// Initial state.
24     Ground,
25     /// Escape sequence started.
26     ///
27     /// `Esc` received with a flag that there's more data available.
28     Escape,
29     /// Escape sequence and we're collecting intermediates.
30     ///
31     /// # Notes
32     ///
33     /// This implementation doesn't collect intermediates. It just handles the state
34     /// to distinguish between (im)proper sequences.
35     EscapeIntermediate,
36     /// CSI sequence started.
37     ///
38     /// `Esc` followed by the `[` received.
39     CsiEntry,
40     /// CSI sequence should be consumed, but not dispatched.
41     CsiIgnore,
42     /// CSI sequence and we're collecting parameters.
43     CsiParameter,
44     /// CSI sequence and we're collecting intermediates.
45     ///
46     /// # Notes
47     ///
48     /// This implementation doesn't collect intermediates. It just handles the state
49     /// to distinguish between (im)proper sequences.
50     CsiIntermediate,
51     /// Possible UTF-8 sequence and we're collecting UTF-8 code points.
52     Utf8,
53 }
54 
55 pub(crate) trait Provide {
provide_char(&mut self, ch: char)56     fn provide_char(&mut self, ch: char);
57 
provide_esc_sequence(&mut self, ch: char)58     fn provide_esc_sequence(&mut self, ch: char);
59 
provide_csi_sequence(&mut self, parameters: &[u64], ignored_count: usize, ch: char)60     fn provide_csi_sequence(&mut self, parameters: &[u64], ignored_count: usize, ch: char);
61 }
62 
63 pub(crate) struct Engine {
64     parameters: [u64; MAX_PARAMETERS],
65     parameters_count: usize,
66     parameter: u64,
67     ignored_parameters_count: usize,
68     state: State,
69     utf8_points: [u8; MAX_UTF8_CODE_POINTS],
70     utf8_points_count: usize,
71     utf8_points_expected_count: usize,
72 }
73 
74 impl Default for Engine {
default() -> Self75     fn default() -> Self {
76         Engine {
77             parameters: [DEFAULT_PARAMETER_VALUE; MAX_PARAMETERS],
78             parameters_count: 0,
79             parameter: DEFAULT_PARAMETER_VALUE,
80             ignored_parameters_count: 0,
81             state: State::Ground,
82             utf8_points: [0; MAX_UTF8_CODE_POINTS],
83             utf8_points_count: 0,
84             utf8_points_expected_count: 0,
85         }
86     }
87 }
88 
89 impl Engine {
set_state(&mut self, state: State)90     fn set_state(&mut self, state: State) {
91         if let State::Ground = state {
92             self.parameters_count = 0;
93             self.parameter = DEFAULT_PARAMETER_VALUE;
94             self.ignored_parameters_count = 0;
95             self.utf8_points_count = 0;
96             self.utf8_points_expected_count = 0;
97         }
98         self.state = state;
99     }
100 
store_parameter(&mut self)101     fn store_parameter(&mut self) {
102         if self.parameters_count < MAX_PARAMETERS {
103             self.parameters[self.parameters_count] = self.parameter;
104             self.parameters_count += 1;
105         } else {
106             self.ignored_parameters_count += 1;
107         }
108         self.parameter = DEFAULT_PARAMETER_VALUE;
109     }
110 
handle_possible_esc(&mut self, provider: &mut dyn Provide, byte: u8, more: bool) -> bool111     fn handle_possible_esc(&mut self, provider: &mut dyn Provide, byte: u8, more: bool) -> bool {
112         if byte != 0x1B {
113             return false;
114         }
115 
116         match (self.state, more) {
117             // More input means possible Esc sequence, just switch state and wait
118             (State::Ground, true) => self.set_state(State::Escape),
119 
120             // No more input means Esc key, dispatch it
121             (State::Ground, false) => provider.provide_char('\x1B'),
122 
123             // More input means possible Esc sequence, dispatch the previous Esc char
124             (State::Escape, true) => provider.provide_char('\x1B'),
125 
126             // No more input means Esc key, dispatch the previous & current Esc char
127             (State::Escape, false) => {
128                 provider.provide_char('\x1B');
129                 provider.provide_char('\x1B');
130                 self.set_state(State::Ground);
131             }
132 
133             // Discard any state
134             // More input means possible Esc sequence
135             (_, true) => self.set_state(State::Escape),
136 
137             // Discard any state
138             // No more input means Esc key, dispatch it
139             (_, false) => {
140                 provider.provide_char('\x1B');
141                 self.set_state(State::Ground);
142             }
143         }
144 
145         true
146     }
147 
handle_possible_utf8_code_points(&mut self, provider: &mut dyn Provide, byte: u8) -> bool148     fn handle_possible_utf8_code_points(&mut self, provider: &mut dyn Provide, byte: u8) -> bool {
149         if byte & 0b1000_0000 == 0b0000_0000 {
150             provider.provide_char(byte as char);
151             true
152         } else if byte & 0b1110_0000 == 0b1100_0000 {
153             self.utf8_points_count = 1;
154             self.utf8_points[0] = byte;
155             self.utf8_points_expected_count = 2;
156             self.set_state(State::Utf8);
157             true
158         } else if byte & 0b1111_0000 == 0b1110_0000 {
159             self.utf8_points_count = 1;
160             self.utf8_points[0] = byte;
161             self.utf8_points_expected_count = 3;
162             self.set_state(State::Utf8);
163             true
164         } else if byte & 0b1111_1000 == 0b1111_0000 {
165             self.utf8_points_count = 1;
166             self.utf8_points[0] = byte;
167             self.utf8_points_expected_count = 4;
168             self.set_state(State::Utf8);
169             true
170         } else {
171             false
172         }
173     }
174 
advance_ground_state(&mut self, provider: &mut dyn Provide, byte: u8)175     fn advance_ground_state(&mut self, provider: &mut dyn Provide, byte: u8) {
176         if self.handle_possible_utf8_code_points(provider, byte) {
177             return;
178         }
179 
180         match byte {
181             0x1B => unreachable!(),
182 
183             // Execute
184             0x00..=0x17 | 0x19 | 0x1C..=0x1F => provider.provide_char(byte as char),
185 
186             // Print
187             0x20..=0x7F => provider.provide_char(byte as char),
188 
189             _ => {}
190         };
191     }
192 
advance_escape_state(&mut self, provider: &mut dyn Provide, byte: u8)193     fn advance_escape_state(&mut self, provider: &mut dyn Provide, byte: u8) {
194         match byte {
195             0x1B => unreachable!(),
196 
197             // Intermediate bytes to collect
198             0x20..=0x2F => {
199                 self.set_state(State::EscapeIntermediate);
200             }
201 
202             // Escape followed by '[' (0x5B)
203             //   -> CSI sequence start
204             0x5B => self.set_state(State::CsiEntry),
205 
206             // Escape sequence final character
207             0x30..=0x4F | 0x51..=0x57 | 0x59 | 0x5A | 0x5C | 0x60..=0x7E => {
208                 provider.provide_esc_sequence(byte as char);
209                 self.set_state(State::Ground);
210             }
211 
212             // Execute
213             0x00..=0x17 | 0x19 | 0x1C..=0x1F => provider.provide_char(byte as char),
214 
215             // TODO Does it mean we should ignore the whole sequence?
216             // Ignore
217             0x7F => {}
218 
219             // Other bytes are considered as invalid -> cancel whatever we have
220             _ => self.set_state(State::Ground),
221         };
222     }
223 
advance_escape_intermediate_state(&mut self, provider: &mut dyn Provide, byte: u8)224     fn advance_escape_intermediate_state(&mut self, provider: &mut dyn Provide, byte: u8) {
225         match byte {
226             0x1B => unreachable!(),
227 
228             // Intermediate bytes to collect
229             0x20..=0x2F => {}
230 
231             // Escape followed by '[' (0x5B)
232             //   -> CSI sequence start
233             0x5B => self.set_state(State::CsiEntry),
234 
235             // Escape sequence final character
236             0x30..=0x5A | 0x5C..=0x7E => {
237                 provider.provide_esc_sequence(byte as char);
238                 self.set_state(State::Ground);
239             }
240 
241             // Execute
242             0x00..=0x17 | 0x19 | 0x1C..=0x1F => provider.provide_char(byte as char),
243 
244             // TODO Does it mean we should ignore the whole sequence?
245             // Ignore
246             0x7F => {}
247 
248             // Other bytes are considered as invalid -> cancel whatever we have
249             _ => self.set_state(State::Ground),
250         };
251     }
252 
advance_csi_entry_state(&mut self, provider: &mut dyn Provide, byte: u8)253     fn advance_csi_entry_state(&mut self, provider: &mut dyn Provide, byte: u8) {
254         match byte {
255             0x1B => unreachable!(),
256 
257             // Semicolon = parameter delimiter
258             0x3B => {
259                 self.store_parameter();
260                 self.set_state(State::CsiParameter);
261             }
262 
263             // '0' ..= '9' = parameter value
264             0x30..=0x39 => {
265                 self.parameter = (byte as u64) - 0x30;
266                 self.set_state(State::CsiParameter);
267             }
268 
269             0x3A => self.set_state(State::CsiIgnore),
270 
271             // CSI sequence final character
272             //   -> dispatch CSI sequence
273             0x40..=0x7E => {
274                 provider.provide_csi_sequence(
275                     &self.parameters[..self.parameters_count],
276                     self.ignored_parameters_count,
277                     byte as char,
278                 );
279 
280                 self.set_state(State::Ground);
281             }
282 
283             // Execute
284             0x00..=0x17 | 0x19 | 0x1C..=0x1F => provider.provide_char(byte as char),
285 
286             // TODO Does it mean we should ignore the whole sequence?
287             // Ignore
288             0x7F => {}
289 
290             // Collect rest as parameters
291             _ => {
292                 self.parameter = byte as u64;
293                 self.store_parameter();
294             }
295         };
296     }
297 
advance_csi_ignore_state(&mut self, provider: &mut dyn Provide, byte: u8)298     fn advance_csi_ignore_state(&mut self, provider: &mut dyn Provide, byte: u8) {
299         match byte {
300             0x1B => unreachable!(),
301 
302             // Execute
303             0x00..=0x17 | 0x19 | 0x1C..=0x1F => provider.provide_char(byte as char),
304 
305             // TODO Does it mean we should ignore the whole sequence?
306             // Ignore
307             0x20..=0x3F | 0x7F => {}
308 
309             0x40..=0x7E => self.set_state(State::Ground),
310 
311             // Other bytes are considered as invalid -> cancel whatever we have
312             _ => self.set_state(State::Ground),
313         };
314     }
315 
advance_csi_parameter_state(&mut self, provider: &mut dyn Provide, byte: u8)316     fn advance_csi_parameter_state(&mut self, provider: &mut dyn Provide, byte: u8) {
317         match byte {
318             0x1B => unreachable!(),
319 
320             // '0' ..= '9' = parameter value
321             0x30..=0x39 => {
322                 self.parameter = self.parameter.saturating_mul(10);
323                 self.parameter = self.parameter.saturating_add((byte as u64) - 0x30);
324             }
325 
326             // Semicolon = parameter delimiter
327             0x3B => self.store_parameter(),
328 
329             // CSI sequence final character
330             //   -> dispatch CSI sequence
331             0x40..=0x7E => {
332                 self.store_parameter();
333                 provider.provide_csi_sequence(
334                     &self.parameters[..self.parameters_count],
335                     self.ignored_parameters_count,
336                     byte as char,
337                 );
338 
339                 self.set_state(State::Ground);
340             }
341 
342             // Intermediates to collect
343             0x20..=0x2F => {
344                 self.store_parameter();
345                 self.set_state(State::CsiIntermediate);
346             }
347 
348             // Ignore
349             0x3A | 0x3C..=0x3F => self.set_state(State::CsiIgnore),
350 
351             // Execute
352             0x00..=0x17 | 0x19 | 0x1C..=0x1F => provider.provide_char(byte as char),
353 
354             // TODO Does it mean we should ignore the whole sequence?
355             // Ignore
356             0x7F => {}
357 
358             // Other bytes are considered as invalid -> cancel whatever we have
359             _ => self.set_state(State::Ground),
360         };
361     }
362 
advance_csi_intermediate_state(&mut self, provider: &mut dyn Provide, byte: u8)363     fn advance_csi_intermediate_state(&mut self, provider: &mut dyn Provide, byte: u8) {
364         match byte {
365             0x1B => unreachable!(),
366 
367             // Intermediates to collect
368             0x20..=0x2F => {}
369 
370             // CSI sequence final character
371             //   -> dispatch CSI sequence
372             0x40..=0x7E => {
373                 provider.provide_csi_sequence(
374                     &self.parameters[..self.parameters_count],
375                     self.ignored_parameters_count,
376                     byte as char,
377                 );
378 
379                 self.set_state(State::Ground);
380             }
381 
382             // Execute
383             0x00..=0x17 | 0x19 | 0x1C..=0x1F => provider.provide_char(byte as char),
384 
385             // TODO Does it mean we should ignore the whole sequence?
386             // Ignore
387             0x7F => {}
388 
389             // Other bytes are considered as invalid -> cancel whatever we have
390             _ => self.set_state(State::Ground),
391         }
392     }
393 
advance_utf8_state(&mut self, provider: &mut dyn Provide, byte: u8)394     fn advance_utf8_state(&mut self, provider: &mut dyn Provide, byte: u8) {
395         if byte & 0b1100_0000 != 0b1000_0000 {
396             self.set_state(State::Ground);
397             return;
398         }
399 
400         self.utf8_points[self.utf8_points_count] = byte;
401         self.utf8_points_count += 1;
402 
403         if self.utf8_points_count == self.utf8_points_expected_count {
404             if let Some(ch) = std::str::from_utf8(&self.utf8_points[..self.utf8_points_count])
405                 .ok()
406                 .and_then(|s| s.chars().next())
407             {
408                 provider.provide_char(ch);
409             }
410             self.set_state(State::Ground);
411         }
412     }
413 
advance(&mut self, provider: &mut dyn Provide, byte: u8, more: bool)414     pub(crate) fn advance(&mut self, provider: &mut dyn Provide, byte: u8, more: bool) {
415         // eprintln!("advance: {:?} {} {}", self.state, byte, more);
416 
417         if self.handle_possible_esc(provider, byte, more) {
418             return;
419         }
420 
421         match self.state {
422             State::Ground => self.advance_ground_state(provider, byte),
423             State::Escape => self.advance_escape_state(provider, byte),
424             State::EscapeIntermediate => self.advance_escape_intermediate_state(provider, byte),
425             State::CsiEntry => self.advance_csi_entry_state(provider, byte),
426             State::CsiIgnore => self.advance_csi_ignore_state(provider, byte),
427             State::CsiParameter => self.advance_csi_parameter_state(provider, byte),
428             State::CsiIntermediate => self.advance_csi_intermediate_state(provider, byte),
429             State::Utf8 => self.advance_utf8_state(provider, byte),
430         };
431     }
432 }
433 
434 #[cfg(test)]
435 mod tests {
436     use super::*;
437 
438     #[test]
esc_char()439     fn esc_char() {
440         let mut engine = Engine::default();
441         let mut provider = CharProvider::default();
442 
443         // No more input means that the Esc character should be dispatched immediately
444         engine.advance(&mut provider, 0x1B, false);
445         assert_eq!(provider.chars, &['\x1B']);
446 
447         // There's more input so the machine should wait before dispatching Esc character
448         engine.advance(&mut provider, 0x1B, true);
449         assert_eq!(provider.chars, &['\x1B']);
450 
451         // Another Esc character, but no more input, machine should dispatch the postponed Esc
452         // character and the new one too.
453         engine.advance(&mut provider, 0x1B, false);
454         assert_eq!(provider.chars, &['\x1B', '\x1B', '\x1B']);
455     }
456 
457     #[test]
esc_without_intermediates()458     fn esc_without_intermediates() {
459         let mut engine = Engine::default();
460         let mut provider = EscProvider::default();
461 
462         let input = b"\x1B0\x1B~";
463         advance(&mut engine, &mut provider, input, false);
464 
465         assert_eq!(provider.chars.len(), 2);
466 
467         assert_eq!(provider.chars[0], '0');
468 
469         assert_eq!(provider.chars[1], '~');
470     }
471 
472     #[test]
csi_without_parameters()473     fn csi_without_parameters() {
474         let mut engine = Engine::default();
475         let mut provider = CsiProvider::default();
476 
477         let input = b"\x1B\x5Bm";
478         advance(&mut engine, &mut provider, input, false);
479 
480         assert_eq!(provider.parameters.len(), 1);
481         assert_eq!(provider.parameters[0], &[]);
482         assert_eq!(provider.chars.len(), 1);
483         assert_eq!(provider.chars[0], 'm');
484     }
485 
486     #[test]
csi_with_two_default_parameters()487     fn csi_with_two_default_parameters() {
488         let mut engine = Engine::default();
489         let mut provider = CsiProvider::default();
490 
491         let input = b"\x1B\x5B;m";
492         advance(&mut engine, &mut provider, input, false);
493 
494         assert_eq!(provider.parameters.len(), 1);
495         assert_eq!(
496             provider.parameters[0],
497             &[DEFAULT_PARAMETER_VALUE, DEFAULT_PARAMETER_VALUE]
498         );
499         assert_eq!(provider.chars.len(), 1);
500         assert_eq!(provider.chars[0], 'm');
501     }
502 
503     #[test]
csi_with_trailing_semicolon()504     fn csi_with_trailing_semicolon() {
505         let mut engine = Engine::default();
506         let mut provider = CsiProvider::default();
507 
508         let input = b"\x1B\x5B123;m";
509         advance(&mut engine, &mut provider, input, false);
510 
511         assert_eq!(provider.parameters.len(), 1);
512         assert_eq!(provider.parameters[0], &[123, DEFAULT_PARAMETER_VALUE]);
513         assert_eq!(provider.chars.len(), 1);
514         assert_eq!(provider.chars[0], 'm');
515     }
516 
517     #[test]
csi_max_parameters()518     fn csi_max_parameters() {
519         let mut engine = Engine::default();
520         let mut provider = CsiProvider::default();
521 
522         let input = b"\x1B\x5B1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16;17;18;19;20;21;22;23;24;25;26;27;28;29;30m";
523         advance(&mut engine, &mut provider, input, false);
524 
525         assert_eq!(provider.parameters.len(), 1);
526         assert_eq!(provider.parameters[0].len(), MAX_PARAMETERS);
527         assert_eq!(
528             provider.parameters[0],
529             &[
530                 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
531                 24, 25, 26, 27, 28, 29, 30
532             ]
533         );
534         assert_eq!(provider.chars.len(), 1);
535         assert_eq!(provider.chars[0], 'm');
536     }
537 
538     #[test]
test_parse_utf8_character()539     fn test_parse_utf8_character() {
540         let mut engine = Engine::default();
541         let mut provider = CharProvider::default();
542 
543         advance(&mut engine, &mut provider, &['a' as u8], false);
544         assert_eq!(provider.chars.len(), 1);
545         assert_eq!(provider.chars[0], 'a');
546 
547         advance(&mut engine, &mut provider, &[0xC3, 0xB1], false);
548         assert_eq!(provider.chars.len(), 2);
549         assert_eq!(provider.chars[1], 'ñ');
550 
551         advance(&mut engine, &mut provider, &[0xE2, 0x81, 0xA1], false);
552         assert_eq!(provider.chars.len(), 3);
553         assert_eq!(provider.chars[2], '\u{2061}');
554 
555         advance(&mut engine, &mut provider, &[0xF0, 0x90, 0x8C, 0xBC], false);
556         assert_eq!(provider.chars.len(), 4);
557         assert_eq!(provider.chars[3], '��');
558     }
559 
advance(engine: &mut Engine, provider: &mut dyn Provide, bytes: &[u8], more: bool)560     fn advance(engine: &mut Engine, provider: &mut dyn Provide, bytes: &[u8], more: bool) {
561         let len = bytes.len();
562 
563         for (i, byte) in bytes.iter().enumerate() {
564             engine.advance(provider, *byte, i < len - 1 || more);
565         }
566     }
567 
568     #[derive(Default)]
569     struct CharProvider {
570         chars: Vec<char>,
571     }
572 
573     impl Provide for CharProvider {
provide_char(&mut self, ch: char)574         fn provide_char(&mut self, ch: char) {
575             self.chars.push(ch);
576         }
577 
provide_esc_sequence(&mut self, _ch: char)578         fn provide_esc_sequence(&mut self, _ch: char) {}
579 
provide_csi_sequence(&mut self, _parameters: &[u64], _ignored_count: usize, _ch: char)580         fn provide_csi_sequence(&mut self, _parameters: &[u64], _ignored_count: usize, _ch: char) {}
581     }
582 
583     #[derive(Default)]
584     struct CsiProvider {
585         parameters: Vec<Vec<u64>>,
586         chars: Vec<char>,
587     }
588 
589     impl Provide for CsiProvider {
provide_char(&mut self, _ch: char)590         fn provide_char(&mut self, _ch: char) {}
591 
provide_esc_sequence(&mut self, _ch: char)592         fn provide_esc_sequence(&mut self, _ch: char) {}
593 
provide_csi_sequence(&mut self, parameters: &[u64], _ignored_count: usize, ch: char)594         fn provide_csi_sequence(&mut self, parameters: &[u64], _ignored_count: usize, ch: char) {
595             self.parameters.push(parameters.to_vec());
596             self.chars.push(ch);
597         }
598     }
599 
600     #[derive(Default)]
601     struct EscProvider {
602         chars: Vec<char>,
603     }
604 
605     impl Provide for EscProvider {
provide_char(&mut self, _ch: char)606         fn provide_char(&mut self, _ch: char) {}
607 
provide_esc_sequence(&mut self, ch: char)608         fn provide_esc_sequence(&mut self, ch: char) {
609             self.chars.push(ch);
610         }
611 
provide_csi_sequence(&mut self, _parameters: &[u64], _ignored_count: usize, _ch: char)612         fn provide_csi_sequence(&mut self, _parameters: &[u64], _ignored_count: usize, _ch: char) {}
613     }
614 }
615