1 use std::borrow::Cow;
2 use std::char;
3 use std::ops::RangeInclusive;
4 
5 use winnow::combinator::alt;
6 use winnow::combinator::cut_err;
7 use winnow::combinator::delimited;
8 use winnow::combinator::fail;
9 use winnow::combinator::opt;
10 use winnow::combinator::peek;
11 use winnow::combinator::preceded;
12 use winnow::combinator::repeat;
13 use winnow::combinator::success;
14 use winnow::combinator::terminated;
15 use winnow::prelude::*;
16 use winnow::stream::Stream;
17 use winnow::token::any;
18 use winnow::token::none_of;
19 use winnow::token::one_of;
20 use winnow::token::tag;
21 use winnow::token::take_while;
22 use winnow::trace::trace;
23 
24 use crate::parser::error::CustomError;
25 use crate::parser::numbers::HEXDIG;
26 use crate::parser::prelude::*;
27 use crate::parser::trivia::{from_utf8_unchecked, newline, ws, ws_newlines, NON_ASCII, WSCHAR};
28 
29 // ;; String
30 
31 // string = ml-basic-string / basic-string / ml-literal-string / literal-string
string<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>>32 pub(crate) fn string<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> {
33     trace(
34         "string",
35         alt((
36             ml_basic_string,
37             basic_string,
38             ml_literal_string,
39             literal_string.map(Cow::Borrowed),
40         )),
41     )
42     .parse_next(input)
43 }
44 
45 // ;; Basic String
46 
47 // basic-string = quotation-mark *basic-char quotation-mark
basic_string<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>>48 pub(crate) fn basic_string<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> {
49     trace("basic-string", |input: &mut Input<'i>| {
50         let _ = one_of(QUOTATION_MARK).parse_next(input)?;
51 
52         let mut c = Cow::Borrowed("");
53         if let Some(ci) = opt(basic_chars).parse_next(input)? {
54             c = ci;
55         }
56         while let Some(ci) = opt(basic_chars).parse_next(input)? {
57             c.to_mut().push_str(&ci);
58         }
59 
60         let _ = cut_err(one_of(QUOTATION_MARK))
61             .context(StrContext::Label("basic string"))
62             .parse_next(input)?;
63 
64         Ok(c)
65     })
66     .parse_next(input)
67 }
68 
69 // quotation-mark = %x22            ; "
70 pub(crate) const QUOTATION_MARK: u8 = b'"';
71 
72 // basic-char = basic-unescaped / escaped
basic_chars<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>>73 fn basic_chars<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> {
74     alt((
75         // Deviate from the official grammar by batching the unescaped chars so we build a string a
76         // chunk at a time, rather than a `char` at a time.
77         take_while(1.., BASIC_UNESCAPED)
78             .try_map(std::str::from_utf8)
79             .map(Cow::Borrowed),
80         escaped.map(|c| Cow::Owned(String::from(c))),
81     ))
82     .parse_next(input)
83 }
84 
85 // basic-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii
86 pub(crate) const BASIC_UNESCAPED: (
87     (u8, u8),
88     u8,
89     RangeInclusive<u8>,
90     RangeInclusive<u8>,
91     RangeInclusive<u8>,
92 ) = (WSCHAR, 0x21, 0x23..=0x5B, 0x5D..=0x7E, NON_ASCII);
93 
94 // escaped = escape escape-seq-char
escaped(input: &mut Input<'_>) -> PResult<char>95 fn escaped(input: &mut Input<'_>) -> PResult<char> {
96     preceded(ESCAPE, escape_seq_char).parse_next(input)
97 }
98 
99 // escape = %x5C                    ; \
100 pub(crate) const ESCAPE: u8 = b'\\';
101 
102 // escape-seq-char =  %x22         ; "    quotation mark  U+0022
103 // escape-seq-char =/ %x5C         ; \    reverse solidus U+005C
104 // escape-seq-char =/ %x62         ; b    backspace       U+0008
105 // escape-seq-char =/ %x66         ; f    form feed       U+000C
106 // escape-seq-char =/ %x6E         ; n    line feed       U+000A
107 // escape-seq-char =/ %x72         ; r    carriage return U+000D
108 // escape-seq-char =/ %x74         ; t    tab             U+0009
109 // escape-seq-char =/ %x75 4HEXDIG ; uXXXX                U+XXXX
110 // escape-seq-char =/ %x55 8HEXDIG ; UXXXXXXXX            U+XXXXXXXX
escape_seq_char(input: &mut Input<'_>) -> PResult<char>111 fn escape_seq_char(input: &mut Input<'_>) -> PResult<char> {
112     dispatch! {any;
113         b'b' => success('\u{8}'),
114         b'f' => success('\u{c}'),
115         b'n' => success('\n'),
116         b'r' => success('\r'),
117         b't' => success('\t'),
118         b'u' => cut_err(hexescape::<4>).context(StrContext::Label("unicode 4-digit hex code")),
119         b'U' => cut_err(hexescape::<8>).context(StrContext::Label("unicode 8-digit hex code")),
120         b'\\' => success('\\'),
121         b'"' => success('"'),
122         _ => {
123             cut_err(fail::<_, char, _>)
124             .context(StrContext::Label("escape sequence"))
125             .context(StrContext::Expected(StrContextValue::CharLiteral('b')))
126             .context(StrContext::Expected(StrContextValue::CharLiteral('f')))
127             .context(StrContext::Expected(StrContextValue::CharLiteral('n')))
128             .context(StrContext::Expected(StrContextValue::CharLiteral('r')))
129             .context(StrContext::Expected(StrContextValue::CharLiteral('t')))
130             .context(StrContext::Expected(StrContextValue::CharLiteral('u')))
131             .context(StrContext::Expected(StrContextValue::CharLiteral('U')))
132             .context(StrContext::Expected(StrContextValue::CharLiteral('\\')))
133             .context(StrContext::Expected(StrContextValue::CharLiteral('"')))
134         }
135     }
136     .parse_next(input)
137 }
138 
hexescape<const N: usize>(input: &mut Input<'_>) -> PResult<char>139 pub(crate) fn hexescape<const N: usize>(input: &mut Input<'_>) -> PResult<char> {
140     take_while(0..=N, HEXDIG)
141         .verify(|b: &[u8]| b.len() == N)
142         .map(|b: &[u8]| unsafe { from_utf8_unchecked(b, "`is_ascii_digit` filters out on-ASCII") })
143         .verify_map(|s| u32::from_str_radix(s, 16).ok())
144         .try_map(|h| char::from_u32(h).ok_or(CustomError::OutOfRange))
145         .parse_next(input)
146 }
147 
148 // ;; Multiline Basic String
149 
150 // ml-basic-string = ml-basic-string-delim [ newline ] ml-basic-body
151 //                   ml-basic-string-delim
ml_basic_string<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>>152 fn ml_basic_string<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> {
153     trace(
154         "ml-basic-string",
155         delimited(
156             ML_BASIC_STRING_DELIM,
157             preceded(opt(newline), cut_err(ml_basic_body)),
158             cut_err(ML_BASIC_STRING_DELIM),
159         )
160         .context(StrContext::Label("multiline basic string")),
161     )
162     .parse_next(input)
163 }
164 
165 // ml-basic-string-delim = 3quotation-mark
166 pub(crate) const ML_BASIC_STRING_DELIM: &[u8] = b"\"\"\"";
167 
168 // ml-basic-body = *mlb-content *( mlb-quotes 1*mlb-content ) [ mlb-quotes ]
ml_basic_body<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>>169 fn ml_basic_body<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> {
170     let mut c = Cow::Borrowed("");
171     if let Some(ci) = opt(mlb_content).parse_next(input)? {
172         c = ci;
173     }
174     while let Some(ci) = opt(mlb_content).parse_next(input)? {
175         c.to_mut().push_str(&ci);
176     }
177 
178     while let Some(qi) = opt(mlb_quotes(none_of(b'\"').value(()))).parse_next(input)? {
179         if let Some(ci) = opt(mlb_content).parse_next(input)? {
180             c.to_mut().push_str(qi);
181             c.to_mut().push_str(&ci);
182             while let Some(ci) = opt(mlb_content).parse_next(input)? {
183                 c.to_mut().push_str(&ci);
184             }
185         } else {
186             break;
187         }
188     }
189 
190     if let Some(qi) = opt(mlb_quotes(tag(ML_BASIC_STRING_DELIM).value(()))).parse_next(input)? {
191         c.to_mut().push_str(qi);
192     }
193 
194     Ok(c)
195 }
196 
197 // mlb-content = mlb-char / newline / mlb-escaped-nl
198 // mlb-char = mlb-unescaped / escaped
mlb_content<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>>199 fn mlb_content<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> {
200     alt((
201         // Deviate from the official grammar by batching the unescaped chars so we build a string a
202         // chunk at a time, rather than a `char` at a time.
203         take_while(1.., MLB_UNESCAPED)
204             .try_map(std::str::from_utf8)
205             .map(Cow::Borrowed),
206         // Order changed fromg grammar so `escaped` can more easily `cut_err` on bad escape sequences
207         mlb_escaped_nl.map(|_| Cow::Borrowed("")),
208         escaped.map(|c| Cow::Owned(String::from(c))),
209         newline.map(|_| Cow::Borrowed("\n")),
210     ))
211     .parse_next(input)
212 }
213 
214 // mlb-quotes = 1*2quotation-mark
mlb_quotes<'i>( mut term: impl winnow::Parser<Input<'i>, (), ContextError>, ) -> impl Parser<Input<'i>, &'i str, ContextError>215 fn mlb_quotes<'i>(
216     mut term: impl winnow::Parser<Input<'i>, (), ContextError>,
217 ) -> impl Parser<Input<'i>, &'i str, ContextError> {
218     move |input: &mut Input<'i>| {
219         let start = input.checkpoint();
220         let res = terminated(b"\"\"", peek(term.by_ref()))
221             .map(|b| unsafe { from_utf8_unchecked(b, "`bytes` out non-ASCII") })
222             .parse_next(input);
223 
224         match res {
225             Err(winnow::error::ErrMode::Backtrack(_)) => {
226                 input.reset(start);
227                 terminated(b"\"", peek(term.by_ref()))
228                     .map(|b| unsafe { from_utf8_unchecked(b, "`bytes` out non-ASCII") })
229                     .parse_next(input)
230             }
231             res => res,
232         }
233     }
234 }
235 
236 // mlb-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii
237 pub(crate) const MLB_UNESCAPED: (
238     (u8, u8),
239     u8,
240     RangeInclusive<u8>,
241     RangeInclusive<u8>,
242     RangeInclusive<u8>,
243 ) = (WSCHAR, 0x21, 0x23..=0x5B, 0x5D..=0x7E, NON_ASCII);
244 
245 // mlb-escaped-nl = escape ws newline *( wschar / newline
246 // When the last non-whitespace character on a line is a \,
247 // it will be trimmed along with all whitespace
248 // (including newlines) up to the next non-whitespace
249 // character or closing delimiter.
mlb_escaped_nl(input: &mut Input<'_>) -> PResult<()>250 fn mlb_escaped_nl(input: &mut Input<'_>) -> PResult<()> {
251     repeat(1.., (ESCAPE, ws, ws_newlines))
252         .map(|()| ())
253         .value(())
254         .parse_next(input)
255 }
256 
257 // ;; Literal String
258 
259 // literal-string = apostrophe *literal-char apostrophe
literal_string<'i>(input: &mut Input<'i>) -> PResult<&'i str>260 pub(crate) fn literal_string<'i>(input: &mut Input<'i>) -> PResult<&'i str> {
261     trace(
262         "literal-string",
263         delimited(
264             APOSTROPHE,
265             cut_err(take_while(0.., LITERAL_CHAR)),
266             cut_err(APOSTROPHE),
267         )
268         .try_map(std::str::from_utf8)
269         .context(StrContext::Label("literal string")),
270     )
271     .parse_next(input)
272 }
273 
274 // apostrophe = %x27 ; ' apostrophe
275 pub(crate) const APOSTROPHE: u8 = b'\'';
276 
277 // literal-char = %x09 / %x20-26 / %x28-7E / non-ascii
278 pub(crate) const LITERAL_CHAR: (
279     u8,
280     RangeInclusive<u8>,
281     RangeInclusive<u8>,
282     RangeInclusive<u8>,
283 ) = (0x9, 0x20..=0x26, 0x28..=0x7E, NON_ASCII);
284 
285 // ;; Multiline Literal String
286 
287 // ml-literal-string = ml-literal-string-delim [ newline ] ml-literal-body
288 //                     ml-literal-string-delim
ml_literal_string<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>>289 fn ml_literal_string<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> {
290     trace(
291         "ml-literal-string",
292         delimited(
293             (ML_LITERAL_STRING_DELIM, opt(newline)),
294             cut_err(ml_literal_body.map(|t| {
295                 if t.contains("\r\n") {
296                     Cow::Owned(t.replace("\r\n", "\n"))
297                 } else {
298                     Cow::Borrowed(t)
299                 }
300             })),
301             cut_err(ML_LITERAL_STRING_DELIM),
302         )
303         .context(StrContext::Label("multiline literal string")),
304     )
305     .parse_next(input)
306 }
307 
308 // ml-literal-string-delim = 3apostrophe
309 pub(crate) const ML_LITERAL_STRING_DELIM: &[u8] = b"'''";
310 
311 // ml-literal-body = *mll-content *( mll-quotes 1*mll-content ) [ mll-quotes ]
ml_literal_body<'i>(input: &mut Input<'i>) -> PResult<&'i str>312 fn ml_literal_body<'i>(input: &mut Input<'i>) -> PResult<&'i str> {
313     (
314         repeat(0.., mll_content).map(|()| ()),
315         repeat(
316             0..,
317             (
318                 mll_quotes(none_of(APOSTROPHE).value(())),
319                 repeat(1.., mll_content).map(|()| ()),
320             ),
321         )
322         .map(|()| ()),
323         opt(mll_quotes(tag(ML_LITERAL_STRING_DELIM).value(()))),
324     )
325         .recognize()
326         .try_map(std::str::from_utf8)
327         .parse_next(input)
328 }
329 
330 // mll-content = mll-char / newline
mll_content(input: &mut Input<'_>) -> PResult<u8>331 fn mll_content(input: &mut Input<'_>) -> PResult<u8> {
332     alt((one_of(MLL_CHAR), newline)).parse_next(input)
333 }
334 
335 // mll-char = %x09 / %x20-26 / %x28-7E / non-ascii
336 const MLL_CHAR: (
337     u8,
338     RangeInclusive<u8>,
339     RangeInclusive<u8>,
340     RangeInclusive<u8>,
341 ) = (0x9, 0x20..=0x26, 0x28..=0x7E, NON_ASCII);
342 
343 // mll-quotes = 1*2apostrophe
mll_quotes<'i>( mut term: impl winnow::Parser<Input<'i>, (), ContextError>, ) -> impl Parser<Input<'i>, &'i str, ContextError>344 fn mll_quotes<'i>(
345     mut term: impl winnow::Parser<Input<'i>, (), ContextError>,
346 ) -> impl Parser<Input<'i>, &'i str, ContextError> {
347     move |input: &mut Input<'i>| {
348         let start = input.checkpoint();
349         let res = terminated(b"''", peek(term.by_ref()))
350             .map(|b| unsafe { from_utf8_unchecked(b, "`bytes` out non-ASCII") })
351             .parse_next(input);
352 
353         match res {
354             Err(winnow::error::ErrMode::Backtrack(_)) => {
355                 input.reset(start);
356                 terminated(b"'", peek(term.by_ref()))
357                     .map(|b| unsafe { from_utf8_unchecked(b, "`bytes` out non-ASCII") })
358                     .parse_next(input)
359             }
360             res => res,
361         }
362     }
363 }
364 
365 #[cfg(test)]
366 #[cfg(feature = "parse")]
367 #[cfg(feature = "display")]
368 mod test {
369     use super::*;
370 
371     #[test]
basic_string()372     fn basic_string() {
373         let input =
374             r#""I'm a string. \"You can quote me\". Name\tJos\u00E9\nLocation\tSF. \U0002070E""#;
375         let expected = "I\'m a string. \"You can quote me\". Name\tJosé\nLocation\tSF. \u{2070E}";
376         let parsed = string.parse(new_input(input));
377         assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
378     }
379 
380     #[test]
ml_basic_string()381     fn ml_basic_string() {
382         let cases = [
383             (
384                 r#""""
385 Roses are red
386 Violets are blue""""#,
387                 r#"Roses are red
388 Violets are blue"#,
389             ),
390             (r#"""" \""" """"#, " \"\"\" "),
391             (r#"""" \\""""#, " \\"),
392         ];
393 
394         for &(input, expected) in &cases {
395             let parsed = string.parse(new_input(input));
396             assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
397         }
398 
399         let invalid_cases = [r#""""  """#, r#""""  \""""#];
400 
401         for input in &invalid_cases {
402             let parsed = string.parse(new_input(input));
403             assert!(parsed.is_err());
404         }
405     }
406 
407     #[test]
ml_basic_string_escape_ws()408     fn ml_basic_string_escape_ws() {
409         let inputs = [
410             r#""""
411 The quick brown \
412 
413 
414   fox jumps over \
415     the lazy dog.""""#,
416             r#""""\
417        The quick brown \
418        fox jumps over \
419        the lazy dog.\
420        """"#,
421         ];
422         for input in &inputs {
423             let expected = "The quick brown fox jumps over the lazy dog.";
424             let parsed = string.parse(new_input(input));
425             assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
426         }
427         let empties = [
428             r#""""\
429        """"#,
430             r#""""
431 \
432   \
433 """"#,
434         ];
435         for input in &empties {
436             let expected = "";
437             let parsed = string.parse(new_input(input));
438             assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
439         }
440     }
441 
442     #[test]
literal_string()443     fn literal_string() {
444         let inputs = [
445             r"'C:\Users\nodejs\templates'",
446             r"'\\ServerX\admin$\system32\'",
447             r#"'Tom "Dubs" Preston-Werner'"#,
448             r"'<\i\c*\s*>'",
449         ];
450 
451         for input in &inputs {
452             let expected = &input[1..input.len() - 1];
453             let parsed = string.parse(new_input(input));
454             assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
455         }
456     }
457 
458     #[test]
ml_literal_string()459     fn ml_literal_string() {
460         let inputs = [
461             r"'''I [dw]on't need \d{2} apples'''",
462             r#"''''one_quote''''"#,
463         ];
464         for input in &inputs {
465             let expected = &input[3..input.len() - 3];
466             let parsed = string.parse(new_input(input));
467             assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
468         }
469 
470         let input = r#"'''
471 The first newline is
472 trimmed in raw strings.
473    All other whitespace
474    is preserved.
475 '''"#;
476         let expected = &input[4..input.len() - 3];
477         let parsed = string.parse(new_input(input));
478         assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
479     }
480 }
481