1 #include <stdbool.h>
2 
3 #include <Python.h>
4 
5 #include "tokenizer.h"
6 #include "pegen.h"
7 #include "string_parser.h"
8 
9 //// STRING HANDLING FUNCTIONS ////
10 
11 static int
warn_invalid_escape_sequence(Parser * p,const char * first_invalid_escape,Token * t)12 warn_invalid_escape_sequence(Parser *p, const char *first_invalid_escape, Token *t)
13 {
14     unsigned char c = *first_invalid_escape;
15     int octal = ('4' <= c && c <= '7');
16     PyObject *msg =
17         octal
18         ? PyUnicode_FromFormat("invalid octal escape sequence '\\%.3s'",
19                                first_invalid_escape)
20         : PyUnicode_FromFormat("invalid escape sequence '\\%c'", c);
21     if (msg == NULL) {
22         return -1;
23     }
24     if (PyErr_WarnExplicitObject(PyExc_DeprecationWarning, msg, p->tok->filename,
25                                  t->lineno, NULL, NULL) < 0) {
26         if (PyErr_ExceptionMatches(PyExc_DeprecationWarning)) {
27             /* Replace the DeprecationWarning exception with a SyntaxError
28                to get a more accurate error report */
29             PyErr_Clear();
30 
31             /* This is needed, in order for the SyntaxError to point to the token t,
32                since _PyPegen_raise_error uses p->tokens[p->fill - 1] for the
33                error location, if p->known_err_token is not set. */
34             p->known_err_token = t;
35             if (octal) {
36                 RAISE_SYNTAX_ERROR("invalid octal escape sequence '\\%.3s'",
37                                    first_invalid_escape);
38             }
39             else {
40                 RAISE_SYNTAX_ERROR("invalid escape sequence '\\%c'", c);
41             }
42         }
43         Py_DECREF(msg);
44         return -1;
45     }
46     Py_DECREF(msg);
47     return 0;
48 }
49 
50 static PyObject *
decode_utf8(const char ** sPtr,const char * end)51 decode_utf8(const char **sPtr, const char *end)
52 {
53     const char *s;
54     const char *t;
55     t = s = *sPtr;
56     while (s < end && (*s & 0x80)) {
57         s++;
58     }
59     *sPtr = s;
60     return PyUnicode_DecodeUTF8(t, s - t, NULL);
61 }
62 
63 static PyObject *
decode_unicode_with_escapes(Parser * parser,const char * s,size_t len,Token * t)64 decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
65 {
66     PyObject *v;
67     PyObject *u;
68     char *buf;
69     char *p;
70     const char *end;
71 
72     /* check for integer overflow */
73     if (len > SIZE_MAX / 6) {
74         return NULL;
75     }
76     /* "ä" (2 bytes) may become "\U000000E4" (10 bytes), or 1:5
77        "\ä" (3 bytes) may become "\u005c\U000000E4" (16 bytes), or ~1:6 */
78     u = PyBytes_FromStringAndSize((char *)NULL, len * 6);
79     if (u == NULL) {
80         return NULL;
81     }
82     p = buf = PyBytes_AsString(u);
83     if (p == NULL) {
84         return NULL;
85     }
86     end = s + len;
87     while (s < end) {
88         if (*s == '\\') {
89             *p++ = *s++;
90             if (s >= end || *s & 0x80) {
91                 strcpy(p, "u005c");
92                 p += 5;
93                 if (s >= end) {
94                     break;
95                 }
96             }
97         }
98         if (*s & 0x80) {
99             PyObject *w;
100             int kind;
101             const void *data;
102             Py_ssize_t w_len;
103             Py_ssize_t i;
104             w = decode_utf8(&s, end);
105             if (w == NULL) {
106                 Py_DECREF(u);
107                 return NULL;
108             }
109             kind = PyUnicode_KIND(w);
110             data = PyUnicode_DATA(w);
111             w_len = PyUnicode_GET_LENGTH(w);
112             for (i = 0; i < w_len; i++) {
113                 Py_UCS4 chr = PyUnicode_READ(kind, data, i);
114                 sprintf(p, "\\U%08x", chr);
115                 p += 10;
116             }
117             /* Should be impossible to overflow */
118             assert(p - buf <= PyBytes_GET_SIZE(u));
119             Py_DECREF(w);
120         }
121         else {
122             *p++ = *s++;
123         }
124     }
125     len = p - buf;
126     s = buf;
127 
128     const char *first_invalid_escape;
129     v = _PyUnicode_DecodeUnicodeEscapeInternal(s, len, NULL, NULL, &first_invalid_escape);
130 
131     if (v != NULL && first_invalid_escape != NULL) {
132         if (warn_invalid_escape_sequence(parser, first_invalid_escape, t) < 0) {
133             /* We have not decref u before because first_invalid_escape points
134                inside u. */
135             Py_XDECREF(u);
136             Py_DECREF(v);
137             return NULL;
138         }
139     }
140     Py_XDECREF(u);
141     return v;
142 }
143 
144 static PyObject *
decode_bytes_with_escapes(Parser * p,const char * s,Py_ssize_t len,Token * t)145 decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t)
146 {
147     const char *first_invalid_escape;
148     PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, &first_invalid_escape);
149     if (result == NULL) {
150         return NULL;
151     }
152 
153     if (first_invalid_escape != NULL) {
154         if (warn_invalid_escape_sequence(p, first_invalid_escape, t) < 0) {
155             Py_DECREF(result);
156             return NULL;
157         }
158     }
159     return result;
160 }
161 
162 /* s must include the bracketing quote characters, and r, b, u,
163    &/or f prefixes (if any), and embedded escape sequences (if any).
164    _PyPegen_parsestr parses it, and sets *result to decoded Python string object.
165    If the string is an f-string, set *fstr and *fstrlen to the unparsed
166    string object.  Return 0 if no errors occurred.  */
167 int
_PyPegen_parsestr(Parser * p,int * bytesmode,int * rawmode,PyObject ** result,const char ** fstr,Py_ssize_t * fstrlen,Token * t)168 _PyPegen_parsestr(Parser *p, int *bytesmode, int *rawmode, PyObject **result,
169                   const char **fstr, Py_ssize_t *fstrlen, Token *t)
170 {
171     const char *s = PyBytes_AsString(t->bytes);
172     if (s == NULL) {
173         return -1;
174     }
175 
176     size_t len;
177     int quote = Py_CHARMASK(*s);
178     int fmode = 0;
179     *bytesmode = 0;
180     *rawmode = 0;
181     *result = NULL;
182     *fstr = NULL;
183     if (Py_ISALPHA(quote)) {
184         while (!*bytesmode || !*rawmode) {
185             if (quote == 'b' || quote == 'B') {
186                 quote =(unsigned char)*++s;
187                 *bytesmode = 1;
188             }
189             else if (quote == 'u' || quote == 'U') {
190                 quote = (unsigned char)*++s;
191             }
192             else if (quote == 'r' || quote == 'R') {
193                 quote = (unsigned char)*++s;
194                 *rawmode = 1;
195             }
196             else if (quote == 'f' || quote == 'F') {
197                 quote = (unsigned char)*++s;
198                 fmode = 1;
199             }
200             else {
201                 break;
202             }
203         }
204     }
205 
206     /* fstrings are only allowed in Python 3.6 and greater */
207     if (fmode && p->feature_version < 6) {
208         p->error_indicator = 1;
209         RAISE_SYNTAX_ERROR("Format strings are only supported in Python 3.6 and greater");
210         return -1;
211     }
212 
213     if (fmode && *bytesmode) {
214         PyErr_BadInternalCall();
215         return -1;
216     }
217     if (quote != '\'' && quote != '\"') {
218         PyErr_BadInternalCall();
219         return -1;
220     }
221     /* Skip the leading quote char. */
222     s++;
223     len = strlen(s);
224     if (len > INT_MAX) {
225         PyErr_SetString(PyExc_OverflowError, "string to parse is too long");
226         return -1;
227     }
228     if (s[--len] != quote) {
229         /* Last quote char must match the first. */
230         PyErr_BadInternalCall();
231         return -1;
232     }
233     if (len >= 4 && s[0] == quote && s[1] == quote) {
234         /* A triple quoted string. We've already skipped one quote at
235            the start and one at the end of the string. Now skip the
236            two at the start. */
237         s += 2;
238         len -= 2;
239         /* And check that the last two match. */
240         if (s[--len] != quote || s[--len] != quote) {
241             PyErr_BadInternalCall();
242             return -1;
243         }
244     }
245 
246     if (fmode) {
247         /* Just return the bytes. The caller will parse the resulting
248            string. */
249         *fstr = s;
250         *fstrlen = len;
251         return 0;
252     }
253 
254     /* Not an f-string. */
255     /* Avoid invoking escape decoding routines if possible. */
256     *rawmode = *rawmode || strchr(s, '\\') == NULL;
257     if (*bytesmode) {
258         /* Disallow non-ASCII characters. */
259         const char *ch;
260         for (ch = s; *ch; ch++) {
261             if (Py_CHARMASK(*ch) >= 0x80) {
262                 RAISE_SYNTAX_ERROR_KNOWN_LOCATION(
263                                    t,
264                                    "bytes can only contain ASCII "
265                                    "literal characters");
266                 return -1;
267             }
268         }
269         if (*rawmode) {
270             *result = PyBytes_FromStringAndSize(s, len);
271         }
272         else {
273             *result = decode_bytes_with_escapes(p, s, len, t);
274         }
275     }
276     else {
277         if (*rawmode) {
278             *result = PyUnicode_DecodeUTF8Stateful(s, len, NULL, NULL);
279         }
280         else {
281             *result = decode_unicode_with_escapes(p, s, len, t);
282         }
283     }
284     return *result == NULL ? -1 : 0;
285 }
286 
287 
288 
289 // FSTRING STUFF
290 
291 /* Fix locations for the given node and its children.
292 
293    `parent` is the enclosing node.
294    `expr_start` is the starting position of the expression (pointing to the open brace).
295    `n` is the node which locations are going to be fixed relative to parent.
296    `expr_str` is the child node's string representation, including braces.
297 */
298 static bool
fstring_find_expr_location(Token * parent,const char * expr_start,char * expr_str,int * p_lines,int * p_cols)299 fstring_find_expr_location(Token *parent, const char* expr_start, char *expr_str, int *p_lines, int *p_cols)
300 {
301     *p_lines = 0;
302     *p_cols = 0;
303     assert(expr_start != NULL && *expr_start == '{');
304     if (parent && parent->bytes) {
305         const char *parent_str = PyBytes_AsString(parent->bytes);
306         if (!parent_str) {
307             return false;
308         }
309         // The following is needed, in order to correctly shift the column
310         // offset, in the case that (disregarding any whitespace) a newline
311         // immediately follows the opening curly brace of the fstring expression.
312         bool newline_after_brace = 1;
313         const char *start = expr_start + 1;
314         while (start && *start != '}' && *start != '\n') {
315             if (*start != ' ' && *start != '\t' && *start != '\f') {
316                 newline_after_brace = 0;
317                 break;
318             }
319             start++;
320         }
321 
322         // Account for the characters from the last newline character to our
323         // left until the beginning of expr_start.
324         if (!newline_after_brace) {
325             start = expr_start;
326             while (start > parent_str && *start != '\n') {
327                 start--;
328             }
329             *p_cols += (int)(expr_start - start);
330             if (*start == '\n') {
331                 *p_cols -= 1;
332             }
333         }
334         /* adjust the start based on the number of newlines encountered
335            before the f-string expression */
336         for (const char *p = parent_str; p < expr_start; p++) {
337             if (*p == '\n') {
338                 (*p_lines)++;
339             }
340         }
341     }
342     return true;
343 }
344 
345 
346 /* Compile this expression in to an expr_ty.  Add parens around the
347    expression, in order to allow leading spaces in the expression. */
348 static expr_ty
fstring_compile_expr(Parser * p,const char * expr_start,const char * expr_end,Token * t)349 fstring_compile_expr(Parser *p, const char *expr_start, const char *expr_end,
350                      Token *t)
351 {
352     expr_ty expr = NULL;
353     char *str;
354     Py_ssize_t len;
355     const char *s;
356     expr_ty result = NULL;
357 
358     assert(expr_end >= expr_start);
359     assert(*(expr_start-1) == '{');
360     assert(*expr_end == '}' || *expr_end == '!' || *expr_end == ':' ||
361            *expr_end == '=');
362 
363     /* If the substring is all whitespace, it's an error.  We need to catch this
364        here, and not when we call PyParser_SimpleParseStringFlagsFilename,
365        because turning the expression '' in to '()' would go from being invalid
366        to valid. */
367     for (s = expr_start; s != expr_end; s++) {
368         char c = *s;
369         /* The Python parser ignores only the following whitespace
370            characters (\r already is converted to \n). */
371         if (!(c == ' ' || c == '\t' || c == '\n' || c == '\f')) {
372             break;
373         }
374     }
375 
376     if (s == expr_end) {
377         if (*expr_end == '!' || *expr_end == ':' || *expr_end == '=') {
378             RAISE_SYNTAX_ERROR("f-string: expression required before '%c'", *expr_end);
379             return NULL;
380         }
381         RAISE_SYNTAX_ERROR("f-string: empty expression not allowed");
382         return NULL;
383     }
384 
385     len = expr_end - expr_start;
386     /* Allocate 3 extra bytes: open paren, close paren, null byte. */
387     str = PyMem_Calloc(len + 3, sizeof(char));
388     if (str == NULL) {
389         PyErr_NoMemory();
390         return NULL;
391     }
392 
393     // The call to fstring_find_expr_location is responsible for finding the column offset
394     // the generated AST nodes need to be shifted to the right, which is equal to the number
395     // of the f-string characters before the expression starts.
396     memcpy(str+1, expr_start, len);
397     int lines, cols;
398     if (!fstring_find_expr_location(t, expr_start-1, str+1, &lines, &cols)) {
399         PyMem_Free(str);
400         return NULL;
401     }
402 
403     // The parentheses are needed in order to allow for leading whitespace within
404     // the f-string expression. This consequently gets parsed as a group (see the
405     // group rule in python.gram).
406     str[0] = '(';
407     str[len+1] = ')';
408 
409     struct tok_state* tok = _PyTokenizer_FromString(str, 1);
410     if (tok == NULL) {
411         PyMem_Free(str);
412         return NULL;
413     }
414     Py_INCREF(p->tok->filename);
415 
416     tok->filename = p->tok->filename;
417     tok->lineno = t->lineno + lines - 1;
418 
419     Parser *p2 = _PyPegen_Parser_New(tok, Py_fstring_input, p->flags, p->feature_version,
420                                      NULL, p->arena);
421 
422     p2->starting_lineno = t->lineno + lines;
423     p2->starting_col_offset = lines != 0 ? cols : t->col_offset + cols;
424 
425     expr = _PyPegen_run_parser(p2);
426 
427     if (expr == NULL) {
428         goto exit;
429     }
430     result = expr;
431 
432 exit:
433     PyMem_Free(str);
434     _PyPegen_Parser_Free(p2);
435     _PyTokenizer_Free(tok);
436     return result;
437 }
438 
439 /* Return -1 on error.
440 
441    Return 0 if we reached the end of the literal.
442 
443    Return 1 if we haven't reached the end of the literal, but we want
444    the caller to process the literal up to this point. Used for
445    doubled braces.
446 */
447 static int
fstring_find_literal(Parser * p,const char ** str,const char * end,int raw,PyObject ** literal,int recurse_lvl,Token * t)448 fstring_find_literal(Parser *p, const char **str, const char *end, int raw,
449                      PyObject **literal, int recurse_lvl, Token *t)
450 {
451     /* Get any literal string. It ends when we hit an un-doubled left
452        brace (which isn't part of a unicode name escape such as
453        "\N{EULER CONSTANT}"), or the end of the string. */
454 
455     const char *s = *str;
456     const char *literal_start = s;
457     int result = 0;
458 
459     assert(*literal == NULL);
460     while (s < end) {
461         char ch = *s++;
462         if (!raw && ch == '\\' && s < end) {
463             ch = *s++;
464             if (ch == 'N') {
465                 /* We need to look at and skip matching braces for "\N{name}"
466                    sequences because otherwise we'll think the opening '{'
467                    starts an expression, which is not the case with "\N".
468                    Keep looking for either a matched '{' '}' pair, or the end
469                    of the string. */
470 
471                 if (s < end && *s++ == '{') {
472                     while (s < end && *s++ != '}') {
473                     }
474                     continue;
475                 }
476 
477                 /* This is an invalid "\N" sequence, since it's a "\N" not
478                    followed by a "{".  Just keep parsing this literal.  This
479                    error will be caught later by
480                    decode_unicode_with_escapes(). */
481                 continue;
482             }
483             if (ch == '{' && warn_invalid_escape_sequence(p, s-1, t) < 0) {
484                 return -1;
485             }
486         }
487         if (ch == '{' || ch == '}') {
488             /* Check for doubled braces, but only at the top level. If
489                we checked at every level, then f'{0:{3}}' would fail
490                with the two closing braces. */
491             if (recurse_lvl == 0) {
492                 if (s < end && *s == ch) {
493                     /* We're going to tell the caller that the literal ends
494                        here, but that they should continue scanning. But also
495                        skip over the second brace when we resume scanning. */
496                     *str = s + 1;
497                     result = 1;
498                     goto done;
499                 }
500 
501                 /* Where a single '{' is the start of a new expression, a
502                    single '}' is not allowed. */
503                 if (ch == '}') {
504                     *str = s - 1;
505                     RAISE_SYNTAX_ERROR("f-string: single '}' is not allowed");
506                     return -1;
507                 }
508             }
509             /* We're either at a '{', which means we're starting another
510                expression; or a '}', which means we're at the end of this
511                f-string (for a nested format_spec). */
512             s--;
513             break;
514         }
515     }
516     *str = s;
517     assert(s <= end);
518     assert(s == end || *s == '{' || *s == '}');
519 done:
520     if (literal_start != s) {
521         if (raw) {
522             *literal = PyUnicode_DecodeUTF8Stateful(literal_start,
523                                                     s - literal_start,
524                                                     NULL, NULL);
525         }
526         else {
527             *literal = decode_unicode_with_escapes(p, literal_start,
528                                                    s - literal_start, t);
529         }
530         if (!*literal) {
531             return -1;
532         }
533     }
534     return result;
535 }
536 
537 /* Forward declaration because parsing is recursive. */
538 static expr_ty
539 fstring_parse(Parser *p, const char **str, const char *end, int raw, int recurse_lvl,
540               Token *first_token, Token* t, Token *last_token);
541 
542 /* Parse the f-string at *str, ending at end.  We know *str starts an
543    expression (so it must be a '{'). Returns the FormattedValue node, which
544    includes the expression, conversion character, format_spec expression, and
545    optionally the text of the expression (if = is used).
546 
547    Note that I don't do a perfect job here: I don't make sure that a
548    closing brace doesn't match an opening paren, for example. It
549    doesn't need to error on all invalid expressions, just correctly
550    find the end of all valid ones. Any errors inside the expression
551    will be caught when we parse it later.
552 
553    *expression is set to the expression.  For an '=' "debug" expression,
554    *expr_text is set to the debug text (the original text of the expression,
555    including the '=' and any whitespace around it, as a string object).  If
556    not a debug expression, *expr_text set to NULL. */
557 static int
fstring_find_expr(Parser * p,const char ** str,const char * end,int raw,int recurse_lvl,PyObject ** expr_text,expr_ty * expression,Token * first_token,Token * t,Token * last_token)558 fstring_find_expr(Parser *p, const char **str, const char *end, int raw, int recurse_lvl,
559                   PyObject **expr_text, expr_ty *expression, Token *first_token,
560                   Token *t, Token *last_token)
561 {
562     /* Return -1 on error, else 0. */
563 
564     const char *expr_start;
565     const char *expr_end;
566     expr_ty simple_expression;
567     expr_ty format_spec = NULL; /* Optional format specifier. */
568     int conversion = -1; /* The conversion char.  Use default if not
569                             specified, or !r if using = and no format
570                             spec. */
571 
572     /* 0 if we're not in a string, else the quote char we're trying to
573        match (single or double quote). */
574     char quote_char = 0;
575 
576     /* If we're inside a string, 1=normal, 3=triple-quoted. */
577     int string_type = 0;
578 
579     /* Keep track of nesting level for braces/parens/brackets in
580        expressions. */
581     Py_ssize_t nested_depth = 0;
582     char parenstack[MAXLEVEL];
583 
584     *expr_text = NULL;
585 
586     /* Can only nest one level deep. */
587     if (recurse_lvl >= 2) {
588         RAISE_SYNTAX_ERROR("f-string: expressions nested too deeply");
589         goto error;
590     }
591 
592     /* The first char must be a left brace, or we wouldn't have gotten
593        here. Skip over it. */
594     assert(**str == '{');
595     *str += 1;
596 
597     expr_start = *str;
598     for (; *str < end; (*str)++) {
599         char ch;
600 
601         /* Loop invariants. */
602         assert(nested_depth >= 0);
603         assert(*str >= expr_start && *str < end);
604         if (quote_char) {
605             assert(string_type == 1 || string_type == 3);
606         } else {
607             assert(string_type == 0);
608         }
609 
610         ch = **str;
611         /* Nowhere inside an expression is a backslash allowed. */
612         if (ch == '\\') {
613             /* Error: can't include a backslash character, inside
614                parens or strings or not. */
615             RAISE_SYNTAX_ERROR(
616                       "f-string expression part "
617                       "cannot include a backslash");
618             goto error;
619         }
620         if (quote_char) {
621             /* We're inside a string. See if we're at the end. */
622             /* This code needs to implement the same non-error logic
623                as tok_get from tokenizer.c, at the letter_quote
624                label. To actually share that code would be a
625                nightmare. But, it's unlikely to change and is small,
626                so duplicate it here. Note we don't need to catch all
627                of the errors, since they'll be caught when parsing the
628                expression. We just need to match the non-error
629                cases. Thus we can ignore \n in single-quoted strings,
630                for example. Or non-terminated strings. */
631             if (ch == quote_char) {
632                 /* Does this match the string_type (single or triple
633                    quoted)? */
634                 if (string_type == 3) {
635                     if (*str+2 < end && *(*str+1) == ch && *(*str+2) == ch) {
636                         /* We're at the end of a triple quoted string. */
637                         *str += 2;
638                         string_type = 0;
639                         quote_char = 0;
640                         continue;
641                     }
642                 } else {
643                     /* We're at the end of a normal string. */
644                     quote_char = 0;
645                     string_type = 0;
646                     continue;
647                 }
648             }
649         } else if (ch == '\'' || ch == '"') {
650             /* Is this a triple quoted string? */
651             if (*str+2 < end && *(*str+1) == ch && *(*str+2) == ch) {
652                 string_type = 3;
653                 *str += 2;
654             } else {
655                 /* Start of a normal string. */
656                 string_type = 1;
657             }
658             /* Start looking for the end of the string. */
659             quote_char = ch;
660         } else if (ch == '[' || ch == '{' || ch == '(') {
661             if (nested_depth >= MAXLEVEL) {
662                 RAISE_SYNTAX_ERROR("f-string: too many nested parenthesis");
663                 goto error;
664             }
665             parenstack[nested_depth] = ch;
666             nested_depth++;
667         } else if (ch == '#') {
668             /* Error: can't include a comment character, inside parens
669                or not. */
670             RAISE_SYNTAX_ERROR("f-string expression part cannot include '#'");
671             goto error;
672         } else if (nested_depth == 0 &&
673                    (ch == '!' || ch == ':' || ch == '}' ||
674                     ch == '=' || ch == '>' || ch == '<')) {
675             /* See if there's a next character. */
676             if (*str+1 < end) {
677                 char next = *(*str+1);
678 
679                 /* For "!=". since '=' is not an allowed conversion character,
680                    nothing is lost in this test. */
681                 if ((ch == '!' && next == '=') ||   /* != */
682                     (ch == '=' && next == '=') ||   /* == */
683                     (ch == '<' && next == '=') ||   /* <= */
684                     (ch == '>' && next == '=')      /* >= */
685                     ) {
686                     *str += 1;
687                     continue;
688                 }
689             }
690             /* Don't get out of the loop for these, if they're single
691                chars (not part of 2-char tokens). If by themselves, they
692                don't end an expression (unlike say '!'). */
693             if (ch == '>' || ch == '<') {
694                 continue;
695             }
696 
697             /* Normal way out of this loop. */
698             break;
699         } else if (ch == ']' || ch == '}' || ch == ')') {
700             if (!nested_depth) {
701                 RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", ch);
702                 goto error;
703             }
704             nested_depth--;
705             int opening = (unsigned char)parenstack[nested_depth];
706             if (!((opening == '(' && ch == ')') ||
707                   (opening == '[' && ch == ']') ||
708                   (opening == '{' && ch == '}')))
709             {
710                 RAISE_SYNTAX_ERROR(
711                           "f-string: closing parenthesis '%c' "
712                           "does not match opening parenthesis '%c'",
713                           ch, opening);
714                 goto error;
715             }
716         } else {
717             /* Just consume this char and loop around. */
718         }
719     }
720     expr_end = *str;
721     /* If we leave the above loop in a string or with mismatched parens, we
722        don't really care. We'll get a syntax error when compiling the
723        expression. But, we can produce a better error message, so let's just
724        do that.*/
725     if (quote_char) {
726         RAISE_SYNTAX_ERROR("f-string: unterminated string");
727         goto error;
728     }
729     if (nested_depth) {
730         int opening = (unsigned char)parenstack[nested_depth - 1];
731         RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", opening);
732         goto error;
733     }
734 
735     if (*str >= end) {
736         goto unexpected_end_of_string;
737     }
738 
739     /* Compile the expression as soon as possible, so we show errors
740        related to the expression before errors related to the
741        conversion or format_spec. */
742     simple_expression = fstring_compile_expr(p, expr_start, expr_end, t);
743     if (!simple_expression) {
744         goto error;
745     }
746 
747     /* Check for =, which puts the text value of the expression in
748        expr_text. */
749     if (**str == '=') {
750         if (p->feature_version < 8) {
751             RAISE_SYNTAX_ERROR("f-string: self documenting expressions are "
752                                "only supported in Python 3.8 and greater");
753             goto error;
754         }
755         *str += 1;
756 
757         /* Skip over ASCII whitespace.  No need to test for end of string
758            here, since we know there's at least a trailing quote somewhere
759            ahead. */
760         while (Py_ISSPACE(**str)) {
761             *str += 1;
762         }
763         if (*str >= end) {
764             goto unexpected_end_of_string;
765         }
766         /* Set *expr_text to the text of the expression. */
767         *expr_text = PyUnicode_FromStringAndSize(expr_start, *str-expr_start);
768         if (!*expr_text) {
769             goto error;
770         }
771     }
772 
773     /* Check for a conversion char, if present. */
774     if (**str == '!') {
775         *str += 1;
776         if (*str >= end) {
777             goto unexpected_end_of_string;
778         }
779 
780         conversion = (unsigned char)**str;
781         *str += 1;
782 
783         /* Validate the conversion. */
784         if (!(conversion == 's' || conversion == 'r' || conversion == 'a')) {
785             RAISE_SYNTAX_ERROR(
786                       "f-string: invalid conversion character: "
787                       "expected 's', 'r', or 'a'");
788             goto error;
789         }
790 
791     }
792 
793     /* Check for the format spec, if present. */
794     if (*str >= end) {
795         goto unexpected_end_of_string;
796     }
797     if (**str == ':') {
798         *str += 1;
799         if (*str >= end) {
800             goto unexpected_end_of_string;
801         }
802 
803         /* Parse the format spec. */
804         format_spec = fstring_parse(p, str, end, raw, recurse_lvl+1,
805                                     first_token, t, last_token);
806         if (!format_spec) {
807             goto error;
808         }
809     }
810 
811     if (*str >= end || **str != '}') {
812         goto unexpected_end_of_string;
813     }
814 
815     /* We're at a right brace. Consume it. */
816     assert(*str < end);
817     assert(**str == '}');
818     *str += 1;
819 
820     /* If we're in = mode (detected by non-NULL expr_text), and have no format
821        spec and no explicit conversion, set the conversion to 'r'. */
822     if (*expr_text && format_spec == NULL && conversion == -1) {
823         conversion = 'r';
824     }
825 
826     /* And now create the FormattedValue node that represents this
827        entire expression with the conversion and format spec. */
828     //TODO: Fix this
829     *expression = _PyAST_FormattedValue(simple_expression, conversion,
830                                         format_spec, first_token->lineno,
831                                         first_token->col_offset,
832                                         last_token->end_lineno,
833                                         last_token->end_col_offset, p->arena);
834     if (!*expression) {
835         goto error;
836     }
837 
838     return 0;
839 
840 unexpected_end_of_string:
841     RAISE_SYNTAX_ERROR("f-string: expecting '}'");
842     /* Falls through to error. */
843 
844 error:
845     Py_XDECREF(*expr_text);
846     return -1;
847 
848 }
849 
850 /* Return -1 on error.
851 
852    Return 0 if we have a literal (possible zero length) and an
853    expression (zero length if at the end of the string.
854 
855    Return 1 if we have a literal, but no expression, and we want the
856    caller to call us again. This is used to deal with doubled
857    braces.
858 
859    When called multiple times on the string 'a{{b{0}c', this function
860    will return:
861 
862    1. the literal 'a{' with no expression, and a return value
863       of 1. Despite the fact that there's no expression, the return
864       value of 1 means we're not finished yet.
865 
866    2. the literal 'b' and the expression '0', with a return value of
867       0. The fact that there's an expression means we're not finished.
868 
869    3. literal 'c' with no expression and a return value of 0. The
870       combination of the return value of 0 with no expression means
871       we're finished.
872 */
873 static int
fstring_find_literal_and_expr(Parser * p,const char ** str,const char * end,int raw,int recurse_lvl,PyObject ** literal,PyObject ** expr_text,expr_ty * expression,Token * first_token,Token * t,Token * last_token)874 fstring_find_literal_and_expr(Parser *p, const char **str, const char *end, int raw,
875                               int recurse_lvl, PyObject **literal,
876                               PyObject **expr_text, expr_ty *expression,
877                               Token *first_token, Token *t, Token *last_token)
878 {
879     int result;
880 
881     assert(*literal == NULL && *expression == NULL);
882 
883     /* Get any literal string. */
884     result = fstring_find_literal(p, str, end, raw, literal, recurse_lvl, t);
885     if (result < 0) {
886         goto error;
887     }
888 
889     assert(result == 0 || result == 1);
890 
891     if (result == 1) {
892         /* We have a literal, but don't look at the expression. */
893         return 1;
894     }
895 
896     if (*str >= end || **str == '}') {
897         /* We're at the end of the string or the end of a nested
898            f-string: no expression. The top-level error case where we
899            expect to be at the end of the string but we're at a '}' is
900            handled later. */
901         return 0;
902     }
903 
904     /* We must now be the start of an expression, on a '{'. */
905     assert(**str == '{');
906 
907     if (fstring_find_expr(p, str, end, raw, recurse_lvl, expr_text,
908                           expression, first_token, t, last_token) < 0) {
909         goto error;
910     }
911 
912     return 0;
913 
914 error:
915     Py_CLEAR(*literal);
916     return -1;
917 }
918 
919 #ifdef NDEBUG
920 #define ExprList_check_invariants(l)
921 #else
922 static void
ExprList_check_invariants(ExprList * l)923 ExprList_check_invariants(ExprList *l)
924 {
925     /* Check our invariants. Make sure this object is "live", and
926        hasn't been deallocated. */
927     assert(l->size >= 0);
928     assert(l->p != NULL);
929     if (l->size <= EXPRLIST_N_CACHED) {
930         assert(l->data == l->p);
931     }
932 }
933 #endif
934 
935 static void
ExprList_Init(ExprList * l)936 ExprList_Init(ExprList *l)
937 {
938     l->allocated = EXPRLIST_N_CACHED;
939     l->size = 0;
940 
941     /* Until we start allocating dynamically, p points to data. */
942     l->p = l->data;
943 
944     ExprList_check_invariants(l);
945 }
946 
947 static int
ExprList_Append(ExprList * l,expr_ty exp)948 ExprList_Append(ExprList *l, expr_ty exp)
949 {
950     ExprList_check_invariants(l);
951     if (l->size >= l->allocated) {
952         /* We need to alloc (or realloc) the memory. */
953         Py_ssize_t new_size = l->allocated * 2;
954 
955         /* See if we've ever allocated anything dynamically. */
956         if (l->p == l->data) {
957             Py_ssize_t i;
958             /* We're still using the cached data. Switch to
959                alloc-ing. */
960             l->p = PyMem_Malloc(sizeof(expr_ty) * new_size);
961             if (!l->p) {
962                 return -1;
963             }
964             /* Copy the cached data into the new buffer. */
965             for (i = 0; i < l->size; i++) {
966                 l->p[i] = l->data[i];
967             }
968         } else {
969             /* Just realloc. */
970             expr_ty *tmp = PyMem_Realloc(l->p, sizeof(expr_ty) * new_size);
971             if (!tmp) {
972                 PyMem_Free(l->p);
973                 l->p = NULL;
974                 return -1;
975             }
976             l->p = tmp;
977         }
978 
979         l->allocated = new_size;
980         assert(l->allocated == 2 * l->size);
981     }
982 
983     l->p[l->size++] = exp;
984 
985     ExprList_check_invariants(l);
986     return 0;
987 }
988 
989 static void
ExprList_Dealloc(ExprList * l)990 ExprList_Dealloc(ExprList *l)
991 {
992     ExprList_check_invariants(l);
993 
994     /* If there's been an error, or we've never dynamically allocated,
995        do nothing. */
996     if (!l->p || l->p == l->data) {
997         /* Do nothing. */
998     } else {
999         /* We have dynamically allocated. Free the memory. */
1000         PyMem_Free(l->p);
1001     }
1002     l->p = NULL;
1003     l->size = -1;
1004 }
1005 
1006 static asdl_expr_seq *
ExprList_Finish(ExprList * l,PyArena * arena)1007 ExprList_Finish(ExprList *l, PyArena *arena)
1008 {
1009     asdl_expr_seq *seq;
1010 
1011     ExprList_check_invariants(l);
1012 
1013     /* Allocate the asdl_seq and copy the expressions in to it. */
1014     seq = _Py_asdl_expr_seq_new(l->size, arena);
1015     if (seq) {
1016         Py_ssize_t i;
1017         for (i = 0; i < l->size; i++) {
1018             asdl_seq_SET(seq, i, l->p[i]);
1019         }
1020     }
1021     ExprList_Dealloc(l);
1022     return seq;
1023 }
1024 
1025 #ifdef NDEBUG
1026 #define FstringParser_check_invariants(state)
1027 #else
1028 static void
FstringParser_check_invariants(FstringParser * state)1029 FstringParser_check_invariants(FstringParser *state)
1030 {
1031     if (state->last_str) {
1032         assert(PyUnicode_CheckExact(state->last_str));
1033     }
1034     ExprList_check_invariants(&state->expr_list);
1035 }
1036 #endif
1037 
1038 void
_PyPegen_FstringParser_Init(FstringParser * state)1039 _PyPegen_FstringParser_Init(FstringParser *state)
1040 {
1041     state->last_str = NULL;
1042     state->fmode = 0;
1043     ExprList_Init(&state->expr_list);
1044     FstringParser_check_invariants(state);
1045 }
1046 
1047 void
_PyPegen_FstringParser_Dealloc(FstringParser * state)1048 _PyPegen_FstringParser_Dealloc(FstringParser *state)
1049 {
1050     FstringParser_check_invariants(state);
1051 
1052     Py_XDECREF(state->last_str);
1053     ExprList_Dealloc(&state->expr_list);
1054 }
1055 
1056 /* Make a Constant node, but decref the PyUnicode object being added. */
1057 static expr_ty
make_str_node_and_del(Parser * p,PyObject ** str,Token * first_token,Token * last_token)1058 make_str_node_and_del(Parser *p, PyObject **str, Token* first_token, Token *last_token)
1059 {
1060     PyObject *s = *str;
1061     PyObject *kind = NULL;
1062     *str = NULL;
1063     assert(PyUnicode_CheckExact(s));
1064     if (_PyArena_AddPyObject(p->arena, s) < 0) {
1065         Py_DECREF(s);
1066         return NULL;
1067     }
1068     const char* the_str = PyBytes_AsString(first_token->bytes);
1069     if (the_str && the_str[0] == 'u') {
1070         kind = _PyPegen_new_identifier(p, "u");
1071     }
1072 
1073     if (kind == NULL && PyErr_Occurred()) {
1074         return NULL;
1075     }
1076 
1077     return _PyAST_Constant(s, kind, first_token->lineno, first_token->col_offset,
1078                            last_token->end_lineno, last_token->end_col_offset,
1079                            p->arena);
1080 
1081 }
1082 
1083 
1084 /* Add a non-f-string (that is, a regular literal string). str is
1085    decref'd. */
1086 int
_PyPegen_FstringParser_ConcatAndDel(FstringParser * state,PyObject * str)1087 _PyPegen_FstringParser_ConcatAndDel(FstringParser *state, PyObject *str)
1088 {
1089     FstringParser_check_invariants(state);
1090 
1091     assert(PyUnicode_CheckExact(str));
1092 
1093     if (PyUnicode_GET_LENGTH(str) == 0) {
1094         Py_DECREF(str);
1095         return 0;
1096     }
1097 
1098     if (!state->last_str) {
1099         /* We didn't have a string before, so just remember this one. */
1100         state->last_str = str;
1101     } else {
1102         /* Concatenate this with the previous string. */
1103         PyUnicode_AppendAndDel(&state->last_str, str);
1104         if (!state->last_str) {
1105             return -1;
1106         }
1107     }
1108     FstringParser_check_invariants(state);
1109     return 0;
1110 }
1111 
1112 /* Parse an f-string. The f-string is in *str to end, with no
1113    'f' or quotes. */
1114 int
_PyPegen_FstringParser_ConcatFstring(Parser * p,FstringParser * state,const char ** str,const char * end,int raw,int recurse_lvl,Token * first_token,Token * t,Token * last_token)1115 _PyPegen_FstringParser_ConcatFstring(Parser *p, FstringParser *state, const char **str,
1116                             const char *end, int raw, int recurse_lvl,
1117                             Token *first_token, Token* t, Token *last_token)
1118 {
1119     FstringParser_check_invariants(state);
1120     state->fmode = 1;
1121 
1122     /* Parse the f-string. */
1123     while (1) {
1124         PyObject *literal = NULL;
1125         PyObject *expr_text = NULL;
1126         expr_ty expression = NULL;
1127 
1128         /* If there's a zero length literal in front of the
1129            expression, literal will be NULL. If we're at the end of
1130            the f-string, expression will be NULL (unless result == 1,
1131            see below). */
1132         int result = fstring_find_literal_and_expr(p, str, end, raw, recurse_lvl,
1133                                                    &literal, &expr_text,
1134                                                    &expression, first_token, t, last_token);
1135         if (result < 0) {
1136             return -1;
1137         }
1138 
1139         /* Add the literal, if any. */
1140         if (literal && _PyPegen_FstringParser_ConcatAndDel(state, literal) < 0) {
1141             Py_XDECREF(expr_text);
1142             return -1;
1143         }
1144         /* Add the expr_text, if any. */
1145         if (expr_text && _PyPegen_FstringParser_ConcatAndDel(state, expr_text) < 0) {
1146             return -1;
1147         }
1148 
1149         /* We've dealt with the literal and expr_text, their ownership has
1150            been transferred to the state object.  Don't look at them again. */
1151 
1152         /* See if we should just loop around to get the next literal
1153            and expression, while ignoring the expression this
1154            time. This is used for un-doubling braces, as an
1155            optimization. */
1156         if (result == 1) {
1157             continue;
1158         }
1159 
1160         if (!expression) {
1161             /* We're done with this f-string. */
1162             break;
1163         }
1164 
1165         /* We know we have an expression. Convert any existing string
1166            to a Constant node. */
1167         if (state->last_str) {
1168             /* Convert the existing last_str literal to a Constant node. */
1169             expr_ty last_str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
1170             if (!last_str || ExprList_Append(&state->expr_list, last_str) < 0) {
1171                 return -1;
1172             }
1173         }
1174 
1175         if (ExprList_Append(&state->expr_list, expression) < 0) {
1176             return -1;
1177         }
1178     }
1179 
1180     /* If recurse_lvl is zero, then we must be at the end of the
1181        string. Otherwise, we must be at a right brace. */
1182 
1183     if (recurse_lvl == 0 && *str < end-1) {
1184         RAISE_SYNTAX_ERROR("f-string: unexpected end of string");
1185         return -1;
1186     }
1187     if (recurse_lvl != 0 && **str != '}') {
1188         RAISE_SYNTAX_ERROR("f-string: expecting '}'");
1189         return -1;
1190     }
1191 
1192     FstringParser_check_invariants(state);
1193     return 0;
1194 }
1195 
1196 /* Convert the partial state reflected in last_str and expr_list to an
1197    expr_ty. The expr_ty can be a Constant, or a JoinedStr. */
1198 expr_ty
_PyPegen_FstringParser_Finish(Parser * p,FstringParser * state,Token * first_token,Token * last_token)1199 _PyPegen_FstringParser_Finish(Parser *p, FstringParser *state, Token* first_token,
1200                      Token *last_token)
1201 {
1202     asdl_expr_seq *seq;
1203 
1204     FstringParser_check_invariants(state);
1205 
1206     /* If we're just a constant string with no expressions, return
1207        that. */
1208     if (!state->fmode) {
1209         assert(!state->expr_list.size);
1210         if (!state->last_str) {
1211             /* Create a zero length string. */
1212             state->last_str = PyUnicode_FromStringAndSize(NULL, 0);
1213             if (!state->last_str) {
1214                 goto error;
1215             }
1216         }
1217         return make_str_node_and_del(p, &state->last_str, first_token, last_token);
1218     }
1219 
1220     /* Create a Constant node out of last_str, if needed. It will be the
1221        last node in our expression list. */
1222     if (state->last_str) {
1223         expr_ty str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
1224         if (!str || ExprList_Append(&state->expr_list, str) < 0) {
1225             goto error;
1226         }
1227     }
1228     /* This has already been freed. */
1229     assert(state->last_str == NULL);
1230 
1231     seq = ExprList_Finish(&state->expr_list, p->arena);
1232     if (!seq) {
1233         goto error;
1234     }
1235 
1236     return _PyAST_JoinedStr(seq, first_token->lineno, first_token->col_offset,
1237                             last_token->end_lineno, last_token->end_col_offset,
1238                             p->arena);
1239 
1240 error:
1241     _PyPegen_FstringParser_Dealloc(state);
1242     return NULL;
1243 }
1244 
1245 /* Given an f-string (with no 'f' or quotes) that's in *str and ends
1246    at end, parse it into an expr_ty.  Return NULL on error.  Adjust
1247    str to point past the parsed portion. */
1248 static expr_ty
fstring_parse(Parser * p,const char ** str,const char * end,int raw,int recurse_lvl,Token * first_token,Token * t,Token * last_token)1249 fstring_parse(Parser *p, const char **str, const char *end, int raw,
1250               int recurse_lvl, Token *first_token, Token* t, Token *last_token)
1251 {
1252     FstringParser state;
1253 
1254     _PyPegen_FstringParser_Init(&state);
1255     if (_PyPegen_FstringParser_ConcatFstring(p, &state, str, end, raw, recurse_lvl,
1256                                     first_token, t, last_token) < 0) {
1257         _PyPegen_FstringParser_Dealloc(&state);
1258         return NULL;
1259     }
1260 
1261     return _PyPegen_FstringParser_Finish(p, &state, t, t);
1262 }
1263