1 #include <stdbool.h>
2
3 #include <Python.h>
4
5 #include "tokenizer.h"
6 #include "pegen.h"
7 #include "string_parser.h"
8
9 //// STRING HANDLING FUNCTIONS ////
10
11 static int
warn_invalid_escape_sequence(Parser * p,const char * first_invalid_escape,Token * t)12 warn_invalid_escape_sequence(Parser *p, const char *first_invalid_escape, Token *t)
13 {
14 unsigned char c = *first_invalid_escape;
15 int octal = ('4' <= c && c <= '7');
16 PyObject *msg =
17 octal
18 ? PyUnicode_FromFormat("invalid octal escape sequence '\\%.3s'",
19 first_invalid_escape)
20 : PyUnicode_FromFormat("invalid escape sequence '\\%c'", c);
21 if (msg == NULL) {
22 return -1;
23 }
24 if (PyErr_WarnExplicitObject(PyExc_DeprecationWarning, msg, p->tok->filename,
25 t->lineno, NULL, NULL) < 0) {
26 if (PyErr_ExceptionMatches(PyExc_DeprecationWarning)) {
27 /* Replace the DeprecationWarning exception with a SyntaxError
28 to get a more accurate error report */
29 PyErr_Clear();
30
31 /* This is needed, in order for the SyntaxError to point to the token t,
32 since _PyPegen_raise_error uses p->tokens[p->fill - 1] for the
33 error location, if p->known_err_token is not set. */
34 p->known_err_token = t;
35 if (octal) {
36 RAISE_SYNTAX_ERROR("invalid octal escape sequence '\\%.3s'",
37 first_invalid_escape);
38 }
39 else {
40 RAISE_SYNTAX_ERROR("invalid escape sequence '\\%c'", c);
41 }
42 }
43 Py_DECREF(msg);
44 return -1;
45 }
46 Py_DECREF(msg);
47 return 0;
48 }
49
50 static PyObject *
decode_utf8(const char ** sPtr,const char * end)51 decode_utf8(const char **sPtr, const char *end)
52 {
53 const char *s;
54 const char *t;
55 t = s = *sPtr;
56 while (s < end && (*s & 0x80)) {
57 s++;
58 }
59 *sPtr = s;
60 return PyUnicode_DecodeUTF8(t, s - t, NULL);
61 }
62
63 static PyObject *
decode_unicode_with_escapes(Parser * parser,const char * s,size_t len,Token * t)64 decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
65 {
66 PyObject *v;
67 PyObject *u;
68 char *buf;
69 char *p;
70 const char *end;
71
72 /* check for integer overflow */
73 if (len > SIZE_MAX / 6) {
74 return NULL;
75 }
76 /* "ä" (2 bytes) may become "\U000000E4" (10 bytes), or 1:5
77 "\ä" (3 bytes) may become "\u005c\U000000E4" (16 bytes), or ~1:6 */
78 u = PyBytes_FromStringAndSize((char *)NULL, len * 6);
79 if (u == NULL) {
80 return NULL;
81 }
82 p = buf = PyBytes_AsString(u);
83 if (p == NULL) {
84 return NULL;
85 }
86 end = s + len;
87 while (s < end) {
88 if (*s == '\\') {
89 *p++ = *s++;
90 if (s >= end || *s & 0x80) {
91 strcpy(p, "u005c");
92 p += 5;
93 if (s >= end) {
94 break;
95 }
96 }
97 }
98 if (*s & 0x80) {
99 PyObject *w;
100 int kind;
101 const void *data;
102 Py_ssize_t w_len;
103 Py_ssize_t i;
104 w = decode_utf8(&s, end);
105 if (w == NULL) {
106 Py_DECREF(u);
107 return NULL;
108 }
109 kind = PyUnicode_KIND(w);
110 data = PyUnicode_DATA(w);
111 w_len = PyUnicode_GET_LENGTH(w);
112 for (i = 0; i < w_len; i++) {
113 Py_UCS4 chr = PyUnicode_READ(kind, data, i);
114 sprintf(p, "\\U%08x", chr);
115 p += 10;
116 }
117 /* Should be impossible to overflow */
118 assert(p - buf <= PyBytes_GET_SIZE(u));
119 Py_DECREF(w);
120 }
121 else {
122 *p++ = *s++;
123 }
124 }
125 len = p - buf;
126 s = buf;
127
128 const char *first_invalid_escape;
129 v = _PyUnicode_DecodeUnicodeEscapeInternal(s, len, NULL, NULL, &first_invalid_escape);
130
131 if (v != NULL && first_invalid_escape != NULL) {
132 if (warn_invalid_escape_sequence(parser, first_invalid_escape, t) < 0) {
133 /* We have not decref u before because first_invalid_escape points
134 inside u. */
135 Py_XDECREF(u);
136 Py_DECREF(v);
137 return NULL;
138 }
139 }
140 Py_XDECREF(u);
141 return v;
142 }
143
144 static PyObject *
decode_bytes_with_escapes(Parser * p,const char * s,Py_ssize_t len,Token * t)145 decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t)
146 {
147 const char *first_invalid_escape;
148 PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, &first_invalid_escape);
149 if (result == NULL) {
150 return NULL;
151 }
152
153 if (first_invalid_escape != NULL) {
154 if (warn_invalid_escape_sequence(p, first_invalid_escape, t) < 0) {
155 Py_DECREF(result);
156 return NULL;
157 }
158 }
159 return result;
160 }
161
162 /* s must include the bracketing quote characters, and r, b, u,
163 &/or f prefixes (if any), and embedded escape sequences (if any).
164 _PyPegen_parsestr parses it, and sets *result to decoded Python string object.
165 If the string is an f-string, set *fstr and *fstrlen to the unparsed
166 string object. Return 0 if no errors occurred. */
167 int
_PyPegen_parsestr(Parser * p,int * bytesmode,int * rawmode,PyObject ** result,const char ** fstr,Py_ssize_t * fstrlen,Token * t)168 _PyPegen_parsestr(Parser *p, int *bytesmode, int *rawmode, PyObject **result,
169 const char **fstr, Py_ssize_t *fstrlen, Token *t)
170 {
171 const char *s = PyBytes_AsString(t->bytes);
172 if (s == NULL) {
173 return -1;
174 }
175
176 size_t len;
177 int quote = Py_CHARMASK(*s);
178 int fmode = 0;
179 *bytesmode = 0;
180 *rawmode = 0;
181 *result = NULL;
182 *fstr = NULL;
183 if (Py_ISALPHA(quote)) {
184 while (!*bytesmode || !*rawmode) {
185 if (quote == 'b' || quote == 'B') {
186 quote =(unsigned char)*++s;
187 *bytesmode = 1;
188 }
189 else if (quote == 'u' || quote == 'U') {
190 quote = (unsigned char)*++s;
191 }
192 else if (quote == 'r' || quote == 'R') {
193 quote = (unsigned char)*++s;
194 *rawmode = 1;
195 }
196 else if (quote == 'f' || quote == 'F') {
197 quote = (unsigned char)*++s;
198 fmode = 1;
199 }
200 else {
201 break;
202 }
203 }
204 }
205
206 /* fstrings are only allowed in Python 3.6 and greater */
207 if (fmode && p->feature_version < 6) {
208 p->error_indicator = 1;
209 RAISE_SYNTAX_ERROR("Format strings are only supported in Python 3.6 and greater");
210 return -1;
211 }
212
213 if (fmode && *bytesmode) {
214 PyErr_BadInternalCall();
215 return -1;
216 }
217 if (quote != '\'' && quote != '\"') {
218 PyErr_BadInternalCall();
219 return -1;
220 }
221 /* Skip the leading quote char. */
222 s++;
223 len = strlen(s);
224 if (len > INT_MAX) {
225 PyErr_SetString(PyExc_OverflowError, "string to parse is too long");
226 return -1;
227 }
228 if (s[--len] != quote) {
229 /* Last quote char must match the first. */
230 PyErr_BadInternalCall();
231 return -1;
232 }
233 if (len >= 4 && s[0] == quote && s[1] == quote) {
234 /* A triple quoted string. We've already skipped one quote at
235 the start and one at the end of the string. Now skip the
236 two at the start. */
237 s += 2;
238 len -= 2;
239 /* And check that the last two match. */
240 if (s[--len] != quote || s[--len] != quote) {
241 PyErr_BadInternalCall();
242 return -1;
243 }
244 }
245
246 if (fmode) {
247 /* Just return the bytes. The caller will parse the resulting
248 string. */
249 *fstr = s;
250 *fstrlen = len;
251 return 0;
252 }
253
254 /* Not an f-string. */
255 /* Avoid invoking escape decoding routines if possible. */
256 *rawmode = *rawmode || strchr(s, '\\') == NULL;
257 if (*bytesmode) {
258 /* Disallow non-ASCII characters. */
259 const char *ch;
260 for (ch = s; *ch; ch++) {
261 if (Py_CHARMASK(*ch) >= 0x80) {
262 RAISE_SYNTAX_ERROR_KNOWN_LOCATION(
263 t,
264 "bytes can only contain ASCII "
265 "literal characters");
266 return -1;
267 }
268 }
269 if (*rawmode) {
270 *result = PyBytes_FromStringAndSize(s, len);
271 }
272 else {
273 *result = decode_bytes_with_escapes(p, s, len, t);
274 }
275 }
276 else {
277 if (*rawmode) {
278 *result = PyUnicode_DecodeUTF8Stateful(s, len, NULL, NULL);
279 }
280 else {
281 *result = decode_unicode_with_escapes(p, s, len, t);
282 }
283 }
284 return *result == NULL ? -1 : 0;
285 }
286
287
288
289 // FSTRING STUFF
290
291 /* Fix locations for the given node and its children.
292
293 `parent` is the enclosing node.
294 `expr_start` is the starting position of the expression (pointing to the open brace).
295 `n` is the node which locations are going to be fixed relative to parent.
296 `expr_str` is the child node's string representation, including braces.
297 */
298 static bool
fstring_find_expr_location(Token * parent,const char * expr_start,char * expr_str,int * p_lines,int * p_cols)299 fstring_find_expr_location(Token *parent, const char* expr_start, char *expr_str, int *p_lines, int *p_cols)
300 {
301 *p_lines = 0;
302 *p_cols = 0;
303 assert(expr_start != NULL && *expr_start == '{');
304 if (parent && parent->bytes) {
305 const char *parent_str = PyBytes_AsString(parent->bytes);
306 if (!parent_str) {
307 return false;
308 }
309 // The following is needed, in order to correctly shift the column
310 // offset, in the case that (disregarding any whitespace) a newline
311 // immediately follows the opening curly brace of the fstring expression.
312 bool newline_after_brace = 1;
313 const char *start = expr_start + 1;
314 while (start && *start != '}' && *start != '\n') {
315 if (*start != ' ' && *start != '\t' && *start != '\f') {
316 newline_after_brace = 0;
317 break;
318 }
319 start++;
320 }
321
322 // Account for the characters from the last newline character to our
323 // left until the beginning of expr_start.
324 if (!newline_after_brace) {
325 start = expr_start;
326 while (start > parent_str && *start != '\n') {
327 start--;
328 }
329 *p_cols += (int)(expr_start - start);
330 if (*start == '\n') {
331 *p_cols -= 1;
332 }
333 }
334 /* adjust the start based on the number of newlines encountered
335 before the f-string expression */
336 for (const char *p = parent_str; p < expr_start; p++) {
337 if (*p == '\n') {
338 (*p_lines)++;
339 }
340 }
341 }
342 return true;
343 }
344
345
346 /* Compile this expression in to an expr_ty. Add parens around the
347 expression, in order to allow leading spaces in the expression. */
348 static expr_ty
fstring_compile_expr(Parser * p,const char * expr_start,const char * expr_end,Token * t)349 fstring_compile_expr(Parser *p, const char *expr_start, const char *expr_end,
350 Token *t)
351 {
352 expr_ty expr = NULL;
353 char *str;
354 Py_ssize_t len;
355 const char *s;
356 expr_ty result = NULL;
357
358 assert(expr_end >= expr_start);
359 assert(*(expr_start-1) == '{');
360 assert(*expr_end == '}' || *expr_end == '!' || *expr_end == ':' ||
361 *expr_end == '=');
362
363 /* If the substring is all whitespace, it's an error. We need to catch this
364 here, and not when we call PyParser_SimpleParseStringFlagsFilename,
365 because turning the expression '' in to '()' would go from being invalid
366 to valid. */
367 for (s = expr_start; s != expr_end; s++) {
368 char c = *s;
369 /* The Python parser ignores only the following whitespace
370 characters (\r already is converted to \n). */
371 if (!(c == ' ' || c == '\t' || c == '\n' || c == '\f')) {
372 break;
373 }
374 }
375
376 if (s == expr_end) {
377 if (*expr_end == '!' || *expr_end == ':' || *expr_end == '=') {
378 RAISE_SYNTAX_ERROR("f-string: expression required before '%c'", *expr_end);
379 return NULL;
380 }
381 RAISE_SYNTAX_ERROR("f-string: empty expression not allowed");
382 return NULL;
383 }
384
385 len = expr_end - expr_start;
386 /* Allocate 3 extra bytes: open paren, close paren, null byte. */
387 str = PyMem_Calloc(len + 3, sizeof(char));
388 if (str == NULL) {
389 PyErr_NoMemory();
390 return NULL;
391 }
392
393 // The call to fstring_find_expr_location is responsible for finding the column offset
394 // the generated AST nodes need to be shifted to the right, which is equal to the number
395 // of the f-string characters before the expression starts.
396 memcpy(str+1, expr_start, len);
397 int lines, cols;
398 if (!fstring_find_expr_location(t, expr_start-1, str+1, &lines, &cols)) {
399 PyMem_Free(str);
400 return NULL;
401 }
402
403 // The parentheses are needed in order to allow for leading whitespace within
404 // the f-string expression. This consequently gets parsed as a group (see the
405 // group rule in python.gram).
406 str[0] = '(';
407 str[len+1] = ')';
408
409 struct tok_state* tok = _PyTokenizer_FromString(str, 1);
410 if (tok == NULL) {
411 PyMem_Free(str);
412 return NULL;
413 }
414 Py_INCREF(p->tok->filename);
415
416 tok->filename = p->tok->filename;
417 tok->lineno = t->lineno + lines - 1;
418
419 Parser *p2 = _PyPegen_Parser_New(tok, Py_fstring_input, p->flags, p->feature_version,
420 NULL, p->arena);
421
422 p2->starting_lineno = t->lineno + lines;
423 p2->starting_col_offset = lines != 0 ? cols : t->col_offset + cols;
424
425 expr = _PyPegen_run_parser(p2);
426
427 if (expr == NULL) {
428 goto exit;
429 }
430 result = expr;
431
432 exit:
433 PyMem_Free(str);
434 _PyPegen_Parser_Free(p2);
435 _PyTokenizer_Free(tok);
436 return result;
437 }
438
439 /* Return -1 on error.
440
441 Return 0 if we reached the end of the literal.
442
443 Return 1 if we haven't reached the end of the literal, but we want
444 the caller to process the literal up to this point. Used for
445 doubled braces.
446 */
447 static int
fstring_find_literal(Parser * p,const char ** str,const char * end,int raw,PyObject ** literal,int recurse_lvl,Token * t)448 fstring_find_literal(Parser *p, const char **str, const char *end, int raw,
449 PyObject **literal, int recurse_lvl, Token *t)
450 {
451 /* Get any literal string. It ends when we hit an un-doubled left
452 brace (which isn't part of a unicode name escape such as
453 "\N{EULER CONSTANT}"), or the end of the string. */
454
455 const char *s = *str;
456 const char *literal_start = s;
457 int result = 0;
458
459 assert(*literal == NULL);
460 while (s < end) {
461 char ch = *s++;
462 if (!raw && ch == '\\' && s < end) {
463 ch = *s++;
464 if (ch == 'N') {
465 /* We need to look at and skip matching braces for "\N{name}"
466 sequences because otherwise we'll think the opening '{'
467 starts an expression, which is not the case with "\N".
468 Keep looking for either a matched '{' '}' pair, or the end
469 of the string. */
470
471 if (s < end && *s++ == '{') {
472 while (s < end && *s++ != '}') {
473 }
474 continue;
475 }
476
477 /* This is an invalid "\N" sequence, since it's a "\N" not
478 followed by a "{". Just keep parsing this literal. This
479 error will be caught later by
480 decode_unicode_with_escapes(). */
481 continue;
482 }
483 if (ch == '{' && warn_invalid_escape_sequence(p, s-1, t) < 0) {
484 return -1;
485 }
486 }
487 if (ch == '{' || ch == '}') {
488 /* Check for doubled braces, but only at the top level. If
489 we checked at every level, then f'{0:{3}}' would fail
490 with the two closing braces. */
491 if (recurse_lvl == 0) {
492 if (s < end && *s == ch) {
493 /* We're going to tell the caller that the literal ends
494 here, but that they should continue scanning. But also
495 skip over the second brace when we resume scanning. */
496 *str = s + 1;
497 result = 1;
498 goto done;
499 }
500
501 /* Where a single '{' is the start of a new expression, a
502 single '}' is not allowed. */
503 if (ch == '}') {
504 *str = s - 1;
505 RAISE_SYNTAX_ERROR("f-string: single '}' is not allowed");
506 return -1;
507 }
508 }
509 /* We're either at a '{', which means we're starting another
510 expression; or a '}', which means we're at the end of this
511 f-string (for a nested format_spec). */
512 s--;
513 break;
514 }
515 }
516 *str = s;
517 assert(s <= end);
518 assert(s == end || *s == '{' || *s == '}');
519 done:
520 if (literal_start != s) {
521 if (raw) {
522 *literal = PyUnicode_DecodeUTF8Stateful(literal_start,
523 s - literal_start,
524 NULL, NULL);
525 }
526 else {
527 *literal = decode_unicode_with_escapes(p, literal_start,
528 s - literal_start, t);
529 }
530 if (!*literal) {
531 return -1;
532 }
533 }
534 return result;
535 }
536
537 /* Forward declaration because parsing is recursive. */
538 static expr_ty
539 fstring_parse(Parser *p, const char **str, const char *end, int raw, int recurse_lvl,
540 Token *first_token, Token* t, Token *last_token);
541
542 /* Parse the f-string at *str, ending at end. We know *str starts an
543 expression (so it must be a '{'). Returns the FormattedValue node, which
544 includes the expression, conversion character, format_spec expression, and
545 optionally the text of the expression (if = is used).
546
547 Note that I don't do a perfect job here: I don't make sure that a
548 closing brace doesn't match an opening paren, for example. It
549 doesn't need to error on all invalid expressions, just correctly
550 find the end of all valid ones. Any errors inside the expression
551 will be caught when we parse it later.
552
553 *expression is set to the expression. For an '=' "debug" expression,
554 *expr_text is set to the debug text (the original text of the expression,
555 including the '=' and any whitespace around it, as a string object). If
556 not a debug expression, *expr_text set to NULL. */
557 static int
fstring_find_expr(Parser * p,const char ** str,const char * end,int raw,int recurse_lvl,PyObject ** expr_text,expr_ty * expression,Token * first_token,Token * t,Token * last_token)558 fstring_find_expr(Parser *p, const char **str, const char *end, int raw, int recurse_lvl,
559 PyObject **expr_text, expr_ty *expression, Token *first_token,
560 Token *t, Token *last_token)
561 {
562 /* Return -1 on error, else 0. */
563
564 const char *expr_start;
565 const char *expr_end;
566 expr_ty simple_expression;
567 expr_ty format_spec = NULL; /* Optional format specifier. */
568 int conversion = -1; /* The conversion char. Use default if not
569 specified, or !r if using = and no format
570 spec. */
571
572 /* 0 if we're not in a string, else the quote char we're trying to
573 match (single or double quote). */
574 char quote_char = 0;
575
576 /* If we're inside a string, 1=normal, 3=triple-quoted. */
577 int string_type = 0;
578
579 /* Keep track of nesting level for braces/parens/brackets in
580 expressions. */
581 Py_ssize_t nested_depth = 0;
582 char parenstack[MAXLEVEL];
583
584 *expr_text = NULL;
585
586 /* Can only nest one level deep. */
587 if (recurse_lvl >= 2) {
588 RAISE_SYNTAX_ERROR("f-string: expressions nested too deeply");
589 goto error;
590 }
591
592 /* The first char must be a left brace, or we wouldn't have gotten
593 here. Skip over it. */
594 assert(**str == '{');
595 *str += 1;
596
597 expr_start = *str;
598 for (; *str < end; (*str)++) {
599 char ch;
600
601 /* Loop invariants. */
602 assert(nested_depth >= 0);
603 assert(*str >= expr_start && *str < end);
604 if (quote_char) {
605 assert(string_type == 1 || string_type == 3);
606 } else {
607 assert(string_type == 0);
608 }
609
610 ch = **str;
611 /* Nowhere inside an expression is a backslash allowed. */
612 if (ch == '\\') {
613 /* Error: can't include a backslash character, inside
614 parens or strings or not. */
615 RAISE_SYNTAX_ERROR(
616 "f-string expression part "
617 "cannot include a backslash");
618 goto error;
619 }
620 if (quote_char) {
621 /* We're inside a string. See if we're at the end. */
622 /* This code needs to implement the same non-error logic
623 as tok_get from tokenizer.c, at the letter_quote
624 label. To actually share that code would be a
625 nightmare. But, it's unlikely to change and is small,
626 so duplicate it here. Note we don't need to catch all
627 of the errors, since they'll be caught when parsing the
628 expression. We just need to match the non-error
629 cases. Thus we can ignore \n in single-quoted strings,
630 for example. Or non-terminated strings. */
631 if (ch == quote_char) {
632 /* Does this match the string_type (single or triple
633 quoted)? */
634 if (string_type == 3) {
635 if (*str+2 < end && *(*str+1) == ch && *(*str+2) == ch) {
636 /* We're at the end of a triple quoted string. */
637 *str += 2;
638 string_type = 0;
639 quote_char = 0;
640 continue;
641 }
642 } else {
643 /* We're at the end of a normal string. */
644 quote_char = 0;
645 string_type = 0;
646 continue;
647 }
648 }
649 } else if (ch == '\'' || ch == '"') {
650 /* Is this a triple quoted string? */
651 if (*str+2 < end && *(*str+1) == ch && *(*str+2) == ch) {
652 string_type = 3;
653 *str += 2;
654 } else {
655 /* Start of a normal string. */
656 string_type = 1;
657 }
658 /* Start looking for the end of the string. */
659 quote_char = ch;
660 } else if (ch == '[' || ch == '{' || ch == '(') {
661 if (nested_depth >= MAXLEVEL) {
662 RAISE_SYNTAX_ERROR("f-string: too many nested parenthesis");
663 goto error;
664 }
665 parenstack[nested_depth] = ch;
666 nested_depth++;
667 } else if (ch == '#') {
668 /* Error: can't include a comment character, inside parens
669 or not. */
670 RAISE_SYNTAX_ERROR("f-string expression part cannot include '#'");
671 goto error;
672 } else if (nested_depth == 0 &&
673 (ch == '!' || ch == ':' || ch == '}' ||
674 ch == '=' || ch == '>' || ch == '<')) {
675 /* See if there's a next character. */
676 if (*str+1 < end) {
677 char next = *(*str+1);
678
679 /* For "!=". since '=' is not an allowed conversion character,
680 nothing is lost in this test. */
681 if ((ch == '!' && next == '=') || /* != */
682 (ch == '=' && next == '=') || /* == */
683 (ch == '<' && next == '=') || /* <= */
684 (ch == '>' && next == '=') /* >= */
685 ) {
686 *str += 1;
687 continue;
688 }
689 }
690 /* Don't get out of the loop for these, if they're single
691 chars (not part of 2-char tokens). If by themselves, they
692 don't end an expression (unlike say '!'). */
693 if (ch == '>' || ch == '<') {
694 continue;
695 }
696
697 /* Normal way out of this loop. */
698 break;
699 } else if (ch == ']' || ch == '}' || ch == ')') {
700 if (!nested_depth) {
701 RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", ch);
702 goto error;
703 }
704 nested_depth--;
705 int opening = (unsigned char)parenstack[nested_depth];
706 if (!((opening == '(' && ch == ')') ||
707 (opening == '[' && ch == ']') ||
708 (opening == '{' && ch == '}')))
709 {
710 RAISE_SYNTAX_ERROR(
711 "f-string: closing parenthesis '%c' "
712 "does not match opening parenthesis '%c'",
713 ch, opening);
714 goto error;
715 }
716 } else {
717 /* Just consume this char and loop around. */
718 }
719 }
720 expr_end = *str;
721 /* If we leave the above loop in a string or with mismatched parens, we
722 don't really care. We'll get a syntax error when compiling the
723 expression. But, we can produce a better error message, so let's just
724 do that.*/
725 if (quote_char) {
726 RAISE_SYNTAX_ERROR("f-string: unterminated string");
727 goto error;
728 }
729 if (nested_depth) {
730 int opening = (unsigned char)parenstack[nested_depth - 1];
731 RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", opening);
732 goto error;
733 }
734
735 if (*str >= end) {
736 goto unexpected_end_of_string;
737 }
738
739 /* Compile the expression as soon as possible, so we show errors
740 related to the expression before errors related to the
741 conversion or format_spec. */
742 simple_expression = fstring_compile_expr(p, expr_start, expr_end, t);
743 if (!simple_expression) {
744 goto error;
745 }
746
747 /* Check for =, which puts the text value of the expression in
748 expr_text. */
749 if (**str == '=') {
750 if (p->feature_version < 8) {
751 RAISE_SYNTAX_ERROR("f-string: self documenting expressions are "
752 "only supported in Python 3.8 and greater");
753 goto error;
754 }
755 *str += 1;
756
757 /* Skip over ASCII whitespace. No need to test for end of string
758 here, since we know there's at least a trailing quote somewhere
759 ahead. */
760 while (Py_ISSPACE(**str)) {
761 *str += 1;
762 }
763 if (*str >= end) {
764 goto unexpected_end_of_string;
765 }
766 /* Set *expr_text to the text of the expression. */
767 *expr_text = PyUnicode_FromStringAndSize(expr_start, *str-expr_start);
768 if (!*expr_text) {
769 goto error;
770 }
771 }
772
773 /* Check for a conversion char, if present. */
774 if (**str == '!') {
775 *str += 1;
776 if (*str >= end) {
777 goto unexpected_end_of_string;
778 }
779
780 conversion = (unsigned char)**str;
781 *str += 1;
782
783 /* Validate the conversion. */
784 if (!(conversion == 's' || conversion == 'r' || conversion == 'a')) {
785 RAISE_SYNTAX_ERROR(
786 "f-string: invalid conversion character: "
787 "expected 's', 'r', or 'a'");
788 goto error;
789 }
790
791 }
792
793 /* Check for the format spec, if present. */
794 if (*str >= end) {
795 goto unexpected_end_of_string;
796 }
797 if (**str == ':') {
798 *str += 1;
799 if (*str >= end) {
800 goto unexpected_end_of_string;
801 }
802
803 /* Parse the format spec. */
804 format_spec = fstring_parse(p, str, end, raw, recurse_lvl+1,
805 first_token, t, last_token);
806 if (!format_spec) {
807 goto error;
808 }
809 }
810
811 if (*str >= end || **str != '}') {
812 goto unexpected_end_of_string;
813 }
814
815 /* We're at a right brace. Consume it. */
816 assert(*str < end);
817 assert(**str == '}');
818 *str += 1;
819
820 /* If we're in = mode (detected by non-NULL expr_text), and have no format
821 spec and no explicit conversion, set the conversion to 'r'. */
822 if (*expr_text && format_spec == NULL && conversion == -1) {
823 conversion = 'r';
824 }
825
826 /* And now create the FormattedValue node that represents this
827 entire expression with the conversion and format spec. */
828 //TODO: Fix this
829 *expression = _PyAST_FormattedValue(simple_expression, conversion,
830 format_spec, first_token->lineno,
831 first_token->col_offset,
832 last_token->end_lineno,
833 last_token->end_col_offset, p->arena);
834 if (!*expression) {
835 goto error;
836 }
837
838 return 0;
839
840 unexpected_end_of_string:
841 RAISE_SYNTAX_ERROR("f-string: expecting '}'");
842 /* Falls through to error. */
843
844 error:
845 Py_XDECREF(*expr_text);
846 return -1;
847
848 }
849
850 /* Return -1 on error.
851
852 Return 0 if we have a literal (possible zero length) and an
853 expression (zero length if at the end of the string.
854
855 Return 1 if we have a literal, but no expression, and we want the
856 caller to call us again. This is used to deal with doubled
857 braces.
858
859 When called multiple times on the string 'a{{b{0}c', this function
860 will return:
861
862 1. the literal 'a{' with no expression, and a return value
863 of 1. Despite the fact that there's no expression, the return
864 value of 1 means we're not finished yet.
865
866 2. the literal 'b' and the expression '0', with a return value of
867 0. The fact that there's an expression means we're not finished.
868
869 3. literal 'c' with no expression and a return value of 0. The
870 combination of the return value of 0 with no expression means
871 we're finished.
872 */
873 static int
fstring_find_literal_and_expr(Parser * p,const char ** str,const char * end,int raw,int recurse_lvl,PyObject ** literal,PyObject ** expr_text,expr_ty * expression,Token * first_token,Token * t,Token * last_token)874 fstring_find_literal_and_expr(Parser *p, const char **str, const char *end, int raw,
875 int recurse_lvl, PyObject **literal,
876 PyObject **expr_text, expr_ty *expression,
877 Token *first_token, Token *t, Token *last_token)
878 {
879 int result;
880
881 assert(*literal == NULL && *expression == NULL);
882
883 /* Get any literal string. */
884 result = fstring_find_literal(p, str, end, raw, literal, recurse_lvl, t);
885 if (result < 0) {
886 goto error;
887 }
888
889 assert(result == 0 || result == 1);
890
891 if (result == 1) {
892 /* We have a literal, but don't look at the expression. */
893 return 1;
894 }
895
896 if (*str >= end || **str == '}') {
897 /* We're at the end of the string or the end of a nested
898 f-string: no expression. The top-level error case where we
899 expect to be at the end of the string but we're at a '}' is
900 handled later. */
901 return 0;
902 }
903
904 /* We must now be the start of an expression, on a '{'. */
905 assert(**str == '{');
906
907 if (fstring_find_expr(p, str, end, raw, recurse_lvl, expr_text,
908 expression, first_token, t, last_token) < 0) {
909 goto error;
910 }
911
912 return 0;
913
914 error:
915 Py_CLEAR(*literal);
916 return -1;
917 }
918
919 #ifdef NDEBUG
920 #define ExprList_check_invariants(l)
921 #else
922 static void
ExprList_check_invariants(ExprList * l)923 ExprList_check_invariants(ExprList *l)
924 {
925 /* Check our invariants. Make sure this object is "live", and
926 hasn't been deallocated. */
927 assert(l->size >= 0);
928 assert(l->p != NULL);
929 if (l->size <= EXPRLIST_N_CACHED) {
930 assert(l->data == l->p);
931 }
932 }
933 #endif
934
935 static void
ExprList_Init(ExprList * l)936 ExprList_Init(ExprList *l)
937 {
938 l->allocated = EXPRLIST_N_CACHED;
939 l->size = 0;
940
941 /* Until we start allocating dynamically, p points to data. */
942 l->p = l->data;
943
944 ExprList_check_invariants(l);
945 }
946
947 static int
ExprList_Append(ExprList * l,expr_ty exp)948 ExprList_Append(ExprList *l, expr_ty exp)
949 {
950 ExprList_check_invariants(l);
951 if (l->size >= l->allocated) {
952 /* We need to alloc (or realloc) the memory. */
953 Py_ssize_t new_size = l->allocated * 2;
954
955 /* See if we've ever allocated anything dynamically. */
956 if (l->p == l->data) {
957 Py_ssize_t i;
958 /* We're still using the cached data. Switch to
959 alloc-ing. */
960 l->p = PyMem_Malloc(sizeof(expr_ty) * new_size);
961 if (!l->p) {
962 return -1;
963 }
964 /* Copy the cached data into the new buffer. */
965 for (i = 0; i < l->size; i++) {
966 l->p[i] = l->data[i];
967 }
968 } else {
969 /* Just realloc. */
970 expr_ty *tmp = PyMem_Realloc(l->p, sizeof(expr_ty) * new_size);
971 if (!tmp) {
972 PyMem_Free(l->p);
973 l->p = NULL;
974 return -1;
975 }
976 l->p = tmp;
977 }
978
979 l->allocated = new_size;
980 assert(l->allocated == 2 * l->size);
981 }
982
983 l->p[l->size++] = exp;
984
985 ExprList_check_invariants(l);
986 return 0;
987 }
988
989 static void
ExprList_Dealloc(ExprList * l)990 ExprList_Dealloc(ExprList *l)
991 {
992 ExprList_check_invariants(l);
993
994 /* If there's been an error, or we've never dynamically allocated,
995 do nothing. */
996 if (!l->p || l->p == l->data) {
997 /* Do nothing. */
998 } else {
999 /* We have dynamically allocated. Free the memory. */
1000 PyMem_Free(l->p);
1001 }
1002 l->p = NULL;
1003 l->size = -1;
1004 }
1005
1006 static asdl_expr_seq *
ExprList_Finish(ExprList * l,PyArena * arena)1007 ExprList_Finish(ExprList *l, PyArena *arena)
1008 {
1009 asdl_expr_seq *seq;
1010
1011 ExprList_check_invariants(l);
1012
1013 /* Allocate the asdl_seq and copy the expressions in to it. */
1014 seq = _Py_asdl_expr_seq_new(l->size, arena);
1015 if (seq) {
1016 Py_ssize_t i;
1017 for (i = 0; i < l->size; i++) {
1018 asdl_seq_SET(seq, i, l->p[i]);
1019 }
1020 }
1021 ExprList_Dealloc(l);
1022 return seq;
1023 }
1024
1025 #ifdef NDEBUG
1026 #define FstringParser_check_invariants(state)
1027 #else
1028 static void
FstringParser_check_invariants(FstringParser * state)1029 FstringParser_check_invariants(FstringParser *state)
1030 {
1031 if (state->last_str) {
1032 assert(PyUnicode_CheckExact(state->last_str));
1033 }
1034 ExprList_check_invariants(&state->expr_list);
1035 }
1036 #endif
1037
1038 void
_PyPegen_FstringParser_Init(FstringParser * state)1039 _PyPegen_FstringParser_Init(FstringParser *state)
1040 {
1041 state->last_str = NULL;
1042 state->fmode = 0;
1043 ExprList_Init(&state->expr_list);
1044 FstringParser_check_invariants(state);
1045 }
1046
1047 void
_PyPegen_FstringParser_Dealloc(FstringParser * state)1048 _PyPegen_FstringParser_Dealloc(FstringParser *state)
1049 {
1050 FstringParser_check_invariants(state);
1051
1052 Py_XDECREF(state->last_str);
1053 ExprList_Dealloc(&state->expr_list);
1054 }
1055
1056 /* Make a Constant node, but decref the PyUnicode object being added. */
1057 static expr_ty
make_str_node_and_del(Parser * p,PyObject ** str,Token * first_token,Token * last_token)1058 make_str_node_and_del(Parser *p, PyObject **str, Token* first_token, Token *last_token)
1059 {
1060 PyObject *s = *str;
1061 PyObject *kind = NULL;
1062 *str = NULL;
1063 assert(PyUnicode_CheckExact(s));
1064 if (_PyArena_AddPyObject(p->arena, s) < 0) {
1065 Py_DECREF(s);
1066 return NULL;
1067 }
1068 const char* the_str = PyBytes_AsString(first_token->bytes);
1069 if (the_str && the_str[0] == 'u') {
1070 kind = _PyPegen_new_identifier(p, "u");
1071 }
1072
1073 if (kind == NULL && PyErr_Occurred()) {
1074 return NULL;
1075 }
1076
1077 return _PyAST_Constant(s, kind, first_token->lineno, first_token->col_offset,
1078 last_token->end_lineno, last_token->end_col_offset,
1079 p->arena);
1080
1081 }
1082
1083
1084 /* Add a non-f-string (that is, a regular literal string). str is
1085 decref'd. */
1086 int
_PyPegen_FstringParser_ConcatAndDel(FstringParser * state,PyObject * str)1087 _PyPegen_FstringParser_ConcatAndDel(FstringParser *state, PyObject *str)
1088 {
1089 FstringParser_check_invariants(state);
1090
1091 assert(PyUnicode_CheckExact(str));
1092
1093 if (PyUnicode_GET_LENGTH(str) == 0) {
1094 Py_DECREF(str);
1095 return 0;
1096 }
1097
1098 if (!state->last_str) {
1099 /* We didn't have a string before, so just remember this one. */
1100 state->last_str = str;
1101 } else {
1102 /* Concatenate this with the previous string. */
1103 PyUnicode_AppendAndDel(&state->last_str, str);
1104 if (!state->last_str) {
1105 return -1;
1106 }
1107 }
1108 FstringParser_check_invariants(state);
1109 return 0;
1110 }
1111
1112 /* Parse an f-string. The f-string is in *str to end, with no
1113 'f' or quotes. */
1114 int
_PyPegen_FstringParser_ConcatFstring(Parser * p,FstringParser * state,const char ** str,const char * end,int raw,int recurse_lvl,Token * first_token,Token * t,Token * last_token)1115 _PyPegen_FstringParser_ConcatFstring(Parser *p, FstringParser *state, const char **str,
1116 const char *end, int raw, int recurse_lvl,
1117 Token *first_token, Token* t, Token *last_token)
1118 {
1119 FstringParser_check_invariants(state);
1120 state->fmode = 1;
1121
1122 /* Parse the f-string. */
1123 while (1) {
1124 PyObject *literal = NULL;
1125 PyObject *expr_text = NULL;
1126 expr_ty expression = NULL;
1127
1128 /* If there's a zero length literal in front of the
1129 expression, literal will be NULL. If we're at the end of
1130 the f-string, expression will be NULL (unless result == 1,
1131 see below). */
1132 int result = fstring_find_literal_and_expr(p, str, end, raw, recurse_lvl,
1133 &literal, &expr_text,
1134 &expression, first_token, t, last_token);
1135 if (result < 0) {
1136 return -1;
1137 }
1138
1139 /* Add the literal, if any. */
1140 if (literal && _PyPegen_FstringParser_ConcatAndDel(state, literal) < 0) {
1141 Py_XDECREF(expr_text);
1142 return -1;
1143 }
1144 /* Add the expr_text, if any. */
1145 if (expr_text && _PyPegen_FstringParser_ConcatAndDel(state, expr_text) < 0) {
1146 return -1;
1147 }
1148
1149 /* We've dealt with the literal and expr_text, their ownership has
1150 been transferred to the state object. Don't look at them again. */
1151
1152 /* See if we should just loop around to get the next literal
1153 and expression, while ignoring the expression this
1154 time. This is used for un-doubling braces, as an
1155 optimization. */
1156 if (result == 1) {
1157 continue;
1158 }
1159
1160 if (!expression) {
1161 /* We're done with this f-string. */
1162 break;
1163 }
1164
1165 /* We know we have an expression. Convert any existing string
1166 to a Constant node. */
1167 if (state->last_str) {
1168 /* Convert the existing last_str literal to a Constant node. */
1169 expr_ty last_str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
1170 if (!last_str || ExprList_Append(&state->expr_list, last_str) < 0) {
1171 return -1;
1172 }
1173 }
1174
1175 if (ExprList_Append(&state->expr_list, expression) < 0) {
1176 return -1;
1177 }
1178 }
1179
1180 /* If recurse_lvl is zero, then we must be at the end of the
1181 string. Otherwise, we must be at a right brace. */
1182
1183 if (recurse_lvl == 0 && *str < end-1) {
1184 RAISE_SYNTAX_ERROR("f-string: unexpected end of string");
1185 return -1;
1186 }
1187 if (recurse_lvl != 0 && **str != '}') {
1188 RAISE_SYNTAX_ERROR("f-string: expecting '}'");
1189 return -1;
1190 }
1191
1192 FstringParser_check_invariants(state);
1193 return 0;
1194 }
1195
1196 /* Convert the partial state reflected in last_str and expr_list to an
1197 expr_ty. The expr_ty can be a Constant, or a JoinedStr. */
1198 expr_ty
_PyPegen_FstringParser_Finish(Parser * p,FstringParser * state,Token * first_token,Token * last_token)1199 _PyPegen_FstringParser_Finish(Parser *p, FstringParser *state, Token* first_token,
1200 Token *last_token)
1201 {
1202 asdl_expr_seq *seq;
1203
1204 FstringParser_check_invariants(state);
1205
1206 /* If we're just a constant string with no expressions, return
1207 that. */
1208 if (!state->fmode) {
1209 assert(!state->expr_list.size);
1210 if (!state->last_str) {
1211 /* Create a zero length string. */
1212 state->last_str = PyUnicode_FromStringAndSize(NULL, 0);
1213 if (!state->last_str) {
1214 goto error;
1215 }
1216 }
1217 return make_str_node_and_del(p, &state->last_str, first_token, last_token);
1218 }
1219
1220 /* Create a Constant node out of last_str, if needed. It will be the
1221 last node in our expression list. */
1222 if (state->last_str) {
1223 expr_ty str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
1224 if (!str || ExprList_Append(&state->expr_list, str) < 0) {
1225 goto error;
1226 }
1227 }
1228 /* This has already been freed. */
1229 assert(state->last_str == NULL);
1230
1231 seq = ExprList_Finish(&state->expr_list, p->arena);
1232 if (!seq) {
1233 goto error;
1234 }
1235
1236 return _PyAST_JoinedStr(seq, first_token->lineno, first_token->col_offset,
1237 last_token->end_lineno, last_token->end_col_offset,
1238 p->arena);
1239
1240 error:
1241 _PyPegen_FstringParser_Dealloc(state);
1242 return NULL;
1243 }
1244
1245 /* Given an f-string (with no 'f' or quotes) that's in *str and ends
1246 at end, parse it into an expr_ty. Return NULL on error. Adjust
1247 str to point past the parsed portion. */
1248 static expr_ty
fstring_parse(Parser * p,const char ** str,const char * end,int raw,int recurse_lvl,Token * first_token,Token * t,Token * last_token)1249 fstring_parse(Parser *p, const char **str, const char *end, int raw,
1250 int recurse_lvl, Token *first_token, Token* t, Token *last_token)
1251 {
1252 FstringParser state;
1253
1254 _PyPegen_FstringParser_Init(&state);
1255 if (_PyPegen_FstringParser_ConcatFstring(p, &state, str, end, raw, recurse_lvl,
1256 first_token, t, last_token) < 0) {
1257 _PyPegen_FstringParser_Dealloc(&state);
1258 return NULL;
1259 }
1260
1261 return _PyPegen_FstringParser_Finish(p, &state, t, t);
1262 }
1263