1 #include <Python.h>
2 #include "pycore_ast.h" // _PyAST_Validate(),
3 #include "pycore_pystate.h" // _PyThreadState_GET()
4 #include <errcode.h>
5
6 #include "tokenizer.h"
7 #include "pegen.h"
8
9 // Internal parser functions
10
11 asdl_stmt_seq*
_PyPegen_interactive_exit(Parser * p)12 _PyPegen_interactive_exit(Parser *p)
13 {
14 if (p->errcode) {
15 *(p->errcode) = E_EOF;
16 }
17 return NULL;
18 }
19
20 Py_ssize_t
_PyPegen_byte_offset_to_character_offset(PyObject * line,Py_ssize_t col_offset)21 _PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset)
22 {
23 const char *str = PyUnicode_AsUTF8(line);
24 if (!str) {
25 return -1;
26 }
27 Py_ssize_t len = strlen(str);
28 if (col_offset > len + 1) {
29 col_offset = len + 1;
30 }
31 assert(col_offset >= 0);
32 PyObject *text = PyUnicode_DecodeUTF8(str, col_offset, "replace");
33 if (!text) {
34 return -1;
35 }
36 Py_ssize_t size = PyUnicode_GET_LENGTH(text);
37 Py_DECREF(text);
38 return size;
39 }
40
41 // Here, mark is the start of the node, while p->mark is the end.
42 // If node==NULL, they should be the same.
43 int
_PyPegen_insert_memo(Parser * p,int mark,int type,void * node)44 _PyPegen_insert_memo(Parser *p, int mark, int type, void *node)
45 {
46 // Insert in front
47 Memo *m = _PyArena_Malloc(p->arena, sizeof(Memo));
48 if (m == NULL) {
49 return -1;
50 }
51 m->type = type;
52 m->node = node;
53 m->mark = p->mark;
54 m->next = p->tokens[mark]->memo;
55 p->tokens[mark]->memo = m;
56 return 0;
57 }
58
59 // Like _PyPegen_insert_memo(), but updates an existing node if found.
60 int
_PyPegen_update_memo(Parser * p,int mark,int type,void * node)61 _PyPegen_update_memo(Parser *p, int mark, int type, void *node)
62 {
63 for (Memo *m = p->tokens[mark]->memo; m != NULL; m = m->next) {
64 if (m->type == type) {
65 // Update existing node.
66 m->node = node;
67 m->mark = p->mark;
68 return 0;
69 }
70 }
71 // Insert new node.
72 return _PyPegen_insert_memo(p, mark, type, node);
73 }
74
75 static int
init_normalization(Parser * p)76 init_normalization(Parser *p)
77 {
78 if (p->normalize) {
79 return 1;
80 }
81 PyObject *m = PyImport_ImportModule("unicodedata");
82 if (!m)
83 {
84 return 0;
85 }
86 p->normalize = PyObject_GetAttrString(m, "normalize");
87 Py_DECREF(m);
88 if (!p->normalize)
89 {
90 return 0;
91 }
92 return 1;
93 }
94
95 static int
growable_comment_array_init(growable_comment_array * arr,size_t initial_size)96 growable_comment_array_init(growable_comment_array *arr, size_t initial_size) {
97 assert(initial_size > 0);
98 arr->items = PyMem_Malloc(initial_size * sizeof(*arr->items));
99 arr->size = initial_size;
100 arr->num_items = 0;
101
102 return arr->items != NULL;
103 }
104
105 static int
growable_comment_array_add(growable_comment_array * arr,int lineno,char * comment)106 growable_comment_array_add(growable_comment_array *arr, int lineno, char *comment) {
107 if (arr->num_items >= arr->size) {
108 size_t new_size = arr->size * 2;
109 void *new_items_array = PyMem_Realloc(arr->items, new_size * sizeof(*arr->items));
110 if (!new_items_array) {
111 return 0;
112 }
113 arr->items = new_items_array;
114 arr->size = new_size;
115 }
116
117 arr->items[arr->num_items].lineno = lineno;
118 arr->items[arr->num_items].comment = comment; // Take ownership
119 arr->num_items++;
120 return 1;
121 }
122
123 static void
growable_comment_array_deallocate(growable_comment_array * arr)124 growable_comment_array_deallocate(growable_comment_array *arr) {
125 for (unsigned i = 0; i < arr->num_items; i++) {
126 PyMem_Free(arr->items[i].comment);
127 }
128 PyMem_Free(arr->items);
129 }
130
131 static int
_get_keyword_or_name_type(Parser * p,const char * name,int name_len)132 _get_keyword_or_name_type(Parser *p, const char *name, int name_len)
133 {
134 assert(name_len > 0);
135 if (name_len >= p->n_keyword_lists ||
136 p->keywords[name_len] == NULL ||
137 p->keywords[name_len]->type == -1) {
138 return NAME;
139 }
140 for (KeywordToken *k = p->keywords[name_len]; k != NULL && k->type != -1; k++) {
141 if (strncmp(k->str, name, name_len) == 0) {
142 return k->type;
143 }
144 }
145 return NAME;
146 }
147
148 static int
initialize_token(Parser * p,Token * token,const char * start,const char * end,int token_type)149 initialize_token(Parser *p, Token *token, const char *start, const char *end, int token_type) {
150 assert(token != NULL);
151
152 token->type = (token_type == NAME) ? _get_keyword_or_name_type(p, start, (int)(end - start)) : token_type;
153 token->bytes = PyBytes_FromStringAndSize(start, end - start);
154 if (token->bytes == NULL) {
155 return -1;
156 }
157
158 if (_PyArena_AddPyObject(p->arena, token->bytes) < 0) {
159 Py_DECREF(token->bytes);
160 return -1;
161 }
162
163 token->level = p->tok->level;
164
165 const char *line_start = token_type == STRING ? p->tok->multi_line_start : p->tok->line_start;
166 int lineno = token_type == STRING ? p->tok->first_lineno : p->tok->lineno;
167 int end_lineno = p->tok->lineno;
168
169 int col_offset = (start != NULL && start >= line_start) ? (int)(start - line_start) : -1;
170 int end_col_offset = (end != NULL && end >= p->tok->line_start) ? (int)(end - p->tok->line_start) : -1;
171
172 token->lineno = lineno;
173 token->col_offset = p->tok->lineno == p->starting_lineno ? p->starting_col_offset + col_offset : col_offset;
174 token->end_lineno = end_lineno;
175 token->end_col_offset = p->tok->lineno == p->starting_lineno ? p->starting_col_offset + end_col_offset : end_col_offset;
176
177 p->fill += 1;
178
179 if (token_type == ERRORTOKEN && p->tok->done == E_DECODE) {
180 return _Pypegen_raise_decode_error(p);
181 }
182
183 return (token_type == ERRORTOKEN ? _Pypegen_tokenizer_error(p) : 0);
184 }
185
186 static int
_resize_tokens_array(Parser * p)187 _resize_tokens_array(Parser *p) {
188 int newsize = p->size * 2;
189 Token **new_tokens = PyMem_Realloc(p->tokens, newsize * sizeof(Token *));
190 if (new_tokens == NULL) {
191 PyErr_NoMemory();
192 return -1;
193 }
194 p->tokens = new_tokens;
195
196 for (int i = p->size; i < newsize; i++) {
197 p->tokens[i] = PyMem_Calloc(1, sizeof(Token));
198 if (p->tokens[i] == NULL) {
199 p->size = i; // Needed, in order to cleanup correctly after parser fails
200 PyErr_NoMemory();
201 return -1;
202 }
203 }
204 p->size = newsize;
205 return 0;
206 }
207
208 int
_PyPegen_fill_token(Parser * p)209 _PyPegen_fill_token(Parser *p)
210 {
211 const char *start;
212 const char *end;
213 int type = _PyTokenizer_Get(p->tok, &start, &end);
214
215 // Record and skip '# type: ignore' comments
216 while (type == TYPE_IGNORE) {
217 Py_ssize_t len = end - start;
218 char *tag = PyMem_Malloc(len + 1);
219 if (tag == NULL) {
220 PyErr_NoMemory();
221 return -1;
222 }
223 strncpy(tag, start, len);
224 tag[len] = '\0';
225 // Ownership of tag passes to the growable array
226 if (!growable_comment_array_add(&p->type_ignore_comments, p->tok->lineno, tag)) {
227 PyErr_NoMemory();
228 return -1;
229 }
230 type = _PyTokenizer_Get(p->tok, &start, &end);
231 }
232
233 // If we have reached the end and we are in single input mode we need to insert a newline and reset the parsing
234 if (p->start_rule == Py_single_input && type == ENDMARKER && p->parsing_started) {
235 type = NEWLINE; /* Add an extra newline */
236 p->parsing_started = 0;
237
238 if (p->tok->indent && !(p->flags & PyPARSE_DONT_IMPLY_DEDENT)) {
239 p->tok->pendin = -p->tok->indent;
240 p->tok->indent = 0;
241 }
242 }
243 else {
244 p->parsing_started = 1;
245 }
246
247 // Check if we are at the limit of the token array capacity and resize if needed
248 if ((p->fill == p->size) && (_resize_tokens_array(p) != 0)) {
249 return -1;
250 }
251
252 Token *t = p->tokens[p->fill];
253 return initialize_token(p, t, start, end, type);
254 }
255
256 #if defined(Py_DEBUG)
257 // Instrumentation to count the effectiveness of memoization.
258 // The array counts the number of tokens skipped by memoization,
259 // indexed by type.
260
261 #define NSTATISTICS 2000
262 static long memo_statistics[NSTATISTICS];
263
264 void
_PyPegen_clear_memo_statistics(void)265 _PyPegen_clear_memo_statistics(void)
266 {
267 for (int i = 0; i < NSTATISTICS; i++) {
268 memo_statistics[i] = 0;
269 }
270 }
271
272 PyObject *
_PyPegen_get_memo_statistics(void)273 _PyPegen_get_memo_statistics(void)
274 {
275 PyObject *ret = PyList_New(NSTATISTICS);
276 if (ret == NULL) {
277 return NULL;
278 }
279 for (int i = 0; i < NSTATISTICS; i++) {
280 PyObject *value = PyLong_FromLong(memo_statistics[i]);
281 if (value == NULL) {
282 Py_DECREF(ret);
283 return NULL;
284 }
285 // PyList_SetItem borrows a reference to value.
286 if (PyList_SetItem(ret, i, value) < 0) {
287 Py_DECREF(ret);
288 return NULL;
289 }
290 }
291 return ret;
292 }
293 #endif
294
295 int // bool
_PyPegen_is_memoized(Parser * p,int type,void * pres)296 _PyPegen_is_memoized(Parser *p, int type, void *pres)
297 {
298 if (p->mark == p->fill) {
299 if (_PyPegen_fill_token(p) < 0) {
300 p->error_indicator = 1;
301 return -1;
302 }
303 }
304
305 Token *t = p->tokens[p->mark];
306
307 for (Memo *m = t->memo; m != NULL; m = m->next) {
308 if (m->type == type) {
309 #if defined(PY_DEBUG)
310 if (0 <= type && type < NSTATISTICS) {
311 long count = m->mark - p->mark;
312 // A memoized negative result counts for one.
313 if (count <= 0) {
314 count = 1;
315 }
316 memo_statistics[type] += count;
317 }
318 #endif
319 p->mark = m->mark;
320 *(void **)(pres) = m->node;
321 return 1;
322 }
323 }
324 return 0;
325 }
326
327 int
_PyPegen_lookahead_with_name(int positive,expr_ty (func)(Parser *),Parser * p)328 _PyPegen_lookahead_with_name(int positive, expr_ty (func)(Parser *), Parser *p)
329 {
330 int mark = p->mark;
331 void *res = func(p);
332 p->mark = mark;
333 return (res != NULL) == positive;
334 }
335
336 int
_PyPegen_lookahead_with_string(int positive,expr_ty (func)(Parser *,const char *),Parser * p,const char * arg)337 _PyPegen_lookahead_with_string(int positive, expr_ty (func)(Parser *, const char*), Parser *p, const char* arg)
338 {
339 int mark = p->mark;
340 void *res = func(p, arg);
341 p->mark = mark;
342 return (res != NULL) == positive;
343 }
344
345 int
_PyPegen_lookahead_with_int(int positive,Token * (func)(Parser *,int),Parser * p,int arg)346 _PyPegen_lookahead_with_int(int positive, Token *(func)(Parser *, int), Parser *p, int arg)
347 {
348 int mark = p->mark;
349 void *res = func(p, arg);
350 p->mark = mark;
351 return (res != NULL) == positive;
352 }
353
354 int
_PyPegen_lookahead(int positive,void * (func)(Parser *),Parser * p)355 _PyPegen_lookahead(int positive, void *(func)(Parser *), Parser *p)
356 {
357 int mark = p->mark;
358 void *res = (void*)func(p);
359 p->mark = mark;
360 return (res != NULL) == positive;
361 }
362
363 Token *
_PyPegen_expect_token(Parser * p,int type)364 _PyPegen_expect_token(Parser *p, int type)
365 {
366 if (p->mark == p->fill) {
367 if (_PyPegen_fill_token(p) < 0) {
368 p->error_indicator = 1;
369 return NULL;
370 }
371 }
372 Token *t = p->tokens[p->mark];
373 if (t->type != type) {
374 return NULL;
375 }
376 p->mark += 1;
377 return t;
378 }
379
380 void*
_PyPegen_expect_forced_result(Parser * p,void * result,const char * expected)381 _PyPegen_expect_forced_result(Parser *p, void* result, const char* expected) {
382
383 if (p->error_indicator == 1) {
384 return NULL;
385 }
386 if (result == NULL) {
387 RAISE_SYNTAX_ERROR("expected (%s)", expected);
388 return NULL;
389 }
390 return result;
391 }
392
393 Token *
_PyPegen_expect_forced_token(Parser * p,int type,const char * expected)394 _PyPegen_expect_forced_token(Parser *p, int type, const char* expected) {
395
396 if (p->error_indicator == 1) {
397 return NULL;
398 }
399
400 if (p->mark == p->fill) {
401 if (_PyPegen_fill_token(p) < 0) {
402 p->error_indicator = 1;
403 return NULL;
404 }
405 }
406 Token *t = p->tokens[p->mark];
407 if (t->type != type) {
408 RAISE_SYNTAX_ERROR_KNOWN_LOCATION(t, "expected '%s'", expected);
409 return NULL;
410 }
411 p->mark += 1;
412 return t;
413 }
414
415 expr_ty
_PyPegen_expect_soft_keyword(Parser * p,const char * keyword)416 _PyPegen_expect_soft_keyword(Parser *p, const char *keyword)
417 {
418 if (p->mark == p->fill) {
419 if (_PyPegen_fill_token(p) < 0) {
420 p->error_indicator = 1;
421 return NULL;
422 }
423 }
424 Token *t = p->tokens[p->mark];
425 if (t->type != NAME) {
426 return NULL;
427 }
428 const char *s = PyBytes_AsString(t->bytes);
429 if (!s) {
430 p->error_indicator = 1;
431 return NULL;
432 }
433 if (strcmp(s, keyword) != 0) {
434 return NULL;
435 }
436 return _PyPegen_name_token(p);
437 }
438
439 Token *
_PyPegen_get_last_nonnwhitespace_token(Parser * p)440 _PyPegen_get_last_nonnwhitespace_token(Parser *p)
441 {
442 assert(p->mark >= 0);
443 Token *token = NULL;
444 for (int m = p->mark - 1; m >= 0; m--) {
445 token = p->tokens[m];
446 if (token->type != ENDMARKER && (token->type < NEWLINE || token->type > DEDENT)) {
447 break;
448 }
449 }
450 return token;
451 }
452
453 PyObject *
_PyPegen_new_identifier(Parser * p,const char * n)454 _PyPegen_new_identifier(Parser *p, const char *n)
455 {
456 PyObject *id = PyUnicode_DecodeUTF8(n, strlen(n), NULL);
457 if (!id) {
458 goto error;
459 }
460 /* PyUnicode_DecodeUTF8 should always return a ready string. */
461 assert(PyUnicode_IS_READY(id));
462 /* Check whether there are non-ASCII characters in the
463 identifier; if so, normalize to NFKC. */
464 if (!PyUnicode_IS_ASCII(id))
465 {
466 PyObject *id2;
467 if (!init_normalization(p))
468 {
469 Py_DECREF(id);
470 goto error;
471 }
472 PyObject *form = PyUnicode_InternFromString("NFKC");
473 if (form == NULL)
474 {
475 Py_DECREF(id);
476 goto error;
477 }
478 PyObject *args[2] = {form, id};
479 id2 = _PyObject_FastCall(p->normalize, args, 2);
480 Py_DECREF(id);
481 Py_DECREF(form);
482 if (!id2) {
483 goto error;
484 }
485 if (!PyUnicode_Check(id2))
486 {
487 PyErr_Format(PyExc_TypeError,
488 "unicodedata.normalize() must return a string, not "
489 "%.200s",
490 _PyType_Name(Py_TYPE(id2)));
491 Py_DECREF(id2);
492 goto error;
493 }
494 id = id2;
495 }
496 PyUnicode_InternInPlace(&id);
497 if (_PyArena_AddPyObject(p->arena, id) < 0)
498 {
499 Py_DECREF(id);
500 goto error;
501 }
502 return id;
503
504 error:
505 p->error_indicator = 1;
506 return NULL;
507 }
508
509 static expr_ty
_PyPegen_name_from_token(Parser * p,Token * t)510 _PyPegen_name_from_token(Parser *p, Token* t)
511 {
512 if (t == NULL) {
513 return NULL;
514 }
515 const char *s = PyBytes_AsString(t->bytes);
516 if (!s) {
517 p->error_indicator = 1;
518 return NULL;
519 }
520 PyObject *id = _PyPegen_new_identifier(p, s);
521 if (id == NULL) {
522 p->error_indicator = 1;
523 return NULL;
524 }
525 return _PyAST_Name(id, Load, t->lineno, t->col_offset, t->end_lineno,
526 t->end_col_offset, p->arena);
527 }
528
529 expr_ty
_PyPegen_name_token(Parser * p)530 _PyPegen_name_token(Parser *p)
531 {
532 Token *t = _PyPegen_expect_token(p, NAME);
533 return _PyPegen_name_from_token(p, t);
534 }
535
536 void *
_PyPegen_string_token(Parser * p)537 _PyPegen_string_token(Parser *p)
538 {
539 return _PyPegen_expect_token(p, STRING);
540 }
541
_PyPegen_soft_keyword_token(Parser * p)542 expr_ty _PyPegen_soft_keyword_token(Parser *p) {
543 Token *t = _PyPegen_expect_token(p, NAME);
544 if (t == NULL) {
545 return NULL;
546 }
547 char *the_token;
548 Py_ssize_t size;
549 PyBytes_AsStringAndSize(t->bytes, &the_token, &size);
550 for (char **keyword = p->soft_keywords; *keyword != NULL; keyword++) {
551 if (strncmp(*keyword, the_token, size) == 0) {
552 return _PyPegen_name_from_token(p, t);
553 }
554 }
555 return NULL;
556 }
557
558 static PyObject *
parsenumber_raw(const char * s)559 parsenumber_raw(const char *s)
560 {
561 const char *end;
562 long x;
563 double dx;
564 Py_complex compl;
565 int imflag;
566
567 assert(s != NULL);
568 errno = 0;
569 end = s + strlen(s) - 1;
570 imflag = *end == 'j' || *end == 'J';
571 if (s[0] == '0') {
572 x = (long)PyOS_strtoul(s, (char **)&end, 0);
573 if (x < 0 && errno == 0) {
574 return PyLong_FromString(s, (char **)0, 0);
575 }
576 }
577 else {
578 x = PyOS_strtol(s, (char **)&end, 0);
579 }
580 if (*end == '\0') {
581 if (errno != 0) {
582 return PyLong_FromString(s, (char **)0, 0);
583 }
584 return PyLong_FromLong(x);
585 }
586 /* XXX Huge floats may silently fail */
587 if (imflag) {
588 compl.real = 0.;
589 compl.imag = PyOS_string_to_double(s, (char **)&end, NULL);
590 if (compl.imag == -1.0 && PyErr_Occurred()) {
591 return NULL;
592 }
593 return PyComplex_FromCComplex(compl);
594 }
595 dx = PyOS_string_to_double(s, NULL, NULL);
596 if (dx == -1.0 && PyErr_Occurred()) {
597 return NULL;
598 }
599 return PyFloat_FromDouble(dx);
600 }
601
602 static PyObject *
parsenumber(const char * s)603 parsenumber(const char *s)
604 {
605 char *dup;
606 char *end;
607 PyObject *res = NULL;
608
609 assert(s != NULL);
610
611 if (strchr(s, '_') == NULL) {
612 return parsenumber_raw(s);
613 }
614 /* Create a duplicate without underscores. */
615 dup = PyMem_Malloc(strlen(s) + 1);
616 if (dup == NULL) {
617 return PyErr_NoMemory();
618 }
619 end = dup;
620 for (; *s; s++) {
621 if (*s != '_') {
622 *end++ = *s;
623 }
624 }
625 *end = '\0';
626 res = parsenumber_raw(dup);
627 PyMem_Free(dup);
628 return res;
629 }
630
631 expr_ty
_PyPegen_number_token(Parser * p)632 _PyPegen_number_token(Parser *p)
633 {
634 Token *t = _PyPegen_expect_token(p, NUMBER);
635 if (t == NULL) {
636 return NULL;
637 }
638
639 const char *num_raw = PyBytes_AsString(t->bytes);
640 if (num_raw == NULL) {
641 p->error_indicator = 1;
642 return NULL;
643 }
644
645 if (p->feature_version < 6 && strchr(num_raw, '_') != NULL) {
646 p->error_indicator = 1;
647 return RAISE_SYNTAX_ERROR("Underscores in numeric literals are only supported "
648 "in Python 3.6 and greater");
649 }
650
651 PyObject *c = parsenumber(num_raw);
652
653 if (c == NULL) {
654 p->error_indicator = 1;
655 PyThreadState *tstate = _PyThreadState_GET();
656 // The only way a ValueError should happen in _this_ code is via
657 // PyLong_FromString hitting a length limit.
658 if (tstate->curexc_type == PyExc_ValueError &&
659 tstate->curexc_value != NULL) {
660 PyObject *type, *value, *tb;
661 // This acts as PyErr_Clear() as we're replacing curexc.
662 PyErr_Fetch(&type, &value, &tb);
663 Py_XDECREF(tb);
664 Py_DECREF(type);
665 /* Intentionally omitting columns to avoid a wall of 1000s of '^'s
666 * on the error message. Nobody is going to overlook their huge
667 * numeric literal once given the line. */
668 RAISE_ERROR_KNOWN_LOCATION(
669 p, PyExc_SyntaxError,
670 t->lineno, -1 /* col_offset */,
671 t->end_lineno, -1 /* end_col_offset */,
672 "%S - Consider hexadecimal for huge integer literals "
673 "to avoid decimal conversion limits.",
674 value);
675 Py_DECREF(value);
676 }
677 return NULL;
678 }
679
680 if (_PyArena_AddPyObject(p->arena, c) < 0) {
681 Py_DECREF(c);
682 p->error_indicator = 1;
683 return NULL;
684 }
685
686 return _PyAST_Constant(c, NULL, t->lineno, t->col_offset, t->end_lineno,
687 t->end_col_offset, p->arena);
688 }
689
690 /* Check that the source for a single input statement really is a single
691 statement by looking at what is left in the buffer after parsing.
692 Trailing whitespace and comments are OK. */
693 static int // bool
bad_single_statement(Parser * p)694 bad_single_statement(Parser *p)
695 {
696 char *cur = p->tok->cur;
697 char c = *cur;
698
699 for (;;) {
700 while (c == ' ' || c == '\t' || c == '\n' || c == '\014') {
701 c = *++cur;
702 }
703
704 if (!c) {
705 return 0;
706 }
707
708 if (c != '#') {
709 return 1;
710 }
711
712 /* Suck up comment. */
713 while (c && c != '\n') {
714 c = *++cur;
715 }
716 }
717 }
718
719 static int
compute_parser_flags(PyCompilerFlags * flags)720 compute_parser_flags(PyCompilerFlags *flags)
721 {
722 int parser_flags = 0;
723 if (!flags) {
724 return 0;
725 }
726 if (flags->cf_flags & PyCF_DONT_IMPLY_DEDENT) {
727 parser_flags |= PyPARSE_DONT_IMPLY_DEDENT;
728 }
729 if (flags->cf_flags & PyCF_IGNORE_COOKIE) {
730 parser_flags |= PyPARSE_IGNORE_COOKIE;
731 }
732 if (flags->cf_flags & CO_FUTURE_BARRY_AS_BDFL) {
733 parser_flags |= PyPARSE_BARRY_AS_BDFL;
734 }
735 if (flags->cf_flags & PyCF_TYPE_COMMENTS) {
736 parser_flags |= PyPARSE_TYPE_COMMENTS;
737 }
738 if ((flags->cf_flags & PyCF_ONLY_AST) && flags->cf_feature_version < 7) {
739 parser_flags |= PyPARSE_ASYNC_HACKS;
740 }
741 if (flags->cf_flags & PyCF_ALLOW_INCOMPLETE_INPUT) {
742 parser_flags |= PyPARSE_ALLOW_INCOMPLETE_INPUT;
743 }
744 return parser_flags;
745 }
746
747 // Parser API
748
749 Parser *
_PyPegen_Parser_New(struct tok_state * tok,int start_rule,int flags,int feature_version,int * errcode,PyArena * arena)750 _PyPegen_Parser_New(struct tok_state *tok, int start_rule, int flags,
751 int feature_version, int *errcode, PyArena *arena)
752 {
753 Parser *p = PyMem_Malloc(sizeof(Parser));
754 if (p == NULL) {
755 return (Parser *) PyErr_NoMemory();
756 }
757 assert(tok != NULL);
758 tok->type_comments = (flags & PyPARSE_TYPE_COMMENTS) > 0;
759 tok->async_hacks = (flags & PyPARSE_ASYNC_HACKS) > 0;
760 p->tok = tok;
761 p->keywords = NULL;
762 p->n_keyword_lists = -1;
763 p->soft_keywords = NULL;
764 p->tokens = PyMem_Malloc(sizeof(Token *));
765 if (!p->tokens) {
766 PyMem_Free(p);
767 return (Parser *) PyErr_NoMemory();
768 }
769 p->tokens[0] = PyMem_Calloc(1, sizeof(Token));
770 if (!p->tokens[0]) {
771 PyMem_Free(p->tokens);
772 PyMem_Free(p);
773 return (Parser *) PyErr_NoMemory();
774 }
775 if (!growable_comment_array_init(&p->type_ignore_comments, 10)) {
776 PyMem_Free(p->tokens[0]);
777 PyMem_Free(p->tokens);
778 PyMem_Free(p);
779 return (Parser *) PyErr_NoMemory();
780 }
781
782 p->mark = 0;
783 p->fill = 0;
784 p->size = 1;
785
786 p->errcode = errcode;
787 p->arena = arena;
788 p->start_rule = start_rule;
789 p->parsing_started = 0;
790 p->normalize = NULL;
791 p->error_indicator = 0;
792
793 p->starting_lineno = 0;
794 p->starting_col_offset = 0;
795 p->flags = flags;
796 p->feature_version = feature_version;
797 p->known_err_token = NULL;
798 p->level = 0;
799 p->call_invalid_rules = 0;
800 return p;
801 }
802
803 void
_PyPegen_Parser_Free(Parser * p)804 _PyPegen_Parser_Free(Parser *p)
805 {
806 Py_XDECREF(p->normalize);
807 for (int i = 0; i < p->size; i++) {
808 PyMem_Free(p->tokens[i]);
809 }
810 PyMem_Free(p->tokens);
811 growable_comment_array_deallocate(&p->type_ignore_comments);
812 PyMem_Free(p);
813 }
814
815 static void
reset_parser_state_for_error_pass(Parser * p)816 reset_parser_state_for_error_pass(Parser *p)
817 {
818 for (int i = 0; i < p->fill; i++) {
819 p->tokens[i]->memo = NULL;
820 }
821 p->mark = 0;
822 p->call_invalid_rules = 1;
823 // Don't try to get extra tokens in interactive mode when trying to
824 // raise specialized errors in the second pass.
825 p->tok->interactive_underflow = IUNDERFLOW_STOP;
826 }
827
828 static inline int
_is_end_of_source(Parser * p)829 _is_end_of_source(Parser *p) {
830 int err = p->tok->done;
831 return err == E_EOF || err == E_EOFS || err == E_EOLS;
832 }
833
834 void *
_PyPegen_run_parser(Parser * p)835 _PyPegen_run_parser(Parser *p)
836 {
837 void *res = _PyPegen_parse(p);
838 assert(p->level == 0);
839 if (res == NULL) {
840 if ((p->flags & PyPARSE_ALLOW_INCOMPLETE_INPUT) && _is_end_of_source(p)) {
841 PyErr_Clear();
842 return RAISE_SYNTAX_ERROR("incomplete input");
843 }
844 if (PyErr_Occurred() && !PyErr_ExceptionMatches(PyExc_SyntaxError)) {
845 return NULL;
846 }
847 // Make a second parser pass. In this pass we activate heavier and slower checks
848 // to produce better error messages and more complete diagnostics. Extra "invalid_*"
849 // rules will be active during parsing.
850 Token *last_token = p->tokens[p->fill - 1];
851 reset_parser_state_for_error_pass(p);
852 _PyPegen_parse(p);
853
854 // Set SyntaxErrors accordingly depending on the parser/tokenizer status at the failure
855 // point.
856 _Pypegen_set_syntax_error(p, last_token);
857 return NULL;
858 }
859
860 if (p->start_rule == Py_single_input && bad_single_statement(p)) {
861 p->tok->done = E_BADSINGLE; // This is not necessary for now, but might be in the future
862 return RAISE_SYNTAX_ERROR("multiple statements found while compiling a single statement");
863 }
864
865 // test_peg_generator defines _Py_TEST_PEGEN to not call PyAST_Validate()
866 #if defined(Py_DEBUG) && !defined(_Py_TEST_PEGEN)
867 if (p->start_rule == Py_single_input ||
868 p->start_rule == Py_file_input ||
869 p->start_rule == Py_eval_input)
870 {
871 if (!_PyAST_Validate(res)) {
872 return NULL;
873 }
874 }
875 #endif
876 return res;
877 }
878
879 mod_ty
_PyPegen_run_parser_from_file_pointer(FILE * fp,int start_rule,PyObject * filename_ob,const char * enc,const char * ps1,const char * ps2,PyCompilerFlags * flags,int * errcode,PyArena * arena)880 _PyPegen_run_parser_from_file_pointer(FILE *fp, int start_rule, PyObject *filename_ob,
881 const char *enc, const char *ps1, const char *ps2,
882 PyCompilerFlags *flags, int *errcode, PyArena *arena)
883 {
884 struct tok_state *tok = _PyTokenizer_FromFile(fp, enc, ps1, ps2);
885 if (tok == NULL) {
886 if (PyErr_Occurred()) {
887 _PyPegen_raise_tokenizer_init_error(filename_ob);
888 return NULL;
889 }
890 return NULL;
891 }
892 if (!tok->fp || ps1 != NULL || ps2 != NULL ||
893 PyUnicode_CompareWithASCIIString(filename_ob, "<stdin>") == 0) {
894 tok->fp_interactive = 1;
895 }
896 // This transfers the ownership to the tokenizer
897 tok->filename = filename_ob;
898 Py_INCREF(filename_ob);
899
900 // From here on we need to clean up even if there's an error
901 mod_ty result = NULL;
902
903 int parser_flags = compute_parser_flags(flags);
904 Parser *p = _PyPegen_Parser_New(tok, start_rule, parser_flags, PY_MINOR_VERSION,
905 errcode, arena);
906 if (p == NULL) {
907 goto error;
908 }
909
910 result = _PyPegen_run_parser(p);
911 _PyPegen_Parser_Free(p);
912
913 error:
914 _PyTokenizer_Free(tok);
915 return result;
916 }
917
918 mod_ty
_PyPegen_run_parser_from_string(const char * str,int start_rule,PyObject * filename_ob,PyCompilerFlags * flags,PyArena * arena)919 _PyPegen_run_parser_from_string(const char *str, int start_rule, PyObject *filename_ob,
920 PyCompilerFlags *flags, PyArena *arena)
921 {
922 int exec_input = start_rule == Py_file_input;
923
924 struct tok_state *tok;
925 if (flags != NULL && flags->cf_flags & PyCF_IGNORE_COOKIE) {
926 tok = _PyTokenizer_FromUTF8(str, exec_input);
927 } else {
928 tok = _PyTokenizer_FromString(str, exec_input);
929 }
930 if (tok == NULL) {
931 if (PyErr_Occurred()) {
932 _PyPegen_raise_tokenizer_init_error(filename_ob);
933 }
934 return NULL;
935 }
936 // This transfers the ownership to the tokenizer
937 tok->filename = filename_ob;
938 Py_INCREF(filename_ob);
939
940 // We need to clear up from here on
941 mod_ty result = NULL;
942
943 int parser_flags = compute_parser_flags(flags);
944 int feature_version = flags && (flags->cf_flags & PyCF_ONLY_AST) ?
945 flags->cf_feature_version : PY_MINOR_VERSION;
946 Parser *p = _PyPegen_Parser_New(tok, start_rule, parser_flags, feature_version,
947 NULL, arena);
948 if (p == NULL) {
949 goto error;
950 }
951
952 result = _PyPegen_run_parser(p);
953 _PyPegen_Parser_Free(p);
954
955 error:
956 _PyTokenizer_Free(tok);
957 return result;
958 }
959