1 #include <Python.h>
2 #include <errcode.h>
3 
4 #include "tokenizer.h"
5 #include "pegen.h"
6 
7 // TOKENIZER ERRORS
8 
9 void
_PyPegen_raise_tokenizer_init_error(PyObject * filename)10 _PyPegen_raise_tokenizer_init_error(PyObject *filename)
11 {
12     if (!(PyErr_ExceptionMatches(PyExc_LookupError)
13           || PyErr_ExceptionMatches(PyExc_SyntaxError)
14           || PyErr_ExceptionMatches(PyExc_ValueError)
15           || PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))) {
16         return;
17     }
18     PyObject *errstr = NULL;
19     PyObject *tuple = NULL;
20     PyObject *type;
21     PyObject *value;
22     PyObject *tback;
23     PyErr_Fetch(&type, &value, &tback);
24     errstr = PyObject_Str(value);
25     if (!errstr) {
26         goto error;
27     }
28 
29     PyObject *tmp = Py_BuildValue("(OiiO)", filename, 0, -1, Py_None);
30     if (!tmp) {
31         goto error;
32     }
33 
34     tuple = PyTuple_Pack(2, errstr, tmp);
35     Py_DECREF(tmp);
36     if (!value) {
37         goto error;
38     }
39     PyErr_SetObject(PyExc_SyntaxError, tuple);
40 
41 error:
42     Py_XDECREF(type);
43     Py_XDECREF(value);
44     Py_XDECREF(tback);
45     Py_XDECREF(errstr);
46     Py_XDECREF(tuple);
47 }
48 
49 static inline void
raise_unclosed_parentheses_error(Parser * p)50 raise_unclosed_parentheses_error(Parser *p) {
51        int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
52        int error_col = p->tok->parencolstack[p->tok->level-1];
53        RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError,
54                                   error_lineno, error_col, error_lineno, -1,
55                                   "'%c' was never closed",
56                                   p->tok->parenstack[p->tok->level-1]);
57 }
58 
59 int
_Pypegen_tokenizer_error(Parser * p)60 _Pypegen_tokenizer_error(Parser *p)
61 {
62     if (PyErr_Occurred()) {
63         return -1;
64     }
65 
66     const char *msg = NULL;
67     PyObject* errtype = PyExc_SyntaxError;
68     Py_ssize_t col_offset = -1;
69     switch (p->tok->done) {
70         case E_TOKEN:
71             msg = "invalid token";
72             break;
73         case E_EOF:
74             if (p->tok->level) {
75                 raise_unclosed_parentheses_error(p);
76             } else {
77                 RAISE_SYNTAX_ERROR("unexpected EOF while parsing");
78             }
79             return -1;
80         case E_DEDENT:
81             RAISE_INDENTATION_ERROR("unindent does not match any outer indentation level");
82             return -1;
83         case E_INTR:
84             if (!PyErr_Occurred()) {
85                 PyErr_SetNone(PyExc_KeyboardInterrupt);
86             }
87             return -1;
88         case E_NOMEM:
89             PyErr_NoMemory();
90             return -1;
91         case E_TABSPACE:
92             errtype = PyExc_TabError;
93             msg = "inconsistent use of tabs and spaces in indentation";
94             break;
95         case E_TOODEEP:
96             errtype = PyExc_IndentationError;
97             msg = "too many levels of indentation";
98             break;
99         case E_LINECONT: {
100             col_offset = p->tok->cur - p->tok->buf - 1;
101             msg = "unexpected character after line continuation character";
102             break;
103         }
104         default:
105             msg = "unknown parsing error";
106     }
107 
108     RAISE_ERROR_KNOWN_LOCATION(p, errtype, p->tok->lineno,
109                                col_offset >= 0 ? col_offset : 0,
110                                p->tok->lineno, -1, msg);
111     return -1;
112 }
113 
114 int
_Pypegen_raise_decode_error(Parser * p)115 _Pypegen_raise_decode_error(Parser *p)
116 {
117     assert(PyErr_Occurred());
118     const char *errtype = NULL;
119     if (PyErr_ExceptionMatches(PyExc_UnicodeError)) {
120         errtype = "unicode error";
121     }
122     else if (PyErr_ExceptionMatches(PyExc_ValueError)) {
123         errtype = "value error";
124     }
125     if (errtype) {
126         PyObject *type;
127         PyObject *value;
128         PyObject *tback;
129         PyObject *errstr;
130         PyErr_Fetch(&type, &value, &tback);
131         errstr = PyObject_Str(value);
132         if (errstr) {
133             RAISE_SYNTAX_ERROR("(%s) %U", errtype, errstr);
134             Py_DECREF(errstr);
135         }
136         else {
137             PyErr_Clear();
138             RAISE_SYNTAX_ERROR("(%s) unknown error", errtype);
139         }
140         Py_XDECREF(type);
141         Py_XDECREF(value);
142         Py_XDECREF(tback);
143     }
144 
145     return -1;
146 }
147 
148 static int
_PyPegen_tokenize_full_source_to_check_for_errors(Parser * p)149 _PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) {
150     // Tokenize the whole input to see if there are any tokenization
151     // errors such as mistmatching parentheses. These will get priority
152     // over generic syntax errors only if the line number of the error is
153     // before the one that we had for the generic error.
154 
155     // We don't want to tokenize to the end for interactive input
156     if (p->tok->prompt != NULL) {
157         return 0;
158     }
159 
160     PyObject *type, *value, *traceback;
161     PyErr_Fetch(&type, &value, &traceback);
162 
163     Token *current_token = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1];
164     Py_ssize_t current_err_line = current_token->lineno;
165 
166     int ret = 0;
167 
168     for (;;) {
169         const char *start;
170         const char *end;
171         switch (_PyTokenizer_Get(p->tok, &start, &end)) {
172             case ERRORTOKEN:
173                 if (PyErr_Occurred()) {
174                     ret = -1;
175                     goto exit;
176                 }
177                 if (p->tok->level != 0) {
178                     int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
179                     if (current_err_line > error_lineno) {
180                         raise_unclosed_parentheses_error(p);
181                         ret = -1;
182                         goto exit;
183                     }
184                 }
185                 break;
186             case ENDMARKER:
187                 break;
188             default:
189                 continue;
190         }
191         break;
192     }
193 
194 
195 exit:
196     if (PyErr_Occurred()) {
197         Py_XDECREF(value);
198         Py_XDECREF(type);
199         Py_XDECREF(traceback);
200     } else {
201         PyErr_Restore(type, value, traceback);
202     }
203     return ret;
204 }
205 
206 // PARSER ERRORS
207 
208 void *
_PyPegen_raise_error(Parser * p,PyObject * errtype,const char * errmsg,...)209 _PyPegen_raise_error(Parser *p, PyObject *errtype, const char *errmsg, ...)
210 {
211     if (p->fill == 0) {
212         va_list va;
213         va_start(va, errmsg);
214         _PyPegen_raise_error_known_location(p, errtype, 0, 0, 0, -1, errmsg, va);
215         va_end(va);
216         return NULL;
217     }
218 
219     Token *t = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1];
220     Py_ssize_t col_offset;
221     Py_ssize_t end_col_offset = -1;
222     if (t->col_offset == -1) {
223         if (p->tok->cur == p->tok->buf) {
224             col_offset = 0;
225         } else {
226             const char* start = p->tok->buf  ? p->tok->line_start : p->tok->buf;
227             col_offset = Py_SAFE_DOWNCAST(p->tok->cur - start, intptr_t, int);
228         }
229     } else {
230         col_offset = t->col_offset + 1;
231     }
232 
233     if (t->end_col_offset != -1) {
234         end_col_offset = t->end_col_offset + 1;
235     }
236 
237     va_list va;
238     va_start(va, errmsg);
239     _PyPegen_raise_error_known_location(p, errtype, t->lineno, col_offset, t->end_lineno, end_col_offset, errmsg, va);
240     va_end(va);
241 
242     return NULL;
243 }
244 
245 static PyObject *
get_error_line_from_tokenizer_buffers(Parser * p,Py_ssize_t lineno)246 get_error_line_from_tokenizer_buffers(Parser *p, Py_ssize_t lineno)
247 {
248     /* If the file descriptor is interactive, the source lines of the current
249      * (multi-line) statement are stored in p->tok->interactive_src_start.
250      * If not, we're parsing from a string, which means that the whole source
251      * is stored in p->tok->str. */
252     assert((p->tok->fp == NULL && p->tok->str != NULL) || p->tok->fp == stdin);
253 
254     char *cur_line = p->tok->fp_interactive ? p->tok->interactive_src_start : p->tok->str;
255     if (cur_line == NULL) {
256         assert(p->tok->fp_interactive);
257         // We can reach this point if the tokenizer buffers for interactive source have not been
258         // initialized because we failed to decode the original source with the given locale.
259         return PyUnicode_FromStringAndSize("", 0);
260     }
261 
262     Py_ssize_t relative_lineno = p->starting_lineno ? lineno - p->starting_lineno + 1 : lineno;
263     const char* buf_end = p->tok->fp_interactive ? p->tok->interactive_src_end : p->tok->inp;
264 
265     for (int i = 0; i < relative_lineno - 1; i++) {
266         char *new_line = strchr(cur_line, '\n');
267         // The assert is here for debug builds but the conditional that
268         // follows is there so in release builds we do not crash at the cost
269         // to report a potentially wrong line.
270         assert(new_line != NULL && new_line + 1 < buf_end);
271         if (new_line == NULL || new_line + 1 > buf_end) {
272             break;
273         }
274         cur_line = new_line + 1;
275     }
276 
277     char *next_newline;
278     if ((next_newline = strchr(cur_line, '\n')) == NULL) { // This is the last line
279         next_newline = cur_line + strlen(cur_line);
280     }
281     return PyUnicode_DecodeUTF8(cur_line, next_newline - cur_line, "replace");
282 }
283 
284 void *
_PyPegen_raise_error_known_location(Parser * p,PyObject * errtype,Py_ssize_t lineno,Py_ssize_t col_offset,Py_ssize_t end_lineno,Py_ssize_t end_col_offset,const char * errmsg,va_list va)285 _PyPegen_raise_error_known_location(Parser *p, PyObject *errtype,
286                                     Py_ssize_t lineno, Py_ssize_t col_offset,
287                                     Py_ssize_t end_lineno, Py_ssize_t end_col_offset,
288                                     const char *errmsg, va_list va)
289 {
290     PyObject *value = NULL;
291     PyObject *errstr = NULL;
292     PyObject *error_line = NULL;
293     PyObject *tmp = NULL;
294     p->error_indicator = 1;
295 
296     if (end_lineno == CURRENT_POS) {
297         end_lineno = p->tok->lineno;
298     }
299     if (end_col_offset == CURRENT_POS) {
300         end_col_offset = p->tok->cur - p->tok->line_start;
301     }
302 
303     if (p->start_rule == Py_fstring_input) {
304         const char *fstring_msg = "f-string: ";
305         Py_ssize_t len = strlen(fstring_msg) + strlen(errmsg);
306 
307         char *new_errmsg = PyMem_Malloc(len + 1); // Lengths of both strings plus NULL character
308         if (!new_errmsg) {
309             return (void *) PyErr_NoMemory();
310         }
311 
312         // Copy both strings into new buffer
313         memcpy(new_errmsg, fstring_msg, strlen(fstring_msg));
314         memcpy(new_errmsg + strlen(fstring_msg), errmsg, strlen(errmsg));
315         new_errmsg[len] = 0;
316         errmsg = new_errmsg;
317     }
318     errstr = PyUnicode_FromFormatV(errmsg, va);
319     if (!errstr) {
320         goto error;
321     }
322 
323     if (p->tok->fp_interactive && p->tok->interactive_src_start != NULL) {
324         error_line = get_error_line_from_tokenizer_buffers(p, lineno);
325     }
326     else if (p->start_rule == Py_file_input) {
327         error_line = _PyErr_ProgramDecodedTextObject(p->tok->filename,
328                                                      (int) lineno, p->tok->encoding);
329     }
330 
331     if (!error_line) {
332         /* PyErr_ProgramTextObject was not called or returned NULL. If it was not called,
333            then we need to find the error line from some other source, because
334            p->start_rule != Py_file_input. If it returned NULL, then it either unexpectedly
335            failed or we're parsing from a string or the REPL. There's a third edge case where
336            we're actually parsing from a file, which has an E_EOF SyntaxError and in that case
337            `PyErr_ProgramTextObject` fails because lineno points to last_file_line + 1, which
338            does not physically exist */
339         assert(p->tok->fp == NULL || p->tok->fp == stdin || p->tok->done == E_EOF);
340 
341         if (p->tok->lineno <= lineno && p->tok->inp > p->tok->buf) {
342             Py_ssize_t size = p->tok->inp - p->tok->buf;
343             error_line = PyUnicode_DecodeUTF8(p->tok->buf, size, "replace");
344         }
345         else if (p->tok->fp == NULL || p->tok->fp == stdin) {
346             error_line = get_error_line_from_tokenizer_buffers(p, lineno);
347         }
348         else {
349             error_line = PyUnicode_FromStringAndSize("", 0);
350         }
351         if (!error_line) {
352             goto error;
353         }
354     }
355 
356     if (p->start_rule == Py_fstring_input) {
357         col_offset -= p->starting_col_offset;
358         end_col_offset -= p->starting_col_offset;
359     }
360 
361     Py_ssize_t col_number = col_offset;
362     Py_ssize_t end_col_number = end_col_offset;
363 
364     if (p->tok->encoding != NULL) {
365         col_number = _PyPegen_byte_offset_to_character_offset(error_line, col_offset);
366         if (col_number < 0) {
367             goto error;
368         }
369         if (end_col_number > 0) {
370             Py_ssize_t end_col_offset = _PyPegen_byte_offset_to_character_offset(error_line, end_col_number);
371             if (end_col_offset < 0) {
372                 goto error;
373             } else {
374                 end_col_number = end_col_offset;
375             }
376         }
377     }
378     tmp = Py_BuildValue("(OnnNnn)", p->tok->filename, lineno, col_number, error_line, end_lineno, end_col_number);
379     if (!tmp) {
380         goto error;
381     }
382     value = PyTuple_Pack(2, errstr, tmp);
383     Py_DECREF(tmp);
384     if (!value) {
385         goto error;
386     }
387     PyErr_SetObject(errtype, value);
388 
389     Py_DECREF(errstr);
390     Py_DECREF(value);
391     if (p->start_rule == Py_fstring_input) {
392         PyMem_Free((void *)errmsg);
393     }
394     return NULL;
395 
396 error:
397     Py_XDECREF(errstr);
398     Py_XDECREF(error_line);
399     if (p->start_rule == Py_fstring_input) {
400         PyMem_Free((void *)errmsg);
401     }
402     return NULL;
403 }
404 
405 void
_Pypegen_set_syntax_error(Parser * p,Token * last_token)406 _Pypegen_set_syntax_error(Parser* p, Token* last_token) {
407     // Existing sintax error
408     if (PyErr_Occurred()) {
409         // Prioritize tokenizer errors to custom syntax errors raised
410         // on the second phase only if the errors come from the parser.
411         int is_tok_ok = (p->tok->done == E_DONE || p->tok->done == E_OK);
412         if (is_tok_ok && PyErr_ExceptionMatches(PyExc_SyntaxError)) {
413             _PyPegen_tokenize_full_source_to_check_for_errors(p);
414         }
415         // Propagate the existing syntax error.
416         return;
417     }
418     // Initialization error
419     if (p->fill == 0) {
420         RAISE_SYNTAX_ERROR("error at start before reading any input");
421     }
422     // Parser encountered EOF (End of File) unexpectedtly
423     if (last_token->type == ERRORTOKEN && p->tok->done == E_EOF) {
424         if (p->tok->level) {
425             raise_unclosed_parentheses_error(p);
426         } else {
427             RAISE_SYNTAX_ERROR("unexpected EOF while parsing");
428         }
429         return;
430     }
431     // Indentation error in the tokenizer
432     if (last_token->type == INDENT || last_token->type == DEDENT) {
433         RAISE_INDENTATION_ERROR(last_token->type == INDENT ? "unexpected indent" : "unexpected unindent");
434         return;
435     }
436     // Unknown error (generic case)
437 
438     // Use the last token we found on the first pass to avoid reporting
439     // incorrect locations for generic syntax errors just because we reached
440     // further away when trying to find specific syntax errors in the second
441     // pass.
442     RAISE_SYNTAX_ERROR_KNOWN_LOCATION(last_token, "invalid syntax");
443     // _PyPegen_tokenize_full_source_to_check_for_errors will override the existing
444     // generic SyntaxError we just raised if errors are found.
445     _PyPegen_tokenize_full_source_to_check_for_errors(p);
446 }
447