1 #include <Python.h>
2 #include <errcode.h>
3
4 #include "tokenizer.h"
5 #include "pegen.h"
6
7 // TOKENIZER ERRORS
8
9 void
_PyPegen_raise_tokenizer_init_error(PyObject * filename)10 _PyPegen_raise_tokenizer_init_error(PyObject *filename)
11 {
12 if (!(PyErr_ExceptionMatches(PyExc_LookupError)
13 || PyErr_ExceptionMatches(PyExc_SyntaxError)
14 || PyErr_ExceptionMatches(PyExc_ValueError)
15 || PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))) {
16 return;
17 }
18 PyObject *errstr = NULL;
19 PyObject *tuple = NULL;
20 PyObject *type;
21 PyObject *value;
22 PyObject *tback;
23 PyErr_Fetch(&type, &value, &tback);
24 errstr = PyObject_Str(value);
25 if (!errstr) {
26 goto error;
27 }
28
29 PyObject *tmp = Py_BuildValue("(OiiO)", filename, 0, -1, Py_None);
30 if (!tmp) {
31 goto error;
32 }
33
34 tuple = PyTuple_Pack(2, errstr, tmp);
35 Py_DECREF(tmp);
36 if (!value) {
37 goto error;
38 }
39 PyErr_SetObject(PyExc_SyntaxError, tuple);
40
41 error:
42 Py_XDECREF(type);
43 Py_XDECREF(value);
44 Py_XDECREF(tback);
45 Py_XDECREF(errstr);
46 Py_XDECREF(tuple);
47 }
48
49 static inline void
raise_unclosed_parentheses_error(Parser * p)50 raise_unclosed_parentheses_error(Parser *p) {
51 int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
52 int error_col = p->tok->parencolstack[p->tok->level-1];
53 RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError,
54 error_lineno, error_col, error_lineno, -1,
55 "'%c' was never closed",
56 p->tok->parenstack[p->tok->level-1]);
57 }
58
59 int
_Pypegen_tokenizer_error(Parser * p)60 _Pypegen_tokenizer_error(Parser *p)
61 {
62 if (PyErr_Occurred()) {
63 return -1;
64 }
65
66 const char *msg = NULL;
67 PyObject* errtype = PyExc_SyntaxError;
68 Py_ssize_t col_offset = -1;
69 switch (p->tok->done) {
70 case E_TOKEN:
71 msg = "invalid token";
72 break;
73 case E_EOF:
74 if (p->tok->level) {
75 raise_unclosed_parentheses_error(p);
76 } else {
77 RAISE_SYNTAX_ERROR("unexpected EOF while parsing");
78 }
79 return -1;
80 case E_DEDENT:
81 RAISE_INDENTATION_ERROR("unindent does not match any outer indentation level");
82 return -1;
83 case E_INTR:
84 if (!PyErr_Occurred()) {
85 PyErr_SetNone(PyExc_KeyboardInterrupt);
86 }
87 return -1;
88 case E_NOMEM:
89 PyErr_NoMemory();
90 return -1;
91 case E_TABSPACE:
92 errtype = PyExc_TabError;
93 msg = "inconsistent use of tabs and spaces in indentation";
94 break;
95 case E_TOODEEP:
96 errtype = PyExc_IndentationError;
97 msg = "too many levels of indentation";
98 break;
99 case E_LINECONT: {
100 col_offset = p->tok->cur - p->tok->buf - 1;
101 msg = "unexpected character after line continuation character";
102 break;
103 }
104 default:
105 msg = "unknown parsing error";
106 }
107
108 RAISE_ERROR_KNOWN_LOCATION(p, errtype, p->tok->lineno,
109 col_offset >= 0 ? col_offset : 0,
110 p->tok->lineno, -1, msg);
111 return -1;
112 }
113
114 int
_Pypegen_raise_decode_error(Parser * p)115 _Pypegen_raise_decode_error(Parser *p)
116 {
117 assert(PyErr_Occurred());
118 const char *errtype = NULL;
119 if (PyErr_ExceptionMatches(PyExc_UnicodeError)) {
120 errtype = "unicode error";
121 }
122 else if (PyErr_ExceptionMatches(PyExc_ValueError)) {
123 errtype = "value error";
124 }
125 if (errtype) {
126 PyObject *type;
127 PyObject *value;
128 PyObject *tback;
129 PyObject *errstr;
130 PyErr_Fetch(&type, &value, &tback);
131 errstr = PyObject_Str(value);
132 if (errstr) {
133 RAISE_SYNTAX_ERROR("(%s) %U", errtype, errstr);
134 Py_DECREF(errstr);
135 }
136 else {
137 PyErr_Clear();
138 RAISE_SYNTAX_ERROR("(%s) unknown error", errtype);
139 }
140 Py_XDECREF(type);
141 Py_XDECREF(value);
142 Py_XDECREF(tback);
143 }
144
145 return -1;
146 }
147
148 static int
_PyPegen_tokenize_full_source_to_check_for_errors(Parser * p)149 _PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) {
150 // Tokenize the whole input to see if there are any tokenization
151 // errors such as mistmatching parentheses. These will get priority
152 // over generic syntax errors only if the line number of the error is
153 // before the one that we had for the generic error.
154
155 // We don't want to tokenize to the end for interactive input
156 if (p->tok->prompt != NULL) {
157 return 0;
158 }
159
160 PyObject *type, *value, *traceback;
161 PyErr_Fetch(&type, &value, &traceback);
162
163 Token *current_token = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1];
164 Py_ssize_t current_err_line = current_token->lineno;
165
166 int ret = 0;
167
168 for (;;) {
169 const char *start;
170 const char *end;
171 switch (_PyTokenizer_Get(p->tok, &start, &end)) {
172 case ERRORTOKEN:
173 if (PyErr_Occurred()) {
174 ret = -1;
175 goto exit;
176 }
177 if (p->tok->level != 0) {
178 int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
179 if (current_err_line > error_lineno) {
180 raise_unclosed_parentheses_error(p);
181 ret = -1;
182 goto exit;
183 }
184 }
185 break;
186 case ENDMARKER:
187 break;
188 default:
189 continue;
190 }
191 break;
192 }
193
194
195 exit:
196 if (PyErr_Occurred()) {
197 Py_XDECREF(value);
198 Py_XDECREF(type);
199 Py_XDECREF(traceback);
200 } else {
201 PyErr_Restore(type, value, traceback);
202 }
203 return ret;
204 }
205
206 // PARSER ERRORS
207
208 void *
_PyPegen_raise_error(Parser * p,PyObject * errtype,const char * errmsg,...)209 _PyPegen_raise_error(Parser *p, PyObject *errtype, const char *errmsg, ...)
210 {
211 if (p->fill == 0) {
212 va_list va;
213 va_start(va, errmsg);
214 _PyPegen_raise_error_known_location(p, errtype, 0, 0, 0, -1, errmsg, va);
215 va_end(va);
216 return NULL;
217 }
218
219 Token *t = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1];
220 Py_ssize_t col_offset;
221 Py_ssize_t end_col_offset = -1;
222 if (t->col_offset == -1) {
223 if (p->tok->cur == p->tok->buf) {
224 col_offset = 0;
225 } else {
226 const char* start = p->tok->buf ? p->tok->line_start : p->tok->buf;
227 col_offset = Py_SAFE_DOWNCAST(p->tok->cur - start, intptr_t, int);
228 }
229 } else {
230 col_offset = t->col_offset + 1;
231 }
232
233 if (t->end_col_offset != -1) {
234 end_col_offset = t->end_col_offset + 1;
235 }
236
237 va_list va;
238 va_start(va, errmsg);
239 _PyPegen_raise_error_known_location(p, errtype, t->lineno, col_offset, t->end_lineno, end_col_offset, errmsg, va);
240 va_end(va);
241
242 return NULL;
243 }
244
245 static PyObject *
get_error_line_from_tokenizer_buffers(Parser * p,Py_ssize_t lineno)246 get_error_line_from_tokenizer_buffers(Parser *p, Py_ssize_t lineno)
247 {
248 /* If the file descriptor is interactive, the source lines of the current
249 * (multi-line) statement are stored in p->tok->interactive_src_start.
250 * If not, we're parsing from a string, which means that the whole source
251 * is stored in p->tok->str. */
252 assert((p->tok->fp == NULL && p->tok->str != NULL) || p->tok->fp == stdin);
253
254 char *cur_line = p->tok->fp_interactive ? p->tok->interactive_src_start : p->tok->str;
255 if (cur_line == NULL) {
256 assert(p->tok->fp_interactive);
257 // We can reach this point if the tokenizer buffers for interactive source have not been
258 // initialized because we failed to decode the original source with the given locale.
259 return PyUnicode_FromStringAndSize("", 0);
260 }
261
262 Py_ssize_t relative_lineno = p->starting_lineno ? lineno - p->starting_lineno + 1 : lineno;
263 const char* buf_end = p->tok->fp_interactive ? p->tok->interactive_src_end : p->tok->inp;
264
265 for (int i = 0; i < relative_lineno - 1; i++) {
266 char *new_line = strchr(cur_line, '\n');
267 // The assert is here for debug builds but the conditional that
268 // follows is there so in release builds we do not crash at the cost
269 // to report a potentially wrong line.
270 assert(new_line != NULL && new_line + 1 < buf_end);
271 if (new_line == NULL || new_line + 1 > buf_end) {
272 break;
273 }
274 cur_line = new_line + 1;
275 }
276
277 char *next_newline;
278 if ((next_newline = strchr(cur_line, '\n')) == NULL) { // This is the last line
279 next_newline = cur_line + strlen(cur_line);
280 }
281 return PyUnicode_DecodeUTF8(cur_line, next_newline - cur_line, "replace");
282 }
283
284 void *
_PyPegen_raise_error_known_location(Parser * p,PyObject * errtype,Py_ssize_t lineno,Py_ssize_t col_offset,Py_ssize_t end_lineno,Py_ssize_t end_col_offset,const char * errmsg,va_list va)285 _PyPegen_raise_error_known_location(Parser *p, PyObject *errtype,
286 Py_ssize_t lineno, Py_ssize_t col_offset,
287 Py_ssize_t end_lineno, Py_ssize_t end_col_offset,
288 const char *errmsg, va_list va)
289 {
290 PyObject *value = NULL;
291 PyObject *errstr = NULL;
292 PyObject *error_line = NULL;
293 PyObject *tmp = NULL;
294 p->error_indicator = 1;
295
296 if (end_lineno == CURRENT_POS) {
297 end_lineno = p->tok->lineno;
298 }
299 if (end_col_offset == CURRENT_POS) {
300 end_col_offset = p->tok->cur - p->tok->line_start;
301 }
302
303 if (p->start_rule == Py_fstring_input) {
304 const char *fstring_msg = "f-string: ";
305 Py_ssize_t len = strlen(fstring_msg) + strlen(errmsg);
306
307 char *new_errmsg = PyMem_Malloc(len + 1); // Lengths of both strings plus NULL character
308 if (!new_errmsg) {
309 return (void *) PyErr_NoMemory();
310 }
311
312 // Copy both strings into new buffer
313 memcpy(new_errmsg, fstring_msg, strlen(fstring_msg));
314 memcpy(new_errmsg + strlen(fstring_msg), errmsg, strlen(errmsg));
315 new_errmsg[len] = 0;
316 errmsg = new_errmsg;
317 }
318 errstr = PyUnicode_FromFormatV(errmsg, va);
319 if (!errstr) {
320 goto error;
321 }
322
323 if (p->tok->fp_interactive && p->tok->interactive_src_start != NULL) {
324 error_line = get_error_line_from_tokenizer_buffers(p, lineno);
325 }
326 else if (p->start_rule == Py_file_input) {
327 error_line = _PyErr_ProgramDecodedTextObject(p->tok->filename,
328 (int) lineno, p->tok->encoding);
329 }
330
331 if (!error_line) {
332 /* PyErr_ProgramTextObject was not called or returned NULL. If it was not called,
333 then we need to find the error line from some other source, because
334 p->start_rule != Py_file_input. If it returned NULL, then it either unexpectedly
335 failed or we're parsing from a string or the REPL. There's a third edge case where
336 we're actually parsing from a file, which has an E_EOF SyntaxError and in that case
337 `PyErr_ProgramTextObject` fails because lineno points to last_file_line + 1, which
338 does not physically exist */
339 assert(p->tok->fp == NULL || p->tok->fp == stdin || p->tok->done == E_EOF);
340
341 if (p->tok->lineno <= lineno && p->tok->inp > p->tok->buf) {
342 Py_ssize_t size = p->tok->inp - p->tok->buf;
343 error_line = PyUnicode_DecodeUTF8(p->tok->buf, size, "replace");
344 }
345 else if (p->tok->fp == NULL || p->tok->fp == stdin) {
346 error_line = get_error_line_from_tokenizer_buffers(p, lineno);
347 }
348 else {
349 error_line = PyUnicode_FromStringAndSize("", 0);
350 }
351 if (!error_line) {
352 goto error;
353 }
354 }
355
356 if (p->start_rule == Py_fstring_input) {
357 col_offset -= p->starting_col_offset;
358 end_col_offset -= p->starting_col_offset;
359 }
360
361 Py_ssize_t col_number = col_offset;
362 Py_ssize_t end_col_number = end_col_offset;
363
364 if (p->tok->encoding != NULL) {
365 col_number = _PyPegen_byte_offset_to_character_offset(error_line, col_offset);
366 if (col_number < 0) {
367 goto error;
368 }
369 if (end_col_number > 0) {
370 Py_ssize_t end_col_offset = _PyPegen_byte_offset_to_character_offset(error_line, end_col_number);
371 if (end_col_offset < 0) {
372 goto error;
373 } else {
374 end_col_number = end_col_offset;
375 }
376 }
377 }
378 tmp = Py_BuildValue("(OnnNnn)", p->tok->filename, lineno, col_number, error_line, end_lineno, end_col_number);
379 if (!tmp) {
380 goto error;
381 }
382 value = PyTuple_Pack(2, errstr, tmp);
383 Py_DECREF(tmp);
384 if (!value) {
385 goto error;
386 }
387 PyErr_SetObject(errtype, value);
388
389 Py_DECREF(errstr);
390 Py_DECREF(value);
391 if (p->start_rule == Py_fstring_input) {
392 PyMem_Free((void *)errmsg);
393 }
394 return NULL;
395
396 error:
397 Py_XDECREF(errstr);
398 Py_XDECREF(error_line);
399 if (p->start_rule == Py_fstring_input) {
400 PyMem_Free((void *)errmsg);
401 }
402 return NULL;
403 }
404
405 void
_Pypegen_set_syntax_error(Parser * p,Token * last_token)406 _Pypegen_set_syntax_error(Parser* p, Token* last_token) {
407 // Existing sintax error
408 if (PyErr_Occurred()) {
409 // Prioritize tokenizer errors to custom syntax errors raised
410 // on the second phase only if the errors come from the parser.
411 int is_tok_ok = (p->tok->done == E_DONE || p->tok->done == E_OK);
412 if (is_tok_ok && PyErr_ExceptionMatches(PyExc_SyntaxError)) {
413 _PyPegen_tokenize_full_source_to_check_for_errors(p);
414 }
415 // Propagate the existing syntax error.
416 return;
417 }
418 // Initialization error
419 if (p->fill == 0) {
420 RAISE_SYNTAX_ERROR("error at start before reading any input");
421 }
422 // Parser encountered EOF (End of File) unexpectedtly
423 if (last_token->type == ERRORTOKEN && p->tok->done == E_EOF) {
424 if (p->tok->level) {
425 raise_unclosed_parentheses_error(p);
426 } else {
427 RAISE_SYNTAX_ERROR("unexpected EOF while parsing");
428 }
429 return;
430 }
431 // Indentation error in the tokenizer
432 if (last_token->type == INDENT || last_token->type == DEDENT) {
433 RAISE_INDENTATION_ERROR(last_token->type == INDENT ? "unexpected indent" : "unexpected unindent");
434 return;
435 }
436 // Unknown error (generic case)
437
438 // Use the last token we found on the first pass to avoid reporting
439 // incorrect locations for generic syntax errors just because we reached
440 // further away when trying to find specific syntax errors in the second
441 // pass.
442 RAISE_SYNTAX_ERROR_KNOWN_LOCATION(last_token, "invalid syntax");
443 // _PyPegen_tokenize_full_source_to_check_for_errors will override the existing
444 // generic SyntaxError we just raised if errors are found.
445 _PyPegen_tokenize_full_source_to_check_for_errors(p);
446 }
447