1
2 /* Tokenizer implementation */
3
4 #define PY_SSIZE_T_CLEAN
5 #include "Python.h"
6 #include "pycore_call.h" // _PyObject_CallNoArgs()
7
8 #include <ctype.h>
9 #include <assert.h>
10
11 #include "tokenizer.h"
12 #include "errcode.h"
13
14 #include "unicodeobject.h"
15 #include "bytesobject.h"
16 #include "fileobject.h"
17 #include "abstract.h"
18
19 /* Alternate tab spacing */
20 #define ALTTABSIZE 1
21
22 #define is_potential_identifier_start(c) (\
23 (c >= 'a' && c <= 'z')\
24 || (c >= 'A' && c <= 'Z')\
25 || c == '_'\
26 || (c >= 128))
27
28 #define is_potential_identifier_char(c) (\
29 (c >= 'a' && c <= 'z')\
30 || (c >= 'A' && c <= 'Z')\
31 || (c >= '0' && c <= '9')\
32 || c == '_'\
33 || (c >= 128))
34
35
36 /* Don't ever change this -- it would break the portability of Python code */
37 #define TABSIZE 8
38
39 /* Forward */
40 static struct tok_state *tok_new(void);
41 static int tok_nextc(struct tok_state *tok);
42 static void tok_backup(struct tok_state *tok, int c);
43 static int syntaxerror(struct tok_state *tok, const char *format, ...);
44
45 /* Spaces in this constant are treated as "zero or more spaces or tabs" when
46 tokenizing. */
47 static const char* type_comment_prefix = "# type: ";
48
49 /* Create and initialize a new tok_state structure */
50
51 static struct tok_state *
tok_new(void)52 tok_new(void)
53 {
54 struct tok_state *tok = (struct tok_state *)PyMem_Malloc(
55 sizeof(struct tok_state));
56 if (tok == NULL)
57 return NULL;
58 tok->buf = tok->cur = tok->inp = NULL;
59 tok->fp_interactive = 0;
60 tok->interactive_src_start = NULL;
61 tok->interactive_src_end = NULL;
62 tok->start = NULL;
63 tok->end = NULL;
64 tok->done = E_OK;
65 tok->fp = NULL;
66 tok->input = NULL;
67 tok->tabsize = TABSIZE;
68 tok->indent = 0;
69 tok->indstack[0] = 0;
70 tok->atbol = 1;
71 tok->pendin = 0;
72 tok->prompt = tok->nextprompt = NULL;
73 tok->lineno = 0;
74 tok->level = 0;
75 tok->altindstack[0] = 0;
76 tok->decoding_state = STATE_INIT;
77 tok->decoding_erred = 0;
78 tok->enc = NULL;
79 tok->encoding = NULL;
80 tok->cont_line = 0;
81 tok->filename = NULL;
82 tok->decoding_readline = NULL;
83 tok->decoding_buffer = NULL;
84 tok->type_comments = 0;
85 tok->async_hacks = 0;
86 tok->async_def = 0;
87 tok->async_def_indent = 0;
88 tok->async_def_nl = 0;
89 tok->interactive_underflow = IUNDERFLOW_NORMAL;
90 tok->str = NULL;
91 tok->report_warnings = 1;
92 return tok;
93 }
94
95 static char *
new_string(const char * s,Py_ssize_t len,struct tok_state * tok)96 new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
97 {
98 char* result = (char *)PyMem_Malloc(len + 1);
99 if (!result) {
100 tok->done = E_NOMEM;
101 return NULL;
102 }
103 memcpy(result, s, len);
104 result[len] = '\0';
105 return result;
106 }
107
108 static char *
error_ret(struct tok_state * tok)109 error_ret(struct tok_state *tok) /* XXX */
110 {
111 tok->decoding_erred = 1;
112 if (tok->fp != NULL && tok->buf != NULL) /* see _PyTokenizer_Free */
113 PyMem_Free(tok->buf);
114 tok->buf = tok->cur = tok->inp = NULL;
115 tok->start = NULL;
116 tok->end = NULL;
117 tok->done = E_DECODE;
118 return NULL; /* as if it were EOF */
119 }
120
121
122 static const char *
get_normal_name(const char * s)123 get_normal_name(const char *s) /* for utf-8 and latin-1 */
124 {
125 char buf[13];
126 int i;
127 for (i = 0; i < 12; i++) {
128 int c = s[i];
129 if (c == '\0')
130 break;
131 else if (c == '_')
132 buf[i] = '-';
133 else
134 buf[i] = tolower(c);
135 }
136 buf[i] = '\0';
137 if (strcmp(buf, "utf-8") == 0 ||
138 strncmp(buf, "utf-8-", 6) == 0)
139 return "utf-8";
140 else if (strcmp(buf, "latin-1") == 0 ||
141 strcmp(buf, "iso-8859-1") == 0 ||
142 strcmp(buf, "iso-latin-1") == 0 ||
143 strncmp(buf, "latin-1-", 8) == 0 ||
144 strncmp(buf, "iso-8859-1-", 11) == 0 ||
145 strncmp(buf, "iso-latin-1-", 12) == 0)
146 return "iso-8859-1";
147 else
148 return s;
149 }
150
151 /* Return the coding spec in S, or NULL if none is found. */
152
153 static int
get_coding_spec(const char * s,char ** spec,Py_ssize_t size,struct tok_state * tok)154 get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
155 {
156 Py_ssize_t i;
157 *spec = NULL;
158 /* Coding spec must be in a comment, and that comment must be
159 * the only statement on the source code line. */
160 for (i = 0; i < size - 6; i++) {
161 if (s[i] == '#')
162 break;
163 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
164 return 1;
165 }
166 for (; i < size - 6; i++) { /* XXX inefficient search */
167 const char* t = s + i;
168 if (memcmp(t, "coding", 6) == 0) {
169 const char* begin = NULL;
170 t += 6;
171 if (t[0] != ':' && t[0] != '=')
172 continue;
173 do {
174 t++;
175 } while (t[0] == ' ' || t[0] == '\t');
176
177 begin = t;
178 while (Py_ISALNUM(t[0]) ||
179 t[0] == '-' || t[0] == '_' || t[0] == '.')
180 t++;
181
182 if (begin < t) {
183 char* r = new_string(begin, t - begin, tok);
184 const char* q;
185 if (!r)
186 return 0;
187 q = get_normal_name(r);
188 if (r != q) {
189 PyMem_Free(r);
190 r = new_string(q, strlen(q), tok);
191 if (!r)
192 return 0;
193 }
194 *spec = r;
195 break;
196 }
197 }
198 }
199 return 1;
200 }
201
202 /* Check whether the line contains a coding spec. If it does,
203 invoke the set_readline function for the new encoding.
204 This function receives the tok_state and the new encoding.
205 Return 1 on success, 0 on failure. */
206
207 static int
check_coding_spec(const char * line,Py_ssize_t size,struct tok_state * tok,int set_readline (struct tok_state *,const char *))208 check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
209 int set_readline(struct tok_state *, const char *))
210 {
211 char *cs;
212 if (tok->cont_line) {
213 /* It's a continuation line, so it can't be a coding spec. */
214 tok->decoding_state = STATE_NORMAL;
215 return 1;
216 }
217 if (!get_coding_spec(line, &cs, size, tok)) {
218 return 0;
219 }
220 if (!cs) {
221 Py_ssize_t i;
222 for (i = 0; i < size; i++) {
223 if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
224 break;
225 if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
226 /* Stop checking coding spec after a line containing
227 * anything except a comment. */
228 tok->decoding_state = STATE_NORMAL;
229 break;
230 }
231 }
232 return 1;
233 }
234 tok->decoding_state = STATE_NORMAL;
235 if (tok->encoding == NULL) {
236 assert(tok->decoding_readline == NULL);
237 if (strcmp(cs, "utf-8") != 0 && !set_readline(tok, cs)) {
238 error_ret(tok);
239 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
240 PyMem_Free(cs);
241 return 0;
242 }
243 tok->encoding = cs;
244 } else { /* then, compare cs with BOM */
245 if (strcmp(tok->encoding, cs) != 0) {
246 error_ret(tok);
247 PyErr_Format(PyExc_SyntaxError,
248 "encoding problem: %s with BOM", cs);
249 PyMem_Free(cs);
250 return 0;
251 }
252 PyMem_Free(cs);
253 }
254 return 1;
255 }
256
257 /* See whether the file starts with a BOM. If it does,
258 invoke the set_readline function with the new encoding.
259 Return 1 on success, 0 on failure. */
260
261 static int
check_bom(int get_char (struct tok_state *),void unget_char (int,struct tok_state *),int set_readline (struct tok_state *,const char *),struct tok_state * tok)262 check_bom(int get_char(struct tok_state *),
263 void unget_char(int, struct tok_state *),
264 int set_readline(struct tok_state *, const char *),
265 struct tok_state *tok)
266 {
267 int ch1, ch2, ch3;
268 ch1 = get_char(tok);
269 tok->decoding_state = STATE_SEEK_CODING;
270 if (ch1 == EOF) {
271 return 1;
272 } else if (ch1 == 0xEF) {
273 ch2 = get_char(tok);
274 if (ch2 != 0xBB) {
275 unget_char(ch2, tok);
276 unget_char(ch1, tok);
277 return 1;
278 }
279 ch3 = get_char(tok);
280 if (ch3 != 0xBF) {
281 unget_char(ch3, tok);
282 unget_char(ch2, tok);
283 unget_char(ch1, tok);
284 return 1;
285 }
286 } else {
287 unget_char(ch1, tok);
288 return 1;
289 }
290 if (tok->encoding != NULL)
291 PyMem_Free(tok->encoding);
292 tok->encoding = new_string("utf-8", 5, tok);
293 if (!tok->encoding)
294 return 0;
295 /* No need to set_readline: input is already utf-8 */
296 return 1;
297 }
298
299 static int
tok_concatenate_interactive_new_line(struct tok_state * tok,const char * line)300 tok_concatenate_interactive_new_line(struct tok_state *tok, const char *line) {
301 assert(tok->fp_interactive);
302
303 if (!line) {
304 return 0;
305 }
306
307 Py_ssize_t current_size = tok->interactive_src_end - tok->interactive_src_start;
308 Py_ssize_t line_size = strlen(line);
309 char last_char = line[line_size > 0 ? line_size - 1 : line_size];
310 if (last_char != '\n') {
311 line_size += 1;
312 }
313 char* new_str = tok->interactive_src_start;
314
315 new_str = PyMem_Realloc(new_str, current_size + line_size + 1);
316 if (!new_str) {
317 if (tok->interactive_src_start) {
318 PyMem_Free(tok->interactive_src_start);
319 }
320 tok->interactive_src_start = NULL;
321 tok->interactive_src_end = NULL;
322 tok->done = E_NOMEM;
323 return -1;
324 }
325 strcpy(new_str + current_size, line);
326 if (last_char != '\n') {
327 /* Last line does not end in \n, fake one */
328 new_str[current_size + line_size - 1] = '\n';
329 new_str[current_size + line_size] = '\0';
330 }
331 tok->interactive_src_start = new_str;
332 tok->interactive_src_end = new_str + current_size + line_size;
333 return 0;
334 }
335
336
337 /* Read a line of text from TOK into S, using the stream in TOK.
338 Return NULL on failure, else S.
339
340 On entry, tok->decoding_buffer will be one of:
341 1) NULL: need to call tok->decoding_readline to get a new line
342 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
343 stored the result in tok->decoding_buffer
344 3) PyByteArrayObject *: previous call to tok_readline_recode did not have enough room
345 (in the s buffer) to copy entire contents of the line read
346 by tok->decoding_readline. tok->decoding_buffer has the overflow.
347 In this case, tok_readline_recode is called in a loop (with an expanded buffer)
348 until the buffer ends with a '\n' (or until the end of the file is
349 reached): see tok_nextc and its calls to tok_reserve_buf.
350 */
351
352 static int
tok_reserve_buf(struct tok_state * tok,Py_ssize_t size)353 tok_reserve_buf(struct tok_state *tok, Py_ssize_t size)
354 {
355 Py_ssize_t cur = tok->cur - tok->buf;
356 Py_ssize_t oldsize = tok->inp - tok->buf;
357 Py_ssize_t newsize = oldsize + Py_MAX(size, oldsize >> 1);
358 if (newsize > tok->end - tok->buf) {
359 char *newbuf = tok->buf;
360 Py_ssize_t start = tok->start == NULL ? -1 : tok->start - tok->buf;
361 Py_ssize_t line_start = tok->start == NULL ? -1 : tok->line_start - tok->buf;
362 Py_ssize_t multi_line_start = tok->multi_line_start - tok->buf;
363 newbuf = (char *)PyMem_Realloc(newbuf, newsize);
364 if (newbuf == NULL) {
365 tok->done = E_NOMEM;
366 return 0;
367 }
368 tok->buf = newbuf;
369 tok->cur = tok->buf + cur;
370 tok->inp = tok->buf + oldsize;
371 tok->end = tok->buf + newsize;
372 tok->start = start < 0 ? NULL : tok->buf + start;
373 tok->line_start = line_start < 0 ? NULL : tok->buf + line_start;
374 tok->multi_line_start = multi_line_start < 0 ? NULL : tok->buf + multi_line_start;
375 }
376 return 1;
377 }
378
379 static inline int
contains_null_bytes(const char * str,size_t size)380 contains_null_bytes(const char* str, size_t size) {
381 return memchr(str, 0, size) != NULL;
382 }
383
384 static int
tok_readline_recode(struct tok_state * tok)385 tok_readline_recode(struct tok_state *tok) {
386 PyObject *line;
387 const char *buf;
388 Py_ssize_t buflen;
389 line = tok->decoding_buffer;
390 if (line == NULL) {
391 line = PyObject_CallNoArgs(tok->decoding_readline);
392 if (line == NULL) {
393 error_ret(tok);
394 goto error;
395 }
396 }
397 else {
398 tok->decoding_buffer = NULL;
399 }
400 buf = PyUnicode_AsUTF8AndSize(line, &buflen);
401 if (buf == NULL) {
402 error_ret(tok);
403 goto error;
404 }
405 // Make room for the null terminator *and* potentially
406 // an extra newline character that we may need to artificially
407 // add.
408 size_t buffer_size = buflen + 2;
409 if (!tok_reserve_buf(tok, buffer_size)) {
410 goto error;
411 }
412 memcpy(tok->inp, buf, buflen);
413 tok->inp += buflen;
414 *tok->inp = '\0';
415 if (tok->fp_interactive &&
416 tok_concatenate_interactive_new_line(tok, buf) == -1) {
417 goto error;
418 }
419 Py_DECREF(line);
420 return 1;
421 error:
422 Py_XDECREF(line);
423 return 0;
424 }
425
426 /* Set the readline function for TOK to a StreamReader's
427 readline function. The StreamReader is named ENC.
428
429 This function is called from check_bom and check_coding_spec.
430
431 ENC is usually identical to the future value of tok->encoding,
432 except for the (currently unsupported) case of UTF-16.
433
434 Return 1 on success, 0 on failure. */
435
436 static int
fp_setreadl(struct tok_state * tok,const char * enc)437 fp_setreadl(struct tok_state *tok, const char* enc)
438 {
439 PyObject *readline, *io, *stream;
440 int fd;
441 long pos;
442
443 fd = fileno(tok->fp);
444 /* Due to buffering the file offset for fd can be different from the file
445 * position of tok->fp. If tok->fp was opened in text mode on Windows,
446 * its file position counts CRLF as one char and can't be directly mapped
447 * to the file offset for fd. Instead we step back one byte and read to
448 * the end of line.*/
449 pos = ftell(tok->fp);
450 if (pos == -1 ||
451 lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {
452 PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
453 return 0;
454 }
455
456 io = PyImport_ImportModule("io");
457 if (io == NULL) {
458 return 0;
459 }
460 stream = _PyObject_CallMethod(io, &_Py_ID(open), "isisOOO",
461 fd, "r", -1, enc, Py_None, Py_None, Py_False);
462 Py_DECREF(io);
463 if (stream == NULL) {
464 return 0;
465 }
466
467 readline = PyObject_GetAttr(stream, &_Py_ID(readline));
468 Py_DECREF(stream);
469 if (readline == NULL) {
470 return 0;
471 }
472 Py_XSETREF(tok->decoding_readline, readline);
473
474 if (pos > 0) {
475 PyObject *bufobj = _PyObject_CallNoArgs(readline);
476 if (bufobj == NULL) {
477 return 0;
478 }
479 Py_DECREF(bufobj);
480 }
481
482 return 1;
483 }
484
485 /* Fetch the next byte from TOK. */
486
fp_getc(struct tok_state * tok)487 static int fp_getc(struct tok_state *tok) {
488 return getc(tok->fp);
489 }
490
491 /* Unfetch the last byte back into TOK. */
492
fp_ungetc(int c,struct tok_state * tok)493 static void fp_ungetc(int c, struct tok_state *tok) {
494 ungetc(c, tok->fp);
495 }
496
497 /* Check whether the characters at s start a valid
498 UTF-8 sequence. Return the number of characters forming
499 the sequence if yes, 0 if not. The special cases match
500 those in stringlib/codecs.h:utf8_decode.
501 */
502 static int
valid_utf8(const unsigned char * s)503 valid_utf8(const unsigned char* s)
504 {
505 int expected = 0;
506 int length;
507 if (*s < 0x80) {
508 /* single-byte code */
509 return 1;
510 }
511 else if (*s < 0xE0) {
512 /* \xC2\x80-\xDF\xBF -- 0080-07FF */
513 if (*s < 0xC2) {
514 /* invalid sequence
515 \x80-\xBF -- continuation byte
516 \xC0-\xC1 -- fake 0000-007F */
517 return 0;
518 }
519 expected = 1;
520 }
521 else if (*s < 0xF0) {
522 /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */
523 if (*s == 0xE0 && *(s + 1) < 0xA0) {
524 /* invalid sequence
525 \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */
526 return 0;
527 }
528 else if (*s == 0xED && *(s + 1) >= 0xA0) {
529 /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF
530 will result in surrogates in range D800-DFFF. Surrogates are
531 not valid UTF-8 so they are rejected.
532 See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
533 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
534 return 0;
535 }
536 expected = 2;
537 }
538 else if (*s < 0xF5) {
539 /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */
540 if (*(s + 1) < 0x90 ? *s == 0xF0 : *s == 0xF4) {
541 /* invalid sequence -- one of:
542 \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF
543 \xF4\x90\x80\x80- -- 110000- overflow */
544 return 0;
545 }
546 expected = 3;
547 }
548 else {
549 /* invalid start byte */
550 return 0;
551 }
552 length = expected + 1;
553 for (; expected; expected--)
554 if (s[expected] < 0x80 || s[expected] >= 0xC0)
555 return 0;
556 return length;
557 }
558
559 static int
ensure_utf8(char * line,struct tok_state * tok)560 ensure_utf8(char *line, struct tok_state *tok)
561 {
562 int badchar = 0;
563 unsigned char *c;
564 int length;
565 for (c = (unsigned char *)line; *c; c += length) {
566 if (!(length = valid_utf8(c))) {
567 badchar = *c;
568 break;
569 }
570 }
571 if (badchar) {
572 PyErr_Format(PyExc_SyntaxError,
573 "Non-UTF-8 code starting with '\\x%.2x' "
574 "in file %U on line %i, "
575 "but no encoding declared; "
576 "see https://peps.python.org/pep-0263/ for details",
577 badchar, tok->filename, tok->lineno);
578 return 0;
579 }
580 return 1;
581 }
582
583 /* Fetch a byte from TOK, using the string buffer. */
584
585 static int
buf_getc(struct tok_state * tok)586 buf_getc(struct tok_state *tok) {
587 return Py_CHARMASK(*tok->str++);
588 }
589
590 /* Unfetch a byte from TOK, using the string buffer. */
591
592 static void
buf_ungetc(int c,struct tok_state * tok)593 buf_ungetc(int c, struct tok_state *tok) {
594 tok->str--;
595 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
596 }
597
598 /* Set the readline function for TOK to ENC. For the string-based
599 tokenizer, this means to just record the encoding. */
600
601 static int
buf_setreadl(struct tok_state * tok,const char * enc)602 buf_setreadl(struct tok_state *tok, const char* enc) {
603 tok->enc = enc;
604 return 1;
605 }
606
607 /* Return a UTF-8 encoding Python string object from the
608 C byte string STR, which is encoded with ENC. */
609
610 static PyObject *
translate_into_utf8(const char * str,const char * enc)611 translate_into_utf8(const char* str, const char* enc) {
612 PyObject *utf8;
613 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
614 if (buf == NULL)
615 return NULL;
616 utf8 = PyUnicode_AsUTF8String(buf);
617 Py_DECREF(buf);
618 return utf8;
619 }
620
621
622 static char *
translate_newlines(const char * s,int exec_input,struct tok_state * tok)623 translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
624 int skip_next_lf = 0;
625 size_t needed_length = strlen(s) + 2, final_length;
626 char *buf, *current;
627 char c = '\0';
628 buf = PyMem_Malloc(needed_length);
629 if (buf == NULL) {
630 tok->done = E_NOMEM;
631 return NULL;
632 }
633 for (current = buf; *s; s++, current++) {
634 c = *s;
635 if (skip_next_lf) {
636 skip_next_lf = 0;
637 if (c == '\n') {
638 c = *++s;
639 if (!c)
640 break;
641 }
642 }
643 if (c == '\r') {
644 skip_next_lf = 1;
645 c = '\n';
646 }
647 *current = c;
648 }
649 /* If this is exec input, add a newline to the end of the string if
650 there isn't one already. */
651 if (exec_input && c != '\n') {
652 *current = '\n';
653 current++;
654 }
655 *current = '\0';
656 final_length = current - buf + 1;
657 if (final_length < needed_length && final_length) {
658 /* should never fail */
659 char* result = PyMem_Realloc(buf, final_length);
660 if (result == NULL) {
661 PyMem_Free(buf);
662 }
663 buf = result;
664 }
665 return buf;
666 }
667
668 /* Decode a byte string STR for use as the buffer of TOK.
669 Look for encoding declarations inside STR, and record them
670 inside TOK. */
671
672 static char *
decode_str(const char * input,int single,struct tok_state * tok)673 decode_str(const char *input, int single, struct tok_state *tok)
674 {
675 PyObject* utf8 = NULL;
676 char *str;
677 const char *s;
678 const char *newl[2] = {NULL, NULL};
679 int lineno = 0;
680 tok->input = str = translate_newlines(input, single, tok);
681 if (str == NULL)
682 return NULL;
683 tok->enc = NULL;
684 tok->str = str;
685 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
686 return error_ret(tok);
687 str = tok->str; /* string after BOM if any */
688 assert(str);
689 if (tok->enc != NULL) {
690 utf8 = translate_into_utf8(str, tok->enc);
691 if (utf8 == NULL)
692 return error_ret(tok);
693 str = PyBytes_AsString(utf8);
694 }
695 for (s = str;; s++) {
696 if (*s == '\0') break;
697 else if (*s == '\n') {
698 assert(lineno < 2);
699 newl[lineno] = s;
700 lineno++;
701 if (lineno == 2) break;
702 }
703 }
704 tok->enc = NULL;
705 /* need to check line 1 and 2 separately since check_coding_spec
706 assumes a single line as input */
707 if (newl[0]) {
708 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) {
709 return NULL;
710 }
711 if (tok->enc == NULL && tok->decoding_state != STATE_NORMAL && newl[1]) {
712 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
713 tok, buf_setreadl))
714 return NULL;
715 }
716 }
717 if (tok->enc != NULL) {
718 assert(utf8 == NULL);
719 utf8 = translate_into_utf8(str, tok->enc);
720 if (utf8 == NULL)
721 return error_ret(tok);
722 str = PyBytes_AS_STRING(utf8);
723 }
724 assert(tok->decoding_buffer == NULL);
725 tok->decoding_buffer = utf8; /* CAUTION */
726 return str;
727 }
728
729 /* Set up tokenizer for string */
730
731 struct tok_state *
_PyTokenizer_FromString(const char * str,int exec_input)732 _PyTokenizer_FromString(const char *str, int exec_input)
733 {
734 struct tok_state *tok = tok_new();
735 char *decoded;
736
737 if (tok == NULL)
738 return NULL;
739 decoded = decode_str(str, exec_input, tok);
740 if (decoded == NULL) {
741 _PyTokenizer_Free(tok);
742 return NULL;
743 }
744
745 tok->buf = tok->cur = tok->inp = decoded;
746 tok->end = decoded;
747 return tok;
748 }
749
750 /* Set up tokenizer for UTF-8 string */
751
752 struct tok_state *
_PyTokenizer_FromUTF8(const char * str,int exec_input)753 _PyTokenizer_FromUTF8(const char *str, int exec_input)
754 {
755 struct tok_state *tok = tok_new();
756 char *translated;
757 if (tok == NULL)
758 return NULL;
759 tok->input = translated = translate_newlines(str, exec_input, tok);
760 if (translated == NULL) {
761 _PyTokenizer_Free(tok);
762 return NULL;
763 }
764 tok->decoding_state = STATE_NORMAL;
765 tok->enc = NULL;
766 tok->str = translated;
767 tok->encoding = new_string("utf-8", 5, tok);
768 if (!tok->encoding) {
769 _PyTokenizer_Free(tok);
770 return NULL;
771 }
772
773 tok->buf = tok->cur = tok->inp = translated;
774 tok->end = translated;
775 return tok;
776 }
777
778 /* Set up tokenizer for file */
779
780 struct tok_state *
_PyTokenizer_FromFile(FILE * fp,const char * enc,const char * ps1,const char * ps2)781 _PyTokenizer_FromFile(FILE *fp, const char* enc,
782 const char *ps1, const char *ps2)
783 {
784 struct tok_state *tok = tok_new();
785 if (tok == NULL)
786 return NULL;
787 if ((tok->buf = (char *)PyMem_Malloc(BUFSIZ)) == NULL) {
788 _PyTokenizer_Free(tok);
789 return NULL;
790 }
791 tok->cur = tok->inp = tok->buf;
792 tok->end = tok->buf + BUFSIZ;
793 tok->fp = fp;
794 tok->prompt = ps1;
795 tok->nextprompt = ps2;
796 if (enc != NULL) {
797 /* Must copy encoding declaration since it
798 gets copied into the parse tree. */
799 tok->encoding = new_string(enc, strlen(enc), tok);
800 if (!tok->encoding) {
801 _PyTokenizer_Free(tok);
802 return NULL;
803 }
804 tok->decoding_state = STATE_NORMAL;
805 }
806 return tok;
807 }
808
809 /* Free a tok_state structure */
810
811 void
_PyTokenizer_Free(struct tok_state * tok)812 _PyTokenizer_Free(struct tok_state *tok)
813 {
814 if (tok->encoding != NULL) {
815 PyMem_Free(tok->encoding);
816 }
817 Py_XDECREF(tok->decoding_readline);
818 Py_XDECREF(tok->decoding_buffer);
819 Py_XDECREF(tok->filename);
820 if (tok->fp != NULL && tok->buf != NULL) {
821 PyMem_Free(tok->buf);
822 }
823 if (tok->input) {
824 PyMem_Free(tok->input);
825 }
826 if (tok->interactive_src_start != NULL) {
827 PyMem_Free(tok->interactive_src_start);
828 }
829 PyMem_Free(tok);
830 }
831
832 static int
tok_readline_raw(struct tok_state * tok)833 tok_readline_raw(struct tok_state *tok)
834 {
835 do {
836 if (!tok_reserve_buf(tok, BUFSIZ)) {
837 return 0;
838 }
839 int n_chars = (int)(tok->end - tok->inp);
840 size_t line_size = 0;
841 char *line = _Py_UniversalNewlineFgetsWithSize(tok->inp, n_chars, tok->fp, NULL, &line_size);
842 if (line == NULL) {
843 return 1;
844 }
845 if (tok->fp_interactive &&
846 tok_concatenate_interactive_new_line(tok, line) == -1) {
847 return 0;
848 }
849 tok->inp += line_size;
850 if (tok->inp == tok->buf) {
851 return 0;
852 }
853 } while (tok->inp[-1] != '\n');
854 return 1;
855 }
856
857 static int
tok_underflow_string(struct tok_state * tok)858 tok_underflow_string(struct tok_state *tok) {
859 char *end = strchr(tok->inp, '\n');
860 if (end != NULL) {
861 end++;
862 }
863 else {
864 end = strchr(tok->inp, '\0');
865 if (end == tok->inp) {
866 tok->done = E_EOF;
867 return 0;
868 }
869 }
870 if (tok->start == NULL) {
871 tok->buf = tok->cur;
872 }
873 tok->line_start = tok->cur;
874 tok->lineno++;
875 tok->inp = end;
876 return 1;
877 }
878
879 static int
tok_underflow_interactive(struct tok_state * tok)880 tok_underflow_interactive(struct tok_state *tok) {
881 if (tok->interactive_underflow == IUNDERFLOW_STOP) {
882 tok->done = E_INTERACT_STOP;
883 return 1;
884 }
885 char *newtok = PyOS_Readline(tok->fp ? tok->fp : stdin, stdout, tok->prompt);
886 if (newtok != NULL) {
887 char *translated = translate_newlines(newtok, 0, tok);
888 PyMem_Free(newtok);
889 if (translated == NULL) {
890 return 0;
891 }
892 newtok = translated;
893 }
894 if (tok->encoding && newtok && *newtok) {
895 /* Recode to UTF-8 */
896 Py_ssize_t buflen;
897 const char* buf;
898 PyObject *u = translate_into_utf8(newtok, tok->encoding);
899 PyMem_Free(newtok);
900 if (u == NULL) {
901 tok->done = E_DECODE;
902 return 0;
903 }
904 buflen = PyBytes_GET_SIZE(u);
905 buf = PyBytes_AS_STRING(u);
906 newtok = PyMem_Malloc(buflen+1);
907 if (newtok == NULL) {
908 Py_DECREF(u);
909 tok->done = E_NOMEM;
910 return 0;
911 }
912 strcpy(newtok, buf);
913 Py_DECREF(u);
914 }
915 if (tok->fp_interactive &&
916 tok_concatenate_interactive_new_line(tok, newtok) == -1) {
917 PyMem_Free(newtok);
918 return 0;
919 }
920 if (tok->nextprompt != NULL) {
921 tok->prompt = tok->nextprompt;
922 }
923 if (newtok == NULL) {
924 tok->done = E_INTR;
925 }
926 else if (*newtok == '\0') {
927 PyMem_Free(newtok);
928 tok->done = E_EOF;
929 }
930 else if (tok->start != NULL) {
931 Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf;
932 size_t size = strlen(newtok);
933 tok->lineno++;
934 if (!tok_reserve_buf(tok, size + 1)) {
935 PyMem_Free(tok->buf);
936 tok->buf = NULL;
937 PyMem_Free(newtok);
938 return 0;
939 }
940 memcpy(tok->cur, newtok, size + 1);
941 PyMem_Free(newtok);
942 tok->inp += size;
943 tok->multi_line_start = tok->buf + cur_multi_line_start;
944 }
945 else {
946 tok->lineno++;
947 PyMem_Free(tok->buf);
948 tok->buf = newtok;
949 tok->cur = tok->buf;
950 tok->line_start = tok->buf;
951 tok->inp = strchr(tok->buf, '\0');
952 tok->end = tok->inp + 1;
953 }
954 if (tok->done != E_OK) {
955 if (tok->prompt != NULL) {
956 PySys_WriteStderr("\n");
957 }
958 return 0;
959 }
960 return 1;
961 }
962
963 static int
tok_underflow_file(struct tok_state * tok)964 tok_underflow_file(struct tok_state *tok) {
965 if (tok->start == NULL) {
966 tok->cur = tok->inp = tok->buf;
967 }
968 if (tok->decoding_state == STATE_INIT) {
969 /* We have not yet determined the encoding.
970 If an encoding is found, use the file-pointer
971 reader functions from now on. */
972 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok)) {
973 error_ret(tok);
974 return 0;
975 }
976 assert(tok->decoding_state != STATE_INIT);
977 }
978 /* Read until '\n' or EOF */
979 if (tok->decoding_readline != NULL) {
980 /* We already have a codec associated with this input. */
981 if (!tok_readline_recode(tok)) {
982 return 0;
983 }
984 }
985 else {
986 /* We want a 'raw' read. */
987 if (!tok_readline_raw(tok)) {
988 return 0;
989 }
990 }
991 if (tok->inp == tok->cur) {
992 tok->done = E_EOF;
993 return 0;
994 }
995 if (tok->inp[-1] != '\n') {
996 assert(tok->inp + 1 < tok->end);
997 /* Last line does not end in \n, fake one */
998 *tok->inp++ = '\n';
999 *tok->inp = '\0';
1000 }
1001
1002 tok->lineno++;
1003 if (tok->decoding_state != STATE_NORMAL) {
1004 if (tok->lineno > 2) {
1005 tok->decoding_state = STATE_NORMAL;
1006 }
1007 else if (!check_coding_spec(tok->cur, strlen(tok->cur),
1008 tok, fp_setreadl))
1009 {
1010 return 0;
1011 }
1012 }
1013 /* The default encoding is UTF-8, so make sure we don't have any
1014 non-UTF-8 sequences in it. */
1015 if (!tok->encoding && !ensure_utf8(tok->cur, tok)) {
1016 error_ret(tok);
1017 return 0;
1018 }
1019 assert(tok->done == E_OK);
1020 return tok->done == E_OK;
1021 }
1022
1023 #if defined(Py_DEBUG)
1024 static void
print_escape(FILE * f,const char * s,Py_ssize_t size)1025 print_escape(FILE *f, const char *s, Py_ssize_t size)
1026 {
1027 if (s == NULL) {
1028 fputs("NULL", f);
1029 return;
1030 }
1031 putc('"', f);
1032 while (size-- > 0) {
1033 unsigned char c = *s++;
1034 switch (c) {
1035 case '\n': fputs("\\n", f); break;
1036 case '\r': fputs("\\r", f); break;
1037 case '\t': fputs("\\t", f); break;
1038 case '\f': fputs("\\f", f); break;
1039 case '\'': fputs("\\'", f); break;
1040 case '"': fputs("\\\"", f); break;
1041 default:
1042 if (0x20 <= c && c <= 0x7f)
1043 putc(c, f);
1044 else
1045 fprintf(f, "\\x%02x", c);
1046 }
1047 }
1048 putc('"', f);
1049 }
1050 #endif
1051
1052 /* Get next char, updating state; error code goes into tok->done */
1053
1054 static int
tok_nextc(struct tok_state * tok)1055 tok_nextc(struct tok_state *tok)
1056 {
1057 int rc;
1058 for (;;) {
1059 if (tok->cur != tok->inp) {
1060 return Py_CHARMASK(*tok->cur++); /* Fast path */
1061 }
1062 if (tok->done != E_OK) {
1063 return EOF;
1064 }
1065 if (tok->fp == NULL) {
1066 rc = tok_underflow_string(tok);
1067 }
1068 else if (tok->prompt != NULL) {
1069 rc = tok_underflow_interactive(tok);
1070 }
1071 else {
1072 rc = tok_underflow_file(tok);
1073 }
1074 #if defined(Py_DEBUG)
1075 if (Py_DebugFlag) {
1076 fprintf(stderr, "line[%d] = ", tok->lineno);
1077 print_escape(stderr, tok->cur, tok->inp - tok->cur);
1078 fprintf(stderr, " tok->done = %d\n", tok->done);
1079 }
1080 #endif
1081 if (!rc) {
1082 tok->cur = tok->inp;
1083 return EOF;
1084 }
1085 tok->line_start = tok->cur;
1086
1087 if (contains_null_bytes(tok->line_start, tok->inp - tok->line_start)) {
1088 syntaxerror(tok, "source code cannot contain null bytes");
1089 tok->cur = tok->inp;
1090 return EOF;
1091 }
1092 }
1093 Py_UNREACHABLE();
1094 }
1095
1096 /* Back-up one character */
1097
1098 static void
tok_backup(struct tok_state * tok,int c)1099 tok_backup(struct tok_state *tok, int c)
1100 {
1101 if (c != EOF) {
1102 if (--tok->cur < tok->buf) {
1103 Py_FatalError("tokenizer beginning of buffer");
1104 }
1105 if ((int)(unsigned char)*tok->cur != c) {
1106 Py_FatalError("tok_backup: wrong character");
1107 }
1108 }
1109 }
1110
1111 static int
_syntaxerror_range(struct tok_state * tok,const char * format,int col_offset,int end_col_offset,va_list vargs)1112 _syntaxerror_range(struct tok_state *tok, const char *format,
1113 int col_offset, int end_col_offset,
1114 va_list vargs)
1115 {
1116 PyObject *errmsg, *errtext, *args;
1117 errmsg = PyUnicode_FromFormatV(format, vargs);
1118 if (!errmsg) {
1119 goto error;
1120 }
1121
1122 errtext = PyUnicode_DecodeUTF8(tok->line_start, tok->cur - tok->line_start,
1123 "replace");
1124 if (!errtext) {
1125 goto error;
1126 }
1127
1128 if (col_offset == -1) {
1129 col_offset = (int)PyUnicode_GET_LENGTH(errtext);
1130 }
1131 if (end_col_offset == -1) {
1132 end_col_offset = col_offset;
1133 }
1134
1135 Py_ssize_t line_len = strcspn(tok->line_start, "\n");
1136 if (line_len != tok->cur - tok->line_start) {
1137 Py_DECREF(errtext);
1138 errtext = PyUnicode_DecodeUTF8(tok->line_start, line_len,
1139 "replace");
1140 }
1141 if (!errtext) {
1142 goto error;
1143 }
1144
1145 args = Py_BuildValue("(O(OiiNii))", errmsg, tok->filename, tok->lineno,
1146 col_offset, errtext, tok->lineno, end_col_offset);
1147 if (args) {
1148 PyErr_SetObject(PyExc_SyntaxError, args);
1149 Py_DECREF(args);
1150 }
1151
1152 error:
1153 Py_XDECREF(errmsg);
1154 tok->done = E_ERROR;
1155 return ERRORTOKEN;
1156 }
1157
1158 static int
syntaxerror(struct tok_state * tok,const char * format,...)1159 syntaxerror(struct tok_state *tok, const char *format, ...)
1160 {
1161 va_list vargs;
1162 #ifdef HAVE_STDARG_PROTOTYPES
1163 va_start(vargs, format);
1164 #else
1165 va_start(vargs);
1166 #endif
1167 int ret = _syntaxerror_range(tok, format, -1, -1, vargs);
1168 va_end(vargs);
1169 return ret;
1170 }
1171
1172 static int
syntaxerror_known_range(struct tok_state * tok,int col_offset,int end_col_offset,const char * format,...)1173 syntaxerror_known_range(struct tok_state *tok,
1174 int col_offset, int end_col_offset,
1175 const char *format, ...)
1176 {
1177 va_list vargs;
1178 #ifdef HAVE_STDARG_PROTOTYPES
1179 va_start(vargs, format);
1180 #else
1181 va_start(vargs);
1182 #endif
1183 int ret = _syntaxerror_range(tok, format, col_offset, end_col_offset, vargs);
1184 va_end(vargs);
1185 return ret;
1186 }
1187
1188
1189
1190 static int
indenterror(struct tok_state * tok)1191 indenterror(struct tok_state *tok)
1192 {
1193 tok->done = E_TABSPACE;
1194 tok->cur = tok->inp;
1195 return ERRORTOKEN;
1196 }
1197
1198 static int
parser_warn(struct tok_state * tok,PyObject * category,const char * format,...)1199 parser_warn(struct tok_state *tok, PyObject *category, const char *format, ...)
1200 {
1201 if (!tok->report_warnings) {
1202 return 0;
1203 }
1204
1205 PyObject *errmsg;
1206 va_list vargs;
1207 #ifdef HAVE_STDARG_PROTOTYPES
1208 va_start(vargs, format);
1209 #else
1210 va_start(vargs);
1211 #endif
1212 errmsg = PyUnicode_FromFormatV(format, vargs);
1213 va_end(vargs);
1214 if (!errmsg) {
1215 goto error;
1216 }
1217
1218 if (PyErr_WarnExplicitObject(category, errmsg, tok->filename,
1219 tok->lineno, NULL, NULL) < 0) {
1220 if (PyErr_ExceptionMatches(category)) {
1221 /* Replace the DeprecationWarning exception with a SyntaxError
1222 to get a more accurate error report */
1223 PyErr_Clear();
1224 syntaxerror(tok, "%U", errmsg);
1225 }
1226 goto error;
1227 }
1228 Py_DECREF(errmsg);
1229 return 0;
1230
1231 error:
1232 Py_XDECREF(errmsg);
1233 tok->done = E_ERROR;
1234 return -1;
1235 }
1236
1237 static int
lookahead(struct tok_state * tok,const char * test)1238 lookahead(struct tok_state *tok, const char *test)
1239 {
1240 const char *s = test;
1241 int res = 0;
1242 while (1) {
1243 int c = tok_nextc(tok);
1244 if (*s == 0) {
1245 res = !is_potential_identifier_char(c);
1246 }
1247 else if (c == *s) {
1248 s++;
1249 continue;
1250 }
1251
1252 tok_backup(tok, c);
1253 while (s != test) {
1254 tok_backup(tok, *--s);
1255 }
1256 return res;
1257 }
1258 }
1259
1260 static int
verify_end_of_number(struct tok_state * tok,int c,const char * kind)1261 verify_end_of_number(struct tok_state *tok, int c, const char *kind)
1262 {
1263 /* Emit a deprecation warning only if the numeric literal is immediately
1264 * followed by one of keywords which can occur after a numeric literal
1265 * in valid code: "and", "else", "for", "if", "in", "is" and "or".
1266 * It allows to gradually deprecate existing valid code without adding
1267 * warning before error in most cases of invalid numeric literal (which
1268 * would be confusing and break existing tests).
1269 * Raise a syntax error with slightly better message than plain
1270 * "invalid syntax" if the numeric literal is immediately followed by
1271 * other keyword or identifier.
1272 */
1273 int r = 0;
1274 if (c == 'a') {
1275 r = lookahead(tok, "nd");
1276 }
1277 else if (c == 'e') {
1278 r = lookahead(tok, "lse");
1279 }
1280 else if (c == 'f') {
1281 r = lookahead(tok, "or");
1282 }
1283 else if (c == 'i') {
1284 int c2 = tok_nextc(tok);
1285 if (c2 == 'f' || c2 == 'n' || c2 == 's') {
1286 r = 1;
1287 }
1288 tok_backup(tok, c2);
1289 }
1290 else if (c == 'o') {
1291 r = lookahead(tok, "r");
1292 }
1293 else if (c == 'n') {
1294 r = lookahead(tok, "ot");
1295 }
1296 if (r) {
1297 tok_backup(tok, c);
1298 if (parser_warn(tok, PyExc_SyntaxWarning,
1299 "invalid %s literal", kind))
1300 {
1301 return 0;
1302 }
1303 tok_nextc(tok);
1304 }
1305 else /* In future releases, only error will remain. */
1306 if (is_potential_identifier_char(c)) {
1307 tok_backup(tok, c);
1308 syntaxerror(tok, "invalid %s literal", kind);
1309 return 0;
1310 }
1311 return 1;
1312 }
1313
1314 /* Verify that the identifier follows PEP 3131.
1315 All identifier strings are guaranteed to be "ready" unicode objects.
1316 */
1317 static int
verify_identifier(struct tok_state * tok)1318 verify_identifier(struct tok_state *tok)
1319 {
1320 PyObject *s;
1321 if (tok->decoding_erred)
1322 return 0;
1323 s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
1324 if (s == NULL) {
1325 if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1326 tok->done = E_DECODE;
1327 }
1328 else {
1329 tok->done = E_ERROR;
1330 }
1331 return 0;
1332 }
1333 Py_ssize_t invalid = _PyUnicode_ScanIdentifier(s);
1334 if (invalid < 0) {
1335 Py_DECREF(s);
1336 tok->done = E_ERROR;
1337 return 0;
1338 }
1339 assert(PyUnicode_GET_LENGTH(s) > 0);
1340 if (invalid < PyUnicode_GET_LENGTH(s)) {
1341 Py_UCS4 ch = PyUnicode_READ_CHAR(s, invalid);
1342 if (invalid + 1 < PyUnicode_GET_LENGTH(s)) {
1343 /* Determine the offset in UTF-8 encoded input */
1344 Py_SETREF(s, PyUnicode_Substring(s, 0, invalid + 1));
1345 if (s != NULL) {
1346 Py_SETREF(s, PyUnicode_AsUTF8String(s));
1347 }
1348 if (s == NULL) {
1349 tok->done = E_ERROR;
1350 return 0;
1351 }
1352 tok->cur = (char *)tok->start + PyBytes_GET_SIZE(s);
1353 }
1354 Py_DECREF(s);
1355 // PyUnicode_FromFormatV() does not support %X
1356 char hex[9];
1357 (void)PyOS_snprintf(hex, sizeof(hex), "%04X", ch);
1358 if (Py_UNICODE_ISPRINTABLE(ch)) {
1359 syntaxerror(tok, "invalid character '%c' (U+%s)", ch, hex);
1360 }
1361 else {
1362 syntaxerror(tok, "invalid non-printable character U+%s", hex);
1363 }
1364 return 0;
1365 }
1366 Py_DECREF(s);
1367 return 1;
1368 }
1369
1370 static int
tok_decimal_tail(struct tok_state * tok)1371 tok_decimal_tail(struct tok_state *tok)
1372 {
1373 int c;
1374
1375 while (1) {
1376 do {
1377 c = tok_nextc(tok);
1378 } while (isdigit(c));
1379 if (c != '_') {
1380 break;
1381 }
1382 c = tok_nextc(tok);
1383 if (!isdigit(c)) {
1384 tok_backup(tok, c);
1385 syntaxerror(tok, "invalid decimal literal");
1386 return 0;
1387 }
1388 }
1389 return c;
1390 }
1391
1392 /* Get next token, after space stripping etc. */
1393
1394 static inline int
tok_continuation_line(struct tok_state * tok)1395 tok_continuation_line(struct tok_state *tok) {
1396 int c = tok_nextc(tok);
1397 if (c != '\n') {
1398 tok->done = E_LINECONT;
1399 return -1;
1400 }
1401 c = tok_nextc(tok);
1402 if (c == EOF) {
1403 tok->done = E_EOF;
1404 tok->cur = tok->inp;
1405 return -1;
1406 } else {
1407 tok_backup(tok, c);
1408 }
1409 return c;
1410 }
1411
1412 static int
tok_get(struct tok_state * tok,const char ** p_start,const char ** p_end)1413 tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
1414 {
1415 int c;
1416 int blankline, nonascii;
1417
1418 *p_start = *p_end = NULL;
1419 nextline:
1420 tok->start = NULL;
1421 blankline = 0;
1422
1423 /* Get indentation level */
1424 if (tok->atbol) {
1425 int col = 0;
1426 int altcol = 0;
1427 tok->atbol = 0;
1428 int cont_line_col = 0;
1429 for (;;) {
1430 c = tok_nextc(tok);
1431 if (c == ' ') {
1432 col++, altcol++;
1433 }
1434 else if (c == '\t') {
1435 col = (col / tok->tabsize + 1) * tok->tabsize;
1436 altcol = (altcol / ALTTABSIZE + 1) * ALTTABSIZE;
1437 }
1438 else if (c == '\014') {/* Control-L (formfeed) */
1439 col = altcol = 0; /* For Emacs users */
1440 }
1441 else if (c == '\\') {
1442 // Indentation cannot be split over multiple physical lines
1443 // using backslashes. This means that if we found a backslash
1444 // preceded by whitespace, **the first one we find** determines
1445 // the level of indentation of whatever comes next.
1446 cont_line_col = cont_line_col ? cont_line_col : col;
1447 if ((c = tok_continuation_line(tok)) == -1) {
1448 return ERRORTOKEN;
1449 }
1450 }
1451 else {
1452 break;
1453 }
1454 }
1455 tok_backup(tok, c);
1456 if (c == '#' || c == '\n') {
1457 /* Lines with only whitespace and/or comments
1458 shouldn't affect the indentation and are
1459 not passed to the parser as NEWLINE tokens,
1460 except *totally* empty lines in interactive
1461 mode, which signal the end of a command group. */
1462 if (col == 0 && c == '\n' && tok->prompt != NULL) {
1463 blankline = 0; /* Let it through */
1464 }
1465 else if (tok->prompt != NULL && tok->lineno == 1) {
1466 /* In interactive mode, if the first line contains
1467 only spaces and/or a comment, let it through. */
1468 blankline = 0;
1469 col = altcol = 0;
1470 }
1471 else {
1472 blankline = 1; /* Ignore completely */
1473 }
1474 /* We can't jump back right here since we still
1475 may need to skip to the end of a comment */
1476 }
1477 if (!blankline && tok->level == 0) {
1478 col = cont_line_col ? cont_line_col : col;
1479 altcol = cont_line_col ? cont_line_col : altcol;
1480 if (col == tok->indstack[tok->indent]) {
1481 /* No change */
1482 if (altcol != tok->altindstack[tok->indent]) {
1483 return indenterror(tok);
1484 }
1485 }
1486 else if (col > tok->indstack[tok->indent]) {
1487 /* Indent -- always one */
1488 if (tok->indent+1 >= MAXINDENT) {
1489 tok->done = E_TOODEEP;
1490 tok->cur = tok->inp;
1491 return ERRORTOKEN;
1492 }
1493 if (altcol <= tok->altindstack[tok->indent]) {
1494 return indenterror(tok);
1495 }
1496 tok->pendin++;
1497 tok->indstack[++tok->indent] = col;
1498 tok->altindstack[tok->indent] = altcol;
1499 }
1500 else /* col < tok->indstack[tok->indent] */ {
1501 /* Dedent -- any number, must be consistent */
1502 while (tok->indent > 0 &&
1503 col < tok->indstack[tok->indent]) {
1504 tok->pendin--;
1505 tok->indent--;
1506 }
1507 if (col != tok->indstack[tok->indent]) {
1508 tok->done = E_DEDENT;
1509 tok->cur = tok->inp;
1510 return ERRORTOKEN;
1511 }
1512 if (altcol != tok->altindstack[tok->indent]) {
1513 return indenterror(tok);
1514 }
1515 }
1516 }
1517 }
1518
1519 tok->start = tok->cur;
1520
1521 /* Return pending indents/dedents */
1522 if (tok->pendin != 0) {
1523 if (tok->pendin < 0) {
1524 tok->pendin++;
1525 return DEDENT;
1526 }
1527 else {
1528 tok->pendin--;
1529 return INDENT;
1530 }
1531 }
1532
1533 /* Peek ahead at the next character */
1534 c = tok_nextc(tok);
1535 tok_backup(tok, c);
1536 /* Check if we are closing an async function */
1537 if (tok->async_def
1538 && !blankline
1539 /* Due to some implementation artifacts of type comments,
1540 * a TYPE_COMMENT at the start of a function won't set an
1541 * indentation level and it will produce a NEWLINE after it.
1542 * To avoid spuriously ending an async function due to this,
1543 * wait until we have some non-newline char in front of us. */
1544 && c != '\n'
1545 && tok->level == 0
1546 /* There was a NEWLINE after ASYNC DEF,
1547 so we're past the signature. */
1548 && tok->async_def_nl
1549 /* Current indentation level is less than where
1550 the async function was defined */
1551 && tok->async_def_indent >= tok->indent)
1552 {
1553 tok->async_def = 0;
1554 tok->async_def_indent = 0;
1555 tok->async_def_nl = 0;
1556 }
1557
1558 again:
1559 tok->start = NULL;
1560 /* Skip spaces */
1561 do {
1562 c = tok_nextc(tok);
1563 } while (c == ' ' || c == '\t' || c == '\014');
1564
1565 /* Set start of current token */
1566 tok->start = tok->cur == NULL ? NULL : tok->cur - 1;
1567
1568 /* Skip comment, unless it's a type comment */
1569 if (c == '#') {
1570 const char *prefix, *p, *type_start;
1571
1572 while (c != EOF && c != '\n') {
1573 c = tok_nextc(tok);
1574 }
1575
1576 if (tok->type_comments) {
1577 p = tok->start;
1578 prefix = type_comment_prefix;
1579 while (*prefix && p < tok->cur) {
1580 if (*prefix == ' ') {
1581 while (*p == ' ' || *p == '\t') {
1582 p++;
1583 }
1584 } else if (*prefix == *p) {
1585 p++;
1586 } else {
1587 break;
1588 }
1589
1590 prefix++;
1591 }
1592
1593 /* This is a type comment if we matched all of type_comment_prefix. */
1594 if (!*prefix) {
1595 int is_type_ignore = 1;
1596 const char *ignore_end = p + 6;
1597 tok_backup(tok, c); /* don't eat the newline or EOF */
1598
1599 type_start = p;
1600
1601 /* A TYPE_IGNORE is "type: ignore" followed by the end of the token
1602 * or anything ASCII and non-alphanumeric. */
1603 is_type_ignore = (
1604 tok->cur >= ignore_end && memcmp(p, "ignore", 6) == 0
1605 && !(tok->cur > ignore_end
1606 && ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0]))));
1607
1608 if (is_type_ignore) {
1609 *p_start = ignore_end;
1610 *p_end = tok->cur;
1611
1612 /* If this type ignore is the only thing on the line, consume the newline also. */
1613 if (blankline) {
1614 tok_nextc(tok);
1615 tok->atbol = 1;
1616 }
1617 return TYPE_IGNORE;
1618 } else {
1619 *p_start = type_start; /* after type_comment_prefix */
1620 *p_end = tok->cur;
1621 return TYPE_COMMENT;
1622 }
1623 }
1624 }
1625 }
1626
1627 if (tok->done == E_INTERACT_STOP) {
1628 return ENDMARKER;
1629 }
1630
1631 /* Check for EOF and errors now */
1632 if (c == EOF) {
1633 if (tok->level) {
1634 return ERRORTOKEN;
1635 }
1636 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1637 }
1638
1639 /* Identifier (most frequent token!) */
1640 nonascii = 0;
1641 if (is_potential_identifier_start(c)) {
1642 /* Process the various legal combinations of b"", r"", u"", and f"". */
1643 int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0;
1644 while (1) {
1645 if (!(saw_b || saw_u || saw_f) && (c == 'b' || c == 'B'))
1646 saw_b = 1;
1647 /* Since this is a backwards compatibility support literal we don't
1648 want to support it in arbitrary order like byte literals. */
1649 else if (!(saw_b || saw_u || saw_r || saw_f)
1650 && (c == 'u'|| c == 'U')) {
1651 saw_u = 1;
1652 }
1653 /* ur"" and ru"" are not supported */
1654 else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) {
1655 saw_r = 1;
1656 }
1657 else if (!(saw_f || saw_b || saw_u) && (c == 'f' || c == 'F')) {
1658 saw_f = 1;
1659 }
1660 else {
1661 break;
1662 }
1663 c = tok_nextc(tok);
1664 if (c == '"' || c == '\'') {
1665 goto letter_quote;
1666 }
1667 }
1668 while (is_potential_identifier_char(c)) {
1669 if (c >= 128) {
1670 nonascii = 1;
1671 }
1672 c = tok_nextc(tok);
1673 }
1674 tok_backup(tok, c);
1675 if (nonascii && !verify_identifier(tok)) {
1676 return ERRORTOKEN;
1677 }
1678
1679 *p_start = tok->start;
1680 *p_end = tok->cur;
1681
1682 /* async/await parsing block. */
1683 if (tok->cur - tok->start == 5 && tok->start[0] == 'a') {
1684 /* May be an 'async' or 'await' token. For Python 3.7 or
1685 later we recognize them unconditionally. For Python
1686 3.5 or 3.6 we recognize 'async' in front of 'def', and
1687 either one inside of 'async def'. (Technically we
1688 shouldn't recognize these at all for 3.4 or earlier,
1689 but there's no *valid* Python 3.4 code that would be
1690 rejected, and async functions will be rejected in a
1691 later phase.) */
1692 if (!tok->async_hacks || tok->async_def) {
1693 /* Always recognize the keywords. */
1694 if (memcmp(tok->start, "async", 5) == 0) {
1695 return ASYNC;
1696 }
1697 if (memcmp(tok->start, "await", 5) == 0) {
1698 return AWAIT;
1699 }
1700 }
1701 else if (memcmp(tok->start, "async", 5) == 0) {
1702 /* The current token is 'async'.
1703 Look ahead one token to see if that is 'def'. */
1704
1705 struct tok_state ahead_tok;
1706 const char *ahead_tok_start = NULL;
1707 const char *ahead_tok_end = NULL;
1708 int ahead_tok_kind;
1709
1710 memcpy(&ahead_tok, tok, sizeof(ahead_tok));
1711 ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start,
1712 &ahead_tok_end);
1713
1714 if (ahead_tok_kind == NAME
1715 && ahead_tok.cur - ahead_tok.start == 3
1716 && memcmp(ahead_tok.start, "def", 3) == 0)
1717 {
1718 /* The next token is going to be 'def', so instead of
1719 returning a plain NAME token, return ASYNC. */
1720 tok->async_def_indent = tok->indent;
1721 tok->async_def = 1;
1722 return ASYNC;
1723 }
1724 }
1725 }
1726
1727 return NAME;
1728 }
1729
1730 /* Newline */
1731 if (c == '\n') {
1732 tok->atbol = 1;
1733 if (blankline || tok->level > 0) {
1734 goto nextline;
1735 }
1736 *p_start = tok->start;
1737 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1738 tok->cont_line = 0;
1739 if (tok->async_def) {
1740 /* We're somewhere inside an 'async def' function, and
1741 we've encountered a NEWLINE after its signature. */
1742 tok->async_def_nl = 1;
1743 }
1744 return NEWLINE;
1745 }
1746
1747 /* Period or number starting with period? */
1748 if (c == '.') {
1749 c = tok_nextc(tok);
1750 if (isdigit(c)) {
1751 goto fraction;
1752 } else if (c == '.') {
1753 c = tok_nextc(tok);
1754 if (c == '.') {
1755 *p_start = tok->start;
1756 *p_end = tok->cur;
1757 return ELLIPSIS;
1758 }
1759 else {
1760 tok_backup(tok, c);
1761 }
1762 tok_backup(tok, '.');
1763 }
1764 else {
1765 tok_backup(tok, c);
1766 }
1767 *p_start = tok->start;
1768 *p_end = tok->cur;
1769 return DOT;
1770 }
1771
1772 /* Number */
1773 if (isdigit(c)) {
1774 if (c == '0') {
1775 /* Hex, octal or binary -- maybe. */
1776 c = tok_nextc(tok);
1777 if (c == 'x' || c == 'X') {
1778 /* Hex */
1779 c = tok_nextc(tok);
1780 do {
1781 if (c == '_') {
1782 c = tok_nextc(tok);
1783 }
1784 if (!isxdigit(c)) {
1785 tok_backup(tok, c);
1786 return syntaxerror(tok, "invalid hexadecimal literal");
1787 }
1788 do {
1789 c = tok_nextc(tok);
1790 } while (isxdigit(c));
1791 } while (c == '_');
1792 if (!verify_end_of_number(tok, c, "hexadecimal")) {
1793 return ERRORTOKEN;
1794 }
1795 }
1796 else if (c == 'o' || c == 'O') {
1797 /* Octal */
1798 c = tok_nextc(tok);
1799 do {
1800 if (c == '_') {
1801 c = tok_nextc(tok);
1802 }
1803 if (c < '0' || c >= '8') {
1804 if (isdigit(c)) {
1805 return syntaxerror(tok,
1806 "invalid digit '%c' in octal literal", c);
1807 }
1808 else {
1809 tok_backup(tok, c);
1810 return syntaxerror(tok, "invalid octal literal");
1811 }
1812 }
1813 do {
1814 c = tok_nextc(tok);
1815 } while ('0' <= c && c < '8');
1816 } while (c == '_');
1817 if (isdigit(c)) {
1818 return syntaxerror(tok,
1819 "invalid digit '%c' in octal literal", c);
1820 }
1821 if (!verify_end_of_number(tok, c, "octal")) {
1822 return ERRORTOKEN;
1823 }
1824 }
1825 else if (c == 'b' || c == 'B') {
1826 /* Binary */
1827 c = tok_nextc(tok);
1828 do {
1829 if (c == '_') {
1830 c = tok_nextc(tok);
1831 }
1832 if (c != '0' && c != '1') {
1833 if (isdigit(c)) {
1834 return syntaxerror(tok,
1835 "invalid digit '%c' in binary literal", c);
1836 }
1837 else {
1838 tok_backup(tok, c);
1839 return syntaxerror(tok, "invalid binary literal");
1840 }
1841 }
1842 do {
1843 c = tok_nextc(tok);
1844 } while (c == '0' || c == '1');
1845 } while (c == '_');
1846 if (isdigit(c)) {
1847 return syntaxerror(tok,
1848 "invalid digit '%c' in binary literal", c);
1849 }
1850 if (!verify_end_of_number(tok, c, "binary")) {
1851 return ERRORTOKEN;
1852 }
1853 }
1854 else {
1855 int nonzero = 0;
1856 /* maybe old-style octal; c is first char of it */
1857 /* in any case, allow '0' as a literal */
1858 while (1) {
1859 if (c == '_') {
1860 c = tok_nextc(tok);
1861 if (!isdigit(c)) {
1862 tok_backup(tok, c);
1863 return syntaxerror(tok, "invalid decimal literal");
1864 }
1865 }
1866 if (c != '0') {
1867 break;
1868 }
1869 c = tok_nextc(tok);
1870 }
1871 char* zeros_end = tok->cur;
1872 if (isdigit(c)) {
1873 nonzero = 1;
1874 c = tok_decimal_tail(tok);
1875 if (c == 0) {
1876 return ERRORTOKEN;
1877 }
1878 }
1879 if (c == '.') {
1880 c = tok_nextc(tok);
1881 goto fraction;
1882 }
1883 else if (c == 'e' || c == 'E') {
1884 goto exponent;
1885 }
1886 else if (c == 'j' || c == 'J') {
1887 goto imaginary;
1888 }
1889 else if (nonzero) {
1890 /* Old-style octal: now disallowed. */
1891 tok_backup(tok, c);
1892 return syntaxerror_known_range(
1893 tok, (int)(tok->start + 1 - tok->line_start),
1894 (int)(zeros_end - tok->line_start),
1895 "leading zeros in decimal integer "
1896 "literals are not permitted; "
1897 "use an 0o prefix for octal integers");
1898 }
1899 if (!verify_end_of_number(tok, c, "decimal")) {
1900 return ERRORTOKEN;
1901 }
1902 }
1903 }
1904 else {
1905 /* Decimal */
1906 c = tok_decimal_tail(tok);
1907 if (c == 0) {
1908 return ERRORTOKEN;
1909 }
1910 {
1911 /* Accept floating point numbers. */
1912 if (c == '.') {
1913 c = tok_nextc(tok);
1914 fraction:
1915 /* Fraction */
1916 if (isdigit(c)) {
1917 c = tok_decimal_tail(tok);
1918 if (c == 0) {
1919 return ERRORTOKEN;
1920 }
1921 }
1922 }
1923 if (c == 'e' || c == 'E') {
1924 int e;
1925 exponent:
1926 e = c;
1927 /* Exponent part */
1928 c = tok_nextc(tok);
1929 if (c == '+' || c == '-') {
1930 c = tok_nextc(tok);
1931 if (!isdigit(c)) {
1932 tok_backup(tok, c);
1933 return syntaxerror(tok, "invalid decimal literal");
1934 }
1935 } else if (!isdigit(c)) {
1936 tok_backup(tok, c);
1937 if (!verify_end_of_number(tok, e, "decimal")) {
1938 return ERRORTOKEN;
1939 }
1940 tok_backup(tok, e);
1941 *p_start = tok->start;
1942 *p_end = tok->cur;
1943 return NUMBER;
1944 }
1945 c = tok_decimal_tail(tok);
1946 if (c == 0) {
1947 return ERRORTOKEN;
1948 }
1949 }
1950 if (c == 'j' || c == 'J') {
1951 /* Imaginary part */
1952 imaginary:
1953 c = tok_nextc(tok);
1954 if (!verify_end_of_number(tok, c, "imaginary")) {
1955 return ERRORTOKEN;
1956 }
1957 }
1958 else if (!verify_end_of_number(tok, c, "decimal")) {
1959 return ERRORTOKEN;
1960 }
1961 }
1962 }
1963 tok_backup(tok, c);
1964 *p_start = tok->start;
1965 *p_end = tok->cur;
1966 return NUMBER;
1967 }
1968
1969 letter_quote:
1970 /* String */
1971 if (c == '\'' || c == '"') {
1972 int quote = c;
1973 int quote_size = 1; /* 1 or 3 */
1974 int end_quote_size = 0;
1975
1976 /* Nodes of type STRING, especially multi line strings
1977 must be handled differently in order to get both
1978 the starting line number and the column offset right.
1979 (cf. issue 16806) */
1980 tok->first_lineno = tok->lineno;
1981 tok->multi_line_start = tok->line_start;
1982
1983 /* Find the quote size and start of string */
1984 c = tok_nextc(tok);
1985 if (c == quote) {
1986 c = tok_nextc(tok);
1987 if (c == quote) {
1988 quote_size = 3;
1989 }
1990 else {
1991 end_quote_size = 1; /* empty string found */
1992 }
1993 }
1994 if (c != quote) {
1995 tok_backup(tok, c);
1996 }
1997
1998 /* Get rest of string */
1999 while (end_quote_size != quote_size) {
2000 c = tok_nextc(tok);
2001 if (tok->done == E_ERROR) {
2002 return ERRORTOKEN;
2003 }
2004 if (tok->done == E_DECODE) {
2005 break;
2006 }
2007 if (c == EOF || (quote_size == 1 && c == '\n')) {
2008 assert(tok->multi_line_start != NULL);
2009 // shift the tok_state's location into
2010 // the start of string, and report the error
2011 // from the initial quote character
2012 tok->cur = (char *)tok->start;
2013 tok->cur++;
2014 tok->line_start = tok->multi_line_start;
2015 int start = tok->lineno;
2016 tok->lineno = tok->first_lineno;
2017 if (quote_size == 3) {
2018 syntaxerror(tok, "unterminated triple-quoted string literal"
2019 " (detected at line %d)", start);
2020 if (c != '\n') {
2021 tok->done = E_EOFS;
2022 }
2023 return ERRORTOKEN;
2024 }
2025 else {
2026 syntaxerror(tok, "unterminated string literal (detected at"
2027 " line %d)", start);
2028 if (c != '\n') {
2029 tok->done = E_EOLS;
2030 }
2031 return ERRORTOKEN;
2032 }
2033 }
2034 if (c == quote) {
2035 end_quote_size += 1;
2036 }
2037 else {
2038 end_quote_size = 0;
2039 if (c == '\\') {
2040 tok_nextc(tok); /* skip escaped char */
2041 }
2042 }
2043 }
2044
2045 *p_start = tok->start;
2046 *p_end = tok->cur;
2047 return STRING;
2048 }
2049
2050 /* Line continuation */
2051 if (c == '\\') {
2052 if ((c = tok_continuation_line(tok)) == -1) {
2053 return ERRORTOKEN;
2054 }
2055 tok->cont_line = 1;
2056 goto again; /* Read next line */
2057 }
2058
2059 /* Check for two-character token */
2060 {
2061 int c2 = tok_nextc(tok);
2062 int token = PyToken_TwoChars(c, c2);
2063 if (token != OP) {
2064 int c3 = tok_nextc(tok);
2065 int token3 = PyToken_ThreeChars(c, c2, c3);
2066 if (token3 != OP) {
2067 token = token3;
2068 }
2069 else {
2070 tok_backup(tok, c3);
2071 }
2072 *p_start = tok->start;
2073 *p_end = tok->cur;
2074 return token;
2075 }
2076 tok_backup(tok, c2);
2077 }
2078
2079 /* Keep track of parentheses nesting level */
2080 switch (c) {
2081 case '(':
2082 case '[':
2083 case '{':
2084 if (tok->level >= MAXLEVEL) {
2085 return syntaxerror(tok, "too many nested parentheses");
2086 }
2087 tok->parenstack[tok->level] = c;
2088 tok->parenlinenostack[tok->level] = tok->lineno;
2089 tok->parencolstack[tok->level] = (int)(tok->start - tok->line_start);
2090 tok->level++;
2091 break;
2092 case ')':
2093 case ']':
2094 case '}':
2095 if (!tok->level) {
2096 return syntaxerror(tok, "unmatched '%c'", c);
2097 }
2098 tok->level--;
2099 int opening = tok->parenstack[tok->level];
2100 if (!((opening == '(' && c == ')') ||
2101 (opening == '[' && c == ']') ||
2102 (opening == '{' && c == '}')))
2103 {
2104 if (tok->parenlinenostack[tok->level] != tok->lineno) {
2105 return syntaxerror(tok,
2106 "closing parenthesis '%c' does not match "
2107 "opening parenthesis '%c' on line %d",
2108 c, opening, tok->parenlinenostack[tok->level]);
2109 }
2110 else {
2111 return syntaxerror(tok,
2112 "closing parenthesis '%c' does not match "
2113 "opening parenthesis '%c'",
2114 c, opening);
2115 }
2116 }
2117 break;
2118 }
2119
2120 if (!Py_UNICODE_ISPRINTABLE(c)) {
2121 char hex[9];
2122 (void)PyOS_snprintf(hex, sizeof(hex), "%04X", c);
2123 return syntaxerror(tok, "invalid non-printable character U+%s", hex);
2124 }
2125
2126 /* Punctuation character */
2127 *p_start = tok->start;
2128 *p_end = tok->cur;
2129 return PyToken_OneChar(c);
2130 }
2131
2132 int
_PyTokenizer_Get(struct tok_state * tok,const char ** p_start,const char ** p_end)2133 _PyTokenizer_Get(struct tok_state *tok,
2134 const char **p_start, const char **p_end)
2135 {
2136 int result = tok_get(tok, p_start, p_end);
2137 if (tok->decoding_erred) {
2138 result = ERRORTOKEN;
2139 tok->done = E_DECODE;
2140 }
2141 return result;
2142 }
2143
2144 #if defined(__wasi__) || (defined(__EMSCRIPTEN__) && (__EMSCRIPTEN_major__ >= 3))
2145 // fdopen() with borrowed fd. WASI does not provide dup() and Emscripten's
2146 // dup() emulation with open() is slow.
2147 typedef union {
2148 void *cookie;
2149 int fd;
2150 } borrowed;
2151
2152 static ssize_t
borrow_read(void * cookie,char * buf,size_t size)2153 borrow_read(void *cookie, char *buf, size_t size)
2154 {
2155 borrowed b = {.cookie = cookie};
2156 return read(b.fd, (void *)buf, size);
2157 }
2158
2159 static FILE *
fdopen_borrow(int fd)2160 fdopen_borrow(int fd) {
2161 // supports only reading. seek fails. close and write are no-ops.
2162 cookie_io_functions_t io_cb = {borrow_read, NULL, NULL, NULL};
2163 borrowed b = {.fd = fd};
2164 return fopencookie(b.cookie, "r", io_cb);
2165 }
2166 #else
2167 static FILE *
fdopen_borrow(int fd)2168 fdopen_borrow(int fd) {
2169 fd = _Py_dup(fd);
2170 if (fd < 0) {
2171 return NULL;
2172 }
2173 return fdopen(fd, "r");
2174 }
2175 #endif
2176
2177 /* Get the encoding of a Python file. Check for the coding cookie and check if
2178 the file starts with a BOM.
2179
2180 _PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
2181 encoding in the first or second line of the file (in which case the encoding
2182 should be assumed to be UTF-8).
2183
2184 The char* returned is malloc'ed via PyMem_Malloc() and thus must be freed
2185 by the caller. */
2186
2187 char *
_PyTokenizer_FindEncodingFilename(int fd,PyObject * filename)2188 _PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
2189 {
2190 struct tok_state *tok;
2191 FILE *fp;
2192 const char *p_start = NULL;
2193 const char *p_end = NULL;
2194 char *encoding = NULL;
2195
2196 fp = fdopen_borrow(fd);
2197 if (fp == NULL) {
2198 return NULL;
2199 }
2200 tok = _PyTokenizer_FromFile(fp, NULL, NULL, NULL);
2201 if (tok == NULL) {
2202 fclose(fp);
2203 return NULL;
2204 }
2205 if (filename != NULL) {
2206 Py_INCREF(filename);
2207 tok->filename = filename;
2208 }
2209 else {
2210 tok->filename = PyUnicode_FromString("<string>");
2211 if (tok->filename == NULL) {
2212 fclose(fp);
2213 _PyTokenizer_Free(tok);
2214 return encoding;
2215 }
2216 }
2217 // We don't want to report warnings here because it could cause infinite recursion
2218 // if fetching the encoding shows a warning.
2219 tok->report_warnings = 0;
2220 while (tok->lineno < 2 && tok->done == E_OK) {
2221 _PyTokenizer_Get(tok, &p_start, &p_end);
2222 }
2223 fclose(fp);
2224 if (tok->encoding) {
2225 encoding = (char *)PyMem_Malloc(strlen(tok->encoding) + 1);
2226 if (encoding) {
2227 strcpy(encoding, tok->encoding);
2228 }
2229 }
2230 _PyTokenizer_Free(tok);
2231 return encoding;
2232 }
2233
2234 #ifdef Py_DEBUG
2235 void
tok_dump(int type,char * start,char * end)2236 tok_dump(int type, char *start, char *end)
2237 {
2238 fprintf(stderr, "%s", _PyParser_TokenNames[type]);
2239 if (type == NAME || type == NUMBER || type == STRING || type == OP)
2240 fprintf(stderr, "(%.*s)", (int)(end - start), start);
2241 }
2242 #endif // Py_DEBUG
2243