1 
2 /* Tokenizer implementation */
3 
4 #define PY_SSIZE_T_CLEAN
5 #include "Python.h"
6 #include "pycore_call.h"          // _PyObject_CallNoArgs()
7 
8 #include <ctype.h>
9 #include <assert.h>
10 
11 #include "tokenizer.h"
12 #include "errcode.h"
13 
14 #include "unicodeobject.h"
15 #include "bytesobject.h"
16 #include "fileobject.h"
17 #include "abstract.h"
18 
19 /* Alternate tab spacing */
20 #define ALTTABSIZE 1
21 
22 #define is_potential_identifier_start(c) (\
23               (c >= 'a' && c <= 'z')\
24                || (c >= 'A' && c <= 'Z')\
25                || c == '_'\
26                || (c >= 128))
27 
28 #define is_potential_identifier_char(c) (\
29               (c >= 'a' && c <= 'z')\
30                || (c >= 'A' && c <= 'Z')\
31                || (c >= '0' && c <= '9')\
32                || c == '_'\
33                || (c >= 128))
34 
35 
36 /* Don't ever change this -- it would break the portability of Python code */
37 #define TABSIZE 8
38 
39 /* Forward */
40 static struct tok_state *tok_new(void);
41 static int tok_nextc(struct tok_state *tok);
42 static void tok_backup(struct tok_state *tok, int c);
43 static int syntaxerror(struct tok_state *tok, const char *format, ...);
44 
45 /* Spaces in this constant are treated as "zero or more spaces or tabs" when
46    tokenizing. */
47 static const char* type_comment_prefix = "# type: ";
48 
49 /* Create and initialize a new tok_state structure */
50 
51 static struct tok_state *
tok_new(void)52 tok_new(void)
53 {
54     struct tok_state *tok = (struct tok_state *)PyMem_Malloc(
55                                             sizeof(struct tok_state));
56     if (tok == NULL)
57         return NULL;
58     tok->buf = tok->cur = tok->inp = NULL;
59     tok->fp_interactive = 0;
60     tok->interactive_src_start = NULL;
61     tok->interactive_src_end = NULL;
62     tok->start = NULL;
63     tok->end = NULL;
64     tok->done = E_OK;
65     tok->fp = NULL;
66     tok->input = NULL;
67     tok->tabsize = TABSIZE;
68     tok->indent = 0;
69     tok->indstack[0] = 0;
70     tok->atbol = 1;
71     tok->pendin = 0;
72     tok->prompt = tok->nextprompt = NULL;
73     tok->lineno = 0;
74     tok->level = 0;
75     tok->altindstack[0] = 0;
76     tok->decoding_state = STATE_INIT;
77     tok->decoding_erred = 0;
78     tok->enc = NULL;
79     tok->encoding = NULL;
80     tok->cont_line = 0;
81     tok->filename = NULL;
82     tok->decoding_readline = NULL;
83     tok->decoding_buffer = NULL;
84     tok->type_comments = 0;
85     tok->async_hacks = 0;
86     tok->async_def = 0;
87     tok->async_def_indent = 0;
88     tok->async_def_nl = 0;
89     tok->interactive_underflow = IUNDERFLOW_NORMAL;
90     tok->str = NULL;
91     tok->report_warnings = 1;
92     return tok;
93 }
94 
95 static char *
new_string(const char * s,Py_ssize_t len,struct tok_state * tok)96 new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
97 {
98     char* result = (char *)PyMem_Malloc(len + 1);
99     if (!result) {
100         tok->done = E_NOMEM;
101         return NULL;
102     }
103     memcpy(result, s, len);
104     result[len] = '\0';
105     return result;
106 }
107 
108 static char *
error_ret(struct tok_state * tok)109 error_ret(struct tok_state *tok) /* XXX */
110 {
111     tok->decoding_erred = 1;
112     if (tok->fp != NULL && tok->buf != NULL) /* see _PyTokenizer_Free */
113         PyMem_Free(tok->buf);
114     tok->buf = tok->cur = tok->inp = NULL;
115     tok->start = NULL;
116     tok->end = NULL;
117     tok->done = E_DECODE;
118     return NULL;                /* as if it were EOF */
119 }
120 
121 
122 static const char *
get_normal_name(const char * s)123 get_normal_name(const char *s)  /* for utf-8 and latin-1 */
124 {
125     char buf[13];
126     int i;
127     for (i = 0; i < 12; i++) {
128         int c = s[i];
129         if (c == '\0')
130             break;
131         else if (c == '_')
132             buf[i] = '-';
133         else
134             buf[i] = tolower(c);
135     }
136     buf[i] = '\0';
137     if (strcmp(buf, "utf-8") == 0 ||
138         strncmp(buf, "utf-8-", 6) == 0)
139         return "utf-8";
140     else if (strcmp(buf, "latin-1") == 0 ||
141              strcmp(buf, "iso-8859-1") == 0 ||
142              strcmp(buf, "iso-latin-1") == 0 ||
143              strncmp(buf, "latin-1-", 8) == 0 ||
144              strncmp(buf, "iso-8859-1-", 11) == 0 ||
145              strncmp(buf, "iso-latin-1-", 12) == 0)
146         return "iso-8859-1";
147     else
148         return s;
149 }
150 
151 /* Return the coding spec in S, or NULL if none is found.  */
152 
153 static int
get_coding_spec(const char * s,char ** spec,Py_ssize_t size,struct tok_state * tok)154 get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
155 {
156     Py_ssize_t i;
157     *spec = NULL;
158     /* Coding spec must be in a comment, and that comment must be
159      * the only statement on the source code line. */
160     for (i = 0; i < size - 6; i++) {
161         if (s[i] == '#')
162             break;
163         if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
164             return 1;
165     }
166     for (; i < size - 6; i++) { /* XXX inefficient search */
167         const char* t = s + i;
168         if (memcmp(t, "coding", 6) == 0) {
169             const char* begin = NULL;
170             t += 6;
171             if (t[0] != ':' && t[0] != '=')
172                 continue;
173             do {
174                 t++;
175             } while (t[0] == ' ' || t[0] == '\t');
176 
177             begin = t;
178             while (Py_ISALNUM(t[0]) ||
179                    t[0] == '-' || t[0] == '_' || t[0] == '.')
180                 t++;
181 
182             if (begin < t) {
183                 char* r = new_string(begin, t - begin, tok);
184                 const char* q;
185                 if (!r)
186                     return 0;
187                 q = get_normal_name(r);
188                 if (r != q) {
189                     PyMem_Free(r);
190                     r = new_string(q, strlen(q), tok);
191                     if (!r)
192                         return 0;
193                 }
194                 *spec = r;
195                 break;
196             }
197         }
198     }
199     return 1;
200 }
201 
202 /* Check whether the line contains a coding spec. If it does,
203    invoke the set_readline function for the new encoding.
204    This function receives the tok_state and the new encoding.
205    Return 1 on success, 0 on failure.  */
206 
207 static int
check_coding_spec(const char * line,Py_ssize_t size,struct tok_state * tok,int set_readline (struct tok_state *,const char *))208 check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
209                   int set_readline(struct tok_state *, const char *))
210 {
211     char *cs;
212     if (tok->cont_line) {
213         /* It's a continuation line, so it can't be a coding spec. */
214         tok->decoding_state = STATE_NORMAL;
215         return 1;
216     }
217     if (!get_coding_spec(line, &cs, size, tok)) {
218         return 0;
219     }
220     if (!cs) {
221         Py_ssize_t i;
222         for (i = 0; i < size; i++) {
223             if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
224                 break;
225             if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
226                 /* Stop checking coding spec after a line containing
227                  * anything except a comment. */
228                 tok->decoding_state = STATE_NORMAL;
229                 break;
230             }
231         }
232         return 1;
233     }
234     tok->decoding_state = STATE_NORMAL;
235     if (tok->encoding == NULL) {
236         assert(tok->decoding_readline == NULL);
237         if (strcmp(cs, "utf-8") != 0 && !set_readline(tok, cs)) {
238             error_ret(tok);
239             PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
240             PyMem_Free(cs);
241             return 0;
242         }
243         tok->encoding = cs;
244     } else {                /* then, compare cs with BOM */
245         if (strcmp(tok->encoding, cs) != 0) {
246             error_ret(tok);
247             PyErr_Format(PyExc_SyntaxError,
248                          "encoding problem: %s with BOM", cs);
249             PyMem_Free(cs);
250             return 0;
251         }
252         PyMem_Free(cs);
253     }
254     return 1;
255 }
256 
257 /* See whether the file starts with a BOM. If it does,
258    invoke the set_readline function with the new encoding.
259    Return 1 on success, 0 on failure.  */
260 
261 static int
check_bom(int get_char (struct tok_state *),void unget_char (int,struct tok_state *),int set_readline (struct tok_state *,const char *),struct tok_state * tok)262 check_bom(int get_char(struct tok_state *),
263           void unget_char(int, struct tok_state *),
264           int set_readline(struct tok_state *, const char *),
265           struct tok_state *tok)
266 {
267     int ch1, ch2, ch3;
268     ch1 = get_char(tok);
269     tok->decoding_state = STATE_SEEK_CODING;
270     if (ch1 == EOF) {
271         return 1;
272     } else if (ch1 == 0xEF) {
273         ch2 = get_char(tok);
274         if (ch2 != 0xBB) {
275             unget_char(ch2, tok);
276             unget_char(ch1, tok);
277             return 1;
278         }
279         ch3 = get_char(tok);
280         if (ch3 != 0xBF) {
281             unget_char(ch3, tok);
282             unget_char(ch2, tok);
283             unget_char(ch1, tok);
284             return 1;
285         }
286     } else {
287         unget_char(ch1, tok);
288         return 1;
289     }
290     if (tok->encoding != NULL)
291         PyMem_Free(tok->encoding);
292     tok->encoding = new_string("utf-8", 5, tok);
293     if (!tok->encoding)
294         return 0;
295     /* No need to set_readline: input is already utf-8 */
296     return 1;
297 }
298 
299 static int
tok_concatenate_interactive_new_line(struct tok_state * tok,const char * line)300 tok_concatenate_interactive_new_line(struct tok_state *tok, const char *line) {
301     assert(tok->fp_interactive);
302 
303     if (!line) {
304         return 0;
305     }
306 
307     Py_ssize_t current_size = tok->interactive_src_end - tok->interactive_src_start;
308     Py_ssize_t line_size = strlen(line);
309     char last_char = line[line_size > 0 ? line_size - 1 : line_size];
310     if (last_char != '\n') {
311         line_size += 1;
312     }
313     char* new_str = tok->interactive_src_start;
314 
315     new_str = PyMem_Realloc(new_str, current_size + line_size + 1);
316     if (!new_str) {
317         if (tok->interactive_src_start) {
318             PyMem_Free(tok->interactive_src_start);
319         }
320         tok->interactive_src_start = NULL;
321         tok->interactive_src_end = NULL;
322         tok->done = E_NOMEM;
323         return -1;
324     }
325     strcpy(new_str + current_size, line);
326     if (last_char != '\n') {
327         /* Last line does not end in \n, fake one */
328         new_str[current_size + line_size - 1] = '\n';
329         new_str[current_size + line_size] = '\0';
330     }
331     tok->interactive_src_start = new_str;
332     tok->interactive_src_end = new_str + current_size + line_size;
333     return 0;
334 }
335 
336 
337 /* Read a line of text from TOK into S, using the stream in TOK.
338    Return NULL on failure, else S.
339 
340    On entry, tok->decoding_buffer will be one of:
341      1) NULL: need to call tok->decoding_readline to get a new line
342      2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
343        stored the result in tok->decoding_buffer
344      3) PyByteArrayObject *: previous call to tok_readline_recode did not have enough room
345        (in the s buffer) to copy entire contents of the line read
346        by tok->decoding_readline.  tok->decoding_buffer has the overflow.
347        In this case, tok_readline_recode is called in a loop (with an expanded buffer)
348        until the buffer ends with a '\n' (or until the end of the file is
349        reached): see tok_nextc and its calls to tok_reserve_buf.
350 */
351 
352 static int
tok_reserve_buf(struct tok_state * tok,Py_ssize_t size)353 tok_reserve_buf(struct tok_state *tok, Py_ssize_t size)
354 {
355     Py_ssize_t cur = tok->cur - tok->buf;
356     Py_ssize_t oldsize = tok->inp - tok->buf;
357     Py_ssize_t newsize = oldsize + Py_MAX(size, oldsize >> 1);
358     if (newsize > tok->end - tok->buf) {
359         char *newbuf = tok->buf;
360         Py_ssize_t start = tok->start == NULL ? -1 : tok->start - tok->buf;
361         Py_ssize_t line_start = tok->start == NULL ? -1 : tok->line_start - tok->buf;
362         Py_ssize_t multi_line_start = tok->multi_line_start - tok->buf;
363         newbuf = (char *)PyMem_Realloc(newbuf, newsize);
364         if (newbuf == NULL) {
365             tok->done = E_NOMEM;
366             return 0;
367         }
368         tok->buf = newbuf;
369         tok->cur = tok->buf + cur;
370         tok->inp = tok->buf + oldsize;
371         tok->end = tok->buf + newsize;
372         tok->start = start < 0 ? NULL : tok->buf + start;
373         tok->line_start = line_start < 0 ? NULL : tok->buf + line_start;
374         tok->multi_line_start = multi_line_start < 0 ? NULL : tok->buf + multi_line_start;
375     }
376     return 1;
377 }
378 
379 static inline int
contains_null_bytes(const char * str,size_t size)380 contains_null_bytes(const char* str, size_t size) {
381     return memchr(str, 0, size) != NULL;
382 }
383 
384 static int
tok_readline_recode(struct tok_state * tok)385 tok_readline_recode(struct tok_state *tok) {
386     PyObject *line;
387     const  char *buf;
388     Py_ssize_t buflen;
389     line = tok->decoding_buffer;
390     if (line == NULL) {
391         line = PyObject_CallNoArgs(tok->decoding_readline);
392         if (line == NULL) {
393             error_ret(tok);
394             goto error;
395         }
396     }
397     else {
398         tok->decoding_buffer = NULL;
399     }
400     buf = PyUnicode_AsUTF8AndSize(line, &buflen);
401     if (buf == NULL) {
402         error_ret(tok);
403         goto error;
404     }
405     // Make room for the null terminator *and* potentially
406     // an extra newline character that we may need to artificially
407     // add.
408     size_t buffer_size = buflen + 2;
409     if (!tok_reserve_buf(tok, buffer_size)) {
410         goto error;
411     }
412     memcpy(tok->inp, buf, buflen);
413     tok->inp += buflen;
414     *tok->inp = '\0';
415     if (tok->fp_interactive &&
416         tok_concatenate_interactive_new_line(tok, buf) == -1) {
417         goto error;
418     }
419     Py_DECREF(line);
420     return 1;
421 error:
422     Py_XDECREF(line);
423     return 0;
424 }
425 
426 /* Set the readline function for TOK to a StreamReader's
427    readline function. The StreamReader is named ENC.
428 
429    This function is called from check_bom and check_coding_spec.
430 
431    ENC is usually identical to the future value of tok->encoding,
432    except for the (currently unsupported) case of UTF-16.
433 
434    Return 1 on success, 0 on failure. */
435 
436 static int
fp_setreadl(struct tok_state * tok,const char * enc)437 fp_setreadl(struct tok_state *tok, const char* enc)
438 {
439     PyObject *readline, *io, *stream;
440     int fd;
441     long pos;
442 
443     fd = fileno(tok->fp);
444     /* Due to buffering the file offset for fd can be different from the file
445      * position of tok->fp.  If tok->fp was opened in text mode on Windows,
446      * its file position counts CRLF as one char and can't be directly mapped
447      * to the file offset for fd.  Instead we step back one byte and read to
448      * the end of line.*/
449     pos = ftell(tok->fp);
450     if (pos == -1 ||
451         lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {
452         PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
453         return 0;
454     }
455 
456     io = PyImport_ImportModule("io");
457     if (io == NULL) {
458         return 0;
459     }
460     stream = _PyObject_CallMethod(io, &_Py_ID(open), "isisOOO",
461                     fd, "r", -1, enc, Py_None, Py_None, Py_False);
462     Py_DECREF(io);
463     if (stream == NULL) {
464         return 0;
465     }
466 
467     readline = PyObject_GetAttr(stream, &_Py_ID(readline));
468     Py_DECREF(stream);
469     if (readline == NULL) {
470         return 0;
471     }
472     Py_XSETREF(tok->decoding_readline, readline);
473 
474     if (pos > 0) {
475         PyObject *bufobj = _PyObject_CallNoArgs(readline);
476         if (bufobj == NULL) {
477             return 0;
478         }
479         Py_DECREF(bufobj);
480     }
481 
482     return 1;
483 }
484 
485 /* Fetch the next byte from TOK. */
486 
fp_getc(struct tok_state * tok)487 static int fp_getc(struct tok_state *tok) {
488     return getc(tok->fp);
489 }
490 
491 /* Unfetch the last byte back into TOK.  */
492 
fp_ungetc(int c,struct tok_state * tok)493 static void fp_ungetc(int c, struct tok_state *tok) {
494     ungetc(c, tok->fp);
495 }
496 
497 /* Check whether the characters at s start a valid
498    UTF-8 sequence. Return the number of characters forming
499    the sequence if yes, 0 if not.  The special cases match
500    those in stringlib/codecs.h:utf8_decode.
501 */
502 static int
valid_utf8(const unsigned char * s)503 valid_utf8(const unsigned char* s)
504 {
505     int expected = 0;
506     int length;
507     if (*s < 0x80) {
508         /* single-byte code */
509         return 1;
510     }
511     else if (*s < 0xE0) {
512         /* \xC2\x80-\xDF\xBF -- 0080-07FF */
513         if (*s < 0xC2) {
514             /* invalid sequence
515                \x80-\xBF -- continuation byte
516                \xC0-\xC1 -- fake 0000-007F */
517             return 0;
518         }
519         expected = 1;
520     }
521     else if (*s < 0xF0) {
522         /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */
523         if (*s == 0xE0 && *(s + 1) < 0xA0) {
524             /* invalid sequence
525                \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */
526             return 0;
527         }
528         else if (*s == 0xED && *(s + 1) >= 0xA0) {
529             /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF
530                will result in surrogates in range D800-DFFF. Surrogates are
531                not valid UTF-8 so they are rejected.
532                See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
533                (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
534             return 0;
535         }
536         expected = 2;
537     }
538     else if (*s < 0xF5) {
539         /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */
540         if (*(s + 1) < 0x90 ? *s == 0xF0 : *s == 0xF4) {
541             /* invalid sequence -- one of:
542                \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF
543                \xF4\x90\x80\x80- -- 110000- overflow */
544             return 0;
545         }
546         expected = 3;
547     }
548     else {
549         /* invalid start byte */
550         return 0;
551     }
552     length = expected + 1;
553     for (; expected; expected--)
554         if (s[expected] < 0x80 || s[expected] >= 0xC0)
555             return 0;
556     return length;
557 }
558 
559 static int
ensure_utf8(char * line,struct tok_state * tok)560 ensure_utf8(char *line, struct tok_state *tok)
561 {
562     int badchar = 0;
563     unsigned char *c;
564     int length;
565     for (c = (unsigned char *)line; *c; c += length) {
566         if (!(length = valid_utf8(c))) {
567             badchar = *c;
568             break;
569         }
570     }
571     if (badchar) {
572         PyErr_Format(PyExc_SyntaxError,
573                      "Non-UTF-8 code starting with '\\x%.2x' "
574                      "in file %U on line %i, "
575                      "but no encoding declared; "
576                      "see https://peps.python.org/pep-0263/ for details",
577                      badchar, tok->filename, tok->lineno);
578         return 0;
579     }
580     return 1;
581 }
582 
583 /* Fetch a byte from TOK, using the string buffer. */
584 
585 static int
buf_getc(struct tok_state * tok)586 buf_getc(struct tok_state *tok) {
587     return Py_CHARMASK(*tok->str++);
588 }
589 
590 /* Unfetch a byte from TOK, using the string buffer. */
591 
592 static void
buf_ungetc(int c,struct tok_state * tok)593 buf_ungetc(int c, struct tok_state *tok) {
594     tok->str--;
595     assert(Py_CHARMASK(*tok->str) == c);        /* tok->cur may point to read-only segment */
596 }
597 
598 /* Set the readline function for TOK to ENC. For the string-based
599    tokenizer, this means to just record the encoding. */
600 
601 static int
buf_setreadl(struct tok_state * tok,const char * enc)602 buf_setreadl(struct tok_state *tok, const char* enc) {
603     tok->enc = enc;
604     return 1;
605 }
606 
607 /* Return a UTF-8 encoding Python string object from the
608    C byte string STR, which is encoded with ENC. */
609 
610 static PyObject *
translate_into_utf8(const char * str,const char * enc)611 translate_into_utf8(const char* str, const char* enc) {
612     PyObject *utf8;
613     PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
614     if (buf == NULL)
615         return NULL;
616     utf8 = PyUnicode_AsUTF8String(buf);
617     Py_DECREF(buf);
618     return utf8;
619 }
620 
621 
622 static char *
translate_newlines(const char * s,int exec_input,struct tok_state * tok)623 translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
624     int skip_next_lf = 0;
625     size_t needed_length = strlen(s) + 2, final_length;
626     char *buf, *current;
627     char c = '\0';
628     buf = PyMem_Malloc(needed_length);
629     if (buf == NULL) {
630         tok->done = E_NOMEM;
631         return NULL;
632     }
633     for (current = buf; *s; s++, current++) {
634         c = *s;
635         if (skip_next_lf) {
636             skip_next_lf = 0;
637             if (c == '\n') {
638                 c = *++s;
639                 if (!c)
640                     break;
641             }
642         }
643         if (c == '\r') {
644             skip_next_lf = 1;
645             c = '\n';
646         }
647         *current = c;
648     }
649     /* If this is exec input, add a newline to the end of the string if
650        there isn't one already. */
651     if (exec_input && c != '\n') {
652         *current = '\n';
653         current++;
654     }
655     *current = '\0';
656     final_length = current - buf + 1;
657     if (final_length < needed_length && final_length) {
658         /* should never fail */
659         char* result = PyMem_Realloc(buf, final_length);
660         if (result == NULL) {
661             PyMem_Free(buf);
662         }
663         buf = result;
664     }
665     return buf;
666 }
667 
668 /* Decode a byte string STR for use as the buffer of TOK.
669    Look for encoding declarations inside STR, and record them
670    inside TOK.  */
671 
672 static char *
decode_str(const char * input,int single,struct tok_state * tok)673 decode_str(const char *input, int single, struct tok_state *tok)
674 {
675     PyObject* utf8 = NULL;
676     char *str;
677     const char *s;
678     const char *newl[2] = {NULL, NULL};
679     int lineno = 0;
680     tok->input = str = translate_newlines(input, single, tok);
681     if (str == NULL)
682         return NULL;
683     tok->enc = NULL;
684     tok->str = str;
685     if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
686         return error_ret(tok);
687     str = tok->str;             /* string after BOM if any */
688     assert(str);
689     if (tok->enc != NULL) {
690         utf8 = translate_into_utf8(str, tok->enc);
691         if (utf8 == NULL)
692             return error_ret(tok);
693         str = PyBytes_AsString(utf8);
694     }
695     for (s = str;; s++) {
696         if (*s == '\0') break;
697         else if (*s == '\n') {
698             assert(lineno < 2);
699             newl[lineno] = s;
700             lineno++;
701             if (lineno == 2) break;
702         }
703     }
704     tok->enc = NULL;
705     /* need to check line 1 and 2 separately since check_coding_spec
706        assumes a single line as input */
707     if (newl[0]) {
708         if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) {
709             return NULL;
710         }
711         if (tok->enc == NULL && tok->decoding_state != STATE_NORMAL && newl[1]) {
712             if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
713                                    tok, buf_setreadl))
714                 return NULL;
715         }
716     }
717     if (tok->enc != NULL) {
718         assert(utf8 == NULL);
719         utf8 = translate_into_utf8(str, tok->enc);
720         if (utf8 == NULL)
721             return error_ret(tok);
722         str = PyBytes_AS_STRING(utf8);
723     }
724     assert(tok->decoding_buffer == NULL);
725     tok->decoding_buffer = utf8; /* CAUTION */
726     return str;
727 }
728 
729 /* Set up tokenizer for string */
730 
731 struct tok_state *
_PyTokenizer_FromString(const char * str,int exec_input)732 _PyTokenizer_FromString(const char *str, int exec_input)
733 {
734     struct tok_state *tok = tok_new();
735     char *decoded;
736 
737     if (tok == NULL)
738         return NULL;
739     decoded = decode_str(str, exec_input, tok);
740     if (decoded == NULL) {
741         _PyTokenizer_Free(tok);
742         return NULL;
743     }
744 
745     tok->buf = tok->cur = tok->inp = decoded;
746     tok->end = decoded;
747     return tok;
748 }
749 
750 /* Set up tokenizer for UTF-8 string */
751 
752 struct tok_state *
_PyTokenizer_FromUTF8(const char * str,int exec_input)753 _PyTokenizer_FromUTF8(const char *str, int exec_input)
754 {
755     struct tok_state *tok = tok_new();
756     char *translated;
757     if (tok == NULL)
758         return NULL;
759     tok->input = translated = translate_newlines(str, exec_input, tok);
760     if (translated == NULL) {
761         _PyTokenizer_Free(tok);
762         return NULL;
763     }
764     tok->decoding_state = STATE_NORMAL;
765     tok->enc = NULL;
766     tok->str = translated;
767     tok->encoding = new_string("utf-8", 5, tok);
768     if (!tok->encoding) {
769         _PyTokenizer_Free(tok);
770         return NULL;
771     }
772 
773     tok->buf = tok->cur = tok->inp = translated;
774     tok->end = translated;
775     return tok;
776 }
777 
778 /* Set up tokenizer for file */
779 
780 struct tok_state *
_PyTokenizer_FromFile(FILE * fp,const char * enc,const char * ps1,const char * ps2)781 _PyTokenizer_FromFile(FILE *fp, const char* enc,
782                       const char *ps1, const char *ps2)
783 {
784     struct tok_state *tok = tok_new();
785     if (tok == NULL)
786         return NULL;
787     if ((tok->buf = (char *)PyMem_Malloc(BUFSIZ)) == NULL) {
788         _PyTokenizer_Free(tok);
789         return NULL;
790     }
791     tok->cur = tok->inp = tok->buf;
792     tok->end = tok->buf + BUFSIZ;
793     tok->fp = fp;
794     tok->prompt = ps1;
795     tok->nextprompt = ps2;
796     if (enc != NULL) {
797         /* Must copy encoding declaration since it
798            gets copied into the parse tree. */
799         tok->encoding = new_string(enc, strlen(enc), tok);
800         if (!tok->encoding) {
801             _PyTokenizer_Free(tok);
802             return NULL;
803         }
804         tok->decoding_state = STATE_NORMAL;
805     }
806     return tok;
807 }
808 
809 /* Free a tok_state structure */
810 
811 void
_PyTokenizer_Free(struct tok_state * tok)812 _PyTokenizer_Free(struct tok_state *tok)
813 {
814     if (tok->encoding != NULL) {
815         PyMem_Free(tok->encoding);
816     }
817     Py_XDECREF(tok->decoding_readline);
818     Py_XDECREF(tok->decoding_buffer);
819     Py_XDECREF(tok->filename);
820     if (tok->fp != NULL && tok->buf != NULL) {
821         PyMem_Free(tok->buf);
822     }
823     if (tok->input) {
824         PyMem_Free(tok->input);
825     }
826     if (tok->interactive_src_start != NULL) {
827         PyMem_Free(tok->interactive_src_start);
828     }
829     PyMem_Free(tok);
830 }
831 
832 static int
tok_readline_raw(struct tok_state * tok)833 tok_readline_raw(struct tok_state *tok)
834 {
835     do {
836         if (!tok_reserve_buf(tok, BUFSIZ)) {
837             return 0;
838         }
839         int n_chars = (int)(tok->end - tok->inp);
840         size_t line_size = 0;
841         char *line = _Py_UniversalNewlineFgetsWithSize(tok->inp, n_chars, tok->fp, NULL, &line_size);
842         if (line == NULL) {
843             return 1;
844         }
845         if (tok->fp_interactive &&
846             tok_concatenate_interactive_new_line(tok, line) == -1) {
847             return 0;
848         }
849         tok->inp += line_size;
850         if (tok->inp == tok->buf) {
851             return 0;
852         }
853     } while (tok->inp[-1] != '\n');
854     return 1;
855 }
856 
857 static int
tok_underflow_string(struct tok_state * tok)858 tok_underflow_string(struct tok_state *tok) {
859     char *end = strchr(tok->inp, '\n');
860     if (end != NULL) {
861         end++;
862     }
863     else {
864         end = strchr(tok->inp, '\0');
865         if (end == tok->inp) {
866             tok->done = E_EOF;
867             return 0;
868         }
869     }
870     if (tok->start == NULL) {
871         tok->buf = tok->cur;
872     }
873     tok->line_start = tok->cur;
874     tok->lineno++;
875     tok->inp = end;
876     return 1;
877 }
878 
879 static int
tok_underflow_interactive(struct tok_state * tok)880 tok_underflow_interactive(struct tok_state *tok) {
881     if (tok->interactive_underflow == IUNDERFLOW_STOP) {
882         tok->done = E_INTERACT_STOP;
883         return 1;
884     }
885     char *newtok = PyOS_Readline(tok->fp ? tok->fp : stdin, stdout, tok->prompt);
886     if (newtok != NULL) {
887         char *translated = translate_newlines(newtok, 0, tok);
888         PyMem_Free(newtok);
889         if (translated == NULL) {
890             return 0;
891         }
892         newtok = translated;
893     }
894     if (tok->encoding && newtok && *newtok) {
895         /* Recode to UTF-8 */
896         Py_ssize_t buflen;
897         const char* buf;
898         PyObject *u = translate_into_utf8(newtok, tok->encoding);
899         PyMem_Free(newtok);
900         if (u == NULL) {
901             tok->done = E_DECODE;
902             return 0;
903         }
904         buflen = PyBytes_GET_SIZE(u);
905         buf = PyBytes_AS_STRING(u);
906         newtok = PyMem_Malloc(buflen+1);
907         if (newtok == NULL) {
908             Py_DECREF(u);
909             tok->done = E_NOMEM;
910             return 0;
911         }
912         strcpy(newtok, buf);
913         Py_DECREF(u);
914     }
915     if (tok->fp_interactive &&
916         tok_concatenate_interactive_new_line(tok, newtok) == -1) {
917         PyMem_Free(newtok);
918         return 0;
919     }
920     if (tok->nextprompt != NULL) {
921         tok->prompt = tok->nextprompt;
922     }
923     if (newtok == NULL) {
924         tok->done = E_INTR;
925     }
926     else if (*newtok == '\0') {
927         PyMem_Free(newtok);
928         tok->done = E_EOF;
929     }
930     else if (tok->start != NULL) {
931         Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf;
932         size_t size = strlen(newtok);
933         tok->lineno++;
934         if (!tok_reserve_buf(tok, size + 1)) {
935             PyMem_Free(tok->buf);
936             tok->buf = NULL;
937             PyMem_Free(newtok);
938             return 0;
939         }
940         memcpy(tok->cur, newtok, size + 1);
941         PyMem_Free(newtok);
942         tok->inp += size;
943         tok->multi_line_start = tok->buf + cur_multi_line_start;
944     }
945     else {
946         tok->lineno++;
947         PyMem_Free(tok->buf);
948         tok->buf = newtok;
949         tok->cur = tok->buf;
950         tok->line_start = tok->buf;
951         tok->inp = strchr(tok->buf, '\0');
952         tok->end = tok->inp + 1;
953     }
954     if (tok->done != E_OK) {
955         if (tok->prompt != NULL) {
956             PySys_WriteStderr("\n");
957         }
958         return 0;
959     }
960     return 1;
961 }
962 
963 static int
tok_underflow_file(struct tok_state * tok)964 tok_underflow_file(struct tok_state *tok) {
965     if (tok->start == NULL) {
966         tok->cur = tok->inp = tok->buf;
967     }
968     if (tok->decoding_state == STATE_INIT) {
969         /* We have not yet determined the encoding.
970            If an encoding is found, use the file-pointer
971            reader functions from now on. */
972         if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok)) {
973             error_ret(tok);
974             return 0;
975         }
976         assert(tok->decoding_state != STATE_INIT);
977     }
978     /* Read until '\n' or EOF */
979     if (tok->decoding_readline != NULL) {
980         /* We already have a codec associated with this input. */
981         if (!tok_readline_recode(tok)) {
982             return 0;
983         }
984     }
985     else {
986         /* We want a 'raw' read. */
987         if (!tok_readline_raw(tok)) {
988             return 0;
989         }
990     }
991     if (tok->inp == tok->cur) {
992         tok->done = E_EOF;
993         return 0;
994     }
995     if (tok->inp[-1] != '\n') {
996         assert(tok->inp + 1 < tok->end);
997         /* Last line does not end in \n, fake one */
998         *tok->inp++ = '\n';
999         *tok->inp = '\0';
1000     }
1001 
1002     tok->lineno++;
1003     if (tok->decoding_state != STATE_NORMAL) {
1004         if (tok->lineno > 2) {
1005             tok->decoding_state = STATE_NORMAL;
1006         }
1007         else if (!check_coding_spec(tok->cur, strlen(tok->cur),
1008                                     tok, fp_setreadl))
1009         {
1010             return 0;
1011         }
1012     }
1013     /* The default encoding is UTF-8, so make sure we don't have any
1014        non-UTF-8 sequences in it. */
1015     if (!tok->encoding && !ensure_utf8(tok->cur, tok)) {
1016         error_ret(tok);
1017         return 0;
1018     }
1019     assert(tok->done == E_OK);
1020     return tok->done == E_OK;
1021 }
1022 
1023 #if defined(Py_DEBUG)
1024 static void
print_escape(FILE * f,const char * s,Py_ssize_t size)1025 print_escape(FILE *f, const char *s, Py_ssize_t size)
1026 {
1027     if (s == NULL) {
1028         fputs("NULL", f);
1029         return;
1030     }
1031     putc('"', f);
1032     while (size-- > 0) {
1033         unsigned char c = *s++;
1034         switch (c) {
1035             case '\n': fputs("\\n", f); break;
1036             case '\r': fputs("\\r", f); break;
1037             case '\t': fputs("\\t", f); break;
1038             case '\f': fputs("\\f", f); break;
1039             case '\'': fputs("\\'", f); break;
1040             case '"': fputs("\\\"", f); break;
1041             default:
1042                 if (0x20 <= c && c <= 0x7f)
1043                     putc(c, f);
1044                 else
1045                     fprintf(f, "\\x%02x", c);
1046         }
1047     }
1048     putc('"', f);
1049 }
1050 #endif
1051 
1052 /* Get next char, updating state; error code goes into tok->done */
1053 
1054 static int
tok_nextc(struct tok_state * tok)1055 tok_nextc(struct tok_state *tok)
1056 {
1057     int rc;
1058     for (;;) {
1059         if (tok->cur != tok->inp) {
1060             return Py_CHARMASK(*tok->cur++); /* Fast path */
1061         }
1062         if (tok->done != E_OK) {
1063            return EOF;
1064         }
1065         if (tok->fp == NULL) {
1066             rc = tok_underflow_string(tok);
1067         }
1068         else if (tok->prompt != NULL) {
1069             rc = tok_underflow_interactive(tok);
1070         }
1071         else {
1072             rc = tok_underflow_file(tok);
1073         }
1074 #if defined(Py_DEBUG)
1075         if (Py_DebugFlag) {
1076             fprintf(stderr, "line[%d] = ", tok->lineno);
1077             print_escape(stderr, tok->cur, tok->inp - tok->cur);
1078             fprintf(stderr, "  tok->done = %d\n", tok->done);
1079         }
1080 #endif
1081         if (!rc) {
1082             tok->cur = tok->inp;
1083             return EOF;
1084         }
1085         tok->line_start = tok->cur;
1086 
1087         if (contains_null_bytes(tok->line_start, tok->inp - tok->line_start)) {
1088             syntaxerror(tok, "source code cannot contain null bytes");
1089             tok->cur = tok->inp;
1090             return EOF;
1091         }
1092     }
1093     Py_UNREACHABLE();
1094 }
1095 
1096 /* Back-up one character */
1097 
1098 static void
tok_backup(struct tok_state * tok,int c)1099 tok_backup(struct tok_state *tok, int c)
1100 {
1101     if (c != EOF) {
1102         if (--tok->cur < tok->buf) {
1103             Py_FatalError("tokenizer beginning of buffer");
1104         }
1105         if ((int)(unsigned char)*tok->cur != c) {
1106             Py_FatalError("tok_backup: wrong character");
1107         }
1108     }
1109 }
1110 
1111 static int
_syntaxerror_range(struct tok_state * tok,const char * format,int col_offset,int end_col_offset,va_list vargs)1112 _syntaxerror_range(struct tok_state *tok, const char *format,
1113                    int col_offset, int end_col_offset,
1114                    va_list vargs)
1115 {
1116     PyObject *errmsg, *errtext, *args;
1117     errmsg = PyUnicode_FromFormatV(format, vargs);
1118     if (!errmsg) {
1119         goto error;
1120     }
1121 
1122     errtext = PyUnicode_DecodeUTF8(tok->line_start, tok->cur - tok->line_start,
1123                                    "replace");
1124     if (!errtext) {
1125         goto error;
1126     }
1127 
1128     if (col_offset == -1) {
1129         col_offset = (int)PyUnicode_GET_LENGTH(errtext);
1130     }
1131     if (end_col_offset == -1) {
1132         end_col_offset = col_offset;
1133     }
1134 
1135     Py_ssize_t line_len = strcspn(tok->line_start, "\n");
1136     if (line_len != tok->cur - tok->line_start) {
1137         Py_DECREF(errtext);
1138         errtext = PyUnicode_DecodeUTF8(tok->line_start, line_len,
1139                                        "replace");
1140     }
1141     if (!errtext) {
1142         goto error;
1143     }
1144 
1145     args = Py_BuildValue("(O(OiiNii))", errmsg, tok->filename, tok->lineno,
1146                          col_offset, errtext, tok->lineno, end_col_offset);
1147     if (args) {
1148         PyErr_SetObject(PyExc_SyntaxError, args);
1149         Py_DECREF(args);
1150     }
1151 
1152 error:
1153     Py_XDECREF(errmsg);
1154     tok->done = E_ERROR;
1155     return ERRORTOKEN;
1156 }
1157 
1158 static int
syntaxerror(struct tok_state * tok,const char * format,...)1159 syntaxerror(struct tok_state *tok, const char *format, ...)
1160 {
1161     va_list vargs;
1162 #ifdef HAVE_STDARG_PROTOTYPES
1163     va_start(vargs, format);
1164 #else
1165     va_start(vargs);
1166 #endif
1167     int ret = _syntaxerror_range(tok, format, -1, -1, vargs);
1168     va_end(vargs);
1169     return ret;
1170 }
1171 
1172 static int
syntaxerror_known_range(struct tok_state * tok,int col_offset,int end_col_offset,const char * format,...)1173 syntaxerror_known_range(struct tok_state *tok,
1174                         int col_offset, int end_col_offset,
1175                         const char *format, ...)
1176 {
1177     va_list vargs;
1178 #ifdef HAVE_STDARG_PROTOTYPES
1179     va_start(vargs, format);
1180 #else
1181     va_start(vargs);
1182 #endif
1183     int ret = _syntaxerror_range(tok, format, col_offset, end_col_offset, vargs);
1184     va_end(vargs);
1185     return ret;
1186 }
1187 
1188 
1189 
1190 static int
indenterror(struct tok_state * tok)1191 indenterror(struct tok_state *tok)
1192 {
1193     tok->done = E_TABSPACE;
1194     tok->cur = tok->inp;
1195     return ERRORTOKEN;
1196 }
1197 
1198 static int
parser_warn(struct tok_state * tok,PyObject * category,const char * format,...)1199 parser_warn(struct tok_state *tok, PyObject *category, const char *format, ...)
1200 {
1201     if (!tok->report_warnings) {
1202         return 0;
1203     }
1204 
1205     PyObject *errmsg;
1206     va_list vargs;
1207 #ifdef HAVE_STDARG_PROTOTYPES
1208     va_start(vargs, format);
1209 #else
1210     va_start(vargs);
1211 #endif
1212     errmsg = PyUnicode_FromFormatV(format, vargs);
1213     va_end(vargs);
1214     if (!errmsg) {
1215         goto error;
1216     }
1217 
1218     if (PyErr_WarnExplicitObject(category, errmsg, tok->filename,
1219                                  tok->lineno, NULL, NULL) < 0) {
1220         if (PyErr_ExceptionMatches(category)) {
1221             /* Replace the DeprecationWarning exception with a SyntaxError
1222                to get a more accurate error report */
1223             PyErr_Clear();
1224             syntaxerror(tok, "%U", errmsg);
1225         }
1226         goto error;
1227     }
1228     Py_DECREF(errmsg);
1229     return 0;
1230 
1231 error:
1232     Py_XDECREF(errmsg);
1233     tok->done = E_ERROR;
1234     return -1;
1235 }
1236 
1237 static int
lookahead(struct tok_state * tok,const char * test)1238 lookahead(struct tok_state *tok, const char *test)
1239 {
1240     const char *s = test;
1241     int res = 0;
1242     while (1) {
1243         int c = tok_nextc(tok);
1244         if (*s == 0) {
1245             res = !is_potential_identifier_char(c);
1246         }
1247         else if (c == *s) {
1248             s++;
1249             continue;
1250         }
1251 
1252         tok_backup(tok, c);
1253         while (s != test) {
1254             tok_backup(tok, *--s);
1255         }
1256         return res;
1257     }
1258 }
1259 
1260 static int
verify_end_of_number(struct tok_state * tok,int c,const char * kind)1261 verify_end_of_number(struct tok_state *tok, int c, const char *kind)
1262 {
1263     /* Emit a deprecation warning only if the numeric literal is immediately
1264      * followed by one of keywords which can occur after a numeric literal
1265      * in valid code: "and", "else", "for", "if", "in", "is" and "or".
1266      * It allows to gradually deprecate existing valid code without adding
1267      * warning before error in most cases of invalid numeric literal (which
1268      * would be confusing and break existing tests).
1269      * Raise a syntax error with slightly better message than plain
1270      * "invalid syntax" if the numeric literal is immediately followed by
1271      * other keyword or identifier.
1272      */
1273     int r = 0;
1274     if (c == 'a') {
1275         r = lookahead(tok, "nd");
1276     }
1277     else if (c == 'e') {
1278         r = lookahead(tok, "lse");
1279     }
1280     else if (c == 'f') {
1281         r = lookahead(tok, "or");
1282     }
1283     else if (c == 'i') {
1284         int c2 = tok_nextc(tok);
1285         if (c2 == 'f' || c2 == 'n' || c2 == 's') {
1286             r = 1;
1287         }
1288         tok_backup(tok, c2);
1289     }
1290     else if (c == 'o') {
1291         r = lookahead(tok, "r");
1292     }
1293     else if (c == 'n') {
1294         r = lookahead(tok, "ot");
1295     }
1296     if (r) {
1297         tok_backup(tok, c);
1298         if (parser_warn(tok, PyExc_SyntaxWarning,
1299                 "invalid %s literal", kind))
1300         {
1301             return 0;
1302         }
1303         tok_nextc(tok);
1304     }
1305     else /* In future releases, only error will remain. */
1306     if (is_potential_identifier_char(c)) {
1307         tok_backup(tok, c);
1308         syntaxerror(tok, "invalid %s literal", kind);
1309         return 0;
1310     }
1311     return 1;
1312 }
1313 
1314 /* Verify that the identifier follows PEP 3131.
1315    All identifier strings are guaranteed to be "ready" unicode objects.
1316  */
1317 static int
verify_identifier(struct tok_state * tok)1318 verify_identifier(struct tok_state *tok)
1319 {
1320     PyObject *s;
1321     if (tok->decoding_erred)
1322         return 0;
1323     s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
1324     if (s == NULL) {
1325         if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1326             tok->done = E_DECODE;
1327         }
1328         else {
1329             tok->done = E_ERROR;
1330         }
1331         return 0;
1332     }
1333     Py_ssize_t invalid = _PyUnicode_ScanIdentifier(s);
1334     if (invalid < 0) {
1335         Py_DECREF(s);
1336         tok->done = E_ERROR;
1337         return 0;
1338     }
1339     assert(PyUnicode_GET_LENGTH(s) > 0);
1340     if (invalid < PyUnicode_GET_LENGTH(s)) {
1341         Py_UCS4 ch = PyUnicode_READ_CHAR(s, invalid);
1342         if (invalid + 1 < PyUnicode_GET_LENGTH(s)) {
1343             /* Determine the offset in UTF-8 encoded input */
1344             Py_SETREF(s, PyUnicode_Substring(s, 0, invalid + 1));
1345             if (s != NULL) {
1346                 Py_SETREF(s, PyUnicode_AsUTF8String(s));
1347             }
1348             if (s == NULL) {
1349                 tok->done = E_ERROR;
1350                 return 0;
1351             }
1352             tok->cur = (char *)tok->start + PyBytes_GET_SIZE(s);
1353         }
1354         Py_DECREF(s);
1355         // PyUnicode_FromFormatV() does not support %X
1356         char hex[9];
1357         (void)PyOS_snprintf(hex, sizeof(hex), "%04X", ch);
1358         if (Py_UNICODE_ISPRINTABLE(ch)) {
1359             syntaxerror(tok, "invalid character '%c' (U+%s)", ch, hex);
1360         }
1361         else {
1362             syntaxerror(tok, "invalid non-printable character U+%s", hex);
1363         }
1364         return 0;
1365     }
1366     Py_DECREF(s);
1367     return 1;
1368 }
1369 
1370 static int
tok_decimal_tail(struct tok_state * tok)1371 tok_decimal_tail(struct tok_state *tok)
1372 {
1373     int c;
1374 
1375     while (1) {
1376         do {
1377             c = tok_nextc(tok);
1378         } while (isdigit(c));
1379         if (c != '_') {
1380             break;
1381         }
1382         c = tok_nextc(tok);
1383         if (!isdigit(c)) {
1384             tok_backup(tok, c);
1385             syntaxerror(tok, "invalid decimal literal");
1386             return 0;
1387         }
1388     }
1389     return c;
1390 }
1391 
1392 /* Get next token, after space stripping etc. */
1393 
1394 static inline int
tok_continuation_line(struct tok_state * tok)1395 tok_continuation_line(struct tok_state *tok) {
1396     int c = tok_nextc(tok);
1397     if (c != '\n') {
1398         tok->done = E_LINECONT;
1399         return -1;
1400     }
1401     c = tok_nextc(tok);
1402     if (c == EOF) {
1403         tok->done = E_EOF;
1404         tok->cur = tok->inp;
1405         return -1;
1406     } else {
1407         tok_backup(tok, c);
1408     }
1409     return c;
1410 }
1411 
1412 static int
tok_get(struct tok_state * tok,const char ** p_start,const char ** p_end)1413 tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
1414 {
1415     int c;
1416     int blankline, nonascii;
1417 
1418     *p_start = *p_end = NULL;
1419   nextline:
1420     tok->start = NULL;
1421     blankline = 0;
1422 
1423     /* Get indentation level */
1424     if (tok->atbol) {
1425         int col = 0;
1426         int altcol = 0;
1427         tok->atbol = 0;
1428         int cont_line_col = 0;
1429         for (;;) {
1430             c = tok_nextc(tok);
1431             if (c == ' ') {
1432                 col++, altcol++;
1433             }
1434             else if (c == '\t') {
1435                 col = (col / tok->tabsize + 1) * tok->tabsize;
1436                 altcol = (altcol / ALTTABSIZE + 1) * ALTTABSIZE;
1437             }
1438             else if (c == '\014')  {/* Control-L (formfeed) */
1439                 col = altcol = 0; /* For Emacs users */
1440             }
1441             else if (c == '\\') {
1442                 // Indentation cannot be split over multiple physical lines
1443                 // using backslashes. This means that if we found a backslash
1444                 // preceded by whitespace, **the first one we find** determines
1445                 // the level of indentation of whatever comes next.
1446                 cont_line_col = cont_line_col ? cont_line_col : col;
1447                 if ((c = tok_continuation_line(tok)) == -1) {
1448                     return ERRORTOKEN;
1449                 }
1450             }
1451             else {
1452                 break;
1453             }
1454         }
1455         tok_backup(tok, c);
1456         if (c == '#' || c == '\n') {
1457             /* Lines with only whitespace and/or comments
1458                shouldn't affect the indentation and are
1459                not passed to the parser as NEWLINE tokens,
1460                except *totally* empty lines in interactive
1461                mode, which signal the end of a command group. */
1462             if (col == 0 && c == '\n' && tok->prompt != NULL) {
1463                 blankline = 0; /* Let it through */
1464             }
1465             else if (tok->prompt != NULL && tok->lineno == 1) {
1466                 /* In interactive mode, if the first line contains
1467                    only spaces and/or a comment, let it through. */
1468                 blankline = 0;
1469                 col = altcol = 0;
1470             }
1471             else {
1472                 blankline = 1; /* Ignore completely */
1473             }
1474             /* We can't jump back right here since we still
1475                may need to skip to the end of a comment */
1476         }
1477         if (!blankline && tok->level == 0) {
1478             col = cont_line_col ? cont_line_col : col;
1479             altcol = cont_line_col ? cont_line_col : altcol;
1480             if (col == tok->indstack[tok->indent]) {
1481                 /* No change */
1482                 if (altcol != tok->altindstack[tok->indent]) {
1483                     return indenterror(tok);
1484                 }
1485             }
1486             else if (col > tok->indstack[tok->indent]) {
1487                 /* Indent -- always one */
1488                 if (tok->indent+1 >= MAXINDENT) {
1489                     tok->done = E_TOODEEP;
1490                     tok->cur = tok->inp;
1491                     return ERRORTOKEN;
1492                 }
1493                 if (altcol <= tok->altindstack[tok->indent]) {
1494                     return indenterror(tok);
1495                 }
1496                 tok->pendin++;
1497                 tok->indstack[++tok->indent] = col;
1498                 tok->altindstack[tok->indent] = altcol;
1499             }
1500             else /* col < tok->indstack[tok->indent] */ {
1501                 /* Dedent -- any number, must be consistent */
1502                 while (tok->indent > 0 &&
1503                     col < tok->indstack[tok->indent]) {
1504                     tok->pendin--;
1505                     tok->indent--;
1506                 }
1507                 if (col != tok->indstack[tok->indent]) {
1508                     tok->done = E_DEDENT;
1509                     tok->cur = tok->inp;
1510                     return ERRORTOKEN;
1511                 }
1512                 if (altcol != tok->altindstack[tok->indent]) {
1513                     return indenterror(tok);
1514                 }
1515             }
1516         }
1517     }
1518 
1519     tok->start = tok->cur;
1520 
1521     /* Return pending indents/dedents */
1522     if (tok->pendin != 0) {
1523         if (tok->pendin < 0) {
1524             tok->pendin++;
1525             return DEDENT;
1526         }
1527         else {
1528             tok->pendin--;
1529             return INDENT;
1530         }
1531     }
1532 
1533     /* Peek ahead at the next character */
1534     c = tok_nextc(tok);
1535     tok_backup(tok, c);
1536     /* Check if we are closing an async function */
1537     if (tok->async_def
1538         && !blankline
1539         /* Due to some implementation artifacts of type comments,
1540          * a TYPE_COMMENT at the start of a function won't set an
1541          * indentation level and it will produce a NEWLINE after it.
1542          * To avoid spuriously ending an async function due to this,
1543          * wait until we have some non-newline char in front of us. */
1544         && c != '\n'
1545         && tok->level == 0
1546         /* There was a NEWLINE after ASYNC DEF,
1547            so we're past the signature. */
1548         && tok->async_def_nl
1549         /* Current indentation level is less than where
1550            the async function was defined */
1551         && tok->async_def_indent >= tok->indent)
1552     {
1553         tok->async_def = 0;
1554         tok->async_def_indent = 0;
1555         tok->async_def_nl = 0;
1556     }
1557 
1558  again:
1559     tok->start = NULL;
1560     /* Skip spaces */
1561     do {
1562         c = tok_nextc(tok);
1563     } while (c == ' ' || c == '\t' || c == '\014');
1564 
1565     /* Set start of current token */
1566     tok->start = tok->cur == NULL ? NULL : tok->cur - 1;
1567 
1568     /* Skip comment, unless it's a type comment */
1569     if (c == '#') {
1570         const char *prefix, *p, *type_start;
1571 
1572         while (c != EOF && c != '\n') {
1573             c = tok_nextc(tok);
1574         }
1575 
1576         if (tok->type_comments) {
1577             p = tok->start;
1578             prefix = type_comment_prefix;
1579             while (*prefix && p < tok->cur) {
1580                 if (*prefix == ' ') {
1581                     while (*p == ' ' || *p == '\t') {
1582                         p++;
1583                     }
1584                 } else if (*prefix == *p) {
1585                     p++;
1586                 } else {
1587                     break;
1588                 }
1589 
1590                 prefix++;
1591             }
1592 
1593             /* This is a type comment if we matched all of type_comment_prefix. */
1594             if (!*prefix) {
1595                 int is_type_ignore = 1;
1596                 const char *ignore_end = p + 6;
1597                 tok_backup(tok, c);  /* don't eat the newline or EOF */
1598 
1599                 type_start = p;
1600 
1601                 /* A TYPE_IGNORE is "type: ignore" followed by the end of the token
1602                  * or anything ASCII and non-alphanumeric. */
1603                 is_type_ignore = (
1604                     tok->cur >= ignore_end && memcmp(p, "ignore", 6) == 0
1605                     && !(tok->cur > ignore_end
1606                          && ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0]))));
1607 
1608                 if (is_type_ignore) {
1609                     *p_start = ignore_end;
1610                     *p_end = tok->cur;
1611 
1612                     /* If this type ignore is the only thing on the line, consume the newline also. */
1613                     if (blankline) {
1614                         tok_nextc(tok);
1615                         tok->atbol = 1;
1616                     }
1617                     return TYPE_IGNORE;
1618                 } else {
1619                     *p_start = type_start;  /* after type_comment_prefix */
1620                     *p_end = tok->cur;
1621                     return TYPE_COMMENT;
1622                 }
1623             }
1624         }
1625     }
1626 
1627     if (tok->done == E_INTERACT_STOP) {
1628         return ENDMARKER;
1629     }
1630 
1631     /* Check for EOF and errors now */
1632     if (c == EOF) {
1633         if (tok->level) {
1634             return ERRORTOKEN;
1635         }
1636         return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1637     }
1638 
1639     /* Identifier (most frequent token!) */
1640     nonascii = 0;
1641     if (is_potential_identifier_start(c)) {
1642         /* Process the various legal combinations of b"", r"", u"", and f"". */
1643         int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0;
1644         while (1) {
1645             if (!(saw_b || saw_u || saw_f) && (c == 'b' || c == 'B'))
1646                 saw_b = 1;
1647             /* Since this is a backwards compatibility support literal we don't
1648                want to support it in arbitrary order like byte literals. */
1649             else if (!(saw_b || saw_u || saw_r || saw_f)
1650                      && (c == 'u'|| c == 'U')) {
1651                 saw_u = 1;
1652             }
1653             /* ur"" and ru"" are not supported */
1654             else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) {
1655                 saw_r = 1;
1656             }
1657             else if (!(saw_f || saw_b || saw_u) && (c == 'f' || c == 'F')) {
1658                 saw_f = 1;
1659             }
1660             else {
1661                 break;
1662             }
1663             c = tok_nextc(tok);
1664             if (c == '"' || c == '\'') {
1665                 goto letter_quote;
1666             }
1667         }
1668         while (is_potential_identifier_char(c)) {
1669             if (c >= 128) {
1670                 nonascii = 1;
1671             }
1672             c = tok_nextc(tok);
1673         }
1674         tok_backup(tok, c);
1675         if (nonascii && !verify_identifier(tok)) {
1676             return ERRORTOKEN;
1677         }
1678 
1679         *p_start = tok->start;
1680         *p_end = tok->cur;
1681 
1682         /* async/await parsing block. */
1683         if (tok->cur - tok->start == 5 && tok->start[0] == 'a') {
1684             /* May be an 'async' or 'await' token.  For Python 3.7 or
1685                later we recognize them unconditionally.  For Python
1686                3.5 or 3.6 we recognize 'async' in front of 'def', and
1687                either one inside of 'async def'.  (Technically we
1688                shouldn't recognize these at all for 3.4 or earlier,
1689                but there's no *valid* Python 3.4 code that would be
1690                rejected, and async functions will be rejected in a
1691                later phase.) */
1692             if (!tok->async_hacks || tok->async_def) {
1693                 /* Always recognize the keywords. */
1694                 if (memcmp(tok->start, "async", 5) == 0) {
1695                     return ASYNC;
1696                 }
1697                 if (memcmp(tok->start, "await", 5) == 0) {
1698                     return AWAIT;
1699                 }
1700             }
1701             else if (memcmp(tok->start, "async", 5) == 0) {
1702                 /* The current token is 'async'.
1703                    Look ahead one token to see if that is 'def'. */
1704 
1705                 struct tok_state ahead_tok;
1706                 const char *ahead_tok_start = NULL;
1707                 const char *ahead_tok_end = NULL;
1708                 int ahead_tok_kind;
1709 
1710                 memcpy(&ahead_tok, tok, sizeof(ahead_tok));
1711                 ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start,
1712                                          &ahead_tok_end);
1713 
1714                 if (ahead_tok_kind == NAME
1715                     && ahead_tok.cur - ahead_tok.start == 3
1716                     && memcmp(ahead_tok.start, "def", 3) == 0)
1717                 {
1718                     /* The next token is going to be 'def', so instead of
1719                        returning a plain NAME token, return ASYNC. */
1720                     tok->async_def_indent = tok->indent;
1721                     tok->async_def = 1;
1722                     return ASYNC;
1723                 }
1724             }
1725         }
1726 
1727         return NAME;
1728     }
1729 
1730     /* Newline */
1731     if (c == '\n') {
1732         tok->atbol = 1;
1733         if (blankline || tok->level > 0) {
1734             goto nextline;
1735         }
1736         *p_start = tok->start;
1737         *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1738         tok->cont_line = 0;
1739         if (tok->async_def) {
1740             /* We're somewhere inside an 'async def' function, and
1741                we've encountered a NEWLINE after its signature. */
1742             tok->async_def_nl = 1;
1743         }
1744         return NEWLINE;
1745     }
1746 
1747     /* Period or number starting with period? */
1748     if (c == '.') {
1749         c = tok_nextc(tok);
1750         if (isdigit(c)) {
1751             goto fraction;
1752         } else if (c == '.') {
1753             c = tok_nextc(tok);
1754             if (c == '.') {
1755                 *p_start = tok->start;
1756                 *p_end = tok->cur;
1757                 return ELLIPSIS;
1758             }
1759             else {
1760                 tok_backup(tok, c);
1761             }
1762             tok_backup(tok, '.');
1763         }
1764         else {
1765             tok_backup(tok, c);
1766         }
1767         *p_start = tok->start;
1768         *p_end = tok->cur;
1769         return DOT;
1770     }
1771 
1772     /* Number */
1773     if (isdigit(c)) {
1774         if (c == '0') {
1775             /* Hex, octal or binary -- maybe. */
1776             c = tok_nextc(tok);
1777             if (c == 'x' || c == 'X') {
1778                 /* Hex */
1779                 c = tok_nextc(tok);
1780                 do {
1781                     if (c == '_') {
1782                         c = tok_nextc(tok);
1783                     }
1784                     if (!isxdigit(c)) {
1785                         tok_backup(tok, c);
1786                         return syntaxerror(tok, "invalid hexadecimal literal");
1787                     }
1788                     do {
1789                         c = tok_nextc(tok);
1790                     } while (isxdigit(c));
1791                 } while (c == '_');
1792                 if (!verify_end_of_number(tok, c, "hexadecimal")) {
1793                     return ERRORTOKEN;
1794                 }
1795             }
1796             else if (c == 'o' || c == 'O') {
1797                 /* Octal */
1798                 c = tok_nextc(tok);
1799                 do {
1800                     if (c == '_') {
1801                         c = tok_nextc(tok);
1802                     }
1803                     if (c < '0' || c >= '8') {
1804                         if (isdigit(c)) {
1805                             return syntaxerror(tok,
1806                                     "invalid digit '%c' in octal literal", c);
1807                         }
1808                         else {
1809                             tok_backup(tok, c);
1810                             return syntaxerror(tok, "invalid octal literal");
1811                         }
1812                     }
1813                     do {
1814                         c = tok_nextc(tok);
1815                     } while ('0' <= c && c < '8');
1816                 } while (c == '_');
1817                 if (isdigit(c)) {
1818                     return syntaxerror(tok,
1819                             "invalid digit '%c' in octal literal", c);
1820                 }
1821                 if (!verify_end_of_number(tok, c, "octal")) {
1822                     return ERRORTOKEN;
1823                 }
1824             }
1825             else if (c == 'b' || c == 'B') {
1826                 /* Binary */
1827                 c = tok_nextc(tok);
1828                 do {
1829                     if (c == '_') {
1830                         c = tok_nextc(tok);
1831                     }
1832                     if (c != '0' && c != '1') {
1833                         if (isdigit(c)) {
1834                             return syntaxerror(tok,
1835                                     "invalid digit '%c' in binary literal", c);
1836                         }
1837                         else {
1838                             tok_backup(tok, c);
1839                             return syntaxerror(tok, "invalid binary literal");
1840                         }
1841                     }
1842                     do {
1843                         c = tok_nextc(tok);
1844                     } while (c == '0' || c == '1');
1845                 } while (c == '_');
1846                 if (isdigit(c)) {
1847                     return syntaxerror(tok,
1848                             "invalid digit '%c' in binary literal", c);
1849                 }
1850                 if (!verify_end_of_number(tok, c, "binary")) {
1851                     return ERRORTOKEN;
1852                 }
1853             }
1854             else {
1855                 int nonzero = 0;
1856                 /* maybe old-style octal; c is first char of it */
1857                 /* in any case, allow '0' as a literal */
1858                 while (1) {
1859                     if (c == '_') {
1860                         c = tok_nextc(tok);
1861                         if (!isdigit(c)) {
1862                             tok_backup(tok, c);
1863                             return syntaxerror(tok, "invalid decimal literal");
1864                         }
1865                     }
1866                     if (c != '0') {
1867                         break;
1868                     }
1869                     c = tok_nextc(tok);
1870                 }
1871                 char* zeros_end = tok->cur;
1872                 if (isdigit(c)) {
1873                     nonzero = 1;
1874                     c = tok_decimal_tail(tok);
1875                     if (c == 0) {
1876                         return ERRORTOKEN;
1877                     }
1878                 }
1879                 if (c == '.') {
1880                     c = tok_nextc(tok);
1881                     goto fraction;
1882                 }
1883                 else if (c == 'e' || c == 'E') {
1884                     goto exponent;
1885                 }
1886                 else if (c == 'j' || c == 'J') {
1887                     goto imaginary;
1888                 }
1889                 else if (nonzero) {
1890                     /* Old-style octal: now disallowed. */
1891                     tok_backup(tok, c);
1892                     return syntaxerror_known_range(
1893                             tok, (int)(tok->start + 1 - tok->line_start),
1894                             (int)(zeros_end - tok->line_start),
1895                             "leading zeros in decimal integer "
1896                             "literals are not permitted; "
1897                             "use an 0o prefix for octal integers");
1898                 }
1899                 if (!verify_end_of_number(tok, c, "decimal")) {
1900                     return ERRORTOKEN;
1901                 }
1902             }
1903         }
1904         else {
1905             /* Decimal */
1906             c = tok_decimal_tail(tok);
1907             if (c == 0) {
1908                 return ERRORTOKEN;
1909             }
1910             {
1911                 /* Accept floating point numbers. */
1912                 if (c == '.') {
1913                     c = tok_nextc(tok);
1914         fraction:
1915                     /* Fraction */
1916                     if (isdigit(c)) {
1917                         c = tok_decimal_tail(tok);
1918                         if (c == 0) {
1919                             return ERRORTOKEN;
1920                         }
1921                     }
1922                 }
1923                 if (c == 'e' || c == 'E') {
1924                     int e;
1925                   exponent:
1926                     e = c;
1927                     /* Exponent part */
1928                     c = tok_nextc(tok);
1929                     if (c == '+' || c == '-') {
1930                         c = tok_nextc(tok);
1931                         if (!isdigit(c)) {
1932                             tok_backup(tok, c);
1933                             return syntaxerror(tok, "invalid decimal literal");
1934                         }
1935                     } else if (!isdigit(c)) {
1936                         tok_backup(tok, c);
1937                         if (!verify_end_of_number(tok, e, "decimal")) {
1938                             return ERRORTOKEN;
1939                         }
1940                         tok_backup(tok, e);
1941                         *p_start = tok->start;
1942                         *p_end = tok->cur;
1943                         return NUMBER;
1944                     }
1945                     c = tok_decimal_tail(tok);
1946                     if (c == 0) {
1947                         return ERRORTOKEN;
1948                     }
1949                 }
1950                 if (c == 'j' || c == 'J') {
1951                     /* Imaginary part */
1952         imaginary:
1953                     c = tok_nextc(tok);
1954                     if (!verify_end_of_number(tok, c, "imaginary")) {
1955                         return ERRORTOKEN;
1956                     }
1957                 }
1958                 else if (!verify_end_of_number(tok, c, "decimal")) {
1959                     return ERRORTOKEN;
1960                 }
1961             }
1962         }
1963         tok_backup(tok, c);
1964         *p_start = tok->start;
1965         *p_end = tok->cur;
1966         return NUMBER;
1967     }
1968 
1969   letter_quote:
1970     /* String */
1971     if (c == '\'' || c == '"') {
1972         int quote = c;
1973         int quote_size = 1;             /* 1 or 3 */
1974         int end_quote_size = 0;
1975 
1976         /* Nodes of type STRING, especially multi line strings
1977            must be handled differently in order to get both
1978            the starting line number and the column offset right.
1979            (cf. issue 16806) */
1980         tok->first_lineno = tok->lineno;
1981         tok->multi_line_start = tok->line_start;
1982 
1983         /* Find the quote size and start of string */
1984         c = tok_nextc(tok);
1985         if (c == quote) {
1986             c = tok_nextc(tok);
1987             if (c == quote) {
1988                 quote_size = 3;
1989             }
1990             else {
1991                 end_quote_size = 1;     /* empty string found */
1992             }
1993         }
1994         if (c != quote) {
1995             tok_backup(tok, c);
1996         }
1997 
1998         /* Get rest of string */
1999         while (end_quote_size != quote_size) {
2000             c = tok_nextc(tok);
2001             if (tok->done == E_ERROR) {
2002                 return ERRORTOKEN;
2003             }
2004             if (tok->done == E_DECODE) {
2005                 break;
2006             }
2007             if (c == EOF || (quote_size == 1 && c == '\n')) {
2008                 assert(tok->multi_line_start != NULL);
2009                 // shift the tok_state's location into
2010                 // the start of string, and report the error
2011                 // from the initial quote character
2012                 tok->cur = (char *)tok->start;
2013                 tok->cur++;
2014                 tok->line_start = tok->multi_line_start;
2015                 int start = tok->lineno;
2016                 tok->lineno = tok->first_lineno;
2017                 if (quote_size == 3) {
2018                     syntaxerror(tok, "unterminated triple-quoted string literal"
2019                                      " (detected at line %d)", start);
2020                     if (c != '\n') {
2021                         tok->done = E_EOFS;
2022                     }
2023                     return ERRORTOKEN;
2024                 }
2025                 else {
2026                     syntaxerror(tok, "unterminated string literal (detected at"
2027                                      " line %d)", start);
2028                     if (c != '\n') {
2029                         tok->done = E_EOLS;
2030                     }
2031                     return ERRORTOKEN;
2032                 }
2033             }
2034             if (c == quote) {
2035                 end_quote_size += 1;
2036             }
2037             else {
2038                 end_quote_size = 0;
2039                 if (c == '\\') {
2040                     tok_nextc(tok);  /* skip escaped char */
2041                 }
2042             }
2043         }
2044 
2045         *p_start = tok->start;
2046         *p_end = tok->cur;
2047         return STRING;
2048     }
2049 
2050     /* Line continuation */
2051     if (c == '\\') {
2052         if ((c = tok_continuation_line(tok)) == -1) {
2053             return ERRORTOKEN;
2054         }
2055         tok->cont_line = 1;
2056         goto again; /* Read next line */
2057     }
2058 
2059     /* Check for two-character token */
2060     {
2061         int c2 = tok_nextc(tok);
2062         int token = PyToken_TwoChars(c, c2);
2063         if (token != OP) {
2064             int c3 = tok_nextc(tok);
2065             int token3 = PyToken_ThreeChars(c, c2, c3);
2066             if (token3 != OP) {
2067                 token = token3;
2068             }
2069             else {
2070                 tok_backup(tok, c3);
2071             }
2072             *p_start = tok->start;
2073             *p_end = tok->cur;
2074             return token;
2075         }
2076         tok_backup(tok, c2);
2077     }
2078 
2079     /* Keep track of parentheses nesting level */
2080     switch (c) {
2081     case '(':
2082     case '[':
2083     case '{':
2084         if (tok->level >= MAXLEVEL) {
2085             return syntaxerror(tok, "too many nested parentheses");
2086         }
2087         tok->parenstack[tok->level] = c;
2088         tok->parenlinenostack[tok->level] = tok->lineno;
2089         tok->parencolstack[tok->level] = (int)(tok->start - tok->line_start);
2090         tok->level++;
2091         break;
2092     case ')':
2093     case ']':
2094     case '}':
2095         if (!tok->level) {
2096             return syntaxerror(tok, "unmatched '%c'", c);
2097         }
2098         tok->level--;
2099         int opening = tok->parenstack[tok->level];
2100         if (!((opening == '(' && c == ')') ||
2101               (opening == '[' && c == ']') ||
2102               (opening == '{' && c == '}')))
2103         {
2104             if (tok->parenlinenostack[tok->level] != tok->lineno) {
2105                 return syntaxerror(tok,
2106                         "closing parenthesis '%c' does not match "
2107                         "opening parenthesis '%c' on line %d",
2108                         c, opening, tok->parenlinenostack[tok->level]);
2109             }
2110             else {
2111                 return syntaxerror(tok,
2112                         "closing parenthesis '%c' does not match "
2113                         "opening parenthesis '%c'",
2114                         c, opening);
2115             }
2116         }
2117         break;
2118     }
2119 
2120     if (!Py_UNICODE_ISPRINTABLE(c)) {
2121         char hex[9];
2122         (void)PyOS_snprintf(hex, sizeof(hex), "%04X", c);
2123         return syntaxerror(tok, "invalid non-printable character U+%s", hex);
2124     }
2125 
2126     /* Punctuation character */
2127     *p_start = tok->start;
2128     *p_end = tok->cur;
2129     return PyToken_OneChar(c);
2130 }
2131 
2132 int
_PyTokenizer_Get(struct tok_state * tok,const char ** p_start,const char ** p_end)2133 _PyTokenizer_Get(struct tok_state *tok,
2134                  const char **p_start, const char **p_end)
2135 {
2136     int result = tok_get(tok, p_start, p_end);
2137     if (tok->decoding_erred) {
2138         result = ERRORTOKEN;
2139         tok->done = E_DECODE;
2140     }
2141     return result;
2142 }
2143 
2144 #if defined(__wasi__) || (defined(__EMSCRIPTEN__) && (__EMSCRIPTEN_major__ >= 3))
2145 // fdopen() with borrowed fd. WASI does not provide dup() and Emscripten's
2146 // dup() emulation with open() is slow.
2147 typedef union {
2148     void *cookie;
2149     int fd;
2150 } borrowed;
2151 
2152 static ssize_t
borrow_read(void * cookie,char * buf,size_t size)2153 borrow_read(void *cookie, char *buf, size_t size)
2154 {
2155     borrowed b = {.cookie = cookie};
2156     return read(b.fd, (void *)buf, size);
2157 }
2158 
2159 static FILE *
fdopen_borrow(int fd)2160 fdopen_borrow(int fd) {
2161     // supports only reading. seek fails. close and write are no-ops.
2162     cookie_io_functions_t io_cb = {borrow_read, NULL, NULL, NULL};
2163     borrowed b = {.fd = fd};
2164     return fopencookie(b.cookie, "r", io_cb);
2165 }
2166 #else
2167 static FILE *
fdopen_borrow(int fd)2168 fdopen_borrow(int fd) {
2169     fd = _Py_dup(fd);
2170     if (fd < 0) {
2171         return NULL;
2172     }
2173     return fdopen(fd, "r");
2174 }
2175 #endif
2176 
2177 /* Get the encoding of a Python file. Check for the coding cookie and check if
2178    the file starts with a BOM.
2179 
2180    _PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
2181    encoding in the first or second line of the file (in which case the encoding
2182    should be assumed to be UTF-8).
2183 
2184    The char* returned is malloc'ed via PyMem_Malloc() and thus must be freed
2185    by the caller. */
2186 
2187 char *
_PyTokenizer_FindEncodingFilename(int fd,PyObject * filename)2188 _PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
2189 {
2190     struct tok_state *tok;
2191     FILE *fp;
2192     const char *p_start = NULL;
2193     const char *p_end = NULL;
2194     char *encoding = NULL;
2195 
2196     fp = fdopen_borrow(fd);
2197     if (fp == NULL) {
2198         return NULL;
2199     }
2200     tok = _PyTokenizer_FromFile(fp, NULL, NULL, NULL);
2201     if (tok == NULL) {
2202         fclose(fp);
2203         return NULL;
2204     }
2205     if (filename != NULL) {
2206         Py_INCREF(filename);
2207         tok->filename = filename;
2208     }
2209     else {
2210         tok->filename = PyUnicode_FromString("<string>");
2211         if (tok->filename == NULL) {
2212             fclose(fp);
2213             _PyTokenizer_Free(tok);
2214             return encoding;
2215         }
2216     }
2217     // We don't want to report warnings here because it could cause infinite recursion
2218     // if fetching the encoding shows a warning.
2219     tok->report_warnings = 0;
2220     while (tok->lineno < 2 && tok->done == E_OK) {
2221         _PyTokenizer_Get(tok, &p_start, &p_end);
2222     }
2223     fclose(fp);
2224     if (tok->encoding) {
2225         encoding = (char *)PyMem_Malloc(strlen(tok->encoding) + 1);
2226         if (encoding) {
2227             strcpy(encoding, tok->encoding);
2228         }
2229     }
2230     _PyTokenizer_Free(tok);
2231     return encoding;
2232 }
2233 
2234 #ifdef Py_DEBUG
2235 void
tok_dump(int type,char * start,char * end)2236 tok_dump(int type, char *start, char *end)
2237 {
2238     fprintf(stderr, "%s", _PyParser_TokenNames[type]);
2239     if (type == NAME || type == NUMBER || type == STRING || type == OP)
2240         fprintf(stderr, "(%.*s)", (int)(end - start), start);
2241 }
2242 #endif  // Py_DEBUG
2243