1 /*
2     unicode_format.h -- implementation of str.format().
3 */
4 
5 #include "pycore_floatobject.h"   // _PyFloat_FormatAdvancedWriter()
6 
7 /************************************************************************/
8 /***********   Global data structures and forward declarations  *********/
9 /************************************************************************/
10 
11 /*
12    A SubString consists of the characters between two string or
13    unicode pointers.
14 */
15 typedef struct {
16     PyObject *str; /* borrowed reference */
17     Py_ssize_t start, end;
18 } SubString;
19 
20 
21 typedef enum {
22     ANS_INIT,
23     ANS_AUTO,
24     ANS_MANUAL
25 } AutoNumberState;   /* Keep track if we're auto-numbering fields */
26 
27 /* Keeps track of our auto-numbering state, and which number field we're on */
28 typedef struct {
29     AutoNumberState an_state;
30     int an_field_number;
31 } AutoNumber;
32 
33 
34 /* forward declaration for recursion */
35 static PyObject *
36 build_string(SubString *input, PyObject *args, PyObject *kwargs,
37              int recursion_depth, AutoNumber *auto_number);
38 
39 
40 
41 /************************************************************************/
42 /**************************  Utility  functions  ************************/
43 /************************************************************************/
44 
45 static void
AutoNumber_Init(AutoNumber * auto_number)46 AutoNumber_Init(AutoNumber *auto_number)
47 {
48     auto_number->an_state = ANS_INIT;
49     auto_number->an_field_number = 0;
50 }
51 
52 /* fill in a SubString from a pointer and length */
53 Py_LOCAL_INLINE(void)
SubString_init(SubString * str,PyObject * s,Py_ssize_t start,Py_ssize_t end)54 SubString_init(SubString *str, PyObject *s, Py_ssize_t start, Py_ssize_t end)
55 {
56     str->str = s;
57     str->start = start;
58     str->end = end;
59 }
60 
61 /* return a new string.  if str->str is NULL, return None */
62 Py_LOCAL_INLINE(PyObject *)
SubString_new_object(SubString * str)63 SubString_new_object(SubString *str)
64 {
65     if (str->str == NULL)
66         Py_RETURN_NONE;
67     return PyUnicode_Substring(str->str, str->start, str->end);
68 }
69 
70 /* return a new string.  if str->str is NULL, return a new empty string */
71 Py_LOCAL_INLINE(PyObject *)
SubString_new_object_or_empty(SubString * str)72 SubString_new_object_or_empty(SubString *str)
73 {
74     if (str->str == NULL) {
75         return PyUnicode_New(0, 0);
76     }
77     return SubString_new_object(str);
78 }
79 
80 /* Return 1 if an error has been detected switching between automatic
81    field numbering and manual field specification, else return 0. Set
82    ValueError on error. */
83 static int
autonumber_state_error(AutoNumberState state,int field_name_is_empty)84 autonumber_state_error(AutoNumberState state, int field_name_is_empty)
85 {
86     if (state == ANS_MANUAL) {
87         if (field_name_is_empty) {
88             PyErr_SetString(PyExc_ValueError, "cannot switch from "
89                             "manual field specification to "
90                             "automatic field numbering");
91             return 1;
92         }
93     }
94     else {
95         if (!field_name_is_empty) {
96             PyErr_SetString(PyExc_ValueError, "cannot switch from "
97                             "automatic field numbering to "
98                             "manual field specification");
99             return 1;
100         }
101     }
102     return 0;
103 }
104 
105 
106 /************************************************************************/
107 /***********  Format string parsing -- integers and identifiers *********/
108 /************************************************************************/
109 
110 static Py_ssize_t
get_integer(const SubString * str)111 get_integer(const SubString *str)
112 {
113     Py_ssize_t accumulator = 0;
114     Py_ssize_t digitval;
115     Py_ssize_t i;
116 
117     /* empty string is an error */
118     if (str->start >= str->end)
119         return -1;
120 
121     for (i = str->start; i < str->end; i++) {
122         digitval = Py_UNICODE_TODECIMAL(PyUnicode_READ_CHAR(str->str, i));
123         if (digitval < 0)
124             return -1;
125         /*
126            Detect possible overflow before it happens:
127 
128               accumulator * 10 + digitval > PY_SSIZE_T_MAX if and only if
129               accumulator > (PY_SSIZE_T_MAX - digitval) / 10.
130         */
131         if (accumulator > (PY_SSIZE_T_MAX - digitval) / 10) {
132             PyErr_Format(PyExc_ValueError,
133                          "Too many decimal digits in format string");
134             return -1;
135         }
136         accumulator = accumulator * 10 + digitval;
137     }
138     return accumulator;
139 }
140 
141 /************************************************************************/
142 /******** Functions to get field objects and specification strings ******/
143 /************************************************************************/
144 
145 /* do the equivalent of obj.name */
146 static PyObject *
getattr(PyObject * obj,SubString * name)147 getattr(PyObject *obj, SubString *name)
148 {
149     PyObject *newobj;
150     PyObject *str = SubString_new_object(name);
151     if (str == NULL)
152         return NULL;
153     newobj = PyObject_GetAttr(obj, str);
154     Py_DECREF(str);
155     return newobj;
156 }
157 
158 /* do the equivalent of obj[idx], where obj is a sequence */
159 static PyObject *
getitem_sequence(PyObject * obj,Py_ssize_t idx)160 getitem_sequence(PyObject *obj, Py_ssize_t idx)
161 {
162     return PySequence_GetItem(obj, idx);
163 }
164 
165 /* do the equivalent of obj[idx], where obj is not a sequence */
166 static PyObject *
getitem_idx(PyObject * obj,Py_ssize_t idx)167 getitem_idx(PyObject *obj, Py_ssize_t idx)
168 {
169     PyObject *newobj;
170     PyObject *idx_obj = PyLong_FromSsize_t(idx);
171     if (idx_obj == NULL)
172         return NULL;
173     newobj = PyObject_GetItem(obj, idx_obj);
174     Py_DECREF(idx_obj);
175     return newobj;
176 }
177 
178 /* do the equivalent of obj[name] */
179 static PyObject *
getitem_str(PyObject * obj,SubString * name)180 getitem_str(PyObject *obj, SubString *name)
181 {
182     PyObject *newobj;
183     PyObject *str = SubString_new_object(name);
184     if (str == NULL)
185         return NULL;
186     newobj = PyObject_GetItem(obj, str);
187     Py_DECREF(str);
188     return newobj;
189 }
190 
191 typedef struct {
192     /* the entire string we're parsing.  we assume that someone else
193        is managing its lifetime, and that it will exist for the
194        lifetime of the iterator.  can be empty */
195     SubString str;
196 
197     /* index to where we are inside field_name */
198     Py_ssize_t index;
199 } FieldNameIterator;
200 
201 
202 static int
FieldNameIterator_init(FieldNameIterator * self,PyObject * s,Py_ssize_t start,Py_ssize_t end)203 FieldNameIterator_init(FieldNameIterator *self, PyObject *s,
204                        Py_ssize_t start, Py_ssize_t end)
205 {
206     SubString_init(&self->str, s, start, end);
207     self->index = start;
208     return 1;
209 }
210 
211 static int
_FieldNameIterator_attr(FieldNameIterator * self,SubString * name)212 _FieldNameIterator_attr(FieldNameIterator *self, SubString *name)
213 {
214     Py_UCS4 c;
215 
216     name->str = self->str.str;
217     name->start = self->index;
218 
219     /* return everything until '.' or '[' */
220     while (self->index < self->str.end) {
221         c = PyUnicode_READ_CHAR(self->str.str, self->index++);
222         switch (c) {
223         case '[':
224         case '.':
225             /* backup so that we this character will be seen next time */
226             self->index--;
227             break;
228         default:
229             continue;
230         }
231         break;
232     }
233     /* end of string is okay */
234     name->end = self->index;
235     return 1;
236 }
237 
238 static int
_FieldNameIterator_item(FieldNameIterator * self,SubString * name)239 _FieldNameIterator_item(FieldNameIterator *self, SubString *name)
240 {
241     int bracket_seen = 0;
242     Py_UCS4 c;
243 
244     name->str = self->str.str;
245     name->start = self->index;
246 
247     /* return everything until ']' */
248     while (self->index < self->str.end) {
249         c = PyUnicode_READ_CHAR(self->str.str, self->index++);
250         switch (c) {
251         case ']':
252             bracket_seen = 1;
253             break;
254         default:
255             continue;
256         }
257         break;
258     }
259     /* make sure we ended with a ']' */
260     if (!bracket_seen) {
261         PyErr_SetString(PyExc_ValueError, "Missing ']' in format string");
262         return 0;
263     }
264 
265     /* end of string is okay */
266     /* don't include the ']' */
267     name->end = self->index-1;
268     return 1;
269 }
270 
271 /* returns 0 on error, 1 on non-error termination, and 2 if it returns a value */
272 static int
FieldNameIterator_next(FieldNameIterator * self,int * is_attribute,Py_ssize_t * name_idx,SubString * name)273 FieldNameIterator_next(FieldNameIterator *self, int *is_attribute,
274                        Py_ssize_t *name_idx, SubString *name)
275 {
276     /* check at end of input */
277     if (self->index >= self->str.end)
278         return 1;
279 
280     switch (PyUnicode_READ_CHAR(self->str.str, self->index++)) {
281     case '.':
282         *is_attribute = 1;
283         if (_FieldNameIterator_attr(self, name) == 0)
284             return 0;
285         *name_idx = -1;
286         break;
287     case '[':
288         *is_attribute = 0;
289         if (_FieldNameIterator_item(self, name) == 0)
290             return 0;
291         *name_idx = get_integer(name);
292         if (*name_idx == -1 && PyErr_Occurred())
293             return 0;
294         break;
295     default:
296         /* Invalid character follows ']' */
297         PyErr_SetString(PyExc_ValueError, "Only '.' or '[' may "
298                         "follow ']' in format field specifier");
299         return 0;
300     }
301 
302     /* empty string is an error */
303     if (name->start == name->end) {
304         PyErr_SetString(PyExc_ValueError, "Empty attribute in format string");
305         return 0;
306     }
307 
308     return 2;
309 }
310 
311 
312 /* input: field_name
313    output: 'first' points to the part before the first '[' or '.'
314            'first_idx' is -1 if 'first' is not an integer, otherwise
315                        it's the value of first converted to an integer
316            'rest' is an iterator to return the rest
317 */
318 static int
field_name_split(PyObject * str,Py_ssize_t start,Py_ssize_t end,SubString * first,Py_ssize_t * first_idx,FieldNameIterator * rest,AutoNumber * auto_number)319 field_name_split(PyObject *str, Py_ssize_t start, Py_ssize_t end, SubString *first,
320                  Py_ssize_t *first_idx, FieldNameIterator *rest,
321                  AutoNumber *auto_number)
322 {
323     Py_UCS4 c;
324     Py_ssize_t i = start;
325     int field_name_is_empty;
326     int using_numeric_index;
327 
328     /* find the part up until the first '.' or '[' */
329     while (i < end) {
330         switch (c = PyUnicode_READ_CHAR(str, i++)) {
331         case '[':
332         case '.':
333             /* backup so that we this character is available to the
334                "rest" iterator */
335             i--;
336             break;
337         default:
338             continue;
339         }
340         break;
341     }
342 
343     /* set up the return values */
344     SubString_init(first, str, start, i);
345     FieldNameIterator_init(rest, str, i, end);
346 
347     /* see if "first" is an integer, in which case it's used as an index */
348     *first_idx = get_integer(first);
349     if (*first_idx == -1 && PyErr_Occurred())
350         return 0;
351 
352     field_name_is_empty = first->start >= first->end;
353 
354     /* If the field name is omitted or if we have a numeric index
355        specified, then we're doing numeric indexing into args. */
356     using_numeric_index = field_name_is_empty || *first_idx != -1;
357 
358     /* We always get here exactly one time for each field we're
359        processing. And we get here in field order (counting by left
360        braces). So this is the perfect place to handle automatic field
361        numbering if the field name is omitted. */
362 
363     /* Check if we need to do the auto-numbering. It's not needed if
364        we're called from string.Format routines, because it's handled
365        in that class by itself. */
366     if (auto_number) {
367         /* Initialize our auto numbering state if this is the first
368            time we're either auto-numbering or manually numbering. */
369         if (auto_number->an_state == ANS_INIT && using_numeric_index)
370             auto_number->an_state = field_name_is_empty ?
371                 ANS_AUTO : ANS_MANUAL;
372 
373         /* Make sure our state is consistent with what we're doing
374            this time through. Only check if we're using a numeric
375            index. */
376         if (using_numeric_index)
377             if (autonumber_state_error(auto_number->an_state,
378                                        field_name_is_empty))
379                 return 0;
380         /* Zero length field means we want to do auto-numbering of the
381            fields. */
382         if (field_name_is_empty)
383             *first_idx = (auto_number->an_field_number)++;
384     }
385 
386     return 1;
387 }
388 
389 
390 /*
391     get_field_object returns the object inside {}, before the
392     format_spec.  It handles getindex and getattr lookups and consumes
393     the entire input string.
394 */
395 static PyObject *
get_field_object(SubString * input,PyObject * args,PyObject * kwargs,AutoNumber * auto_number)396 get_field_object(SubString *input, PyObject *args, PyObject *kwargs,
397                  AutoNumber *auto_number)
398 {
399     PyObject *obj = NULL;
400     int ok;
401     int is_attribute;
402     SubString name;
403     SubString first;
404     Py_ssize_t index;
405     FieldNameIterator rest;
406 
407     if (!field_name_split(input->str, input->start, input->end, &first,
408                           &index, &rest, auto_number)) {
409         goto error;
410     }
411 
412     if (index == -1) {
413         /* look up in kwargs */
414         PyObject *key = SubString_new_object(&first);
415         if (key == NULL) {
416             goto error;
417         }
418         if (kwargs == NULL) {
419             PyErr_SetObject(PyExc_KeyError, key);
420             Py_DECREF(key);
421             goto error;
422         }
423         /* Use PyObject_GetItem instead of PyDict_GetItem because this
424            code is no longer just used with kwargs. It might be passed
425            a non-dict when called through format_map. */
426         obj = PyObject_GetItem(kwargs, key);
427         Py_DECREF(key);
428         if (obj == NULL) {
429             goto error;
430         }
431     }
432     else {
433         /* If args is NULL, we have a format string with a positional field
434            with only kwargs to retrieve it from. This can only happen when
435            used with format_map(), where positional arguments are not
436            allowed. */
437         if (args == NULL) {
438             PyErr_SetString(PyExc_ValueError, "Format string contains "
439                             "positional fields");
440             goto error;
441         }
442 
443         /* look up in args */
444         obj = PySequence_GetItem(args, index);
445         if (obj == NULL) {
446             PyErr_Format(PyExc_IndexError,
447                          "Replacement index %zd out of range for positional "
448                          "args tuple",
449                          index);
450              goto error;
451         }
452     }
453 
454     /* iterate over the rest of the field_name */
455     while ((ok = FieldNameIterator_next(&rest, &is_attribute, &index,
456                                         &name)) == 2) {
457         PyObject *tmp;
458 
459         if (is_attribute)
460             /* getattr lookup "." */
461             tmp = getattr(obj, &name);
462         else
463             /* getitem lookup "[]" */
464             if (index == -1)
465                 tmp = getitem_str(obj, &name);
466             else
467                 if (PySequence_Check(obj))
468                     tmp = getitem_sequence(obj, index);
469                 else
470                     /* not a sequence */
471                     tmp = getitem_idx(obj, index);
472         if (tmp == NULL)
473             goto error;
474 
475         /* assign to obj */
476         Py_DECREF(obj);
477         obj = tmp;
478     }
479     /* end of iterator, this is the non-error case */
480     if (ok == 1)
481         return obj;
482 error:
483     Py_XDECREF(obj);
484     return NULL;
485 }
486 
487 /************************************************************************/
488 /*****************  Field rendering functions  **************************/
489 /************************************************************************/
490 
491 /*
492     render_field() is the main function in this section.  It takes the
493     field object and field specification string generated by
494     get_field_and_spec, and renders the field into the output string.
495 
496     render_field calls fieldobj.__format__(format_spec) method, and
497     appends to the output.
498 */
499 static int
render_field(PyObject * fieldobj,SubString * format_spec,_PyUnicodeWriter * writer)500 render_field(PyObject *fieldobj, SubString *format_spec, _PyUnicodeWriter *writer)
501 {
502     int ok = 0;
503     PyObject *result = NULL;
504     PyObject *format_spec_object = NULL;
505     int (*formatter) (_PyUnicodeWriter*, PyObject *, PyObject *, Py_ssize_t, Py_ssize_t) = NULL;
506     int err;
507 
508     /* If we know the type exactly, skip the lookup of __format__ and just
509        call the formatter directly. */
510     if (PyUnicode_CheckExact(fieldobj))
511         formatter = _PyUnicode_FormatAdvancedWriter;
512     else if (PyLong_CheckExact(fieldobj))
513         formatter = _PyLong_FormatAdvancedWriter;
514     else if (PyFloat_CheckExact(fieldobj))
515         formatter = _PyFloat_FormatAdvancedWriter;
516     else if (PyComplex_CheckExact(fieldobj))
517         formatter = _PyComplex_FormatAdvancedWriter;
518 
519     if (formatter) {
520         /* we know exactly which formatter will be called when __format__ is
521            looked up, so call it directly, instead. */
522         err = formatter(writer, fieldobj, format_spec->str,
523                         format_spec->start, format_spec->end);
524         return (err == 0);
525     }
526     else {
527         /* We need to create an object out of the pointers we have, because
528            __format__ takes a string/unicode object for format_spec. */
529         if (format_spec->str)
530             format_spec_object = PyUnicode_Substring(format_spec->str,
531                                                      format_spec->start,
532                                                      format_spec->end);
533         else
534             format_spec_object = PyUnicode_New(0, 0);
535         if (format_spec_object == NULL)
536             goto done;
537 
538         result = PyObject_Format(fieldobj, format_spec_object);
539     }
540     if (result == NULL)
541         goto done;
542 
543     if (_PyUnicodeWriter_WriteStr(writer, result) == -1)
544         goto done;
545     ok = 1;
546 
547 done:
548     Py_XDECREF(format_spec_object);
549     Py_XDECREF(result);
550     return ok;
551 }
552 
553 static int
parse_field(SubString * str,SubString * field_name,SubString * format_spec,int * format_spec_needs_expanding,Py_UCS4 * conversion)554 parse_field(SubString *str, SubString *field_name, SubString *format_spec,
555             int *format_spec_needs_expanding, Py_UCS4 *conversion)
556 {
557     /* Note this function works if the field name is zero length,
558        which is good.  Zero length field names are handled later, in
559        field_name_split. */
560 
561     Py_UCS4 c = 0;
562 
563     /* initialize these, as they may be empty */
564     *conversion = '\0';
565     SubString_init(format_spec, NULL, 0, 0);
566 
567     /* Search for the field name.  it's terminated by the end of
568        the string, or a ':' or '!' */
569     field_name->str = str->str;
570     field_name->start = str->start;
571     while (str->start < str->end) {
572         switch ((c = PyUnicode_READ_CHAR(str->str, str->start++))) {
573         case '{':
574             PyErr_SetString(PyExc_ValueError, "unexpected '{' in field name");
575             return 0;
576         case '[':
577             for (; str->start < str->end; str->start++)
578                 if (PyUnicode_READ_CHAR(str->str, str->start) == ']')
579                     break;
580             continue;
581         case '}':
582         case ':':
583         case '!':
584             break;
585         default:
586             continue;
587         }
588         break;
589     }
590 
591     field_name->end = str->start - 1;
592     if (c == '!' || c == ':') {
593         Py_ssize_t count;
594         /* we have a format specifier and/or a conversion */
595         /* don't include the last character */
596 
597         /* see if there's a conversion specifier */
598         if (c == '!') {
599             /* there must be another character present */
600             if (str->start >= str->end) {
601                 PyErr_SetString(PyExc_ValueError,
602                                 "end of string while looking for conversion "
603                                 "specifier");
604                 return 0;
605             }
606             *conversion = PyUnicode_READ_CHAR(str->str, str->start++);
607 
608             if (str->start < str->end) {
609                 c = PyUnicode_READ_CHAR(str->str, str->start++);
610                 if (c == '}')
611                     return 1;
612                 if (c != ':') {
613                     PyErr_SetString(PyExc_ValueError,
614                                     "expected ':' after conversion specifier");
615                     return 0;
616                 }
617             }
618         }
619         format_spec->str = str->str;
620         format_spec->start = str->start;
621         count = 1;
622         while (str->start < str->end) {
623             switch ((c = PyUnicode_READ_CHAR(str->str, str->start++))) {
624             case '{':
625                 *format_spec_needs_expanding = 1;
626                 count++;
627                 break;
628             case '}':
629                 count--;
630                 if (count == 0) {
631                     format_spec->end = str->start - 1;
632                     return 1;
633                 }
634                 break;
635             default:
636                 break;
637             }
638         }
639 
640         PyErr_SetString(PyExc_ValueError, "unmatched '{' in format spec");
641         return 0;
642     }
643     else if (c != '}') {
644         PyErr_SetString(PyExc_ValueError, "expected '}' before end of string");
645         return 0;
646     }
647 
648     return 1;
649 }
650 
651 /************************************************************************/
652 /******* Output string allocation and escape-to-markup processing  ******/
653 /************************************************************************/
654 
655 /* MarkupIterator breaks the string into pieces of either literal
656    text, or things inside {} that need to be marked up.  it is
657    designed to make it easy to wrap a Python iterator around it, for
658    use with the Formatter class */
659 
660 typedef struct {
661     SubString str;
662 } MarkupIterator;
663 
664 static int
MarkupIterator_init(MarkupIterator * self,PyObject * str,Py_ssize_t start,Py_ssize_t end)665 MarkupIterator_init(MarkupIterator *self, PyObject *str,
666                     Py_ssize_t start, Py_ssize_t end)
667 {
668     SubString_init(&self->str, str, start, end);
669     return 1;
670 }
671 
672 /* returns 0 on error, 1 on non-error termination, and 2 if it got a
673    string (or something to be expanded) */
674 static int
MarkupIterator_next(MarkupIterator * self,SubString * literal,int * field_present,SubString * field_name,SubString * format_spec,Py_UCS4 * conversion,int * format_spec_needs_expanding)675 MarkupIterator_next(MarkupIterator *self, SubString *literal,
676                     int *field_present, SubString *field_name,
677                     SubString *format_spec, Py_UCS4 *conversion,
678                     int *format_spec_needs_expanding)
679 {
680     int at_end;
681     Py_UCS4 c = 0;
682     Py_ssize_t start;
683     Py_ssize_t len;
684     int markup_follows = 0;
685 
686     /* initialize all of the output variables */
687     SubString_init(literal, NULL, 0, 0);
688     SubString_init(field_name, NULL, 0, 0);
689     SubString_init(format_spec, NULL, 0, 0);
690     *conversion = '\0';
691     *format_spec_needs_expanding = 0;
692     *field_present = 0;
693 
694     /* No more input, end of iterator.  This is the normal exit
695        path. */
696     if (self->str.start >= self->str.end)
697         return 1;
698 
699     start = self->str.start;
700 
701     /* First read any literal text. Read until the end of string, an
702        escaped '{' or '}', or an unescaped '{'.  In order to never
703        allocate memory and so I can just pass pointers around, if
704        there's an escaped '{' or '}' then we'll return the literal
705        including the brace, but no format object.  The next time
706        through, we'll return the rest of the literal, skipping past
707        the second consecutive brace. */
708     while (self->str.start < self->str.end) {
709         switch (c = PyUnicode_READ_CHAR(self->str.str, self->str.start++)) {
710         case '{':
711         case '}':
712             markup_follows = 1;
713             break;
714         default:
715             continue;
716         }
717         break;
718     }
719 
720     at_end = self->str.start >= self->str.end;
721     len = self->str.start - start;
722 
723     if ((c == '}') && (at_end ||
724                        (c != PyUnicode_READ_CHAR(self->str.str,
725                                                  self->str.start)))) {
726         PyErr_SetString(PyExc_ValueError, "Single '}' encountered "
727                         "in format string");
728         return 0;
729     }
730     if (at_end && c == '{') {
731         PyErr_SetString(PyExc_ValueError, "Single '{' encountered "
732                         "in format string");
733         return 0;
734     }
735     if (!at_end) {
736         if (c == PyUnicode_READ_CHAR(self->str.str, self->str.start)) {
737             /* escaped } or {, skip it in the input.  there is no
738                markup object following us, just this literal text */
739             self->str.start++;
740             markup_follows = 0;
741         }
742         else
743             len--;
744     }
745 
746     /* record the literal text */
747     literal->str = self->str.str;
748     literal->start = start;
749     literal->end = start + len;
750 
751     if (!markup_follows)
752         return 2;
753 
754     /* this is markup; parse the field */
755     *field_present = 1;
756     if (!parse_field(&self->str, field_name, format_spec,
757                      format_spec_needs_expanding, conversion))
758         return 0;
759     return 2;
760 }
761 
762 
763 /* do the !r or !s conversion on obj */
764 static PyObject *
do_conversion(PyObject * obj,Py_UCS4 conversion)765 do_conversion(PyObject *obj, Py_UCS4 conversion)
766 {
767     /* XXX in pre-3.0, do we need to convert this to unicode, since it
768        might have returned a string? */
769     switch (conversion) {
770     case 'r':
771         return PyObject_Repr(obj);
772     case 's':
773         return PyObject_Str(obj);
774     case 'a':
775         return PyObject_ASCII(obj);
776     default:
777         if (conversion > 32 && conversion < 127) {
778                 /* It's the ASCII subrange; casting to char is safe
779                    (assuming the execution character set is an ASCII
780                    superset). */
781                 PyErr_Format(PyExc_ValueError,
782                      "Unknown conversion specifier %c",
783                      (char)conversion);
784         } else
785                 PyErr_Format(PyExc_ValueError,
786                      "Unknown conversion specifier \\x%x",
787                      (unsigned int)conversion);
788         return NULL;
789     }
790 }
791 
792 /* given:
793 
794    {field_name!conversion:format_spec}
795 
796    compute the result and write it to output.
797    format_spec_needs_expanding is an optimization.  if it's false,
798    just output the string directly, otherwise recursively expand the
799    format_spec string.
800 
801    field_name is allowed to be zero length, in which case we
802    are doing auto field numbering.
803 */
804 
805 static int
output_markup(SubString * field_name,SubString * format_spec,int format_spec_needs_expanding,Py_UCS4 conversion,_PyUnicodeWriter * writer,PyObject * args,PyObject * kwargs,int recursion_depth,AutoNumber * auto_number)806 output_markup(SubString *field_name, SubString *format_spec,
807               int format_spec_needs_expanding, Py_UCS4 conversion,
808               _PyUnicodeWriter *writer, PyObject *args, PyObject *kwargs,
809               int recursion_depth, AutoNumber *auto_number)
810 {
811     PyObject *tmp = NULL;
812     PyObject *fieldobj = NULL;
813     SubString expanded_format_spec;
814     SubString *actual_format_spec;
815     int result = 0;
816 
817     /* convert field_name to an object */
818     fieldobj = get_field_object(field_name, args, kwargs, auto_number);
819     if (fieldobj == NULL)
820         goto done;
821 
822     if (conversion != '\0') {
823         tmp = do_conversion(fieldobj, conversion);
824         if (tmp == NULL || PyUnicode_READY(tmp) == -1)
825             goto done;
826 
827         /* do the assignment, transferring ownership: fieldobj = tmp */
828         Py_DECREF(fieldobj);
829         fieldobj = tmp;
830         tmp = NULL;
831     }
832 
833     /* if needed, recursively compute the format_spec */
834     if (format_spec_needs_expanding) {
835         tmp = build_string(format_spec, args, kwargs, recursion_depth-1,
836                            auto_number);
837         if (tmp == NULL || PyUnicode_READY(tmp) == -1)
838             goto done;
839 
840         /* note that in the case we're expanding the format string,
841            tmp must be kept around until after the call to
842            render_field. */
843         SubString_init(&expanded_format_spec, tmp, 0, PyUnicode_GET_LENGTH(tmp));
844         actual_format_spec = &expanded_format_spec;
845     }
846     else
847         actual_format_spec = format_spec;
848 
849     if (render_field(fieldobj, actual_format_spec, writer) == 0)
850         goto done;
851 
852     result = 1;
853 
854 done:
855     Py_XDECREF(fieldobj);
856     Py_XDECREF(tmp);
857 
858     return result;
859 }
860 
861 /*
862     do_markup is the top-level loop for the format() method.  It
863     searches through the format string for escapes to markup codes, and
864     calls other functions to move non-markup text to the output,
865     and to perform the markup to the output.
866 */
867 static int
do_markup(SubString * input,PyObject * args,PyObject * kwargs,_PyUnicodeWriter * writer,int recursion_depth,AutoNumber * auto_number)868 do_markup(SubString *input, PyObject *args, PyObject *kwargs,
869           _PyUnicodeWriter *writer, int recursion_depth, AutoNumber *auto_number)
870 {
871     MarkupIterator iter;
872     int format_spec_needs_expanding;
873     int result;
874     int field_present;
875     SubString literal;
876     SubString field_name;
877     SubString format_spec;
878     Py_UCS4 conversion;
879 
880     MarkupIterator_init(&iter, input->str, input->start, input->end);
881     while ((result = MarkupIterator_next(&iter, &literal, &field_present,
882                                          &field_name, &format_spec,
883                                          &conversion,
884                                          &format_spec_needs_expanding)) == 2) {
885         if (literal.end != literal.start) {
886             if (!field_present && iter.str.start == iter.str.end)
887                 writer->overallocate = 0;
888             if (_PyUnicodeWriter_WriteSubstring(writer, literal.str,
889                                                 literal.start, literal.end) < 0)
890                 return 0;
891         }
892 
893         if (field_present) {
894             if (iter.str.start == iter.str.end)
895                 writer->overallocate = 0;
896             if (!output_markup(&field_name, &format_spec,
897                                format_spec_needs_expanding, conversion, writer,
898                                args, kwargs, recursion_depth, auto_number))
899                 return 0;
900         }
901     }
902     return result;
903 }
904 
905 
906 /*
907     build_string allocates the output string and then
908     calls do_markup to do the heavy lifting.
909 */
910 static PyObject *
build_string(SubString * input,PyObject * args,PyObject * kwargs,int recursion_depth,AutoNumber * auto_number)911 build_string(SubString *input, PyObject *args, PyObject *kwargs,
912              int recursion_depth, AutoNumber *auto_number)
913 {
914     _PyUnicodeWriter writer;
915 
916     /* check the recursion level */
917     if (recursion_depth <= 0) {
918         PyErr_SetString(PyExc_ValueError,
919                         "Max string recursion exceeded");
920         return NULL;
921     }
922 
923     _PyUnicodeWriter_Init(&writer);
924     writer.overallocate = 1;
925     writer.min_length = PyUnicode_GET_LENGTH(input->str) + 100;
926 
927     if (!do_markup(input, args, kwargs, &writer, recursion_depth,
928                    auto_number)) {
929         _PyUnicodeWriter_Dealloc(&writer);
930         return NULL;
931     }
932 
933     return _PyUnicodeWriter_Finish(&writer);
934 }
935 
936 /************************************************************************/
937 /*********** main routine ***********************************************/
938 /************************************************************************/
939 
940 /* this is the main entry point */
941 static PyObject *
do_string_format(PyObject * self,PyObject * args,PyObject * kwargs)942 do_string_format(PyObject *self, PyObject *args, PyObject *kwargs)
943 {
944     SubString input;
945 
946     /* PEP 3101 says only 2 levels, so that
947        "{0:{1}}".format('abc', 's')            # works
948        "{0:{1:{2}}}".format('abc', 's', '')    # fails
949     */
950     int recursion_depth = 2;
951 
952     AutoNumber auto_number;
953 
954     if (PyUnicode_READY(self) == -1)
955         return NULL;
956 
957     AutoNumber_Init(&auto_number);
958     SubString_init(&input, self, 0, PyUnicode_GET_LENGTH(self));
959     return build_string(&input, args, kwargs, recursion_depth, &auto_number);
960 }
961 
962 static PyObject *
do_string_format_map(PyObject * self,PyObject * obj)963 do_string_format_map(PyObject *self, PyObject *obj)
964 {
965     return do_string_format(self, NULL, obj);
966 }
967 
968 
969 /************************************************************************/
970 /*********** formatteriterator ******************************************/
971 /************************************************************************/
972 
973 /* This is used to implement string.Formatter.vparse().  It exists so
974    Formatter can share code with the built in unicode.format() method.
975    It's really just a wrapper around MarkupIterator that is callable
976    from Python. */
977 
978 typedef struct {
979     PyObject_HEAD
980     PyObject *str;
981     MarkupIterator it_markup;
982 } formatteriterobject;
983 
984 static void
formatteriter_dealloc(formatteriterobject * it)985 formatteriter_dealloc(formatteriterobject *it)
986 {
987     Py_XDECREF(it->str);
988     PyObject_Free(it);
989 }
990 
991 /* returns a tuple:
992    (literal, field_name, format_spec, conversion)
993 
994    literal is any literal text to output.  might be zero length
995    field_name is the string before the ':'.  might be None
996    format_spec is the string after the ':'.  mibht be None
997    conversion is either None, or the string after the '!'
998 */
999 static PyObject *
formatteriter_next(formatteriterobject * it)1000 formatteriter_next(formatteriterobject *it)
1001 {
1002     SubString literal;
1003     SubString field_name;
1004     SubString format_spec;
1005     Py_UCS4 conversion;
1006     int format_spec_needs_expanding;
1007     int field_present;
1008     int result = MarkupIterator_next(&it->it_markup, &literal, &field_present,
1009                                      &field_name, &format_spec, &conversion,
1010                                      &format_spec_needs_expanding);
1011 
1012     /* all of the SubString objects point into it->str, so no
1013        memory management needs to be done on them */
1014     assert(0 <= result && result <= 2);
1015     if (result == 0 || result == 1)
1016         /* if 0, error has already been set, if 1, iterator is empty */
1017         return NULL;
1018     else {
1019         PyObject *literal_str = NULL;
1020         PyObject *field_name_str = NULL;
1021         PyObject *format_spec_str = NULL;
1022         PyObject *conversion_str = NULL;
1023         PyObject *tuple = NULL;
1024 
1025         literal_str = SubString_new_object(&literal);
1026         if (literal_str == NULL)
1027             goto done;
1028 
1029         field_name_str = SubString_new_object(&field_name);
1030         if (field_name_str == NULL)
1031             goto done;
1032 
1033         /* if field_name is non-zero length, return a string for
1034            format_spec (even if zero length), else return None */
1035         format_spec_str = (field_present ?
1036                            SubString_new_object_or_empty :
1037                            SubString_new_object)(&format_spec);
1038         if (format_spec_str == NULL)
1039             goto done;
1040 
1041         /* if the conversion is not specified, return a None,
1042            otherwise create a one length string with the conversion
1043            character */
1044         if (conversion == '\0') {
1045             conversion_str = Py_None;
1046             Py_INCREF(conversion_str);
1047         }
1048         else
1049             conversion_str = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
1050                                                        &conversion, 1);
1051         if (conversion_str == NULL)
1052             goto done;
1053 
1054         tuple = PyTuple_Pack(4, literal_str, field_name_str, format_spec_str,
1055                              conversion_str);
1056     done:
1057         Py_XDECREF(literal_str);
1058         Py_XDECREF(field_name_str);
1059         Py_XDECREF(format_spec_str);
1060         Py_XDECREF(conversion_str);
1061         return tuple;
1062     }
1063 }
1064 
1065 static PyMethodDef formatteriter_methods[] = {
1066     {NULL,              NULL}           /* sentinel */
1067 };
1068 
1069 static PyTypeObject PyFormatterIter_Type = {
1070     PyVarObject_HEAD_INIT(&PyType_Type, 0)
1071     "formatteriterator",                /* tp_name */
1072     sizeof(formatteriterobject),        /* tp_basicsize */
1073     0,                                  /* tp_itemsize */
1074     /* methods */
1075     (destructor)formatteriter_dealloc,  /* tp_dealloc */
1076     0,                                  /* tp_vectorcall_offset */
1077     0,                                  /* tp_getattr */
1078     0,                                  /* tp_setattr */
1079     0,                                  /* tp_as_async */
1080     0,                                  /* tp_repr */
1081     0,                                  /* tp_as_number */
1082     0,                                  /* tp_as_sequence */
1083     0,                                  /* tp_as_mapping */
1084     0,                                  /* tp_hash */
1085     0,                                  /* tp_call */
1086     0,                                  /* tp_str */
1087     PyObject_GenericGetAttr,            /* tp_getattro */
1088     0,                                  /* tp_setattro */
1089     0,                                  /* tp_as_buffer */
1090     Py_TPFLAGS_DEFAULT,                 /* tp_flags */
1091     0,                                  /* tp_doc */
1092     0,                                  /* tp_traverse */
1093     0,                                  /* tp_clear */
1094     0,                                  /* tp_richcompare */
1095     0,                                  /* tp_weaklistoffset */
1096     PyObject_SelfIter,                  /* tp_iter */
1097     (iternextfunc)formatteriter_next,   /* tp_iternext */
1098     formatteriter_methods,              /* tp_methods */
1099     0,
1100 };
1101 
1102 /* unicode_formatter_parser is used to implement
1103    string.Formatter.vformat.  it parses a string and returns tuples
1104    describing the parsed elements.  It's a wrapper around
1105    stringlib/string_format.h's MarkupIterator */
1106 static PyObject *
formatter_parser(PyObject * ignored,PyObject * self)1107 formatter_parser(PyObject *ignored, PyObject *self)
1108 {
1109     formatteriterobject *it;
1110 
1111     if (!PyUnicode_Check(self)) {
1112         PyErr_Format(PyExc_TypeError, "expected str, got %s", Py_TYPE(self)->tp_name);
1113         return NULL;
1114     }
1115 
1116     if (PyUnicode_READY(self) == -1)
1117         return NULL;
1118 
1119     it = PyObject_New(formatteriterobject, &PyFormatterIter_Type);
1120     if (it == NULL)
1121         return NULL;
1122 
1123     /* take ownership, give the object to the iterator */
1124     Py_INCREF(self);
1125     it->str = self;
1126 
1127     /* initialize the contained MarkupIterator */
1128     MarkupIterator_init(&it->it_markup, (PyObject*)self, 0, PyUnicode_GET_LENGTH(self));
1129     return (PyObject *)it;
1130 }
1131 
1132 
1133 /************************************************************************/
1134 /*********** fieldnameiterator ******************************************/
1135 /************************************************************************/
1136 
1137 
1138 /* This is used to implement string.Formatter.vparse().  It parses the
1139    field name into attribute and item values.  It's a Python-callable
1140    wrapper around FieldNameIterator */
1141 
1142 typedef struct {
1143     PyObject_HEAD
1144     PyObject *str;
1145     FieldNameIterator it_field;
1146 } fieldnameiterobject;
1147 
1148 static void
fieldnameiter_dealloc(fieldnameiterobject * it)1149 fieldnameiter_dealloc(fieldnameiterobject *it)
1150 {
1151     Py_XDECREF(it->str);
1152     PyObject_Free(it);
1153 }
1154 
1155 /* returns a tuple:
1156    (is_attr, value)
1157    is_attr is true if we used attribute syntax (e.g., '.foo')
1158               false if we used index syntax (e.g., '[foo]')
1159    value is an integer or string
1160 */
1161 static PyObject *
fieldnameiter_next(fieldnameiterobject * it)1162 fieldnameiter_next(fieldnameiterobject *it)
1163 {
1164     int result;
1165     int is_attr;
1166     Py_ssize_t idx;
1167     SubString name;
1168 
1169     result = FieldNameIterator_next(&it->it_field, &is_attr,
1170                                     &idx, &name);
1171     if (result == 0 || result == 1)
1172         /* if 0, error has already been set, if 1, iterator is empty */
1173         return NULL;
1174     else {
1175         PyObject* result = NULL;
1176         PyObject* is_attr_obj = NULL;
1177         PyObject* obj = NULL;
1178 
1179         is_attr_obj = PyBool_FromLong(is_attr);
1180         if (is_attr_obj == NULL)
1181             goto done;
1182 
1183         /* either an integer or a string */
1184         if (idx != -1)
1185             obj = PyLong_FromSsize_t(idx);
1186         else
1187             obj = SubString_new_object(&name);
1188         if (obj == NULL)
1189             goto done;
1190 
1191         /* return a tuple of values */
1192         result = PyTuple_Pack(2, is_attr_obj, obj);
1193 
1194     done:
1195         Py_XDECREF(is_attr_obj);
1196         Py_XDECREF(obj);
1197         return result;
1198     }
1199 }
1200 
1201 static PyMethodDef fieldnameiter_methods[] = {
1202     {NULL,              NULL}           /* sentinel */
1203 };
1204 
1205 static PyTypeObject PyFieldNameIter_Type = {
1206     PyVarObject_HEAD_INIT(&PyType_Type, 0)
1207     "fieldnameiterator",                /* tp_name */
1208     sizeof(fieldnameiterobject),        /* tp_basicsize */
1209     0,                                  /* tp_itemsize */
1210     /* methods */
1211     (destructor)fieldnameiter_dealloc,  /* tp_dealloc */
1212     0,                                  /* tp_vectorcall_offset */
1213     0,                                  /* tp_getattr */
1214     0,                                  /* tp_setattr */
1215     0,                                  /* tp_as_async */
1216     0,                                  /* tp_repr */
1217     0,                                  /* tp_as_number */
1218     0,                                  /* tp_as_sequence */
1219     0,                                  /* tp_as_mapping */
1220     0,                                  /* tp_hash */
1221     0,                                  /* tp_call */
1222     0,                                  /* tp_str */
1223     PyObject_GenericGetAttr,            /* tp_getattro */
1224     0,                                  /* tp_setattro */
1225     0,                                  /* tp_as_buffer */
1226     Py_TPFLAGS_DEFAULT,                 /* tp_flags */
1227     0,                                  /* tp_doc */
1228     0,                                  /* tp_traverse */
1229     0,                                  /* tp_clear */
1230     0,                                  /* tp_richcompare */
1231     0,                                  /* tp_weaklistoffset */
1232     PyObject_SelfIter,                  /* tp_iter */
1233     (iternextfunc)fieldnameiter_next,   /* tp_iternext */
1234     fieldnameiter_methods,              /* tp_methods */
1235     0};
1236 
1237 /* unicode_formatter_field_name_split is used to implement
1238    string.Formatter.vformat.  it takes a PEP 3101 "field name", and
1239    returns a tuple of (first, rest): "first", the part before the
1240    first '.' or '['; and "rest", an iterator for the rest of the field
1241    name.  it's a wrapper around stringlib/string_format.h's
1242    field_name_split.  The iterator it returns is a
1243    FieldNameIterator */
1244 static PyObject *
formatter_field_name_split(PyObject * ignored,PyObject * self)1245 formatter_field_name_split(PyObject *ignored, PyObject *self)
1246 {
1247     SubString first;
1248     Py_ssize_t first_idx;
1249     fieldnameiterobject *it;
1250 
1251     PyObject *first_obj = NULL;
1252     PyObject *result = NULL;
1253 
1254     if (!PyUnicode_Check(self)) {
1255         PyErr_Format(PyExc_TypeError, "expected str, got %s", Py_TYPE(self)->tp_name);
1256         return NULL;
1257     }
1258 
1259     if (PyUnicode_READY(self) == -1)
1260         return NULL;
1261 
1262     it = PyObject_New(fieldnameiterobject, &PyFieldNameIter_Type);
1263     if (it == NULL)
1264         return NULL;
1265 
1266     /* take ownership, give the object to the iterator.  this is
1267        just to keep the field_name alive */
1268     Py_INCREF(self);
1269     it->str = self;
1270 
1271     /* Pass in auto_number = NULL. We'll return an empty string for
1272        first_obj in that case. */
1273     if (!field_name_split((PyObject*)self, 0, PyUnicode_GET_LENGTH(self),
1274                           &first, &first_idx, &it->it_field, NULL))
1275         goto done;
1276 
1277     /* first becomes an integer, if possible; else a string */
1278     if (first_idx != -1)
1279         first_obj = PyLong_FromSsize_t(first_idx);
1280     else
1281         /* convert "first" into a string object */
1282         first_obj = SubString_new_object(&first);
1283     if (first_obj == NULL)
1284         goto done;
1285 
1286     /* return a tuple of values */
1287     result = PyTuple_Pack(2, first_obj, it);
1288 
1289 done:
1290     Py_XDECREF(it);
1291     Py_XDECREF(first_obj);
1292     return result;
1293 }
1294