xref: /aosp_15_r20/external/emboss/compiler/front_end/lr1.py (revision 99e0aae7469b87d12f0ad23e61142c2d74c1ef70)
1# Copyright 2019 Google LLC
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7#     https://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14
15"""LR(1) parser generator.
16
17The primary class in this module, Grammar, takes a list of context-free grammar
18productions, and produces the corresponding LR(1) shift-reduce parser.  This is
19an implementation of the algorithm on pages 221 and 261-265 of "Compilers:
20Principles, Techniques, & Tools" (Second Edition) by Aho, Lam, Sethi, and
21Ullman, also known as "The Dragon Book," hereafter referred to as "ALSU."
22
23This module only implements the LR(1) algorithms; unlike tools such as yacc, it
24does not implement the various bits of glue necessary to actually use a parser.
25Clients are expected to provide their own tokenizers and handle turning a raw
26parse tree into an intermediate representation on their own.
27"""
28
29import collections
30
31from compiler.util import parser_types
32
33
34class Item(collections.namedtuple("Item", ["production", "dot", "terminal",
35                                           "next_symbol"])):
36  """An Item is an LR(1) Item: a production, a cursor location, and a terminal.
37
38  An Item represents a partially-parsed production, and a lookahead symbol.  The
39  position of the dot indicates what portion of the production has been parsed.
40  Generally, Items are an internal implementation detail, but they can be useful
41  elsewhere, particularly for debugging.
42
43  Attributes:
44    production: The Production this Item covers.
45    dot: The index of the "dot" in production's rhs.
46    terminal: The terminal lookahead symbol that follows the production in the
47        input stream.
48  """
49
50  def __str__(self):
51    """__str__ generates ASLU notation."""
52    return (str(self.production.lhs) + " -> " + " ".join(
53        [str(r) for r in self.production.rhs[0:self.dot] + (".",) +
54         self.production.rhs[self.dot:]]) + ", " + str(self.terminal))
55
56  @staticmethod
57  def parse(text):
58    """Parses an Item in ALSU notation.
59
60    Parses an Item from notation like:
61
62       symbol -> foo . bar baz, qux
63
64    where "symbol -> foo bar baz" will be taken as the production, the position
65    of the "." is taken as "dot" (in this case 1), and the symbol after "," is
66    taken as the "terminal".  The following are also valid items:
67
68       sym -> ., foo
69       sym -> . foo bar, baz
70       sym -> foo bar ., baz
71
72    Symbols on the right-hand side of the production should be separated by
73    whitespace.
74
75    Arguments:
76      text: The text to parse into an Item.
77
78    Returns:
79      An Item.
80    """
81    production, terminal = text.split(",")
82    terminal = terminal.strip()
83    if terminal == "$":
84      terminal = END_OF_INPUT
85    lhs, rhs = production.split("->")
86    lhs = lhs.strip()
87    if lhs == "S'":
88      lhs = START_PRIME
89    before_dot, after_dot = rhs.split(".")
90    handle = before_dot.split()
91    tail = after_dot.split()
92    return make_item(parser_types.Production(lhs, tuple(handle + tail)),
93                     len(handle), terminal)
94
95
96def make_item(production, dot, symbol):
97  return Item(production, dot, symbol,
98              None if dot >= len(production.rhs) else production.rhs[dot])
99
100
101class Conflict(
102    collections.namedtuple("Conflict", ["state", "symbol", "actions"])
103):
104  """Conflict represents a parse conflict."""
105
106  def __str__(self):
107    return "Conflict for {} in state {}: ".format(
108        self.symbol, self.state) + " vs ".join([str(a) for a in self.actions])
109
110
111Shift = collections.namedtuple("Shift", ["state", "items"])
112Reduce = collections.namedtuple("Reduce", ["rule"])
113Accept = collections.namedtuple("Accept", [])
114Error = collections.namedtuple("Error", ["code"])
115
116Symbol = collections.namedtuple("Symbol", ["symbol"])
117
118# START_PRIME is the implicit 'real' root symbol for the grammar.
119START_PRIME = "S'"
120
121# END_OF_INPUT is the implicit symbol at the end of input.
122END_OF_INPUT = "$"
123
124# ANY_TOKEN is used by mark_error as a "wildcard" token that should be replaced
125# by every other token.
126ANY_TOKEN = parser_types.Token(object(), "*",
127                               parser_types.parse_location("0:0-0:0"))
128
129
130class Reduction(collections.namedtuple("Reduction",
131                                       ["symbol", "children", "production",
132                                        "source_location"])):
133  """A Reduction is a non-leaf node in a parse tree.
134
135  Attributes:
136    symbol: The name of this element in the parse.
137    children: The child elements of this parse.
138    production: The grammar production to which this reduction corresponds.
139    source_location: If known, the range in the source text corresponding to the
140      tokens from which this reduction was parsed.  May be 'None' if this
141      reduction was produced from no symbols, or if the tokens fed to `parse`
142      did not include source_location.
143  """
144  pass
145
146
147class Grammar(object):
148  """Grammar is an LR(1) context-free grammar.
149
150  Attributes:
151    start: The start symbol for the grammar.
152    productions: A list of productions in the grammar, including the S' -> start
153      production.
154    symbols: A set of all symbols in the grammar, including $ and S'.
155    nonterminals: A set of all nonterminal symbols in the grammar, including S'.
156    terminals: A set of all terminal symbols in the grammar, including $.
157  """
158
159  def __init__(self, start_symbol, productions):
160    """Constructs a Grammar object.
161
162    Arguments:
163      start_symbol: The start symbol for the grammar.
164      productions: A list of productions (not including the "S' -> start_symbol"
165          production).
166    """
167    object.__init__(self)
168    self.start = start_symbol
169    self._seed_production = parser_types.Production(START_PRIME, (self.start,))
170    self.productions = productions + [self._seed_production]
171
172    self._single_level_closure_of_item_cache = {}
173    self._closure_of_item_cache = {}
174    self._compute_symbols()
175    self._compute_seed_firsts()
176    self._set_productions_by_lhs()
177    self._populate_item_cache()
178
179  def _set_productions_by_lhs(self):
180    # Prepopulating _productions_by_lhs speeds up _closure_of_item by about 30%,
181    # which is significant on medium-to-large grammars.
182    self._productions_by_lhs = {}
183    for production in self.productions:
184      self._productions_by_lhs.setdefault(production.lhs, list()).append(
185          production)
186
187  def _populate_item_cache(self):
188    # There are a relatively small number of possible Items for a grammar, and
189    # the algorithm needs to get Items from their constituent components very
190    # frequently.  As it turns out, pre-caching all possible Items results in a
191    # ~35% overall speedup to Grammar.parser().
192    self._item_cache = {}
193    for symbol in self.terminals:
194      for production in self.productions:
195        for dot in range(len(production.rhs) + 1):
196          self._item_cache[production, dot, symbol] = make_item(
197              production, dot, symbol)
198
199  def _compute_symbols(self):
200    """Finds all grammar symbols, and sorts them into terminal and non-terminal.
201
202    Nonterminal symbols are those which appear on the left side of any
203    production.  Terminal symbols are those which do not.
204
205    _compute_symbols is used during __init__.
206    """
207    self.symbols = {END_OF_INPUT}
208    self.nonterminals = set()
209    for production in self.productions:
210      self.symbols.add(production.lhs)
211      self.nonterminals.add(production.lhs)
212      for symbol in production.rhs:
213        self.symbols.add(symbol)
214    self.terminals = self.symbols - self.nonterminals
215
216  def _compute_seed_firsts(self):
217    """Computes FIRST (ALSU p221) for all terminal and nonterminal symbols.
218
219    The algorithm for computing FIRST is an iterative one that terminates when
220    it reaches a fixed point (that is, when further iterations stop changing
221    state).  _compute_seed_firsts computes the fixed point for all single-symbol
222    strings, by repeatedly calling _first and updating the internal _firsts
223    table with the results.
224
225    Once _compute_seed_firsts has completed, _first will return correct results
226    for both single- and multi-symbol strings.
227
228    _compute_seed_firsts is used during __init__.
229    """
230    self.firsts = {}
231    # FIRST for a terminal symbol is always just that terminal symbol.
232    for terminal in self.terminals:
233      self.firsts[terminal] = set([terminal])
234    for nonterminal in self.nonterminals:
235      self.firsts[nonterminal] = set()
236    while True:
237      # The first iteration picks up all the productions that start with
238      # terminal symbols.  The second iteration picks up productions that start
239      # with nonterminals that the first iteration picked up.  The third
240      # iteration picks up nonterminals that the first and second picked up, and
241      # so on.
242      #
243      # This is guaranteed to end, in the worst case, when every terminal
244      # symbol and epsilon has been added to the _firsts set for every
245      # nonterminal symbol.  This would be slow, but requires a pathological
246      # grammar; useful grammars should complete in only a few iterations.
247      firsts_to_add = {}
248      for production in self.productions:
249        for first in self._first(production.rhs):
250          if first not in self.firsts[production.lhs]:
251            if production.lhs not in firsts_to_add:
252              firsts_to_add[production.lhs] = set()
253            firsts_to_add[production.lhs].add(first)
254      if not firsts_to_add:
255        break
256      for symbol in firsts_to_add:
257        self.firsts[symbol].update(firsts_to_add[symbol])
258
259  def _first(self, symbols):
260    """The FIRST function from ALSU p221.
261
262    _first takes a string of symbols (both terminals and nonterminals) and
263    returns the set of terminal symbols which could be the first terminal symbol
264    of a string produced by the given list of symbols.
265
266    _first will not give fully-correct results until _compute_seed_firsts
267    finishes, but is called by _compute_seed_firsts, and must provide partial
268    results during that method's execution.
269
270    Args:
271      symbols: A list of symbols.
272
273    Returns:
274      A set of terminals which could be the first terminal in "symbols."
275    """
276    result = set()
277    all_contain_epsilon = True
278    for symbol in symbols:
279      for first in self.firsts[symbol]:
280        if first:
281          result.add(first)
282      if None not in self.firsts[symbol]:
283        all_contain_epsilon = False
284        break
285    if all_contain_epsilon:
286      # "None" seems like a Pythonic way of representing epsilon (no symbol).
287      result.add(None)
288    return result
289
290  def _closure_of_item(self, root_item):
291    """Modified implementation of CLOSURE from ALSU p261.
292
293    _closure_of_item performs the CLOSURE function with a single seed item, with
294    memoization.  In the algorithm as presented in ALSU, CLOSURE is called with
295    a different set of items every time, which is unhelpful for memoization.
296    Instead, we let _parallel_goto merge the sets returned by _closure_of_item,
297    which results in a ~40% speedup.
298
299    CLOSURE, roughly, computes the set of LR(1) Items which might be active when
300    a "seed" set of Items is active.
301
302    Technically, it is the epsilon-closure of the NFA states represented by
303    "items," where an epsilon transition (a transition that does not consume any
304    symbols) occurs from a->Z.bY,q to b->.X,p when p is in FIRST(Yq).  (a and b
305    are nonterminals, X, Y, and Z are arbitrary strings of symbols, and p and q
306    are terminals.)  That is, it is the set of all NFA states which can be
307    reached from "items" without consuming any input.  This set corresponds to a
308    single DFA state.
309
310    Args:
311      root_item: The initial LR(1) Item.
312
313    Returns:
314      A set of LR(1) items which may be active at the time when the provided
315      item is active.
316    """
317    if root_item in self._closure_of_item_cache:
318      return self._closure_of_item_cache[root_item]
319    item_set = set([root_item])
320    item_list = [root_item]
321    i = 0
322    # Each newly-added Item may trigger the addition of further Items, so
323    # iterate until no new Items are added.  In the worst case, a new Item will
324    # be added for each production.
325    #
326    # This algorithm is really looking for "next" nonterminals in the existing
327    # items, and adding new items corresponding to their productions.
328    while i < len(item_list):
329      item = item_list[i]
330      i += 1
331      if not item.next_symbol:
332        continue
333      # If _closure_of_item_cache contains the full closure of item, then we can
334      # add its full closure to the result set, and skip checking any of its
335      # items: any item that would be added by any item in the cached result
336      # will already be in the _closure_of_item_cache entry.
337      if item in self._closure_of_item_cache:
338        item_set |= self._closure_of_item_cache[item]
339        continue
340      # Even if we don't have the full closure of item, we may have the
341      # immediate closure of item.  It turns out that memoizing just this step
342      # speeds up this function by about 50%, even after the
343      # _closure_of_item_cache check.
344      if item not in self._single_level_closure_of_item_cache:
345        new_items = set()
346        for production in self._productions_by_lhs.get(item.next_symbol, []):
347          for terminal in self._first(item.production.rhs[item.dot + 1:] +
348                                      (item.terminal,)):
349            new_items.add(self._item_cache[production, 0, terminal])
350        self._single_level_closure_of_item_cache[item] = new_items
351      for new_item in self._single_level_closure_of_item_cache[item]:
352        if new_item not in item_set:
353          item_set.add(new_item)
354          item_list.append(new_item)
355    self._closure_of_item_cache[root_item] = item_set
356    # Typically, _closure_of_item() will be called on items whose closures
357    # bring in the greatest number of additional items, then on items which
358    # close over fewer and fewer other items.  Since items are not added to
359    # _closure_of_item_cache unless _closure_of_item() is called directly on
360    # them, this means that it is unlikely that items brought in will (without
361    # intervention) have entries in _closure_of_item_cache, which slows down the
362    # computation of the larger closures.
363    #
364    # Although it is not guaranteed, items added to item_list last will tend to
365    # close over fewer items, and therefore be easier to compute.  By forcibly
366    # re-calculating closures from last to first, and adding the results to
367    # _closure_of_item_cache at each step, we get a modest performance
368    # improvement: roughly 50% less time spent in _closure_of_item, which
369    # translates to about 5% less time in parser().
370    for item in item_list[::-1]:
371      self._closure_of_item(item)
372    return item_set
373
374  def _parallel_goto(self, items):
375    """The GOTO function from ALSU p261, executed on all symbols.
376
377    _parallel_goto takes a set of Items, and returns a dict from every symbol in
378    self.symbols to the set of Items that would be active after a shift
379    operation (if symbol is a terminal) or after a reduction operation (if
380    symbol is a nonterminal).
381
382    _parallel_goto is used in lieu of the single-symbol GOTO from ALSU because
383    it eliminates the outer loop over self.terminals, and thereby reduces the
384    number of next_symbol calls by a factor of len(self.terminals).
385
386    Args:
387      items: The set of items representing the initial DFA state.
388
389    Returns:
390      A dict from symbols to sets of items representing the new DFA states.
391    """
392    results = collections.defaultdict(set)
393    for item in items:
394      next_symbol = item.next_symbol
395      if next_symbol is None:
396        continue
397      item = self._item_cache[item.production, item.dot + 1, item.terminal]
398      # Inlining the cache check results in a ~25% speedup in this function, and
399      # about 10% overall speedup to parser().
400      if item in self._closure_of_item_cache:
401        closure = self._closure_of_item_cache[item]
402      else:
403        closure = self._closure_of_item(item)
404      # _closure will add newly-started Items (Items with dot=0) to the result
405      # set.  After this operation, the result set will correspond to the new
406      # state.
407      results[next_symbol].update(closure)
408    return results
409
410  def _items(self):
411    """The items function from ALSU p261.
412
413    _items computes the set of sets of LR(1) items for a shift-reduce parser
414    that matches the grammar.  Each set of LR(1) items corresponds to a single
415    DFA state.
416
417    Returns:
418      A tuple.
419
420      The first element of the tuple is a list of sets of LR(1) items (each set
421      corresponding to a DFA state).
422
423      The second element of the tuple is a dictionary from (int, symbol) pairs
424      to ints, where all the ints are indexes into the list of sets of LR(1)
425      items.  This dictionary is based on the results of the _Goto function,
426      where item_sets[dict[i, sym]] == self._Goto(item_sets[i], sym).
427    """
428    # The list of states is seeded with the marker S' production.
429    item_list = [
430        frozenset(self._closure_of_item(
431            self._item_cache[self._seed_production, 0, END_OF_INPUT]))
432    ]
433    items = {item_list[0]: 0}
434    goto_table = {}
435    i = 0
436    # For each state, figure out what the new state when each symbol is added to
437    # the top of the parsing stack (see the comments in parser._parse).  See
438    # _Goto for an explanation of how that is actually computed.
439    while i < len(item_list):
440      item_set = item_list[i]
441      gotos = self._parallel_goto(item_set)
442      for symbol, goto in gotos.items():
443        goto = frozenset(goto)
444        if goto not in items:
445          items[goto] = len(item_list)
446          item_list.append(goto)
447        goto_table[i, symbol] = items[goto]
448      i += 1
449    return item_list, goto_table
450
451  def parser(self):
452    """parser returns an LR(1) parser for the Grammar.
453
454    This implements the Canonical LR(1) ("LR(1)") parser algorithm ("Algorithm
455    4.56", ALSU p265), rather than the more common Lookahead LR(1) ("LALR(1)")
456    algorithm.  LALR(1) produces smaller tables, but is more complex and does
457    not cover all LR(1) grammars.  When the LR(1) and LALR(1) algorithms were
458    invented, table sizes were an important consideration; now, the difference
459    between a few hundred and a few thousand entries is unlikely to matter.
460
461    At this time, Grammar does not handle ambiguous grammars, which are commonly
462    used to handle precedence, associativity, and the "dangling else" problem.
463    Formally, these can always be handled by an unambiguous grammar, though
464    doing so can be cumbersome, particularly for expression languages with many
465    levels of precedence.  ALSU section 4.8 (pp278-287) contains some techniques
466    for handling these kinds of ambiguity.
467
468    Returns:
469      A Parser.
470    """
471    item_sets, goto = self._items()
472    action = {}
473    conflicts = set()
474    end_item = self._item_cache[self._seed_production, 1, END_OF_INPUT]
475    for i in range(len(item_sets)):
476      for item in item_sets[i]:
477        new_action = None
478        if (item.next_symbol is None and
479            item.production != self._seed_production):
480          terminal = item.terminal
481          new_action = Reduce(item.production)
482        elif item.next_symbol in self.terminals:
483          terminal = item.next_symbol
484          assert goto[i, terminal] is not None
485          new_action = Shift(goto[i, terminal], item_sets[goto[i, terminal]])
486        if new_action:
487          if (i, terminal) in action and action[i, terminal] != new_action:
488            conflicts.add(
489                Conflict(i, terminal,
490                         frozenset([action[i, terminal], new_action])))
491          action[i, terminal] = new_action
492        if item == end_item:
493          new_action = Accept()
494          assert (i, END_OF_INPUT
495                 ) not in action or action[i, END_OF_INPUT] == new_action
496          action[i, END_OF_INPUT] = new_action
497    trimmed_goto = {}
498    for k in goto:
499      if k[1] in self.nonterminals:
500        trimmed_goto[k] = goto[k]
501    expected = {}
502    for state, terminal in action:
503      if state not in expected:
504        expected[state] = set()
505      expected[state].add(terminal)
506    return Parser(item_sets, trimmed_goto, action, expected, conflicts,
507                  self.terminals, self.nonterminals, self.productions)
508
509
510ParseError = collections.namedtuple("ParseError", ["code", "index", "token",
511                                                   "state", "expected_tokens"])
512ParseResult = collections.namedtuple("ParseResult", ["parse_tree", "error"])
513
514
515class Parser(object):
516  """Parser is a shift-reduce LR(1) parser.
517
518  Generally, clients will want to get a Parser from a Grammar, rather than
519  directly instantiating one.
520
521  Parser exposes the raw tables needed to feed into a Shift-Reduce parser,
522  but can also be used directly for parsing.
523
524  Attributes:
525    item_sets: A list of item sets which correspond to the state numbers in
526      the action and goto tables.  This is not necessary for parsing, but is
527      useful for debugging parsers.
528    goto: The GOTO table for this parser.
529    action: The ACTION table for this parser.
530    expected: A table of terminal symbols that are expected (that is, that
531      have a non-Error action) for each state.  This can be used to provide
532      more helpful error messages for parse errors.
533    conflicts: A set of unresolved conflicts found during table generation.
534    terminals: A set of terminal symbols in the grammar.
535    nonterminals: A set of nonterminal symbols in the grammar.
536    productions: A list of productions in the grammar.
537    default_errors: A dict of states to default error codes to use when
538      encountering an error in that state, when a more-specific Error for the
539      state/terminal pair has not been set.
540  """
541
542  def __init__(self, item_sets, goto, action, expected, conflicts, terminals,
543               nonterminals, productions):
544    super(Parser, self).__init__()
545    self.item_sets = item_sets
546    self.goto = goto
547    self.action = action
548    self.expected = expected
549    self.conflicts = conflicts
550    self.terminals = terminals
551    self.nonterminals = nonterminals
552    self.productions = productions
553    self.default_errors = {}
554
555  def _parse(self, tokens):
556    """_parse implements Shift-Reduce parsing algorithm.
557
558    _parse implements the standard shift-reduce algorithm outlined on ASLU
559    pp236-237.
560
561    Arguments:
562      tokens: the list of token objects to parse.
563
564    Returns:
565      A ParseResult.
566    """
567    # The END_OF_INPUT token is explicitly added to avoid explicit "cursor <
568    # len(tokens)" checks.
569    tokens = list(tokens) + [Symbol(END_OF_INPUT)]
570
571    # Each element of stack is a parse state and a (possibly partial) parse
572    # tree.  The state at the top of the stack encodes which productions are
573    # "active" (that is, which ones the parser has seen partial input which
574    # matches some prefix of the production, in a place where that production
575    # might be valid), and, for each active production, how much of the
576    # production has been completed.
577    stack = [(0, None)]
578
579    def state():
580      return stack[-1][0]
581
582    cursor = 0
583
584    # On each iteration, look at the next symbol and the current state, and
585    # perform the corresponding action.
586    while True:
587      if (state(), tokens[cursor].symbol) not in self.action:
588        # Most state/symbol entries would be Errors, so rather than exhaustively
589        # adding error entries, we just check here.
590        if state() in self.default_errors:
591          next_action = Error(self.default_errors[state()])
592        else:
593          next_action = Error(None)
594      else:
595        next_action = self.action[state(), tokens[cursor].symbol]
596
597      if isinstance(next_action, Shift):
598        # Shift means that there are no "complete" productions on the stack,
599        # and so the current token should be shifted onto the stack, with a new
600        # state indicating the new set of "active" productions.
601        stack.append((next_action.state, tokens[cursor]))
602        cursor += 1
603      elif isinstance(next_action, Accept):
604        # Accept means that parsing is over, successfully.
605        assert len(stack) == 2, "Accepted incompletely-reduced input."
606        assert tokens[cursor].symbol == END_OF_INPUT, ("Accepted parse before "
607                                                       "end of input.")
608        return ParseResult(stack[-1][1], None)
609      elif isinstance(next_action, Reduce):
610        # Reduce means that there is a complete production on the stack, and
611        # that the next symbol implies that the completed production is the
612        # correct production.
613        #
614        # Per ALSU, we would simply pop an element off the state stack for each
615        # symbol on the rhs of the production, and then push a new state by
616        # looking up the (post-pop) current state and the lhs of the production
617        # in GOTO.  The GOTO table, in some sense, is equivalent to shift
618        # actions for nonterminal symbols.
619        #
620        # Here, we attach a new partial parse tree, with the production lhs as
621        # the "name" of the tree, and the popped trees as the "children" of the
622        # new tree.
623        children = [
624            item[1] for item in stack[len(stack) - len(next_action.rule.rhs):]
625        ]
626        # Attach source_location, if known.  The source location will not be
627        # known if the reduction consumes no symbols (empty rhs) or if the
628        # client did not specify source_locations for tokens.
629        #
630        # It is necessary to loop in order to handle cases like:
631        #
632        # C -> c D
633        # D ->
634        #
635        # The D child of the C reduction will not have a source location
636        # (because it is not produced from any source), so it is necessary to
637        # scan backwards through C's children to find the end position.  The
638        # opposite is required in the case where initial children have no
639        # source.
640        #
641        # These loops implicitly handle the case where the reduction has no
642        # children, setting the source_location to None in that case.
643        start_position = None
644        end_position = None
645        for child in children:
646          if hasattr(child,
647                     "source_location") and child.source_location is not None:
648            start_position = child.source_location.start
649            break
650        for child in reversed(children):
651          if hasattr(child,
652                     "source_location") and child.source_location is not None:
653            end_position = child.source_location.end
654            break
655        if start_position is None:
656          source_location = None
657        else:
658          source_location = parser_types.make_location(start_position,
659                                                       end_position)
660        reduction = Reduction(next_action.rule.lhs, children, next_action.rule,
661                              source_location)
662        del stack[len(stack) - len(next_action.rule.rhs):]
663        stack.append((self.goto[state(), next_action.rule.lhs], reduction))
664      elif isinstance(next_action, Error):
665        # Error means that the parse is impossible.  For typical grammars and
666        # texts, this usually happens within a few tokens after the mistake in
667        # the input stream, which is convenient (though imperfect) for error
668        # reporting.
669        return ParseResult(None,
670                           ParseError(next_action.code, cursor, tokens[cursor],
671                                      state(), self.expected[state()]))
672      else:
673        assert False, "Shouldn't be here."
674
675  def mark_error(self, tokens, error_token, error_code):
676    """Marks an error state with the given error code.
677
678    mark_error implements the equivalent of the "Merr" system presented in
679    "Generating LR Syntax error Messages from Examples" (Jeffery, 2003).
680    This system has limitations, but has the primary advantage that error
681    messages can be specified by giving an example of the error and the
682    message itself.
683
684    Arguments:
685      tokens: a list of tokens to parse.
686      error_token: the token where the parse should fail, or None if the parse
687        should fail at the implicit end-of-input token.
688
689        If the error_token is the special ANY_TOKEN, then the error will be
690        recorded as the default error for the error state.
691      error_code: a value to record for the error state reached by parsing
692        tokens.
693
694    Returns:
695      None if error_code was successfully recorded, or an error message if there
696      was a problem.
697    """
698    result = self._parse(tokens)
699
700    # There is no error state to mark on a successful parse.
701    if not result.error:
702      return "Input successfully parsed."
703
704    # Check if the error occurred at the specified token; if not, then this was
705    # not the expected error.
706    if error_token is None:
707      error_symbol = END_OF_INPUT
708      if result.error.token.symbol != END_OF_INPUT:
709        return "error occurred on {} token, not end of input.".format(
710            result.error.token.symbol)
711    else:
712      error_symbol = error_token.symbol
713      if result.error.token != error_token:
714        return "error occurred on {} token, not {} token.".format(
715            result.error.token.symbol, error_token.symbol)
716
717    # If the expected error was found, attempt to mark it.  It is acceptable if
718    # the given error_code is already set as the error code for the given parse,
719    # but not if a different code is set.
720    if result.error.token == ANY_TOKEN:
721      # For ANY_TOKEN, mark it as a default error.
722      if result.error.state in self.default_errors:
723        if self.default_errors[result.error.state] == error_code:
724          return None
725        else:
726          return ("Attempted to overwrite existing default error code {!r} "
727                  "with new error code {!r} for state {}".format(
728                      self.default_errors[result.error.state], error_code,
729                      result.error.state))
730      else:
731        self.default_errors[result.error.state] = error_code
732        return None
733    else:
734      if (result.error.state, error_symbol) in self.action:
735        existing_error = self.action[result.error.state, error_symbol]
736        assert isinstance(existing_error, Error), "Bug"
737        if existing_error.code == error_code:
738          return None
739        else:
740          return ("Attempted to overwrite existing error code {!r} with new "
741                  "error code {!r} for state {}, terminal {}".format(
742                      existing_error.code, error_code, result.error.state,
743                      error_symbol))
744      else:
745        self.action[result.error.state, error_symbol] = Error(error_code)
746        return None
747    assert False, "All other paths should lead to return."
748
749  def parse(self, tokens):
750    """Parses a list of tokens.
751
752    Arguments:
753      tokens: a list of tokens to parse.
754
755    Returns:
756      A ParseResult.
757    """
758    result = self._parse(tokens)
759    return result
760