1# Copyright 2019 Google LLC 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# https://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14 15"""LR(1) parser generator. 16 17The primary class in this module, Grammar, takes a list of context-free grammar 18productions, and produces the corresponding LR(1) shift-reduce parser. This is 19an implementation of the algorithm on pages 221 and 261-265 of "Compilers: 20Principles, Techniques, & Tools" (Second Edition) by Aho, Lam, Sethi, and 21Ullman, also known as "The Dragon Book," hereafter referred to as "ALSU." 22 23This module only implements the LR(1) algorithms; unlike tools such as yacc, it 24does not implement the various bits of glue necessary to actually use a parser. 25Clients are expected to provide their own tokenizers and handle turning a raw 26parse tree into an intermediate representation on their own. 27""" 28 29import collections 30 31from compiler.util import parser_types 32 33 34class Item(collections.namedtuple("Item", ["production", "dot", "terminal", 35 "next_symbol"])): 36 """An Item is an LR(1) Item: a production, a cursor location, and a terminal. 37 38 An Item represents a partially-parsed production, and a lookahead symbol. The 39 position of the dot indicates what portion of the production has been parsed. 40 Generally, Items are an internal implementation detail, but they can be useful 41 elsewhere, particularly for debugging. 42 43 Attributes: 44 production: The Production this Item covers. 45 dot: The index of the "dot" in production's rhs. 46 terminal: The terminal lookahead symbol that follows the production in the 47 input stream. 48 """ 49 50 def __str__(self): 51 """__str__ generates ASLU notation.""" 52 return (str(self.production.lhs) + " -> " + " ".join( 53 [str(r) for r in self.production.rhs[0:self.dot] + (".",) + 54 self.production.rhs[self.dot:]]) + ", " + str(self.terminal)) 55 56 @staticmethod 57 def parse(text): 58 """Parses an Item in ALSU notation. 59 60 Parses an Item from notation like: 61 62 symbol -> foo . bar baz, qux 63 64 where "symbol -> foo bar baz" will be taken as the production, the position 65 of the "." is taken as "dot" (in this case 1), and the symbol after "," is 66 taken as the "terminal". The following are also valid items: 67 68 sym -> ., foo 69 sym -> . foo bar, baz 70 sym -> foo bar ., baz 71 72 Symbols on the right-hand side of the production should be separated by 73 whitespace. 74 75 Arguments: 76 text: The text to parse into an Item. 77 78 Returns: 79 An Item. 80 """ 81 production, terminal = text.split(",") 82 terminal = terminal.strip() 83 if terminal == "$": 84 terminal = END_OF_INPUT 85 lhs, rhs = production.split("->") 86 lhs = lhs.strip() 87 if lhs == "S'": 88 lhs = START_PRIME 89 before_dot, after_dot = rhs.split(".") 90 handle = before_dot.split() 91 tail = after_dot.split() 92 return make_item(parser_types.Production(lhs, tuple(handle + tail)), 93 len(handle), terminal) 94 95 96def make_item(production, dot, symbol): 97 return Item(production, dot, symbol, 98 None if dot >= len(production.rhs) else production.rhs[dot]) 99 100 101class Conflict( 102 collections.namedtuple("Conflict", ["state", "symbol", "actions"]) 103): 104 """Conflict represents a parse conflict.""" 105 106 def __str__(self): 107 return "Conflict for {} in state {}: ".format( 108 self.symbol, self.state) + " vs ".join([str(a) for a in self.actions]) 109 110 111Shift = collections.namedtuple("Shift", ["state", "items"]) 112Reduce = collections.namedtuple("Reduce", ["rule"]) 113Accept = collections.namedtuple("Accept", []) 114Error = collections.namedtuple("Error", ["code"]) 115 116Symbol = collections.namedtuple("Symbol", ["symbol"]) 117 118# START_PRIME is the implicit 'real' root symbol for the grammar. 119START_PRIME = "S'" 120 121# END_OF_INPUT is the implicit symbol at the end of input. 122END_OF_INPUT = "$" 123 124# ANY_TOKEN is used by mark_error as a "wildcard" token that should be replaced 125# by every other token. 126ANY_TOKEN = parser_types.Token(object(), "*", 127 parser_types.parse_location("0:0-0:0")) 128 129 130class Reduction(collections.namedtuple("Reduction", 131 ["symbol", "children", "production", 132 "source_location"])): 133 """A Reduction is a non-leaf node in a parse tree. 134 135 Attributes: 136 symbol: The name of this element in the parse. 137 children: The child elements of this parse. 138 production: The grammar production to which this reduction corresponds. 139 source_location: If known, the range in the source text corresponding to the 140 tokens from which this reduction was parsed. May be 'None' if this 141 reduction was produced from no symbols, or if the tokens fed to `parse` 142 did not include source_location. 143 """ 144 pass 145 146 147class Grammar(object): 148 """Grammar is an LR(1) context-free grammar. 149 150 Attributes: 151 start: The start symbol for the grammar. 152 productions: A list of productions in the grammar, including the S' -> start 153 production. 154 symbols: A set of all symbols in the grammar, including $ and S'. 155 nonterminals: A set of all nonterminal symbols in the grammar, including S'. 156 terminals: A set of all terminal symbols in the grammar, including $. 157 """ 158 159 def __init__(self, start_symbol, productions): 160 """Constructs a Grammar object. 161 162 Arguments: 163 start_symbol: The start symbol for the grammar. 164 productions: A list of productions (not including the "S' -> start_symbol" 165 production). 166 """ 167 object.__init__(self) 168 self.start = start_symbol 169 self._seed_production = parser_types.Production(START_PRIME, (self.start,)) 170 self.productions = productions + [self._seed_production] 171 172 self._single_level_closure_of_item_cache = {} 173 self._closure_of_item_cache = {} 174 self._compute_symbols() 175 self._compute_seed_firsts() 176 self._set_productions_by_lhs() 177 self._populate_item_cache() 178 179 def _set_productions_by_lhs(self): 180 # Prepopulating _productions_by_lhs speeds up _closure_of_item by about 30%, 181 # which is significant on medium-to-large grammars. 182 self._productions_by_lhs = {} 183 for production in self.productions: 184 self._productions_by_lhs.setdefault(production.lhs, list()).append( 185 production) 186 187 def _populate_item_cache(self): 188 # There are a relatively small number of possible Items for a grammar, and 189 # the algorithm needs to get Items from their constituent components very 190 # frequently. As it turns out, pre-caching all possible Items results in a 191 # ~35% overall speedup to Grammar.parser(). 192 self._item_cache = {} 193 for symbol in self.terminals: 194 for production in self.productions: 195 for dot in range(len(production.rhs) + 1): 196 self._item_cache[production, dot, symbol] = make_item( 197 production, dot, symbol) 198 199 def _compute_symbols(self): 200 """Finds all grammar symbols, and sorts them into terminal and non-terminal. 201 202 Nonterminal symbols are those which appear on the left side of any 203 production. Terminal symbols are those which do not. 204 205 _compute_symbols is used during __init__. 206 """ 207 self.symbols = {END_OF_INPUT} 208 self.nonterminals = set() 209 for production in self.productions: 210 self.symbols.add(production.lhs) 211 self.nonterminals.add(production.lhs) 212 for symbol in production.rhs: 213 self.symbols.add(symbol) 214 self.terminals = self.symbols - self.nonterminals 215 216 def _compute_seed_firsts(self): 217 """Computes FIRST (ALSU p221) for all terminal and nonterminal symbols. 218 219 The algorithm for computing FIRST is an iterative one that terminates when 220 it reaches a fixed point (that is, when further iterations stop changing 221 state). _compute_seed_firsts computes the fixed point for all single-symbol 222 strings, by repeatedly calling _first and updating the internal _firsts 223 table with the results. 224 225 Once _compute_seed_firsts has completed, _first will return correct results 226 for both single- and multi-symbol strings. 227 228 _compute_seed_firsts is used during __init__. 229 """ 230 self.firsts = {} 231 # FIRST for a terminal symbol is always just that terminal symbol. 232 for terminal in self.terminals: 233 self.firsts[terminal] = set([terminal]) 234 for nonterminal in self.nonterminals: 235 self.firsts[nonterminal] = set() 236 while True: 237 # The first iteration picks up all the productions that start with 238 # terminal symbols. The second iteration picks up productions that start 239 # with nonterminals that the first iteration picked up. The third 240 # iteration picks up nonterminals that the first and second picked up, and 241 # so on. 242 # 243 # This is guaranteed to end, in the worst case, when every terminal 244 # symbol and epsilon has been added to the _firsts set for every 245 # nonterminal symbol. This would be slow, but requires a pathological 246 # grammar; useful grammars should complete in only a few iterations. 247 firsts_to_add = {} 248 for production in self.productions: 249 for first in self._first(production.rhs): 250 if first not in self.firsts[production.lhs]: 251 if production.lhs not in firsts_to_add: 252 firsts_to_add[production.lhs] = set() 253 firsts_to_add[production.lhs].add(first) 254 if not firsts_to_add: 255 break 256 for symbol in firsts_to_add: 257 self.firsts[symbol].update(firsts_to_add[symbol]) 258 259 def _first(self, symbols): 260 """The FIRST function from ALSU p221. 261 262 _first takes a string of symbols (both terminals and nonterminals) and 263 returns the set of terminal symbols which could be the first terminal symbol 264 of a string produced by the given list of symbols. 265 266 _first will not give fully-correct results until _compute_seed_firsts 267 finishes, but is called by _compute_seed_firsts, and must provide partial 268 results during that method's execution. 269 270 Args: 271 symbols: A list of symbols. 272 273 Returns: 274 A set of terminals which could be the first terminal in "symbols." 275 """ 276 result = set() 277 all_contain_epsilon = True 278 for symbol in symbols: 279 for first in self.firsts[symbol]: 280 if first: 281 result.add(first) 282 if None not in self.firsts[symbol]: 283 all_contain_epsilon = False 284 break 285 if all_contain_epsilon: 286 # "None" seems like a Pythonic way of representing epsilon (no symbol). 287 result.add(None) 288 return result 289 290 def _closure_of_item(self, root_item): 291 """Modified implementation of CLOSURE from ALSU p261. 292 293 _closure_of_item performs the CLOSURE function with a single seed item, with 294 memoization. In the algorithm as presented in ALSU, CLOSURE is called with 295 a different set of items every time, which is unhelpful for memoization. 296 Instead, we let _parallel_goto merge the sets returned by _closure_of_item, 297 which results in a ~40% speedup. 298 299 CLOSURE, roughly, computes the set of LR(1) Items which might be active when 300 a "seed" set of Items is active. 301 302 Technically, it is the epsilon-closure of the NFA states represented by 303 "items," where an epsilon transition (a transition that does not consume any 304 symbols) occurs from a->Z.bY,q to b->.X,p when p is in FIRST(Yq). (a and b 305 are nonterminals, X, Y, and Z are arbitrary strings of symbols, and p and q 306 are terminals.) That is, it is the set of all NFA states which can be 307 reached from "items" without consuming any input. This set corresponds to a 308 single DFA state. 309 310 Args: 311 root_item: The initial LR(1) Item. 312 313 Returns: 314 A set of LR(1) items which may be active at the time when the provided 315 item is active. 316 """ 317 if root_item in self._closure_of_item_cache: 318 return self._closure_of_item_cache[root_item] 319 item_set = set([root_item]) 320 item_list = [root_item] 321 i = 0 322 # Each newly-added Item may trigger the addition of further Items, so 323 # iterate until no new Items are added. In the worst case, a new Item will 324 # be added for each production. 325 # 326 # This algorithm is really looking for "next" nonterminals in the existing 327 # items, and adding new items corresponding to their productions. 328 while i < len(item_list): 329 item = item_list[i] 330 i += 1 331 if not item.next_symbol: 332 continue 333 # If _closure_of_item_cache contains the full closure of item, then we can 334 # add its full closure to the result set, and skip checking any of its 335 # items: any item that would be added by any item in the cached result 336 # will already be in the _closure_of_item_cache entry. 337 if item in self._closure_of_item_cache: 338 item_set |= self._closure_of_item_cache[item] 339 continue 340 # Even if we don't have the full closure of item, we may have the 341 # immediate closure of item. It turns out that memoizing just this step 342 # speeds up this function by about 50%, even after the 343 # _closure_of_item_cache check. 344 if item not in self._single_level_closure_of_item_cache: 345 new_items = set() 346 for production in self._productions_by_lhs.get(item.next_symbol, []): 347 for terminal in self._first(item.production.rhs[item.dot + 1:] + 348 (item.terminal,)): 349 new_items.add(self._item_cache[production, 0, terminal]) 350 self._single_level_closure_of_item_cache[item] = new_items 351 for new_item in self._single_level_closure_of_item_cache[item]: 352 if new_item not in item_set: 353 item_set.add(new_item) 354 item_list.append(new_item) 355 self._closure_of_item_cache[root_item] = item_set 356 # Typically, _closure_of_item() will be called on items whose closures 357 # bring in the greatest number of additional items, then on items which 358 # close over fewer and fewer other items. Since items are not added to 359 # _closure_of_item_cache unless _closure_of_item() is called directly on 360 # them, this means that it is unlikely that items brought in will (without 361 # intervention) have entries in _closure_of_item_cache, which slows down the 362 # computation of the larger closures. 363 # 364 # Although it is not guaranteed, items added to item_list last will tend to 365 # close over fewer items, and therefore be easier to compute. By forcibly 366 # re-calculating closures from last to first, and adding the results to 367 # _closure_of_item_cache at each step, we get a modest performance 368 # improvement: roughly 50% less time spent in _closure_of_item, which 369 # translates to about 5% less time in parser(). 370 for item in item_list[::-1]: 371 self._closure_of_item(item) 372 return item_set 373 374 def _parallel_goto(self, items): 375 """The GOTO function from ALSU p261, executed on all symbols. 376 377 _parallel_goto takes a set of Items, and returns a dict from every symbol in 378 self.symbols to the set of Items that would be active after a shift 379 operation (if symbol is a terminal) or after a reduction operation (if 380 symbol is a nonterminal). 381 382 _parallel_goto is used in lieu of the single-symbol GOTO from ALSU because 383 it eliminates the outer loop over self.terminals, and thereby reduces the 384 number of next_symbol calls by a factor of len(self.terminals). 385 386 Args: 387 items: The set of items representing the initial DFA state. 388 389 Returns: 390 A dict from symbols to sets of items representing the new DFA states. 391 """ 392 results = collections.defaultdict(set) 393 for item in items: 394 next_symbol = item.next_symbol 395 if next_symbol is None: 396 continue 397 item = self._item_cache[item.production, item.dot + 1, item.terminal] 398 # Inlining the cache check results in a ~25% speedup in this function, and 399 # about 10% overall speedup to parser(). 400 if item in self._closure_of_item_cache: 401 closure = self._closure_of_item_cache[item] 402 else: 403 closure = self._closure_of_item(item) 404 # _closure will add newly-started Items (Items with dot=0) to the result 405 # set. After this operation, the result set will correspond to the new 406 # state. 407 results[next_symbol].update(closure) 408 return results 409 410 def _items(self): 411 """The items function from ALSU p261. 412 413 _items computes the set of sets of LR(1) items for a shift-reduce parser 414 that matches the grammar. Each set of LR(1) items corresponds to a single 415 DFA state. 416 417 Returns: 418 A tuple. 419 420 The first element of the tuple is a list of sets of LR(1) items (each set 421 corresponding to a DFA state). 422 423 The second element of the tuple is a dictionary from (int, symbol) pairs 424 to ints, where all the ints are indexes into the list of sets of LR(1) 425 items. This dictionary is based on the results of the _Goto function, 426 where item_sets[dict[i, sym]] == self._Goto(item_sets[i], sym). 427 """ 428 # The list of states is seeded with the marker S' production. 429 item_list = [ 430 frozenset(self._closure_of_item( 431 self._item_cache[self._seed_production, 0, END_OF_INPUT])) 432 ] 433 items = {item_list[0]: 0} 434 goto_table = {} 435 i = 0 436 # For each state, figure out what the new state when each symbol is added to 437 # the top of the parsing stack (see the comments in parser._parse). See 438 # _Goto for an explanation of how that is actually computed. 439 while i < len(item_list): 440 item_set = item_list[i] 441 gotos = self._parallel_goto(item_set) 442 for symbol, goto in gotos.items(): 443 goto = frozenset(goto) 444 if goto not in items: 445 items[goto] = len(item_list) 446 item_list.append(goto) 447 goto_table[i, symbol] = items[goto] 448 i += 1 449 return item_list, goto_table 450 451 def parser(self): 452 """parser returns an LR(1) parser for the Grammar. 453 454 This implements the Canonical LR(1) ("LR(1)") parser algorithm ("Algorithm 455 4.56", ALSU p265), rather than the more common Lookahead LR(1) ("LALR(1)") 456 algorithm. LALR(1) produces smaller tables, but is more complex and does 457 not cover all LR(1) grammars. When the LR(1) and LALR(1) algorithms were 458 invented, table sizes were an important consideration; now, the difference 459 between a few hundred and a few thousand entries is unlikely to matter. 460 461 At this time, Grammar does not handle ambiguous grammars, which are commonly 462 used to handle precedence, associativity, and the "dangling else" problem. 463 Formally, these can always be handled by an unambiguous grammar, though 464 doing so can be cumbersome, particularly for expression languages with many 465 levels of precedence. ALSU section 4.8 (pp278-287) contains some techniques 466 for handling these kinds of ambiguity. 467 468 Returns: 469 A Parser. 470 """ 471 item_sets, goto = self._items() 472 action = {} 473 conflicts = set() 474 end_item = self._item_cache[self._seed_production, 1, END_OF_INPUT] 475 for i in range(len(item_sets)): 476 for item in item_sets[i]: 477 new_action = None 478 if (item.next_symbol is None and 479 item.production != self._seed_production): 480 terminal = item.terminal 481 new_action = Reduce(item.production) 482 elif item.next_symbol in self.terminals: 483 terminal = item.next_symbol 484 assert goto[i, terminal] is not None 485 new_action = Shift(goto[i, terminal], item_sets[goto[i, terminal]]) 486 if new_action: 487 if (i, terminal) in action and action[i, terminal] != new_action: 488 conflicts.add( 489 Conflict(i, terminal, 490 frozenset([action[i, terminal], new_action]))) 491 action[i, terminal] = new_action 492 if item == end_item: 493 new_action = Accept() 494 assert (i, END_OF_INPUT 495 ) not in action or action[i, END_OF_INPUT] == new_action 496 action[i, END_OF_INPUT] = new_action 497 trimmed_goto = {} 498 for k in goto: 499 if k[1] in self.nonterminals: 500 trimmed_goto[k] = goto[k] 501 expected = {} 502 for state, terminal in action: 503 if state not in expected: 504 expected[state] = set() 505 expected[state].add(terminal) 506 return Parser(item_sets, trimmed_goto, action, expected, conflicts, 507 self.terminals, self.nonterminals, self.productions) 508 509 510ParseError = collections.namedtuple("ParseError", ["code", "index", "token", 511 "state", "expected_tokens"]) 512ParseResult = collections.namedtuple("ParseResult", ["parse_tree", "error"]) 513 514 515class Parser(object): 516 """Parser is a shift-reduce LR(1) parser. 517 518 Generally, clients will want to get a Parser from a Grammar, rather than 519 directly instantiating one. 520 521 Parser exposes the raw tables needed to feed into a Shift-Reduce parser, 522 but can also be used directly for parsing. 523 524 Attributes: 525 item_sets: A list of item sets which correspond to the state numbers in 526 the action and goto tables. This is not necessary for parsing, but is 527 useful for debugging parsers. 528 goto: The GOTO table for this parser. 529 action: The ACTION table for this parser. 530 expected: A table of terminal symbols that are expected (that is, that 531 have a non-Error action) for each state. This can be used to provide 532 more helpful error messages for parse errors. 533 conflicts: A set of unresolved conflicts found during table generation. 534 terminals: A set of terminal symbols in the grammar. 535 nonterminals: A set of nonterminal symbols in the grammar. 536 productions: A list of productions in the grammar. 537 default_errors: A dict of states to default error codes to use when 538 encountering an error in that state, when a more-specific Error for the 539 state/terminal pair has not been set. 540 """ 541 542 def __init__(self, item_sets, goto, action, expected, conflicts, terminals, 543 nonterminals, productions): 544 super(Parser, self).__init__() 545 self.item_sets = item_sets 546 self.goto = goto 547 self.action = action 548 self.expected = expected 549 self.conflicts = conflicts 550 self.terminals = terminals 551 self.nonterminals = nonterminals 552 self.productions = productions 553 self.default_errors = {} 554 555 def _parse(self, tokens): 556 """_parse implements Shift-Reduce parsing algorithm. 557 558 _parse implements the standard shift-reduce algorithm outlined on ASLU 559 pp236-237. 560 561 Arguments: 562 tokens: the list of token objects to parse. 563 564 Returns: 565 A ParseResult. 566 """ 567 # The END_OF_INPUT token is explicitly added to avoid explicit "cursor < 568 # len(tokens)" checks. 569 tokens = list(tokens) + [Symbol(END_OF_INPUT)] 570 571 # Each element of stack is a parse state and a (possibly partial) parse 572 # tree. The state at the top of the stack encodes which productions are 573 # "active" (that is, which ones the parser has seen partial input which 574 # matches some prefix of the production, in a place where that production 575 # might be valid), and, for each active production, how much of the 576 # production has been completed. 577 stack = [(0, None)] 578 579 def state(): 580 return stack[-1][0] 581 582 cursor = 0 583 584 # On each iteration, look at the next symbol and the current state, and 585 # perform the corresponding action. 586 while True: 587 if (state(), tokens[cursor].symbol) not in self.action: 588 # Most state/symbol entries would be Errors, so rather than exhaustively 589 # adding error entries, we just check here. 590 if state() in self.default_errors: 591 next_action = Error(self.default_errors[state()]) 592 else: 593 next_action = Error(None) 594 else: 595 next_action = self.action[state(), tokens[cursor].symbol] 596 597 if isinstance(next_action, Shift): 598 # Shift means that there are no "complete" productions on the stack, 599 # and so the current token should be shifted onto the stack, with a new 600 # state indicating the new set of "active" productions. 601 stack.append((next_action.state, tokens[cursor])) 602 cursor += 1 603 elif isinstance(next_action, Accept): 604 # Accept means that parsing is over, successfully. 605 assert len(stack) == 2, "Accepted incompletely-reduced input." 606 assert tokens[cursor].symbol == END_OF_INPUT, ("Accepted parse before " 607 "end of input.") 608 return ParseResult(stack[-1][1], None) 609 elif isinstance(next_action, Reduce): 610 # Reduce means that there is a complete production on the stack, and 611 # that the next symbol implies that the completed production is the 612 # correct production. 613 # 614 # Per ALSU, we would simply pop an element off the state stack for each 615 # symbol on the rhs of the production, and then push a new state by 616 # looking up the (post-pop) current state and the lhs of the production 617 # in GOTO. The GOTO table, in some sense, is equivalent to shift 618 # actions for nonterminal symbols. 619 # 620 # Here, we attach a new partial parse tree, with the production lhs as 621 # the "name" of the tree, and the popped trees as the "children" of the 622 # new tree. 623 children = [ 624 item[1] for item in stack[len(stack) - len(next_action.rule.rhs):] 625 ] 626 # Attach source_location, if known. The source location will not be 627 # known if the reduction consumes no symbols (empty rhs) or if the 628 # client did not specify source_locations for tokens. 629 # 630 # It is necessary to loop in order to handle cases like: 631 # 632 # C -> c D 633 # D -> 634 # 635 # The D child of the C reduction will not have a source location 636 # (because it is not produced from any source), so it is necessary to 637 # scan backwards through C's children to find the end position. The 638 # opposite is required in the case where initial children have no 639 # source. 640 # 641 # These loops implicitly handle the case where the reduction has no 642 # children, setting the source_location to None in that case. 643 start_position = None 644 end_position = None 645 for child in children: 646 if hasattr(child, 647 "source_location") and child.source_location is not None: 648 start_position = child.source_location.start 649 break 650 for child in reversed(children): 651 if hasattr(child, 652 "source_location") and child.source_location is not None: 653 end_position = child.source_location.end 654 break 655 if start_position is None: 656 source_location = None 657 else: 658 source_location = parser_types.make_location(start_position, 659 end_position) 660 reduction = Reduction(next_action.rule.lhs, children, next_action.rule, 661 source_location) 662 del stack[len(stack) - len(next_action.rule.rhs):] 663 stack.append((self.goto[state(), next_action.rule.lhs], reduction)) 664 elif isinstance(next_action, Error): 665 # Error means that the parse is impossible. For typical grammars and 666 # texts, this usually happens within a few tokens after the mistake in 667 # the input stream, which is convenient (though imperfect) for error 668 # reporting. 669 return ParseResult(None, 670 ParseError(next_action.code, cursor, tokens[cursor], 671 state(), self.expected[state()])) 672 else: 673 assert False, "Shouldn't be here." 674 675 def mark_error(self, tokens, error_token, error_code): 676 """Marks an error state with the given error code. 677 678 mark_error implements the equivalent of the "Merr" system presented in 679 "Generating LR Syntax error Messages from Examples" (Jeffery, 2003). 680 This system has limitations, but has the primary advantage that error 681 messages can be specified by giving an example of the error and the 682 message itself. 683 684 Arguments: 685 tokens: a list of tokens to parse. 686 error_token: the token where the parse should fail, or None if the parse 687 should fail at the implicit end-of-input token. 688 689 If the error_token is the special ANY_TOKEN, then the error will be 690 recorded as the default error for the error state. 691 error_code: a value to record for the error state reached by parsing 692 tokens. 693 694 Returns: 695 None if error_code was successfully recorded, or an error message if there 696 was a problem. 697 """ 698 result = self._parse(tokens) 699 700 # There is no error state to mark on a successful parse. 701 if not result.error: 702 return "Input successfully parsed." 703 704 # Check if the error occurred at the specified token; if not, then this was 705 # not the expected error. 706 if error_token is None: 707 error_symbol = END_OF_INPUT 708 if result.error.token.symbol != END_OF_INPUT: 709 return "error occurred on {} token, not end of input.".format( 710 result.error.token.symbol) 711 else: 712 error_symbol = error_token.symbol 713 if result.error.token != error_token: 714 return "error occurred on {} token, not {} token.".format( 715 result.error.token.symbol, error_token.symbol) 716 717 # If the expected error was found, attempt to mark it. It is acceptable if 718 # the given error_code is already set as the error code for the given parse, 719 # but not if a different code is set. 720 if result.error.token == ANY_TOKEN: 721 # For ANY_TOKEN, mark it as a default error. 722 if result.error.state in self.default_errors: 723 if self.default_errors[result.error.state] == error_code: 724 return None 725 else: 726 return ("Attempted to overwrite existing default error code {!r} " 727 "with new error code {!r} for state {}".format( 728 self.default_errors[result.error.state], error_code, 729 result.error.state)) 730 else: 731 self.default_errors[result.error.state] = error_code 732 return None 733 else: 734 if (result.error.state, error_symbol) in self.action: 735 existing_error = self.action[result.error.state, error_symbol] 736 assert isinstance(existing_error, Error), "Bug" 737 if existing_error.code == error_code: 738 return None 739 else: 740 return ("Attempted to overwrite existing error code {!r} with new " 741 "error code {!r} for state {}, terminal {}".format( 742 existing_error.code, error_code, result.error.state, 743 error_symbol)) 744 else: 745 self.action[result.error.state, error_symbol] = Error(error_code) 746 return None 747 assert False, "All other paths should lead to return." 748 749 def parse(self, tokens): 750 """Parses a list of tokens. 751 752 Arguments: 753 tokens: a list of tokens to parse. 754 755 Returns: 756 A ParseResult. 757 """ 758 result = self._parse(tokens) 759 return result 760