1# Copyright 2019 Google LLC 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# https://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14 15"""Generates a Markdown file documenting the raw Emboss grammar.""" 16 17from __future__ import print_function 18 19import re 20import sys 21 22from compiler.front_end import constraints 23from compiler.front_end import module_ir 24from compiler.front_end import tokenizer 25 26# Keep the output to less than 80 columns, so that the preformatted sections are 27# not cut off. 28_MAX_OUTPUT_WIDTH = 80 29 30_HEADER = """ 31This is the context-free grammar for Emboss. Terminal symbols are in `"quotes"` 32or are named in `CamelCase`; nonterminal symbols are named in `snake_case`. The 33term `<empty>` to the right of the `->` indicates an empty production (a rule 34where the left-hand-side may be parsed from an empty string). 35 36This listing is auto-generated from the grammar defined in `module_ir.py`. 37 38Note that, unlike in many languages, comments are included in the grammar. This 39is so that comments can be handled more easily by the autoformatter; comments 40are ignored by the compiler. This is distinct from *documentation*, which is 41included in the IR for use by documentation generators. 42 43""".lstrip() 44 45_BOILERPLATE_PRODUCTION_HEADER = """ 46The following productions are automatically generated to handle zero-or-more, 47one-or-more, and zero-or-one repeated lists (`foo*`, `foo+`, and `foo?` 48nonterminals) in LR(1). They are included for completeness, but may be ignored 49if you just want to understand the grammar. 50 51""" 52 53_TOKENIZER_RULE_HEADER = """ 54The following regexes are used to tokenize input into the corresponding symbols. 55Note that the `Indent`, `Dedent`, and `EndOfLine` symbols are generated using 56separate logic. 57 58""" 59 60_KEYWORDS_HEADER = """ 61The following {} keywords are reserved, but not used, by Emboss. They may not 62be used as field, type, or enum value names. 63 64""" 65 66 67def _sort_productions(productions, start_symbol): 68 """Sorts the given productions in a human-friendly order.""" 69 productions_by_lhs = {} 70 for p in productions: 71 if p.lhs not in productions_by_lhs: 72 productions_by_lhs[p.lhs] = set() 73 productions_by_lhs[p.lhs].add(p) 74 75 queue = [start_symbol] 76 previously_queued_symbols = set(queue) 77 main_production_list = [] 78 # This sorts productions depth-first. I'm not sure if it is better to sort 79 # them breadth-first or depth-first, or with some hybrid. 80 while queue: 81 symbol = queue.pop(-1) 82 if symbol not in productions_by_lhs: 83 continue 84 for production in sorted(productions_by_lhs[symbol]): 85 main_production_list.append(production) 86 for symbol in production.rhs: 87 # Skip boilerplate productions for now, but include their base 88 # production. 89 if symbol and symbol[-1] in "*+?": 90 symbol = symbol[0:-1] 91 if symbol not in previously_queued_symbols: 92 queue.append(symbol) 93 previously_queued_symbols.add(symbol) 94 95 # It's not particularly important to put boilerplate productions in any 96 # particular order. 97 boilerplate_production_list = sorted( 98 set(productions) - set(main_production_list)) 99 for production in boilerplate_production_list: 100 assert production.lhs[-1] in "*+?", "Found orphaned production {}".format( 101 production.lhs) 102 assert set(productions) == set( 103 main_production_list + boilerplate_production_list) 104 assert len(productions) == len(main_production_list) + len( 105 boilerplate_production_list) 106 return main_production_list, boilerplate_production_list 107 108 109def _word_wrap_at_column(words, width): 110 """Wraps words to the specified width, and returns a list of wrapped lines.""" 111 result = [] 112 in_progress = [] 113 for word in words: 114 if len(" ".join(in_progress + [word])) > width: 115 result.append(" ".join(in_progress)) 116 assert len(result[-1]) <= width 117 in_progress = [] 118 in_progress.append(word) 119 result.append(" ".join(in_progress)) 120 assert len(result[-1]) <= width 121 return result 122 123 124def _format_productions(productions): 125 """Formats a list of productions for inclusion in a Markdown document.""" 126 max_lhs_len = max([len(production.lhs) for production in productions]) 127 128 # TODO(bolms): This highlighting is close for now, but not actually right. 129 result = ["```shell\n"] 130 last_lhs = None 131 for production in productions: 132 if last_lhs == production.lhs: 133 lhs = "" 134 delimiter = " |" 135 else: 136 lhs = production.lhs 137 delimiter = "->" 138 leader = "{lhs:{width}} {delimiter}".format( 139 lhs=lhs, 140 width=max_lhs_len, 141 delimiter=delimiter) 142 for rhs_block in _word_wrap_at_column( 143 production.rhs or ["<empty>"], _MAX_OUTPUT_WIDTH - len(leader)): 144 result.append("{leader} {rhs}\n".format(leader=leader, rhs=rhs_block)) 145 leader = " " * len(leader) 146 last_lhs = production.lhs 147 result.append("```\n") 148 return "".join(result) 149 150 151def _normalize_literal_patterns(literals): 152 """Normalizes a list of strings to a list of (regex, symbol) pairs.""" 153 return [(re.sub(r"(\W)", r"\\\1", literal), '"' + literal + '"') 154 for literal in literals] 155 156 157def _normalize_regex_patterns(regexes): 158 """Normalizes a list of tokenizer regexes to a list of (regex, symbol).""" 159 # g3doc breaks up patterns containing '|' when they are inserted into a table, 160 # unless they're preceded by '\'. Note that other special characters, 161 # including '\', should *not* be escaped with '\'. 162 return [(re.sub(r"\|", r"\\|", r.regex.pattern), r.symbol) for r in regexes] 163 164 165def _normalize_reserved_word_list(reserved_words): 166 """Returns words that would be allowed as names if they were not reserved.""" 167 interesting_reserved_words = [] 168 for word in reserved_words: 169 tokens, errors = tokenizer.tokenize(word, "") 170 assert tokens and not errors, "Failed to tokenize " + word 171 if tokens[0].symbol in ["SnakeWord", "CamelWord", "ShoutyWord"]: 172 interesting_reserved_words.append(word) 173 return sorted(interesting_reserved_words) 174 175 176def _format_token_rules(token_rules): 177 """Formats a list of (pattern, symbol) pairs as a table.""" 178 pattern_width = max([len(rule[0]) for rule in token_rules]) 179 pattern_width += 2 # For the `` characters. 180 result = ["{pat_header:{width}} | Symbol\n" 181 "{empty:-<{width}} | {empty:-<30}\n".format(pat_header="Pattern", 182 width=pattern_width, 183 empty="")] 184 for rule in token_rules: 185 if rule[1]: 186 symbol_name = "`" + rule[1] + "`" 187 else: 188 symbol_name = "*no symbol emitted*" 189 result.append( 190 "{pattern:{width}} | {symbol}\n".format(pattern="`" + rule[0] + "`", 191 width=pattern_width, 192 symbol=symbol_name)) 193 return "".join(result) 194 195 196def _format_keyword_list(reserved_words): 197 """formats a list of reserved words.""" 198 lines = [] 199 current_line = "" 200 for word in reserved_words: 201 if len(current_line) + len(word) + 2 > 80: 202 lines.append(current_line) 203 current_line = "" 204 current_line += "`{}` ".format(word) 205 return "".join([line[:-1] + "\n" for line in lines]) 206 207 208def generate_grammar_md(): 209 """Generates up-to-date text for grammar.md.""" 210 main_productions, boilerplate_productions = _sort_productions( 211 module_ir.PRODUCTIONS, module_ir.START_SYMBOL) 212 result = [_HEADER, _format_productions(main_productions), 213 _BOILERPLATE_PRODUCTION_HEADER, 214 _format_productions(boilerplate_productions)] 215 216 main_tokens = _normalize_literal_patterns(tokenizer.LITERAL_TOKEN_PATTERNS) 217 main_tokens += _normalize_regex_patterns(tokenizer.REGEX_TOKEN_PATTERNS) 218 result.append(_TOKENIZER_RULE_HEADER) 219 result.append(_format_token_rules(main_tokens)) 220 221 reserved_words = _normalize_reserved_word_list( 222 constraints.get_reserved_word_list()) 223 result.append(_KEYWORDS_HEADER.format(len(reserved_words))) 224 result.append(_format_keyword_list(reserved_words)) 225 226 return "".join(result) 227 228 229def main(argv): 230 del argv # Unused. 231 print(generate_grammar_md(), end="") 232 return 0 233 234 235if __name__ == "__main__": 236 sys.exit(main(sys.argv)) 237