xref: /aosp_15_r20/external/emboss/compiler/front_end/generate_grammar_md.py (revision 99e0aae7469b87d12f0ad23e61142c2d74c1ef70)
1# Copyright 2019 Google LLC
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7#     https://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14
15"""Generates a Markdown file documenting the raw Emboss grammar."""
16
17from __future__ import print_function
18
19import re
20import sys
21
22from compiler.front_end import constraints
23from compiler.front_end import module_ir
24from compiler.front_end import tokenizer
25
26# Keep the output to less than 80 columns, so that the preformatted sections are
27# not cut off.
28_MAX_OUTPUT_WIDTH = 80
29
30_HEADER = """
31This is the context-free grammar for Emboss.  Terminal symbols are in `"quotes"`
32or are named in `CamelCase`; nonterminal symbols are named in `snake_case`.  The
33term `<empty>` to the right of the `->` indicates an empty production (a rule
34where the left-hand-side may be parsed from an empty string).
35
36This listing is auto-generated from the grammar defined in `module_ir.py`.
37
38Note that, unlike in many languages, comments are included in the grammar.  This
39is so that comments can be handled more easily by the autoformatter; comments
40are ignored by the compiler.  This is distinct from *documentation*, which is
41included in the IR for use by documentation generators.
42
43""".lstrip()
44
45_BOILERPLATE_PRODUCTION_HEADER = """
46The following productions are automatically generated to handle zero-or-more,
47one-or-more, and zero-or-one repeated lists (`foo*`, `foo+`, and `foo?`
48nonterminals) in LR(1).  They are included for completeness, but may be ignored
49if you just want to understand the grammar.
50
51"""
52
53_TOKENIZER_RULE_HEADER = """
54The following regexes are used to tokenize input into the corresponding symbols.
55Note that the `Indent`, `Dedent`, and `EndOfLine` symbols are generated using
56separate logic.
57
58"""
59
60_KEYWORDS_HEADER = """
61The following {} keywords are reserved, but not used, by Emboss.  They may not
62be used as field, type, or enum value names.
63
64"""
65
66
67def _sort_productions(productions, start_symbol):
68  """Sorts the given productions in a human-friendly order."""
69  productions_by_lhs = {}
70  for p in productions:
71    if p.lhs not in productions_by_lhs:
72      productions_by_lhs[p.lhs] = set()
73    productions_by_lhs[p.lhs].add(p)
74
75  queue = [start_symbol]
76  previously_queued_symbols = set(queue)
77  main_production_list = []
78  # This sorts productions depth-first.  I'm not sure if it is better to sort
79  # them breadth-first or depth-first, or with some hybrid.
80  while queue:
81    symbol = queue.pop(-1)
82    if symbol not in productions_by_lhs:
83      continue
84    for production in sorted(productions_by_lhs[symbol]):
85      main_production_list.append(production)
86      for symbol in production.rhs:
87        # Skip boilerplate productions for now, but include their base
88        # production.
89        if symbol and symbol[-1] in "*+?":
90          symbol = symbol[0:-1]
91        if symbol not in previously_queued_symbols:
92          queue.append(symbol)
93          previously_queued_symbols.add(symbol)
94
95  # It's not particularly important to put boilerplate productions in any
96  # particular order.
97  boilerplate_production_list = sorted(
98      set(productions) - set(main_production_list))
99  for production in boilerplate_production_list:
100    assert production.lhs[-1] in "*+?", "Found orphaned production {}".format(
101        production.lhs)
102  assert set(productions) == set(
103      main_production_list + boilerplate_production_list)
104  assert len(productions) == len(main_production_list) + len(
105      boilerplate_production_list)
106  return main_production_list, boilerplate_production_list
107
108
109def _word_wrap_at_column(words, width):
110  """Wraps words to the specified width, and returns a list of wrapped lines."""
111  result = []
112  in_progress = []
113  for word in words:
114    if len(" ".join(in_progress + [word])) > width:
115      result.append(" ".join(in_progress))
116      assert len(result[-1]) <= width
117      in_progress = []
118    in_progress.append(word)
119  result.append(" ".join(in_progress))
120  assert len(result[-1]) <= width
121  return result
122
123
124def _format_productions(productions):
125  """Formats a list of productions for inclusion in a Markdown document."""
126  max_lhs_len = max([len(production.lhs) for production in productions])
127
128  # TODO(bolms): This highlighting is close for now, but not actually right.
129  result = ["```shell\n"]
130  last_lhs = None
131  for production in productions:
132    if last_lhs == production.lhs:
133      lhs = ""
134      delimiter = " |"
135    else:
136      lhs = production.lhs
137      delimiter = "->"
138    leader = "{lhs:{width}} {delimiter}".format(
139        lhs=lhs,
140        width=max_lhs_len,
141        delimiter=delimiter)
142    for rhs_block in _word_wrap_at_column(
143        production.rhs or ["<empty>"], _MAX_OUTPUT_WIDTH - len(leader)):
144      result.append("{leader} {rhs}\n".format(leader=leader, rhs=rhs_block))
145      leader = " " * len(leader)
146    last_lhs = production.lhs
147  result.append("```\n")
148  return "".join(result)
149
150
151def _normalize_literal_patterns(literals):
152  """Normalizes a list of strings to a list of (regex, symbol) pairs."""
153  return [(re.sub(r"(\W)", r"\\\1", literal), '"' + literal + '"')
154          for literal in literals]
155
156
157def _normalize_regex_patterns(regexes):
158  """Normalizes a list of tokenizer regexes to a list of (regex, symbol)."""
159  # g3doc breaks up patterns containing '|' when they are inserted into a table,
160  # unless they're preceded by '\'.  Note that other special characters,
161  # including '\', should *not* be escaped with '\'.
162  return [(re.sub(r"\|", r"\\|", r.regex.pattern), r.symbol) for r in regexes]
163
164
165def _normalize_reserved_word_list(reserved_words):
166  """Returns words that would be allowed as names if they were not reserved."""
167  interesting_reserved_words = []
168  for word in reserved_words:
169    tokens, errors = tokenizer.tokenize(word, "")
170    assert tokens and not errors, "Failed to tokenize " + word
171    if tokens[0].symbol in ["SnakeWord", "CamelWord", "ShoutyWord"]:
172      interesting_reserved_words.append(word)
173  return sorted(interesting_reserved_words)
174
175
176def _format_token_rules(token_rules):
177  """Formats a list of (pattern, symbol) pairs as a table."""
178  pattern_width = max([len(rule[0]) for rule in token_rules])
179  pattern_width += 2  # For the `` characters.
180  result = ["{pat_header:{width}} | Symbol\n"
181            "{empty:-<{width}} | {empty:-<30}\n".format(pat_header="Pattern",
182                                                        width=pattern_width,
183                                                        empty="")]
184  for rule in token_rules:
185    if rule[1]:
186      symbol_name = "`" + rule[1] + "`"
187    else:
188      symbol_name = "*no symbol emitted*"
189    result.append(
190        "{pattern:{width}} | {symbol}\n".format(pattern="`" + rule[0] + "`",
191                                                width=pattern_width,
192                                                symbol=symbol_name))
193  return "".join(result)
194
195
196def _format_keyword_list(reserved_words):
197  """formats a list of reserved words."""
198  lines = []
199  current_line = ""
200  for word in reserved_words:
201    if len(current_line) + len(word) + 2 > 80:
202      lines.append(current_line)
203      current_line = ""
204    current_line += "`{}` ".format(word)
205  return "".join([line[:-1] + "\n" for line in lines])
206
207
208def generate_grammar_md():
209  """Generates up-to-date text for grammar.md."""
210  main_productions, boilerplate_productions = _sort_productions(
211      module_ir.PRODUCTIONS, module_ir.START_SYMBOL)
212  result = [_HEADER, _format_productions(main_productions),
213            _BOILERPLATE_PRODUCTION_HEADER,
214            _format_productions(boilerplate_productions)]
215
216  main_tokens = _normalize_literal_patterns(tokenizer.LITERAL_TOKEN_PATTERNS)
217  main_tokens += _normalize_regex_patterns(tokenizer.REGEX_TOKEN_PATTERNS)
218  result.append(_TOKENIZER_RULE_HEADER)
219  result.append(_format_token_rules(main_tokens))
220
221  reserved_words = _normalize_reserved_word_list(
222      constraints.get_reserved_word_list())
223  result.append(_KEYWORDS_HEADER.format(len(reserved_words)))
224  result.append(_format_keyword_list(reserved_words))
225
226  return "".join(result)
227
228
229def main(argv):
230  del argv  # Unused.
231  print(generate_grammar_md(), end="")
232  return 0
233
234
235if __name__ == "__main__":
236  sys.exit(main(sys.argv))
237