generate_grammar_md.py (revision 99e0aae7469b87d12f0ad23e61142c2d74c1ef70) - OpenGrok cross reference for /aosp_15_r20/external/emboss/compiler/front_end/generate_grammar_md.py

# Copyright 2019 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Generates a Markdown file documenting the raw Emboss grammar."""

from __future__ import print_function

import re
import sys

from compiler.front_end import constraints
from compiler.front_end import module_ir
from compiler.front_end import tokenizer

# Keep the output to less than 80 columns, so that the preformatted sections are
# not cut off.
_MAX_OUTPUT_WIDTH = 80

_HEADER = """
This is the context-free grammar for Emboss.  Terminal symbols are in `"quotes"`
or are named in `CamelCase`; nonterminal symbols are named in `snake_case`.  The
term `<empty>` to the right of the `->` indicates an empty production (a rule
where the left-hand-side may be parsed from an empty string).

This listing is auto-generated from the grammar defined in `module_ir.py`.

Note that, unlike in many languages, comments are included in the grammar.  This
is so that comments can be handled more easily by the autoformatter; comments
are ignored by the compiler.  This is distinct from *documentation*, which is
included in the IR for use by documentation generators.

""".lstrip()

_BOILERPLATE_PRODUCTION_HEADER = """
The following productions are automatically generated to handle zero-or-more,
one-or-more, and zero-or-one repeated lists (`foo*`, `foo+`, and `foo?`
nonterminals) in LR(1).  They are included for completeness, but may be ignored
if you just want to understand the grammar.

"""

_TOKENIZER_RULE_HEADER = """
The following regexes are used to tokenize input into the corresponding symbols.
Note that the `Indent`, `Dedent`, and `EndOfLine` symbols are generated using
separate logic.

"""

_KEYWORDS_HEADER = """
The following {} keywords are reserved, but not used, by Emboss.  They may not
be used as field, type, or enum value names.

"""


def _sort_productions(productions, start_symbol):
  """Sorts the given productions in a human-friendly order."""
  productions_by_lhs = {}
  for p in productions:
    if p.lhs not in productions_by_lhs:
      productions_by_lhs[p.lhs] = set()
    productions_by_lhs[p.lhs].add(p)

  queue = [start_symbol]
  previously_queued_symbols = set(queue)
  main_production_list = []
  # This sorts productions depth-first.  I'm not sure if it is better to sort
  # them breadth-first or depth-first, or with some hybrid.
  while queue:
    symbol = queue.pop(-1)
    if symbol not in productions_by_lhs:
      continue
    for production in sorted(productions_by_lhs[symbol]):
      main_production_list.append(production)
      for symbol in production.rhs:
        # Skip boilerplate productions for now, but include their base
        # production.
        if symbol and symbol[-1] in "*+?":
          symbol = symbol[0:-1]
        if symbol not in previously_queued_symbols:
          queue.append(symbol)
          previously_queued_symbols.add(symbol)

  # It's not particularly important to put boilerplate productions in any
  # particular order.
  boilerplate_production_list = sorted(
      set(productions) - set(main_production_list))
  for production in boilerplate_production_list:
    assert production.lhs[-1] in "*+?", "Found orphaned production {}".format(
        production.lhs)
  assert set(productions) == set(
      main_production_list + boilerplate_production_list)
  assert len(productions) == len(main_production_list) + len(
      boilerplate_production_list)
  return main_production_list, boilerplate_production_list


def _word_wrap_at_column(words, width):
  """Wraps words to the specified width, and returns a list of wrapped lines."""
  result = []
  in_progress = []
  for word in words:
    if len(" ".join(in_progress + [word])) > width:
      result.append(" ".join(in_progress))
      assert len(result[-1]) <= width
      in_progress = []
    in_progress.append(word)
  result.append(" ".join(in_progress))
  assert len(result[-1]) <= width
  return result


def _format_productions(productions):
  """Formats a list of productions for inclusion in a Markdown document."""
  max_lhs_len = max([len(production.lhs) for production in productions])

  # TODO(bolms): This highlighting is close for now, but not actually right.
  result = ["```shell\n"]
  last_lhs = None
  for production in productions:
    if last_lhs == production.lhs:
      lhs = ""
      delimiter = " |"
    else:
      lhs = production.lhs
      delimiter = "->"
    leader = "{lhs:{width}} {delimiter}".format(
        lhs=lhs,
        width=max_lhs_len,
        delimiter=delimiter)
    for rhs_block in _word_wrap_at_column(
        production.rhs or ["<empty>"], _MAX_OUTPUT_WIDTH - len(leader)):
      result.append("{leader} {rhs}\n".format(leader=leader, rhs=rhs_block))
      leader = " " * len(leader)
    last_lhs = production.lhs
  result.append("```\n")
  return "".join(result)


def _normalize_literal_patterns(literals):
  """Normalizes a list of strings to a list of (regex, symbol) pairs."""
  return [(re.sub(r"(\W)", r"\\\1", literal), '"' + literal + '"')
          for literal in literals]


def _normalize_regex_patterns(regexes):
  """Normalizes a list of tokenizer regexes to a list of (regex, symbol)."""
  # g3doc breaks up patterns containing '|' when they are inserted into a table,
  # unless they're preceded by '\'.  Note that other special characters,
  # including '\', should *not* be escaped with '\'.
  return [(re.sub(r"\|", r"\\|", r.regex.pattern), r.symbol) for r in regexes]


def _normalize_reserved_word_list(reserved_words):
  """Returns words that would be allowed as names if they were not reserved."""
  interesting_reserved_words = []
  for word in reserved_words:
    tokens, errors = tokenizer.tokenize(word, "")
    assert tokens and not errors, "Failed to tokenize " + word
    if tokens[0].symbol in ["SnakeWord", "CamelWord", "ShoutyWord"]:
      interesting_reserved_words.append(word)
  return sorted(interesting_reserved_words)


def _format_token_rules(token_rules):
  """Formats a list of (pattern, symbol) pairs as a table."""
  pattern_width = max([len(rule[0]) for rule in token_rules])
  pattern_width += 2  # For the `` characters.
  result = ["{pat_header:{width}} | Symbol\n"
            "{empty:-<{width}} | {empty:-<30}\n".format(pat_header="Pattern",
                                                        width=pattern_width,
                                                        empty="")]
  for rule in token_rules:
    if rule[1]:
      symbol_name = "`" + rule[1] + "`"
    else:
      symbol_name = "*no symbol emitted*"
    result.append(
        "{pattern:{width}} | {symbol}\n".format(pattern="`" + rule[0] + "`",
                                                width=pattern_width,
                                                symbol=symbol_name))
  return "".join(result)


def _format_keyword_list(reserved_words):
  """formats a list of reserved words."""
  lines = []
  current_line = ""
  for word in reserved_words:
    if len(current_line) + len(word) + 2 > 80:
      lines.append(current_line)
      current_line = ""
    current_line += "`{}` ".format(word)
  return "".join([line[:-1] + "\n" for line in lines])


def generate_grammar_md():
  """Generates up-to-date text for grammar.md."""
  main_productions, boilerplate_productions = _sort_productions(
      module_ir.PRODUCTIONS, module_ir.START_SYMBOL)
  result = [_HEADER, _format_productions(main_productions),
            _BOILERPLATE_PRODUCTION_HEADER,
            _format_productions(boilerplate_productions)]

  main_tokens = _normalize_literal_patterns(tokenizer.LITERAL_TOKEN_PATTERNS)
  main_tokens += _normalize_regex_patterns(tokenizer.REGEX_TOKEN_PATTERNS)
  result.append(_TOKENIZER_RULE_HEADER)
  result.append(_format_token_rules(main_tokens))

  reserved_words = _normalize_reserved_word_list(
      constraints.get_reserved_word_list())
  result.append(_KEYWORDS_HEADER.format(len(reserved_words)))
  result.append(_format_keyword_list(reserved_words))

  return "".join(result)


def main(argv):
  del argv  # Unused.
  print(generate_grammar_md(), end="")
  return 0


if __name__ == "__main__":
  sys.exit(main(sys.argv))