# Copyright 2019 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Tokenization for the Emboss definition language. This module exports the tokenize function and various errors. In addition, a couple of lists are exported for the use of generate_grammar_md.py: LITERAL_TOKEN_PATTERNS: A list of literal strings which are matched against input. REGEX_TOKEN_PATTERNS: A list of regexes used for tokenization. REGEX_TOKEN_PATTERNS[n].regex is an re.RegexObject (REGEX_TOKEN_PATTERNS[n].regex.pattern contains the text of the pattern), and REGEX_TOKEN_PATTERNS[n].symbol is the name of the symbol assigned to tokens which match the pattern. """ import collections import re from compiler.util import error from compiler.util import parser_types def tokenize(text, file_name): # TODO(bolms): suppress end-of-line, indent, and dedent tokens between matched # delimiters ([], (), and {}). """Tokenizes its argument. Arguments: text: The raw text of a .emb file. file_name: The name of the file to use in errors. Returns: A tuple of: a list of parser_types.Tokens or None a possibly-empty list of errors. """ tokens = [] indent_stack = [""] line_number = 0 for line in text.splitlines(): line_number += 1 # _tokenize_line splits the actual text into tokens. line_tokens, errors = _tokenize_line(line, line_number, file_name) if errors: return None, errors # Lines with only whitespace and comments are not used for Indent/Dedent # calculation, and do not produce end-of-line tokens. for token in line_tokens: if token.symbol != "Comment": break else: tokens.extend(line_tokens) tokens.append(parser_types.Token( '"\\n"', "\n", parser_types.make_location( (line_number, len(line) + 1), (line_number, len(line) + 1)))) continue # Leading whitespace is whatever .lstrip() removes. leading_whitespace = line[0:len(line) - len(line.lstrip())] if leading_whitespace == indent_stack[-1]: # If the current leading whitespace is equal to the last leading # whitespace, do not emit an Indent or Dedent token. pass elif leading_whitespace.startswith(indent_stack[-1]): # If the current leading whitespace is longer than the last leading # whitespace, emit an Indent token. For the token text, take the new # part of the whitespace. tokens.append( parser_types.Token( "Indent", leading_whitespace[len(indent_stack[-1]):], parser_types.make_location( (line_number, len(indent_stack[-1]) + 1), (line_number, len(leading_whitespace) + 1)))) indent_stack.append(leading_whitespace) else: # Otherwise, search for the unclosed indentation level that matches # the current indentation level. Emit a Dedent token for each # newly-closed indentation level. for i in range(len(indent_stack) - 1, -1, -1): if leading_whitespace == indent_stack[i]: break tokens.append( parser_types.Token("Dedent", "", parser_types.make_location( (line_number, len(leading_whitespace) + 1), (line_number, len(leading_whitespace) + 1)))) del indent_stack[i] else: return None, [[error.error( file_name, parser_types.make_location( (line_number, 1), (line_number, len(leading_whitespace) + 1)), "Bad indentation")]] tokens.extend(line_tokens) # Append an end-of-line token (for non-whitespace lines). tokens.append(parser_types.Token( '"\\n"', "\n", parser_types.make_location( (line_number, len(line) + 1), (line_number, len(line) + 1)))) for i in range(len(indent_stack) - 1): tokens.append(parser_types.Token("Dedent", "", parser_types.make_location( (line_number + 1, 1), (line_number + 1, 1)))) return tokens, [] # Token patterns used by _tokenize_line. LITERAL_TOKEN_PATTERNS = ( "[ ] ( ) : = + - * . ? == != && || < > <= >= , " "$static_size_in_bits $is_statically_sized " "$max $present $upper_bound $lower_bound $next " "$size_in_bits $size_in_bytes " "$max_size_in_bits $max_size_in_bytes $min_size_in_bits $min_size_in_bytes " "$default struct bits enum external import as if let").split() _T = collections.namedtuple("T", ["regex", "symbol"]) REGEX_TOKEN_PATTERNS = [ # Words starting with variations of "emboss reserved" are reserved for # internal use by the Emboss compiler. _T(re.compile(r"EmbossReserved[A-Za-z0-9]*"), "BadWord"), _T(re.compile(r"emboss_reserved[_a-z0-9]*"), "BadWord"), _T(re.compile(r"EMBOSS_RESERVED[_A-Z0-9]*"), "BadWord"), _T(re.compile(r'"(?:[^"\n\\]|\\[n\\"])*"'), "String"), _T(re.compile("[0-9]+"), "Number"), _T(re.compile("[0-9]{1,3}(?:_[0-9]{3})*"), "Number"), _T(re.compile("0x[0-9a-fA-F]+"), "Number"), _T(re.compile("0x_?[0-9a-fA-F]{1,4}(?:_[0-9a-fA-F]{4})*"), "Number"), _T(re.compile("0x_?[0-9a-fA-F]{1,8}(?:_[0-9a-fA-F]{8})*"), "Number"), _T(re.compile("0b[01]+"), "Number"), _T(re.compile("0b_?[01]{1,4}(?:_[01]{4})*"), "Number"), _T(re.compile("0b_?[01]{1,8}(?:_[01]{8})*"), "Number"), _T(re.compile("true|false"), "BooleanConstant"), _T(re.compile("[a-z][a-z_0-9]*"), "SnakeWord"), # Single-letter ShoutyWords (like "A") and single-letter-followed-by-number # ShoutyWords ("A100") are disallowed due to ambiguity with CamelWords. A # ShoutyWord must start with an upper case letter and contain at least one # more upper case letter or '_'. _T(re.compile("[A-Z][A-Z_0-9]*[A-Z_][A-Z_0-9]*"), "ShoutyWord"), # A CamelWord starts with A-Z and contains at least one a-z, and no _. _T(re.compile("[A-Z][a-zA-Z0-9]*[a-z][a-zA-Z0-9]*"), "CamelWord"), _T(re.compile("-- .*"), "Documentation"), _T(re.compile("--$"), "Documentation"), _T(re.compile("--.*"), "BadDocumentation"), _T(re.compile(r"\s+"), None), _T(re.compile("#.*"), "Comment"), # BadWord and BadNumber are a catch-alls for words and numbers so that # something like "abcDef" doesn't tokenize to [SnakeWord, CamelWord]. # # This is preferable to returning an error because the BadWord and BadNumber # token types can be used in example-based errors. _T(re.compile("[0-9][bxBX]?[0-9a-fA-F_]*"), "BadNumber"), _T(re.compile("[a-zA-Z_$0-9]+"), "BadWord"), ] del _T def _tokenize_line(line, line_number, file_name): """Tokenizes a single line of input. Arguments: line: The line of text to tokenize. line_number: The line number (used when constructing token objects). file_name: The name of a file to use in errors. Returns: A tuple of: A list of token objects or None. A possibly-empty list of errors. """ tokens = [] offset = 0 while offset < len(line): best_candidate = "" best_candidate_symbol = None # Find the longest match. Ties go to the first match. This way, keywords # ("struct") are matched as themselves, but words that only happen to start # with keywords ("structure") are matched as words. # # There is never a reason to try to match a literal after a regex that # could also match that literal, so check literals first. for literal in LITERAL_TOKEN_PATTERNS: if line[offset:].startswith(literal) and len(literal) > len( best_candidate): best_candidate = literal # For Emboss, the name of a literal token is just the literal in quotes, # so that the grammar can read a little more naturally, e.g.: # # expression -> expression "+" expression # # instead of # # expression -> expression Plus expression best_candidate_symbol = '"' + literal + '"' for pattern in REGEX_TOKEN_PATTERNS: match_result = pattern.regex.match(line[offset:]) if match_result and len(match_result.group(0)) > len(best_candidate): best_candidate = match_result.group(0) best_candidate_symbol = pattern.symbol if not best_candidate: return None, [[error.error( file_name, parser_types.make_location( (line_number, offset + 1), (line_number, offset + 2)), "Unrecognized token")]] if best_candidate_symbol: tokens.append(parser_types.Token( best_candidate_symbol, best_candidate, parser_types.make_location( (line_number, offset + 1), (line_number, offset + len(best_candidate) + 1)))) offset += len(best_candidate) return tokens, None