From da1910f5b67b56528974aa9996abd46a103a37f2 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Tue, 9 Jan 2018 21:08:40 +0200 Subject: [PATCH] More refactoring towards standalone --- lark/common.py | 9 ++- lark/lark.py | 4 +- lark/lexer.py | 113 ++++++++++++++++++------------------ lark/load_grammar.py | 42 +++++++------- lark/parse_tree_builder.py | 6 +- lark/parsers/lalr_parser.py | 6 +- lark/tree.py | 4 +- 7 files changed, 96 insertions(+), 88 deletions(-) diff --git a/lark/common.py b/lark/common.py index 800aa4f..ff1897a 100644 --- a/lark/common.py +++ b/lark/common.py @@ -4,12 +4,18 @@ import sys Py36 = (sys.version_info[:2] >= (3, 6)) + +###{standalone +def is_terminal(sym): + return sym.isupper() + class GrammarError(Exception): pass class ParseError(Exception): pass +###} class UnexpectedToken(ParseError): def __init__(self, token, expected, seq, index): @@ -32,9 +38,6 @@ class UnexpectedToken(ParseError): -def is_terminal(sym): - return sym.isupper() - class LexerConf: def __init__(self, tokens, ignore=(), postlex=None): diff --git a/lark/lark.py b/lark/lark.py index a7af772..58a6ff7 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -166,8 +166,8 @@ class Lark: def _build_parser(self): self.parser_class = get_frontend(self.options.parser, self.options.lexer) - self.parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class, self.options.propagate_positions, self.options.keep_all_tokens) - callback = self.parse_tree_builder.apply(self.options.transformer) + self._parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class, self.options.propagate_positions, self.options.keep_all_tokens) + callback = self._parse_tree_builder.create_callback(self.options.transformer) if self.profiler: for f in dir(callback): if not (f.startswith('__') and f.endswith('__')): diff --git a/lark/lexer.py b/lark/lexer.py index 5ca77de..4f673f6 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -5,6 +5,7 @@ import re from .utils import Str, classify from .common import is_terminal, PatternStr, PatternRE, TokenDef +###{standalone class LexError(Exception): pass @@ -48,10 +49,60 @@ class Token(Str): __hash__ = Str.__hash__ -class Regex: - def __init__(self, pattern, flags=()): - self.pattern = pattern - self.flags = flags + +class LineCounter: + def __init__(self): + self.newline_char = '\n' + self.char_pos = 0 + self.line = 1 + self.column = 0 + self.line_start_pos = 0 + + def feed(self, token, test_newline=True): + """Consume a token and calculate the new line & column. + + As an optional optimization, set test_newline=False is token doesn't contain a newline. + """ + if test_newline: + newlines = token.count(self.newline_char) + if newlines: + self.line += newlines + self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1 + + self.char_pos += len(token) + self.column = self.char_pos - self.line_start_pos + +class _Lex: + "Built to serve both Lexer and ContextualLexer" + def __init__(self, lexer): + self.lexer = lexer + + def lex(self, stream, newline_types, ignore_types): + newline_types = list(newline_types) + newline_types = list(newline_types) + line_ctr = LineCounter() + + while True: + lexer = self.lexer + for mre, type_from_index in lexer.mres: + m = mre.match(stream, line_ctr.char_pos) + if m: + value = m.group(0) + type_ = type_from_index[m.lastindex] + if type_ not in ignore_types: + t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) + if t.type in lexer.callback: + t = lexer.callback[t.type](t) + lexer = yield t + + line_ctr.feed(value, type_ in newline_types) + break + else: + if line_ctr.char_pos < len(stream): + raise UnexpectedInput(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column) + break +###} + def _regexp_has_newline(r): return '\n' in r or '\\n' in r or ('(?s)' in r and '.' in r) @@ -182,57 +233,3 @@ class ContextualLexer: l.lexer = self.lexers[self.parser_state] -###{lexer - -class LineCounter: - def __init__(self): - self.newline_char = '\n' - self.char_pos = 0 - self.line = 1 - self.column = 0 - self.line_start_pos = 0 - - def feed(self, token, test_newline=True): - """Consume a token and calculate the new line & column. - - As an optional optimization, set test_newline=False is token doesn't contain a newline. - """ - if test_newline: - newlines = token.count(self.newline_char) - if newlines: - self.line += newlines - self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1 - - self.char_pos += len(token) - self.column = self.char_pos - self.line_start_pos - -class _Lex: - "Built to serve both Lexer and ContextualLexer" - def __init__(self, lexer): - self.lexer = lexer - - def lex(self, stream, newline_types, ignore_types): - newline_types = list(newline_types) - newline_types = list(newline_types) - line_ctr = LineCounter() - - while True: - lexer = self.lexer - for mre, type_from_index in lexer.mres: - m = mre.match(stream, line_ctr.char_pos) - if m: - value = m.group(0) - type_ = type_from_index[m.lastindex] - if type_ not in ignore_types: - t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) - if t.type in lexer.callback: - t = lexer.callback[t.type](t) - lexer = yield t - - line_ctr.feed(value, type_ in newline_types) - break - else: - if line_ctr.char_pos < len(stream): - raise UnexpectedInput(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column) - break -###} diff --git a/lark/load_grammar.py b/lark/load_grammar.py index b38a67c..2086591 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -128,7 +128,7 @@ RULES = { class EBNF_to_BNF(InlineTransformer): def __init__(self): - self.new_rules = {} + self.new_rules = [] self.rules_by_expr = {} self.prefix = 'anon' self.i = 0 @@ -141,7 +141,8 @@ class EBNF_to_BNF(InlineTransformer): new_name = '__%s_%s_%d' % (self.prefix, type_, self.i) self.i += 1 t = Token('RULE', new_name, -1) - self.new_rules[new_name] = T('expansions', [T('expansion', [expr]), T('expansion', [t, expr])]), self.rule_options + tree = T('expansions', [T('expansion', [expr]), T('expansion', [t, expr])]) + self.new_rules.append((new_name, tree, self.rule_options)) self.rules_by_expr[expr] = t return t @@ -390,12 +391,6 @@ def _interleave(l, item): def _choice_of_rules(rules): return T('expansions', [T('expansion', [Token('RULE', name)]) for name in rules]) -def dict_update_safe(d1, d2): - for k, v in d2.items(): - assert k not in d1 - d1[k] = v - - class Grammar: def __init__(self, rule_defs, token_defs, ignore): self.token_defs = token_defs @@ -468,38 +463,41 @@ class Grammar: # ================= # Compile Rules # ================= - ebnf_to_bnf = EBNF_to_BNF() - simplify_rule = SimplifyRule_Visitor() + # 1. Pre-process terminals transformer = PrepareLiterals() if not lexer: transformer *= SplitLiterals() transformer *= ExtractAnonTokens(tokens) # Adds to tokens - rules = {} + # 2. Convert EBNF to BNF (and apply step 1) + ebnf_to_bnf = EBNF_to_BNF() + rules = [] for name, rule_tree, options in rule_defs: - assert name not in rules, name ebnf_to_bnf.rule_options = RuleOptions(keep_all_tokens=True) if options and options.keep_all_tokens else None tree = transformer.transform(rule_tree) - rules[name] = ebnf_to_bnf.transform(tree), options + rules.append((name, ebnf_to_bnf.transform(tree), options)) + rules += ebnf_to_bnf.new_rules - dict_update_safe(rules, ebnf_to_bnf.new_rules) + assert len(rules) == len({name for name, _t, _o in rules}), "Whoops, name collision" + # 3. Compile tree to Rule objects rule_tree_to_text = RuleTreeToText() - new_rules = [] - for origin, (tree, options) in rules.items(): + simplify_rule = SimplifyRule_Visitor() + compiled_rules = [] + for name, tree, options in rules: simplify_rule.visit(tree) expansions = rule_tree_to_text.transform(tree) for expansion, alias in expansions: - if alias and origin.startswith('_'): - raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (origin, alias)) + if alias and name.startswith('_'): + raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (name, alias)) - rule = Rule(origin, expansion, alias, options) - new_rules.append(rule) + rule = Rule(name, expansion, alias, options) + compiled_rules.append(rule) - return tokens, new_rules, self.ignore + return tokens, compiled_rules, self.ignore @@ -557,7 +555,7 @@ class GrammarLoader: rules = [options_from_rule(name, x) for name, x in RULES.items()] rules = [Rule(r, x.split(), None, o) for r, xs, o in rules for x in xs] - callback = ParseTreeBuilder(rules, T).apply() + callback = ParseTreeBuilder(rules, T).create_callback() lexer_conf = LexerConf(tokens, ['WS', 'COMMENT']) parser_conf = ParserConf(rules, callback, 'start') diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py index 4513583..f960931 100644 --- a/lark/parse_tree_builder.py +++ b/lark/parse_tree_builder.py @@ -3,6 +3,8 @@ from .utils import suppress from .lexer import Token from .grammar import Rule +###{standalone + class NodeBuilder: def __init__(self, tree_class, name): self.tree_class = tree_class @@ -130,7 +132,7 @@ class ParseTreeBuilder: yield rule, wrapper_chain - def apply(self, transformer=None): + def create_callback(self, transformer=None): callback = Callback() for rule, wrapper_chain in self.rule_builders: @@ -152,3 +154,5 @@ class ParseTreeBuilder: setattr(callback, internal_callback_name, f) return callback + +###} diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index b093990..eafc4ea 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -3,7 +3,7 @@ # Author: Erez Shinan (2017) # Email : erezshin@gmail.com -from ..common import ParseError, UnexpectedToken +from ..common import UnexpectedToken from .lalr_analysis import LALR_Analyzer, Shift @@ -20,6 +20,8 @@ class Parser: self.parser = _Parser(analysis.parse_table, callbacks) self.parse = self.parser.parse +###{standalone + class _Parser: def __init__(self, parse_table, callbacks): self.states = parse_table.states @@ -90,3 +92,5 @@ class _Parser: return val else: reduce(arg) + +###} diff --git a/lark/tree.py b/lark/tree.py index f832857..1639bb1 100644 --- a/lark/tree.py +++ b/lark/tree.py @@ -7,6 +7,7 @@ from copy import deepcopy from .utils import inline_args +###{standalone class Tree(object): def __init__(self, data, children): self.data = data @@ -33,6 +34,7 @@ class Tree(object): def pretty(self, indent_str=' '): return ''.join(self._pretty(0, indent_str)) +###} def expand_kids_by_index(self, *indices): for i in sorted(indices, reverse=True): # reverse so that changing tail won't affect indices @@ -138,7 +140,7 @@ class TransformerChain(object): def __mul__(self, other): return TransformerChain(*self.transformers + (other,)) - + class InlineTransformer(Transformer):