From 9e545f88253824f0afbdc41aa816599bf624be71 Mon Sep 17 00:00:00 2001 From: MegaIng1 Date: Sat, 26 Dec 2020 00:14:26 +0100 Subject: [PATCH] Basics for GrammarBuilder --- lark/lark.py | 83 +++++++++-------- lark/load_grammar.py | 212 +++++++++++++++++++++++++++++++++++-------- 2 files changed, 216 insertions(+), 79 deletions(-) diff --git a/lark/lark.py b/lark/lark.py index 3e0a51f..b1b9270 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -7,7 +7,7 @@ import tempfile from warnings import warn from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS, isascii, logger -from .load_grammar import load_grammar, FromPackageLoader +from .load_grammar import load_grammar, FromPackageLoader, Grammar from .tree import Tree from .common import LexerConf, ParserConf @@ -234,42 +234,50 @@ class Lark(Serialize): else: grammar = read() - assert isinstance(grammar, STRING_TYPE) - self.source_grammar = grammar - if self.options.use_bytes: - if not isascii(grammar): - raise ConfigurationError("Grammar must be ascii only, when use_bytes=True") - if sys.version_info[0] == 2 and self.options.use_bytes != 'force': - raise ConfigurationError("`use_bytes=True` may have issues on python2." - "Use `use_bytes='force'` to use it at your own risk.") - cache_fn = None - if self.options.cache: - if self.options.parser != 'lalr': - raise ConfigurationError("cache only works with parser='lalr' for now") - if isinstance(self.options.cache, STRING_TYPE): - cache_fn = self.options.cache - else: - if self.options.cache is not True: - raise ConfigurationError("cache argument must be bool or str") - unhashable = ('transformer', 'postlex', 'lexer_callbacks', 'edit_terminals') - from . import __version__ - options_str = ''.join(k+str(v) for k, v in options.items() if k not in unhashable) - s = grammar + options_str + __version__ - md5 = hashlib.md5(s.encode()).hexdigest() - cache_fn = tempfile.gettempdir() + '/.lark_cache_%s.tmp' % md5 - - if FS.exists(cache_fn): - logger.debug('Loading grammar from cache: %s', cache_fn) - # Remove options that aren't relevant for loading from cache - for name in (set(options) - _LOAD_ALLOWED_OPTIONS): - del options[name] - with FS.open(cache_fn, 'rb') as f: - try: - self._load(f, **options) - except Exception: - raise RuntimeError("Failed to load Lark from cache: %r. Try to delete the file and run again." % cache_fn) - return + if isinstance(grammar, STRING_TYPE): + self.source_grammar = grammar + if self.options.use_bytes: + if not isascii(grammar): + raise ConfigurationError("Grammar must be ascii only, when use_bytes=True") + if sys.version_info[0] == 2 and self.options.use_bytes != 'force': + raise ConfigurationError("`use_bytes=True` may have issues on python2." + "Use `use_bytes='force'` to use it at your own risk.") + + if self.options.cache: + if self.options.parser != 'lalr': + raise ConfigurationError("cache only works with parser='lalr' for now") + if isinstance(self.options.cache, STRING_TYPE): + cache_fn = self.options.cache + else: + if self.options.cache is not True: + raise ConfigurationError("cache argument must be bool or str") + unhashable = ('transformer', 'postlex', 'lexer_callbacks', 'edit_terminals') + from . import __version__ + options_str = ''.join(k+str(v) for k, v in options.items() if k not in unhashable) + s = grammar + options_str + __version__ + md5 = hashlib.md5(s.encode()).hexdigest() + cache_fn = tempfile.gettempdir() + '/.lark_cache_%s.tmp' % md5 + + if FS.exists(cache_fn): + logger.debug('Loading grammar from cache: %s', cache_fn) + # Remove options that aren't relevant for loading from cache + for name in (set(options) - _LOAD_ALLOWED_OPTIONS): + del options[name] + with FS.open(cache_fn, 'rb') as f: + try: + self._load(f, **options) + except Exception: + raise RuntimeError("Failed to load Lark from cache: %r. Try to delete the file and run again." % cache_fn) + return + + + # Parse the grammar file and compose the grammars + self.grammar = load_grammar(grammar, self.source_path, self.options.import_paths, self.options.keep_all_tokens) + else: + assert isinstance(grammar, Grammar) + self.grammar = grammar + if self.options.lexer == 'auto': if self.options.parser == 'lalr': @@ -301,9 +309,6 @@ class Lark(Serialize): if self.options.ambiguity not in _VALID_AMBIGUITY_OPTIONS: raise ConfigurationError("invalid ambiguity option: %r. Must be one of %r" % (self.options.ambiguity, _VALID_AMBIGUITY_OPTIONS)) - # Parse the grammar file and compose the grammars - self.grammar = load_grammar(grammar, self.source_path, self.options.import_paths, self.options.keep_all_tokens) - if self.options.postlex is not None: terminals_to_keep = set(self.options.postlex.always_accept) else: diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 0915c33..78f8fee 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -302,15 +302,6 @@ class RuleTreeToText(Transformer): return expansion, alias.value -@inline_args -class CanonizeTree(Transformer_InPlace): - def tokenmods(self, *args): - if len(args) == 1: - return list(args) - tokenmods, value = args - return tokenmods + [value] - - class PrepareAnonTerminals(Transformer_InPlace): """Create a unique list of anonymous terminals. Attempt to give meaningful names to them when we add them""" @@ -871,8 +862,26 @@ def extend_expansions(tree, new): tree.children.insert(0, new) -class GrammarLoader: - ERRORS = [ + +def _grammar_parser(): + try: + return _grammar_parser.cache + except AttributeError: + terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()] + + rules = [options_from_rule(name, None, x) for name, x in RULES.items()] + rules = [Rule(NonTerminal(r), symbols_from_strcase(x.split()), i, None, o) + for r, _p, xs, o in rules for i, x in enumerate(xs)] + callback = ParseTreeBuilder(rules, ST).create_callback() + import re + lexer_conf = LexerConf(terminals, re, ['WS', 'COMMENT']) + parser_conf = ParserConf(rules, callback, ['start']) + lexer_conf.lexer_type = 'standard' + parser_conf.parser_type = 'lalr' + _grammar_parser.cache = ParsingFrontend(lexer_conf, parser_conf, {}) + return _grammar_parser.cache + +_GRAMMAR_ERRORS = [ ('Unclosed parenthesis', ['a: (\n']), ('Unmatched closing parenthesis', ['a: )\n', 'a: [)\n', 'a: (]\n']), ('Expecting rule or terminal definition (missing colon)', ['a\n', 'A\n', 'a->\n', 'A->\n', 'a A\n']), @@ -886,21 +895,39 @@ class GrammarLoader: ('%ignore expects a value', ['%ignore %import\n']), ] - def __init__(self, global_keep_all_tokens): - terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()] +def _parse_grammar(text, name, start='start'): + try: + return _grammar_parser().parse(text + '\n', start) + except UnexpectedCharacters as e: + context = e.get_context(text) + raise GrammarError("Unexpected input at line %d column %d in %s: \n\n%s" % + (e.line, e.column, name, context)) + except UnexpectedToken as e: + context = e.get_context(text) + error = e.match_examples(_grammar_parser().parse, _GRAMMAR_ERRORS, use_accepts=True) + if error: + raise GrammarError("%s, at line %s column %s\n\n%s" % (error, e.line, e.column, context)) + elif 'STRING' in e.expected: + raise GrammarError("Expecting a value at line %s column %s\n\n%s" % (e.line, e.column, context)) + raise - rules = [options_from_rule(name, None, x) for name, x in RULES.items()] - rules = [Rule(NonTerminal(r), symbols_from_strcase(x.split()), i, None, o) - for r, _p, xs, o in rules for i, x in enumerate(xs)] - callback = ParseTreeBuilder(rules, ST).create_callback() - import re - lexer_conf = LexerConf(terminals, re, ['WS', 'COMMENT']) - parser_conf = ParserConf(rules, callback, ['start']) - lexer_conf.lexer_type = 'standard' - parser_conf.parser_type = 'lalr' - self.parser = ParsingFrontend(lexer_conf, parser_conf, {}) - self.canonize_tree = CanonizeTree() +class GrammarLoader: + ERRORS = [ + ('Unclosed parenthesis', ['a: (\n']), + ('Unmatched closing parenthesis', ['a: )\n', 'a: [)\n', 'a: (]\n']), + ('Expecting rule or terminal definition (missing colon)', ['a\n', 'A\n', 'a->\n', 'A->\n', 'a A\n']), + ('Illegal name for rules or terminals', ['Aa:\n']), + ('Alias expects lowercase name', ['a: -> "a"\n']), + ('Unexpected colon', ['a::\n', 'a: b:\n', 'a: B:\n', 'a: "a":\n']), + ('Misplaced operator', ['a: b??', 'a: b(?)', 'a:+\n', 'a:?\n', 'a:*\n', 'a:|*\n']), + ('Expecting option ("|") or a new rule or terminal definition', ['a:a\n()\n']), + ('Terminal names cannot contain dots', ['A.B\n']), + ('%import expects a name', ['%import "a"\n']), + ('%ignore expects a value', ['%ignore %import\n']), + ] + + def __init__(self, global_keep_all_tokens=False): self.global_keep_all_tokens = global_keep_all_tokens def import_grammar(self, grammar_path, base_path=None, import_paths=[]): @@ -931,21 +958,7 @@ class GrammarLoader: def load_grammar(self, grammar_text, grammar_name='', import_paths=[]): """Parse grammar_text, verify, and create Grammar object. Display nice messages on error.""" - try: - tree = self.canonize_tree.transform(self.parser.parse(grammar_text+'\n')) - except UnexpectedCharacters as e: - context = e.get_context(grammar_text) - raise GrammarError("Unexpected input at line %d column %d in %s: \n\n%s" % - (e.line, e.column, grammar_name, context)) - except UnexpectedToken as e: - context = e.get_context(grammar_text) - error = e.match_examples(self.parser.parse, self.ERRORS, use_accepts=True) - if error: - raise GrammarError("%s, at line %s column %s\n\n%s" % (error, e.line, e.column, context)) - elif 'STRING' in e.expected: - raise GrammarError("Expecting a value at line %s column %s\n\n%s" % (e.line, e.column, context)) - raise - + tree = _parse_grammar(grammar_text+'\n', grammar_name) tree = PrepareGrammar().transform(tree) # Extract grammar items @@ -1061,7 +1074,7 @@ class GrammarLoader: raise GrammarError("Cannot override a nonexisting terminal: %s" % name) term_defs.append(t) - # Extend the definition of rules + # Extend the definition of rules by adding new entries to the `expansions` node for r in extend_rules: name = r[0] @@ -1162,5 +1175,124 @@ class GrammarLoader: return Grammar(rule_defs, term_defs, ignore_names) +class GrammarBuilder: + def __init__(self, global_keep_all_tokens=False, import_paths=None): + self.global_keep_all_tokens = global_keep_all_tokens + self.import_paths = import_paths or [] + + self._term_defs = {} + self._rule_defs = {} + self._ignore_names = [] + + def define_term(self, name, exp, priority=1, override=False): + if (name in self._term_defs) ^ override: + if override: + raise GrammarError("Cannot override a nonexisting terminal" % name) + else: + raise GrammarError("Terminal '%s' defined more than once" % name) + if name.startswith('__'): + raise GrammarError('Names starting with double-underscore are reserved (Error at %s)' % name) + self._term_defs[name] = (exp, priority) + + def define_rule(self, name, params, exp, options, override=False): + if (name in self._rule_defs) ^ override: + if override: + raise GrammarError("Cannot override a nonexisting rule: %s" % name) + else: + raise GrammarError("Rule '%s' defined more than once" % name) + if name.startswith('__'): + raise GrammarError('Names starting with double-underscore are reserved (Error at %s)' % name) + self._rule_defs[name] = (params, exp, options) + + def extend_term(self, name, exp, priority=1): + if name not in self._term_defs: + raise GrammarError("Can't extend terminal %s as it wasn't defined before" % name) + old_expansions = self._term_defs[name][0] + extend_expansions(old_expansions, exp) + + def extend_rule(self, name, params, exp, options): + if name not in self._rule_defs: + raise GrammarError("Can't extend rule %s as it wasn't defined before" % name) + if params != self._rule_defs[name][0]: + raise GrammarError("Cannot extend templates with different parameters: %s" % name) + # TODO: think about what to do with RuleOptions + old_expansions = self._rule_defs[name][1] + extend_expansions(old_expansions, exp) + + def ignore(self, exp_or_name): + if isinstance(exp_or_name, str): + self._ignore_names.append(exp_or_name) + else: + assert isinstance(exp_or_name, Tree) + t = exp_or_name + if t.data=='expansions' and len(t.children) == 1: + t2 ,= t.children + if t2.data=='expansion' and len(t2.children) == 1: + item ,= t2.children + if item.data == 'value': + item ,= item.children + if isinstance(item, Token) and item.type == 'TERMINAL': + self._ignore_names.append(item.value) + return + + name = '__IGNORE_%d'% len(self._ignore_names) + self._ignore_names.append(name) + self._term_defs[name] = (t, 1) + + def declare(self, *names): + for name in names: + self.define_term(name, None, None) + + def _unpack_term_def(self, tree): + name = tree.children[0].value + exp = tree.children[-1] + p = int(tree.children[1]) if len(tree.children) == 3 else 1 + return name, exp, p + + def _unpack_rule_def(self, tree): + # FIXME: A little pointless at the moment, but I want to rework this (e.g. move the work from `options_from_rule` to here) + r = options_from_rule(*tree.children) + return r + + def load_grammar(self, grammar_text, grammar_source=""): + tree = _parse_grammar(grammar_text, grammar_source) + for stmt in tree.children: + if stmt.data == 'term': + self.define_term(*self._unpack_term_def(stmt)) + continue + elif stmt.data == 'rule': + self.define_rule(*self._unpack_rule_def(stmt)) + continue + assert stmt.data == 'statement', stmt.data + stmt ,= stmt.children + if stmt.data == 'ignore': + self.ignore(*stmt.children) + elif stmt.data == 'declare': + self.declare(*(t.value for t in stmt.children)) + elif stmt.data == 'override': + r ,= stmt.children + if r.data == 'rule': + self.define_rule(*self._unpack_rule_def(r), override=True) + else: + assert r.data == 'term' + self.define_term(*self._unpack_term_def(r), override=True) + elif stmt.data == 'extend': + r ,= stmt.children + if r.data == 'rule': + self.extend_rule(*self._unpack_rule_def(r)) + else: + assert r.data == 'term' + self.extend_term(*self._unpack_term_def(r)) + else: + assert False, stmt + + def check(self): + pass + + def build(self) -> Grammar: + return Grammar([(n, *r) for n, r in self._rule_defs.items()], + [(n, t) for n, t in self._term_defs], + self._ignore_names) + def load_grammar(grammar, source, import_paths, global_keep_all_tokens): return GrammarLoader(global_keep_all_tokens).load_grammar(grammar, source, import_paths)