diff --git a/lark/exceptions.py b/lark/exceptions.py index 9fee01e..73b206a 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -75,7 +75,7 @@ class UnexpectedToken(ParseError, UnexpectedInput): self.column = getattr(token, 'column', '?') self.considered_rules = considered_rules self.state = state - self.pos_in_stream = token.pos_in_stream + self.pos_in_stream = getattr(token, 'pos_in_stream', None) message = ("Unexpected token %r at line %s, column %s.\n" "Expected: %s\n" diff --git a/lark/lark.py b/lark/lark.py index 1841326..7bfe63a 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -157,9 +157,9 @@ class Lark: self.grammar = load_grammar(grammar, self.source) # Compile the EBNF grammar into BNF - tokens, self.rules, self.ignore_tokens = self.grammar.compile() + self.terminals, self.rules, self.ignore_tokens = self.grammar.compile() - self.lexer_conf = LexerConf(tokens, self.ignore_tokens, self.options.postlex, self.options.lexer_callbacks) + self.lexer_conf = LexerConf(self.terminals, self.ignore_tokens, self.options.postlex, self.options.lexer_callbacks) if self.options.parser: self.parser = self._build_parser() diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 01739a8..6da2df0 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -448,8 +448,10 @@ class Grammar: self.ignore = ignore def compile(self): - token_defs = list(self.token_defs) - rule_defs = self.rule_defs + # We change the trees in-place (to support huge grammars) + # So deepcopy allows calling compile more than once. + token_defs = deepcopy(list(self.token_defs)) + rule_defs = deepcopy(self.rule_defs) # ================= # Compile Tokens diff --git a/lark/reconstruct.py b/lark/reconstruct.py index aafc5b3..3253a0f 100644 --- a/lark/reconstruct.py +++ b/lark/reconstruct.py @@ -67,38 +67,42 @@ class MakeMatchTree: class Reconstructor: def __init__(self, parser): - # Recreate the rules to assume a standard lexer - _tokens, rules, _grammar_extra = parser.grammar.compile() + # XXX TODO calling compile twice returns different results! + tokens, rules, _grammar_extra = parser.grammar.compile() - expand1s = {r.origin for r in parser.rules if r.options and r.options.expand1} + self.write_tokens = WriteTokensTransformer({t.name:t for t in tokens}) + self.rules = list(self._build_recons_rules(rules)) - d = defaultdict(list) + def _build_recons_rules(self, rules): + expand1s = {r.origin for r in rules if r.options and r.options.expand1} + + aliases = defaultdict(list) for r in rules: - # Rules can match their alias if r.alias: - alias = NonTerminal(r.alias) - d[alias].append(r.expansion) - d[r.origin].append([alias]) - else: - d[r.origin].append(r.expansion) + aliases[r.origin].append( r.alias ) - # Expanded rules can match their own terminal - for sym in r.expansion: - if sym in expand1s: - d[sym].append([Terminal(sym.name)]) + rule_names = {r.origin for r in rules} + nonterminals = {sym for sym in rule_names + if sym.name.startswith('_') or sym in expand1s or sym in aliases } + + for r in rules: + recons_exp = [sym if sym in nonterminals else Terminal(sym.name) + for sym in r.expansion if not is_discarded_terminal(sym)] - reduced_rules = defaultdict(list) - for name, expansions in d.items(): - for expansion in expansions: - reduced = [sym if sym.name.startswith('_') or sym in expand1s else Terminal(sym.name) - for sym in expansion if not is_discarded_terminal(sym)] + # Skip self-recursive constructs + if recons_exp == [r.origin]: + continue - reduced_rules[name, tuple(reduced)].append(expansion) + sym = NonTerminal(r.alias) if r.alias else r.origin - self.rules = [Rule(name, list(reduced), MakeMatchTree(name.name, expansions[0]), None) - for (name, reduced), expansions in reduced_rules.items()] + yield Rule(sym, recons_exp, MakeMatchTree(sym.name, r.expansion)) - self.write_tokens = WriteTokensTransformer({t.name:t for t in _tokens}) + for origin, rule_aliases in aliases.items(): + for alias in rule_aliases: + yield Rule(origin, [Terminal(alias)], MakeMatchTree(origin.name, [NonTerminal(alias)])) + + yield Rule(origin, [Terminal(origin.name)], MakeMatchTree(origin.name, [origin])) + def _match(self, term, token): diff --git a/tests/__main__.py b/tests/__main__.py index 5a30a4e..051ead6 100644 --- a/tests/__main__.py +++ b/tests/__main__.py @@ -5,6 +5,7 @@ import logging from .test_trees import TestTrees from .test_tools import TestStandalone +from .test_reconstructor import TestReconstructor try: from .test_nearley.test_nearley import TestNearley diff --git a/tests/test_reconstructor.py b/tests/test_reconstructor.py new file mode 100644 index 0000000..ec264a0 --- /dev/null +++ b/tests/test_reconstructor.py @@ -0,0 +1,116 @@ +import json +import unittest +from unittest import TestCase +from lark import Lark +from lark.reconstruct import Reconstructor + + +common = """ +%import common (WS_INLINE, NUMBER, WORD) +%ignore WS_INLINE +""" + +def _remove_ws(s): + return s.replace(' ', '').replace('\n','') + +class TestReconstructor(TestCase): + + def assert_reconstruct(self, grammar, code): + parser = Lark(grammar, parser='lalr') + tree = parser.parse(code) + new = Reconstructor(parser).reconstruct(tree) + self.assertEqual(_remove_ws(code), _remove_ws(new)) + + def test_starred_rule(self): + + g = """ + start: item* + item: NL + | rule + rule: WORD ":" NUMBER + NL: /(\\r?\\n)+\s*/ + """ + common + + code = """ + Elephants: 12 + """ + + self.assert_reconstruct(g, code) + + def test_starred_group(self): + + g = """ + start: (rule | _NL)* + rule: WORD ":" NUMBER + _NL: /(\\r?\\n)+\s*/ + """ + common + + code = """ + Elephants: 12 + """ + + self.assert_reconstruct(g, code) + + def test_alias(self): + + g = """ + start: line* + line: NL + | rule + | "hello" -> hi + rule: WORD ":" NUMBER + NL: /(\\r?\\n)+\s*/ + """ + common + + code = """ + Elephants: 12 + hello + """ + + self.assert_reconstruct(g, code) + + def test_json_example(self): + test_json = ''' + { + "empty_object" : {}, + "empty_array" : [], + "booleans" : { "YES" : true, "NO" : false }, + "numbers" : [ 0, 1, -2, 3.3, 4.4e5, 6.6e-7 ], + "strings" : [ "This", [ "And" , "That", "And a \\"b" ] ], + "nothing" : null + } + ''' + + json_grammar = r""" + ?start: value + + ?value: object + | array + | string + | SIGNED_NUMBER -> number + | "true" -> true + | "false" -> false + | "null" -> null + + array : "[" [value ("," value)*] "]" + object : "{" [pair ("," pair)*] "}" + pair : string ":" value + + string : ESCAPED_STRING + + %import common.ESCAPED_STRING + %import common.SIGNED_NUMBER + %import common.WS + + %ignore WS + """ + + json_parser = Lark(json_grammar, parser='lalr') + tree = json_parser.parse(test_json) + + new_json = Reconstructor(json_parser).reconstruct(tree) + self.assertEqual(json.loads(new_json), json.loads(test_json)) + + +if __name__ == '__main__': + unittest.main()