From ff588714c1184fb24d7a4325006240bb7e89b26d Mon Sep 17 00:00:00 2001 From: Rob Rose Date: Tue, 27 Mar 2018 15:56:39 -0400 Subject: [PATCH 01/34] Changed Lark standalone file extension - Reasoning addressed in Issue #116 - Renamed example and grammars file to use new extension. - Changed `.g` to new extension of `.lrk` in places where it is referenced. --- MANIFEST.in | 2 +- examples/{python2.g => python2.lrk} | 0 examples/{python3.g => python3.lrk} | 0 examples/python_parser.py | 4 ++-- examples/standalone/create_standalone.sh | 2 +- examples/standalone/{json.g => json.lrk} | 0 lark/grammars/{common.g => common.lrk} | 0 lark/load_grammar.py | 2 +- setup.py | 2 +- 9 files changed, 6 insertions(+), 6 deletions(-) rename examples/{python2.g => python2.lrk} (100%) rename examples/{python3.g => python3.lrk} (100%) rename examples/standalone/{json.g => json.lrk} (100%) rename lark/grammars/{common.g => common.lrk} (100%) diff --git a/MANIFEST.in b/MANIFEST.in index 5ee4903..8288fd6 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1 +1 @@ -include README.md LICENSE docs/* examples/*.py examples/*.png examples/*.g tests/*.py tests/test_nearley/*.py tests/test_nearley/grammars/* +include README.md LICENSE docs/* examples/*.py examples/*.png examples/*.lrk tests/*.py tests/test_nearley/*.py tests/test_nearley/grammars/* diff --git a/examples/python2.g b/examples/python2.lrk similarity index 100% rename from examples/python2.g rename to examples/python2.lrk diff --git a/examples/python3.g b/examples/python3.lrk similarity index 100% rename from examples/python3.g rename to examples/python3.lrk diff --git a/examples/python_parser.py b/examples/python_parser.py index d953a79..d14dacc 100644 --- a/examples/python_parser.py +++ b/examples/python_parser.py @@ -21,8 +21,8 @@ class PythonIndenter(Indenter): tab_len = 8 -grammar2_filename = os.path.join(__path__, 'python2.g') -grammar3_filename = os.path.join(__path__, 'python3.g') +grammar2_filename = os.path.join(__path__, 'python2.lrk') +grammar3_filename = os.path.join(__path__, 'python3.lrk') with open(grammar2_filename) as f: python_parser2 = Lark(f, parser='lalr', postlex=PythonIndenter(), start='file_input') with open(grammar3_filename) as f: diff --git a/examples/standalone/create_standalone.sh b/examples/standalone/create_standalone.sh index 1eba3a4..f5001fe 100755 --- a/examples/standalone/create_standalone.sh +++ b/examples/standalone/create_standalone.sh @@ -1 +1 @@ -python -m lark.tools.standalone json.g > json_parser.py +python -m lark.tools.standalone json.lrk > json_parser.py diff --git a/examples/standalone/json.g b/examples/standalone/json.lrk similarity index 100% rename from examples/standalone/json.g rename to examples/standalone/json.lrk diff --git a/lark/grammars/common.g b/lark/grammars/common.lrk similarity index 100% rename from lark/grammars/common.g rename to lark/grammars/common.lrk diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 1637514..cf74199 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -625,7 +625,7 @@ class GrammarLoader: elif stmt.data == 'import': dotted_path = stmt.children[0].children name = stmt.children[1] if len(stmt.children)>1 else dotted_path[-1] - grammar_path = os.path.join(*dotted_path[:-1]) + '.g' + grammar_path = os.path.join(*dotted_path[:-1]) + '.lrk' g = import_grammar(grammar_path) token_options = dict(g.token_defs)[dotted_path[-1]] assert isinstance(token_options, tuple) and len(token_options)==2 diff --git a/setup.py b/setup.py index 430ae5c..978b370 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ setup( requires = [], install_requires = [], - package_data = { '': ['*.md', '*.g'] }, + package_data = { '': ['*.md', '*.lrk'] }, test_suite = 'tests.__main__', From f960c1b8ac0fb77d821ac2c1462910992fdc74e4 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Fri, 13 Apr 2018 00:40:28 +0300 Subject: [PATCH 02/34] Initial: Added transformers.py, and Meta to tree --- lark/lexer.py | 3 ++ lark/load_grammar.py | 17 +++---- lark/parse_tree_builder.py | 21 ++++++--- lark/transformers.py | 93 ++++++++++++++++++++++++++++++++++++++ lark/tree.py | 12 +++++ tests/test_parser.py | 2 +- 6 files changed, 133 insertions(+), 15 deletions(-) create mode 100644 lark/transformers.py diff --git a/lark/lexer.py b/lark/lexer.py index 0a46ee1..1c1b70a 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -34,6 +34,8 @@ class Token(Str): self.value = value self.line = line self.column = column + self.end_line = None + self.end_column = None return self @classmethod @@ -112,6 +114,7 @@ class _Lex: if t: t.end_line = line_ctr.line t.end_column = line_ctr.column + break else: if line_ctr.char_pos < len(stream): diff --git a/lark/load_grammar.py b/lark/load_grammar.py index a6b2d82..5813708 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -14,7 +14,8 @@ from .parsers.lalr_parser import UnexpectedToken from .common import is_terminal, GrammarError, LexerConf, ParserConf, PatternStr, PatternRE, TokenDef from .grammar import RuleOptions, Rule -from .tree import Tree, Transformer, InlineTransformer, Visitor, SlottedTree as ST +from .tree import Tree, Visitor, SlottedTree as ST +from .transformers import Transformer_Children, Transformer_ChildrenInline __path__ = os.path.dirname(__file__) IMPORT_PATHS = [os.path.join(__path__, 'grammars')] @@ -130,7 +131,7 @@ RULES = { } -class EBNF_to_BNF(InlineTransformer): +class EBNF_to_BNF(Transformer_ChildrenInline): def __init__(self): self.new_rules = [] self.rules_by_expr = {} @@ -226,7 +227,7 @@ class SimplifyRule_Visitor(Visitor): tree.children = list(set(tree.children)) -class RuleTreeToText(Transformer): +class RuleTreeToText(Transformer_Children): def expansions(self, x): return x def expansion(self, symbols): @@ -237,7 +238,7 @@ class RuleTreeToText(Transformer): return expansion, alias.value -class CanonizeTree(InlineTransformer): +class CanonizeTree(Transformer_ChildrenInline): def maybe(self, expr): return ST('expr', [expr, Token('OP', '?', -1)]) @@ -247,7 +248,7 @@ class CanonizeTree(InlineTransformer): tokenmods, value = args return tokenmods + [value] -class ExtractAnonTokens(InlineTransformer): +class ExtractAnonTokens(Transformer_ChildrenInline): "Create a unique list of anonymous tokens. Attempt to give meaningful names to them when we add them" def __init__(self, tokens): @@ -351,7 +352,7 @@ def _literal_to_pattern(literal): 'REGEXP': PatternRE }[literal.type](s, flags) -class PrepareLiterals(InlineTransformer): +class PrepareLiterals(Transformer_ChildrenInline): def literal(self, literal): return ST('pattern', [_literal_to_pattern(literal)]) @@ -363,13 +364,13 @@ class PrepareLiterals(InlineTransformer): regexp = '[%s-%s]' % (start, end) return ST('pattern', [PatternRE(regexp)]) -class SplitLiterals(InlineTransformer): +class SplitLiterals(Transformer_ChildrenInline): def pattern(self, p): if isinstance(p, PatternStr) and len(p.value)>1: return ST('expansion', [ST('pattern', [PatternStr(ch, flags=p.flags)]) for ch in p.value]) return ST('pattern', [p]) -class TokenTreeToPattern(Transformer): +class TokenTreeToPattern(Transformer_Children): def pattern(self, ps): p ,= ps return p diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py index 7c74178..1acfe2f 100644 --- a/lark/parse_tree_builder.py +++ b/lark/parse_tree_builder.py @@ -2,6 +2,7 @@ from .common import is_terminal, GrammarError from .utils import suppress from .lexer import Token from .grammar import Rule +from .tree import Tree ###{standalone from functools import partial @@ -38,15 +39,23 @@ class PropagatePositions: if children: for a in children: - with suppress(AttributeError): - res.line = a.line - res.column = a.column + if isinstance(a, Tree): + res.meta.line = a.meta.line + res.meta.column = a.meta.column + elif isinstance(a, Token): + res.meta.line = a.line + res.meta.column = a.column break for a in reversed(children): - with suppress(AttributeError): - res.end_line = a.end_line - res.end_column = a.end_column + # with suppress(AttributeError): + if isinstance(a, Tree): + res.meta.end_line = a.meta.end_line + res.meta.end_column = a.meta.end_column + elif isinstance(a, Token): + res.meta.end_line = a.end_line + res.meta.end_column = a.end_column + break return res diff --git a/lark/transformers.py b/lark/transformers.py new file mode 100644 index 0000000..b1938f0 --- /dev/null +++ b/lark/transformers.py @@ -0,0 +1,93 @@ +from functools import wraps + +from .tree import Tree + +class Discard(Exception): + pass + + +class Transformer: + def _get_userfunc(self, name): + return getattr(self, name) + + def _call_userfunc(self, tree): + # Assumes tree is already transformed + try: + f = self._get_userfunc(tree.data) + except AttributeError: + return self.__default__(tree) + else: + return f(tree) + + def _transform(self, tree): + children = [] + for c in tree.children: + try: + children.append(self._transform(c) if isinstance(c, Tree) else c) + except Discard: + pass + + tree = Tree(tree.data, children) + return self._call_userfunc(tree) + + def __default__(self, tree): + return tree + + def transform(self, tree): + return self._transform(tree) + + def __mul__(self, other): + return TransformerChain(self, other) + +class Transformer_Children(Transformer): + def _call_userfunc(self, tree): + # Assumes tree is already transformed + try: + f = self._get_userfunc(tree.data) + except AttributeError: + return self.__default__(tree) + else: + return f(tree.children) + +class Transformer_ChildrenInline(Transformer): + def _call_userfunc(self, tree): + # Assumes tree is already transformed + try: + f = self._get_userfunc(tree.data) + except AttributeError: + return self.__default__(tree) + else: + return f(*tree.children) + + +class TransformerChain(object): + def __init__(self, *transformers): + self.transformers = transformers + + def transform(self, tree): + for t in self.transformers: + tree = t.transform(tree) + return tree + + def __mul__(self, other): + return TransformerChain(*self.transformers + (other,)) + + + +#### XXX PSEUDOCODE TODO +# def items(obj): +# if isinstance(obj, Transformer): +# def new_get_userfunc(self, name): +# uf = self._get_userfunc(name) +# def _f(tree): +# return uf(tree.children) +# return _f +# obj._get_userfunc = new_get_userfunc +# else: +# assert callable(obj) +# # apply decorator +# def _f(tree): +# return obj(tree.children) +# return _f + + diff --git a/lark/tree.py b/lark/tree.py index d496d75..04cfc4e 100644 --- a/lark/tree.py +++ b/lark/tree.py @@ -7,11 +7,23 @@ from copy import deepcopy from .utils import inline_args +class Meta: + pass + ###{standalone class Tree(object): + __slots__ = ('data', 'children', '_meta', 'rule') + def __init__(self, data, children): self.data = data self.children = children + self._meta = None + + @property + def meta(self): + if self._meta is None: + self._meta = Meta() + return self._meta def __repr__(self): return 'Tree(%s, %s)' % (self.data, self.children) diff --git a/tests/test_parser.py b/tests/test_parser.py index d4d63ca..4aaea93 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -60,7 +60,7 @@ class TestParsers(unittest.TestCase): """, propagate_positions=True) r = g.parse('a') - self.assertEqual( r.children[0].line, 1 ) + self.assertEqual( r.children[0].meta.line, 1 ) def test_expand1(self): From 349a607ae33de8320e54f8541ccd4255b3cb14b7 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Fri, 13 Apr 2018 11:48:41 +0300 Subject: [PATCH 03/34] Some more normalizing --- lark/load_grammar.py | 25 ++++++------ lark/parsers/earley.py | 5 ++- lark/parsers/resolve_ambig.py | 2 +- lark/transformers.py | 73 +++++++++++++++++++++++++---------- lark/tree.py | 67 -------------------------------- 5 files changed, 67 insertions(+), 105 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 5813708..190bda6 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -14,8 +14,8 @@ from .parsers.lalr_parser import UnexpectedToken from .common import is_terminal, GrammarError, LexerConf, ParserConf, PatternStr, PatternRE, TokenDef from .grammar import RuleOptions, Rule -from .tree import Tree, Visitor, SlottedTree as ST -from .transformers import Transformer_Children, Transformer_ChildrenInline +from .tree import Tree, SlottedTree as ST +from .transformers import Transformer, Transformer_Children, Transformer_ChildrenInline, Visitor __path__ = os.path.dirname(__file__) IMPORT_PATHS = [os.path.join(__path__, 'grammars')] @@ -200,17 +200,14 @@ class SimplifyRule_Visitor(Visitor): # --> # expansions( expansion(b, c, e), expansion(b, d, e) ) - while True: - self._flatten(tree) - - for i, child in enumerate(tree.children): - if isinstance(child, Tree) and child.data == 'expansions': - tree.data = 'expansions' - tree.children = [self.visit(ST('expansion', [option if i==j else other - for j, other in enumerate(tree.children)])) - for option in set(child.children)] - break - else: + self._flatten(tree) + + for i, child in enumerate(tree.children): + if isinstance(child, Tree) and child.data == 'expansions': + tree.data = 'expansions' + tree.children = [self.visit(ST('expansion', [option if i==j else other + for j, other in enumerate(tree.children)])) + for option in set(child.children)] break def alias(self, tree): @@ -234,7 +231,7 @@ class RuleTreeToText(Transformer_Children): return [sym.value for sym in symbols], None def alias(self, x): (expansion, _alias), alias = x - assert _alias is None, (alias, expansion, '-', _alias) + assert _alias is None, (alias, expansion, '-', _alias) # Double alias not allowed return expansion, alias.value diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py index d119e41..ee9f871 100644 --- a/lark/parsers/earley.py +++ b/lark/parsers/earley.py @@ -14,7 +14,8 @@ # Email : erezshin@gmail.com from ..common import ParseError, UnexpectedToken, is_terminal -from ..tree import Tree, Transformer_NoRecurse +from ..tree import Tree +from ..transformers import InPlaceTransformer from .grammar_analysis import GrammarAnalyzer @@ -229,7 +230,7 @@ class Parser: return ApplyCallbacks(self.postprocess).transform(tree) -class ApplyCallbacks(Transformer_NoRecurse): +class ApplyCallbacks(InPlaceTransformer): def __init__(self, postprocess): self.postprocess = postprocess diff --git a/lark/parsers/resolve_ambig.py b/lark/parsers/resolve_ambig.py index 456c6a9..7c482ae 100644 --- a/lark/parsers/resolve_ambig.py +++ b/lark/parsers/resolve_ambig.py @@ -1,7 +1,7 @@ from ..utils import compare from functools import cmp_to_key -from ..tree import Tree, Visitor_NoRecurse +from ..tree import Tree # Standard ambiguity resolver (uses comparison) diff --git a/lark/transformers.py b/lark/transformers.py index b1938f0..033d2f4 100644 --- a/lark/transformers.py +++ b/lark/transformers.py @@ -6,33 +6,25 @@ class Discard(Exception): pass -class Transformer: - def _get_userfunc(self, name): - return getattr(self, name) - +class Base: def _call_userfunc(self, tree): - # Assumes tree is already transformed - try: - f = self._get_userfunc(tree.data) - except AttributeError: - return self.__default__(tree) - else: - return f(tree) + return getattr(self, tree.data, self.__default__)(tree) - def _transform(self, tree): - children = [] - for c in tree.children: + def __default__(self, tree): + return tree + +class Transformer(Base): + def _transform_children(self, children): + for c in children: try: - children.append(self._transform(c) if isinstance(c, Tree) else c) + yield self._transform(c) if isinstance(c, Tree) else c except Discard: pass - tree = Tree(tree.data, children) + def _transform(self, tree): + tree = Tree(tree.data, list(self._transform_children(tree.children))) return self._call_userfunc(tree) - def __default__(self, tree): - return tree - def transform(self, tree): return self._transform(tree) @@ -43,7 +35,7 @@ class Transformer_Children(Transformer): def _call_userfunc(self, tree): # Assumes tree is already transformed try: - f = self._get_userfunc(tree.data) + f = getattr(self, tree.data) except AttributeError: return self.__default__(tree) else: @@ -53,7 +45,7 @@ class Transformer_ChildrenInline(Transformer): def _call_userfunc(self, tree): # Assumes tree is already transformed try: - f = self._get_userfunc(tree.data) + f = getattr(self, tree.data) except AttributeError: return self.__default__(tree) else: @@ -72,6 +64,45 @@ class TransformerChain(object): def __mul__(self, other): return TransformerChain(*self.transformers + (other,)) +class Visitor(Base): + # def visit(self, tree): + # for child in tree.children: + # if isinstance(child, Tree): + # self.visit(child) + + # f = getattr(self, tree.data, self.__default__) + # f(tree) + # return tree + + def visit(self, tree): + for subtree in tree.iter_subtrees(): + self._call_userfunc(subtree) + return tree + + def __default__(self, tree): + pass + + +class InPlaceTransformer(Transformer): + # def _transform(self, tree): + # children = [] + # for c in tree.children: + # try: + # children.append(self._transform(c) if isinstance(c, Tree) else c) + # except Discard: + # pass + + # tree.children = children + # return self._call_userfunc(tree) + + def _transform(self, tree): + return self._call_userfunc(tree) + + def transform(self, tree): + for subtree in tree.iter_subtrees(): + subtree.children = list(self._transform_children(subtree.children)) + + return self._transform(tree) #### XXX PSEUDOCODE TODO diff --git a/lark/tree.py b/lark/tree.py index 04cfc4e..cf88293 100644 --- a/lark/tree.py +++ b/lark/tree.py @@ -12,8 +12,6 @@ class Meta: ###{standalone class Tree(object): - __slots__ = ('data', 'children', '_meta', 'rule') - def __init__(self, data, children): self.data = data self.children = children @@ -141,77 +139,12 @@ class Transformer(object): return TransformerChain(self, other) -class Discard(Exception): - pass - -class TransformerChain(object): - def __init__(self, *transformers): - self.transformers = transformers - - def transform(self, tree): - for t in self.transformers: - tree = t.transform(tree) - return tree - - def __mul__(self, other): - return TransformerChain(*self.transformers + (other,)) - - - class InlineTransformer(Transformer): def _get_func(self, name): # use super()._get_func return inline_args(getattr(self, name)).__get__(self) -class Visitor(object): - def visit(self, tree): - for child in tree.children: - if isinstance(child, Tree): - self.visit(child) - - f = getattr(self, tree.data, self.__default__) - f(tree) - return tree - - def __default__(self, tree): - pass - - -class Visitor_NoRecurse(Visitor): - def visit(self, tree): - subtrees = list(tree.iter_subtrees()) - - for subtree in (subtrees): - getattr(self, subtree.data, self.__default__)(subtree) - return tree - - -class Transformer_NoRecurse(Transformer): - def transform(self, tree): - subtrees = list(tree.iter_subtrees()) - - def _t(t): - # Assumes t is already transformed - try: - f = self._get_func(t.data) - except AttributeError: - return self.__default__(t) - else: - return f(t) - - for subtree in subtrees: - children = [] - for c in subtree.children: - try: - children.append(_t(c) if isinstance(c, Tree) else c) - except Discard: - pass - subtree.children = children - - return _t(tree) - def __default__(self, t): - return t ###} From f69bceb3354c01f427535e94af15de62247c7436 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Fri, 13 Apr 2018 12:02:01 +0300 Subject: [PATCH 04/34] Snap more things into place --- lark/__init__.py | 3 ++- lark/parse_tree_builder.py | 2 +- lark/transformers.py | 3 --- lark/tree.py | 34 ---------------------------------- tests/test_parser.py | 4 +++- 5 files changed, 6 insertions(+), 40 deletions(-) diff --git a/lark/__init__.py b/lark/__init__.py index af6f7b5..a2a67b9 100644 --- a/lark/__init__.py +++ b/lark/__init__.py @@ -1,4 +1,5 @@ -from .tree import Tree, Transformer, InlineTransformer +from .tree import Tree +from .transformers import Transformer from .common import ParseError, GrammarError, UnexpectedToken from .lexer import UnexpectedInput, LexError from .lark import Lark diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py index 1acfe2f..ea26347 100644 --- a/lark/parse_tree_builder.py +++ b/lark/parse_tree_builder.py @@ -145,7 +145,7 @@ class ParseTreeBuilder: user_callback_name = rule.alias or rule.origin try: - f = transformer._get_func(user_callback_name) + f = getattr(transformer, user_callback_name) except AttributeError: f = partial(self.tree_class, user_callback_name) diff --git a/lark/transformers.py b/lark/transformers.py index 033d2f4..245b733 100644 --- a/lark/transformers.py +++ b/lark/transformers.py @@ -79,9 +79,6 @@ class Visitor(Base): self._call_userfunc(subtree) return tree - def __default__(self, tree): - pass - class InPlaceTransformer(Transformer): # def _transform(self, tree): diff --git a/lark/tree.py b/lark/tree.py index cf88293..c478ae6 100644 --- a/lark/tree.py +++ b/lark/tree.py @@ -113,40 +113,6 @@ class SlottedTree(Tree): __slots__ = 'data', 'children', 'rule' -###{standalone -class Transformer(object): - def _get_func(self, name): - return getattr(self, name) - - def transform(self, tree): - items = [] - for c in tree.children: - try: - items.append(self.transform(c) if isinstance(c, Tree) else c) - except Discard: - pass - try: - f = self._get_func(tree.data) - except AttributeError: - return self.__default__(tree.data, items) - else: - return f(items) - - def __default__(self, data, children): - return Tree(data, children) - - def __mul__(self, other): - return TransformerChain(self, other) - - -class InlineTransformer(Transformer): - def _get_func(self, name): # use super()._get_func - return inline_args(getattr(self, name)).__get__(self) - - - -###} - def pydot__tree_to_png(tree, filename): import pydot diff --git a/tests/test_parser.py b/tests/test_parser.py index 4aaea93..6ffe1ad 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -20,7 +20,9 @@ logging.basicConfig(level=logging.INFO) from lark.lark import Lark from lark.common import GrammarError, ParseError, UnexpectedToken from lark.lexer import LexError, UnexpectedInput -from lark.tree import Tree, Transformer +from lark.tree import Tree +from lark.transformers import Transformer_Children as Transformer +# from lark.tree import Transformer __path__ = os.path.dirname(__file__) def _read(n, *args): From c3bce19dc274b2031c0ee88af6b6bbd065c707da Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Fri, 13 Apr 2018 13:41:30 +0300 Subject: [PATCH 05/34] More steps towards a good solution --- lark/load_grammar.py | 27 ++++++++------- lark/parsers/earley.py | 4 +-- lark/transformers.py | 74 ++++++++++++++++++------------------------ tests/test_parser.py | 2 +- 4 files changed, 47 insertions(+), 60 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 190bda6..e29c47e 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -15,7 +15,7 @@ from .common import is_terminal, GrammarError, LexerConf, ParserConf, PatternStr from .grammar import RuleOptions, Rule from .tree import Tree, SlottedTree as ST -from .transformers import Transformer, Transformer_Children, Transformer_ChildrenInline, Visitor +from .transformers import Transformer, ChildrenTransformer, inline_args, Visitor __path__ = os.path.dirname(__file__) IMPORT_PATHS = [os.path.join(__path__, 'grammars')] @@ -131,7 +131,8 @@ RULES = { } -class EBNF_to_BNF(Transformer_ChildrenInline): +@inline_args +class EBNF_to_BNF(ChildrenTransformer): def __init__(self): self.new_rules = [] self.rules_by_expr = {} @@ -224,7 +225,7 @@ class SimplifyRule_Visitor(Visitor): tree.children = list(set(tree.children)) -class RuleTreeToText(Transformer_Children): +class RuleTreeToText(ChildrenTransformer): def expansions(self, x): return x def expansion(self, symbols): @@ -235,17 +236,13 @@ class RuleTreeToText(Transformer_Children): return expansion, alias.value -class CanonizeTree(Transformer_ChildrenInline): +@inline_args +class CanonizeTree(ChildrenTransformer): def maybe(self, expr): return ST('expr', [expr, Token('OP', '?', -1)]) - def tokenmods(self, *args): - if len(args) == 1: - return list(args) - tokenmods, value = args - return tokenmods + [value] - -class ExtractAnonTokens(Transformer_ChildrenInline): +@inline_args +class ExtractAnonTokens(ChildrenTransformer): "Create a unique list of anonymous tokens. Attempt to give meaningful names to them when we add them" def __init__(self, tokens): @@ -349,7 +346,8 @@ def _literal_to_pattern(literal): 'REGEXP': PatternRE }[literal.type](s, flags) -class PrepareLiterals(Transformer_ChildrenInline): +@inline_args +class PrepareLiterals(ChildrenTransformer): def literal(self, literal): return ST('pattern', [_literal_to_pattern(literal)]) @@ -361,13 +359,14 @@ class PrepareLiterals(Transformer_ChildrenInline): regexp = '[%s-%s]' % (start, end) return ST('pattern', [PatternRE(regexp)]) -class SplitLiterals(Transformer_ChildrenInline): +@inline_args +class SplitLiterals(ChildrenTransformer): def pattern(self, p): if isinstance(p, PatternStr) and len(p.value)>1: return ST('expansion', [ST('pattern', [PatternStr(ch, flags=p.flags)]) for ch in p.value]) return ST('pattern', [p]) -class TokenTreeToPattern(Transformer_Children): +class TokenTreeToPattern(ChildrenTransformer): def pattern(self, ps): p ,= ps return p diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py index ee9f871..15d42a8 100644 --- a/lark/parsers/earley.py +++ b/lark/parsers/earley.py @@ -15,7 +15,7 @@ from ..common import ParseError, UnexpectedToken, is_terminal from ..tree import Tree -from ..transformers import InPlaceTransformer +from ..transformers import Transformer_InPlace from .grammar_analysis import GrammarAnalyzer @@ -230,7 +230,7 @@ class Parser: return ApplyCallbacks(self.postprocess).transform(tree) -class ApplyCallbacks(InPlaceTransformer): +class ApplyCallbacks(Transformer_InPlace): def __init__(self, postprocess): self.postprocess = postprocess diff --git a/lark/transformers.py b/lark/transformers.py index 245b733..b9f63ce 100644 --- a/lark/transformers.py +++ b/lark/transformers.py @@ -1,5 +1,7 @@ +import inspect from functools import wraps +from . import utils from .tree import Tree class Discard(Exception): @@ -31,7 +33,7 @@ class Transformer(Base): def __mul__(self, other): return TransformerChain(self, other) -class Transformer_Children(Transformer): +class ChildrenTransformer(Transformer): def _call_userfunc(self, tree): # Assumes tree is already transformed try: @@ -41,7 +43,7 @@ class Transformer_Children(Transformer): else: return f(tree.children) -class Transformer_ChildrenInline(Transformer): +class ChildrenInlineTransformer(Transformer): def _call_userfunc(self, tree): # Assumes tree is already transformed try: @@ -64,58 +66,44 @@ class TransformerChain(object): def __mul__(self, other): return TransformerChain(*self.transformers + (other,)) -class Visitor(Base): - # def visit(self, tree): - # for child in tree.children: - # if isinstance(child, Tree): - # self.visit(child) - # f = getattr(self, tree.data, self.__default__) - # f(tree) - # return tree +class Transformer_InPlace(Transformer): + def _transform(self, tree): + return self._call_userfunc(tree) + + def transform(self, tree): + for subtree in tree.iter_subtrees(): + subtree.children = list(self._transform_children(subtree.children)) + + return self._transform(tree) +class Visitor(Base): def visit(self, tree): for subtree in tree.iter_subtrees(): self._call_userfunc(subtree) return tree - -class InPlaceTransformer(Transformer): - # def _transform(self, tree): - # children = [] - # for c in tree.children: - # try: - # children.append(self._transform(c) if isinstance(c, Tree) else c) - # except Discard: - # pass - - # tree.children = children - # return self._call_userfunc(tree) - +class Transformer_InPlaceRecursive(Transformer): def _transform(self, tree): + tree.children = list(self._transform_children(tree.children)) return self._call_userfunc(tree) - def transform(self, tree): - for subtree in tree.iter_subtrees(): - subtree.children = list(self._transform_children(subtree.children)) - - return self._transform(tree) +class Visitor_Recursive(Base): + def visit(self, tree): + for child in tree.children: + if isinstance(child, Tree): + self.visit(child) + f = getattr(self, tree.data, self.__default__) + f(tree) + return tree -#### XXX PSEUDOCODE TODO -# def items(obj): -# if isinstance(obj, Transformer): -# def new_get_userfunc(self, name): -# uf = self._get_userfunc(name) -# def _f(tree): -# return uf(tree.children) -# return _f -# obj._get_userfunc = new_get_userfunc -# else: -# assert callable(obj) -# # apply decorator -# def _f(tree): -# return obj(tree.children) -# return _f +def inline_args(obj): + if inspect.isclass(obj) and issubclass(obj, ChildrenTransformer): + class _NewTransformer(ChildrenInlineTransformer, obj): + pass + return _NewTransformer + else: + return utils.inline_args(obj) diff --git a/tests/test_parser.py b/tests/test_parser.py index 6ffe1ad..7620241 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -21,7 +21,7 @@ from lark.lark import Lark from lark.common import GrammarError, ParseError, UnexpectedToken from lark.lexer import LexError, UnexpectedInput from lark.tree import Tree -from lark.transformers import Transformer_Children as Transformer +from lark.transformers import ChildrenTransformer as Transformer # from lark.tree import Transformer __path__ = os.path.dirname(__file__) From ca3d4ca6f4d3da667df1f7f216414469cac66af1 Mon Sep 17 00:00:00 2001 From: Rob Rose Date: Sat, 14 Apr 2018 20:27:39 -0400 Subject: [PATCH 06/34] Changing changes from .lrk to .lark --- MANIFEST.in | 2 +- examples/{python2.lrk => python2.lark} | 0 examples/{python3.lrk => python3.lark} | 0 examples/python_parser.py | 4 ++-- examples/standalone/create_standalone.sh | 2 +- examples/standalone/{json.lrk => json.lark} | 0 lark/grammars/{common.lrk => common.lark} | 0 lark/load_grammar.py | 2 +- setup.py | 2 +- 9 files changed, 6 insertions(+), 6 deletions(-) rename examples/{python2.lrk => python2.lark} (100%) rename examples/{python3.lrk => python3.lark} (100%) rename examples/standalone/{json.lrk => json.lark} (100%) rename lark/grammars/{common.lrk => common.lark} (100%) diff --git a/MANIFEST.in b/MANIFEST.in index 8288fd6..019e37a 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1 +1 @@ -include README.md LICENSE docs/* examples/*.py examples/*.png examples/*.lrk tests/*.py tests/test_nearley/*.py tests/test_nearley/grammars/* +include README.md LICENSE docs/* examples/*.py examples/*.png examples/*.lark tests/*.py tests/test_nearley/*.py tests/test_nearley/grammars/* diff --git a/examples/python2.lrk b/examples/python2.lark similarity index 100% rename from examples/python2.lrk rename to examples/python2.lark diff --git a/examples/python3.lrk b/examples/python3.lark similarity index 100% rename from examples/python3.lrk rename to examples/python3.lark diff --git a/examples/python_parser.py b/examples/python_parser.py index d14dacc..9b3a978 100644 --- a/examples/python_parser.py +++ b/examples/python_parser.py @@ -21,8 +21,8 @@ class PythonIndenter(Indenter): tab_len = 8 -grammar2_filename = os.path.join(__path__, 'python2.lrk') -grammar3_filename = os.path.join(__path__, 'python3.lrk') +grammar2_filename = os.path.join(__path__, 'python2.lark') +grammar3_filename = os.path.join(__path__, 'python3.lark') with open(grammar2_filename) as f: python_parser2 = Lark(f, parser='lalr', postlex=PythonIndenter(), start='file_input') with open(grammar3_filename) as f: diff --git a/examples/standalone/create_standalone.sh b/examples/standalone/create_standalone.sh index f5001fe..a4fa879 100755 --- a/examples/standalone/create_standalone.sh +++ b/examples/standalone/create_standalone.sh @@ -1 +1 @@ -python -m lark.tools.standalone json.lrk > json_parser.py +python -m lark.tools.standalone json.lark > json_parser.py diff --git a/examples/standalone/json.lrk b/examples/standalone/json.lark similarity index 100% rename from examples/standalone/json.lrk rename to examples/standalone/json.lark diff --git a/lark/grammars/common.lrk b/lark/grammars/common.lark similarity index 100% rename from lark/grammars/common.lrk rename to lark/grammars/common.lark diff --git a/lark/load_grammar.py b/lark/load_grammar.py index cf74199..ebecc46 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -625,7 +625,7 @@ class GrammarLoader: elif stmt.data == 'import': dotted_path = stmt.children[0].children name = stmt.children[1] if len(stmt.children)>1 else dotted_path[-1] - grammar_path = os.path.join(*dotted_path[:-1]) + '.lrk' + grammar_path = os.path.join(*dotted_path[:-1]) + '.lark' g = import_grammar(grammar_path) token_options = dict(g.token_defs)[dotted_path[-1]] assert isinstance(token_options, tuple) and len(token_options)==2 diff --git a/setup.py b/setup.py index 978b370..8543fd4 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ setup( requires = [], install_requires = [], - package_data = { '': ['*.md', '*.lrk'] }, + package_data = { '': ['*.md', '*.lark'] }, test_suite = 'tests.__main__', From 209a3fe8fd1b1f6cd9267c84178cfbfb496065a3 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Wed, 25 Apr 2018 01:54:16 +0300 Subject: [PATCH 07/34] Interface improvements for the Lark instance --- examples/python_parser.py | 18 +++++++----------- lark/lark.py | 30 ++++++++++++++++++++++++++---- 2 files changed, 33 insertions(+), 15 deletions(-) diff --git a/examples/python_parser.py b/examples/python_parser.py index d953a79..ddbd5c4 100644 --- a/examples/python_parser.py +++ b/examples/python_parser.py @@ -10,7 +10,7 @@ import glob, time from lark import Lark from lark.indenter import Indenter -__path__ = os.path.dirname(__file__) +# __path__ = os.path.dirname(__file__) class PythonIndenter(Indenter): NL_type = '_NEWLINE' @@ -20,18 +20,14 @@ class PythonIndenter(Indenter): DEDENT_type = '_DEDENT' tab_len = 8 +kwargs = dict(rel_to=__file__, postlex=PythonIndenter(), start='file_input') -grammar2_filename = os.path.join(__path__, 'python2.g') -grammar3_filename = os.path.join(__path__, 'python3.g') -with open(grammar2_filename) as f: - python_parser2 = Lark(f, parser='lalr', postlex=PythonIndenter(), start='file_input') -with open(grammar3_filename) as f: - python_parser3 = Lark(f, parser='lalr', postlex=PythonIndenter(), start='file_input') +python_parser2 = Lark.open('python2.g', parser='lalr', **kwargs) +python_parser3 = Lark.open('python3.g',parser='lalr', **kwargs) +python_parser2_earley = Lark.open('python2.g', parser='earley', lexer='standard', **kwargs) +print(python_parser3) -with open(grammar2_filename) as f: - python_parser2_earley = Lark(f, parser='lalr', lexer='standard', postlex=PythonIndenter(), start='file_input') - def _read(fn, *args): kwargs = {'encoding': 'iso-8859-1'} with open(fn, *args, **kwargs) as f: @@ -82,6 +78,6 @@ def test_earley_equals_lalr(): if __name__ == '__main__': test_python_lib() - # test_earley_equals_lalr() + test_earley_equals_lalr() # python_parser3.parse(_read(sys.argv[1]) + '\n') diff --git a/lark/lark.py b/lark/lark.py index 2660bd7..3641a40 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -3,6 +3,7 @@ from __future__ import absolute_import import os import time from collections import defaultdict +from io import open from .utils import STRING_TYPE from .load_grammar import load_grammar @@ -105,12 +106,12 @@ class Lark: # Some, but not all file-like objects have a 'name' attribute try: - source = grammar.name + self.source = grammar.name except AttributeError: - source = '' + self.source = '' cache_file = "larkcache_%s" % str(hash(grammar)%(2**32)) else: - cache_file = "larkcache_%s" % os.path.basename(source) + cache_file = "larkcache_%s" % os.path.basename(self.source) # Drain file-like objects to get their contents try: @@ -150,7 +151,7 @@ class Lark: assert self.options.ambiguity in ('resolve', 'explicit', 'auto', 'resolve__antiscore_sum') # Parse the grammar file and compose the grammars (TODO) - self.grammar = load_grammar(grammar, source) + self.grammar = load_grammar(grammar, self.source) # Compile the EBNF grammar into BNF tokens, self.rules, self.ignore_tokens = self.grammar.compile(lexer=bool(lexer), start=self.options.start) @@ -183,6 +184,27 @@ class Lark: return self.parser_class(self.lexer_conf, parser_conf, options=self.options) + @classmethod + def open(cls, grammar_filename, rel_to=None, **options): + """Create an instance of Lark with the grammar given by its filename + + If rel_to is provided, the function will find the grammar filename in relation to it. + + Example: + + >>> Lark.open("grammar_file.g", rel_to=__file__, parser="lalr") + Lark(...) + + """ + if rel_to: + basepath = os.path.dirname(rel_to) + grammar_filename = os.path.join(basepath, grammar_filename) + with open(grammar_filename) as f: + return cls(f, **options) + + def __repr__(self): + return 'Lark(open(%r), parser=%r, lexer=%r, ...)' % (self.source, self.options.parser, self.options.lexer) + def lex(self, text): if not hasattr(self, 'lexer'): From 4a7a66d77359954e86753e9467544ccb22af951e Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Wed, 25 Apr 2018 01:55:10 +0300 Subject: [PATCH 08/34] .lark (preparing) --- lark/lark.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lark/lark.py b/lark/lark.py index 3641a40..8ab2227 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -192,7 +192,7 @@ class Lark: Example: - >>> Lark.open("grammar_file.g", rel_to=__file__, parser="lalr") + >>> Lark.open("grammar_file.lark", rel_to=__file__, parser="lalr") Lark(...) """ From 836735211fffa7e45015781aad2349f81dfe622c Mon Sep 17 00:00:00 2001 From: Rob Rose Date: Wed, 25 Apr 2018 14:28:12 -0400 Subject: [PATCH 09/34] Resolved upstream changes to use new file-extension - @erezsh added the new Lark reference and some other things using the current .g extension, so I wanted to resolve them for PR #117 - Renamed lark.g to lark.lark. - Changed lark_grammar.py to use .lark file extensions. - Changed urls that used .g to use .lark. --- docs/json_tutorial.md | 2 +- examples/README.md | 2 +- examples/{lark.g => lark.lark} | 0 examples/lark_grammar.py | 10 +++++----- 4 files changed, 7 insertions(+), 7 deletions(-) rename examples/{lark.g => lark.lark} (100%) diff --git a/docs/json_tutorial.md b/docs/json_tutorial.md index 9f3fbf1..96e76fb 100644 --- a/docs/json_tutorial.md +++ b/docs/json_tutorial.md @@ -79,7 +79,7 @@ By the way, if you're curious what these terminals signify, they are roughly equ Lark will accept this, if you really want to complicate your life :) -(You can find the original definitions in [common.g](/lark/grammars/common.g).) +(You can find the original definitions in [common.lark](/lark/grammars/common.lark).) Notice that terminals are written in UPPER-CASE, while rules are written in lower-case. I'll touch more on the differences between rules and terminals later. diff --git a/examples/README.md b/examples/README.md index 0951c86..ef150ad 100644 --- a/examples/README.md +++ b/examples/README.md @@ -7,7 +7,7 @@ - [indented\_tree.py](indented\_tree.py) - A demonstration of parsing indentation ("whitespace significant" language) - [fruitflies.py](fruitflies.py) - A demonstration of ambiguity - [turtle\_dsl.py](turtle_dsl.py) - Implements a LOGO-like toy language for Python's turtle, with interpreter. -- [lark\_grammar.py](lark_grammar.py) + [lark.g](lark.g) - A reference implementation of the Lark grammar (using LALR(1) + standard lexer) +- [lark\_grammar.py](lark_grammar.py) + [lark.lark](lark.lark) - A reference implementation of the Lark grammar (using LALR(1) + standard lexer) ### Advanced diff --git a/examples/lark.g b/examples/lark.lark similarity index 100% rename from examples/lark.g rename to examples/lark.lark diff --git a/examples/lark_grammar.py b/examples/lark_grammar.py index 88fc4cf..30ccc8b 100644 --- a/examples/lark_grammar.py +++ b/examples/lark_grammar.py @@ -1,12 +1,12 @@ from lark import Lark -parser = Lark(open('examples/lark.g'), parser="lalr") +parser = Lark(open('examples/lark.lark'), parser="lalr") grammar_files = [ - 'examples/python2.g', - 'examples/python3.g', - 'examples/lark.g', - 'lark/grammars/common.g', + 'examples/python2.lark', + 'examples/python3.lark', + 'examples/lark.lark', + 'lark/grammars/common.lark', ] def test(): From 67f372c994599c19978d0e0ad36b80dc17983b27 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Sat, 5 May 2018 13:16:15 +0300 Subject: [PATCH 10/34] Symbols instead of strings - initial --- examples/python_parser.py | 2 +- lark/grammar.py | 22 ++++++++++++++++++++++ lark/lexer.py | 4 ++-- lark/load_grammar.py | 14 ++++++++++---- lark/parse_tree_builder.py | 10 +++++----- lark/parsers/grammar_analysis.py | 20 ++++++++++---------- lark/parsers/lalr_analysis.py | 10 +++++----- lark/parsers/lalr_parser.py | 2 +- 8 files changed, 56 insertions(+), 28 deletions(-) diff --git a/examples/python_parser.py b/examples/python_parser.py index ddbd5c4..f738a35 100644 --- a/examples/python_parser.py +++ b/examples/python_parser.py @@ -78,6 +78,6 @@ def test_earley_equals_lalr(): if __name__ == '__main__': test_python_lib() - test_earley_equals_lalr() + # test_earley_equals_lalr() # python_parser3.parse(_read(sys.argv[1]) + '\n') diff --git a/lark/grammar.py b/lark/grammar.py index d257bc4..2689389 100644 --- a/lark/grammar.py +++ b/lark/grammar.py @@ -1,3 +1,25 @@ +class Symbol(object): + is_term = NotImplemented + + def __init__(self, name): + self.name = name + + def __eq__(self, other): + assert isinstance(other, Symbol), other + return self.is_term == other.is_term and self.name == other.name + + def __hash__(self): + return hash(self.name) + +class Terminal(Symbol): + is_term = True + + @property + def filter_out(self): + return self.name.startswith('_') + +class NonTerminal(Symbol): + is_term = False class Rule(object): """ diff --git a/lark/lexer.py b/lark/lexer.py index 19e1be4..e7af2a2 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -3,7 +3,7 @@ import re from .utils import Str, classify -from .common import is_terminal, PatternStr, PatternRE, TokenDef +from .common import PatternStr, PatternRE, TokenDef ###{standalone class LexError(Exception): @@ -234,7 +234,7 @@ class ContextualLexer: lexer = lexer_by_tokens[key] except KeyError: accepts = set(accepts) | set(ignore) | set(always_accept) - state_tokens = [tokens_by_name[n] for n in accepts if is_terminal(n) and n!='$END'] + state_tokens = [tokens_by_name[n] for n in accepts if n.is_term and n.name!='$END'] lexer = Lexer(state_tokens, ignore=ignore, user_callbacks=user_callbacks) lexer_by_tokens[key] = lexer diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 43d1bf5..6800801 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -12,7 +12,7 @@ from .parse_tree_builder import ParseTreeBuilder from .parser_frontends import LALR from .parsers.lalr_parser import UnexpectedToken from .common import is_terminal, GrammarError, LexerConf, ParserConf, PatternStr, PatternRE, TokenDef -from .grammar import RuleOptions, Rule +from .grammar import RuleOptions, Rule, Terminal, NonTerminal from .tree import Tree, Transformer, InlineTransformer, Visitor, SlottedTree as ST @@ -523,7 +523,9 @@ class Grammar: if alias and name.startswith('_'): raise GrammarError("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (name, alias)) - rule = Rule(name, expansion, alias, options) + expansion = [Terminal(x) if is_terminal(x) else NonTerminal(x) for x in expansion] + + rule = Rule(NonTerminal(name), expansion, alias, options) compiled_rules.append(rule) return tokens, compiled_rules, self.ignore @@ -578,12 +580,16 @@ def options_from_rule(name, *x): return name, expansions, RuleOptions(keep_all_tokens, expand1, priority=priority) + +def symbols_from_strcase(expansion): + return [Terminal(x) if is_terminal(x) else NonTerminal(x) for x in expansion] + class GrammarLoader: def __init__(self): tokens = [TokenDef(name, PatternRE(value)) for name, value in TOKENS.items()] - rules = [options_from_rule(name, x) for name, x in RULES.items()] - rules = [Rule(r, x.split(), None, o) for r, xs, o in rules for x in xs] + rules = [options_from_rule(name, x) for name, x in RULES.items()] + rules = [Rule(NonTerminal(r), symbols_from_strcase(x.split()), None, o) for r, xs, o in rules for x in xs] callback = ParseTreeBuilder(rules, ST).create_callback() lexer_conf = LexerConf(tokens, ['WS', 'COMMENT']) diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py index 7c74178..54a1bac 100644 --- a/lark/parse_tree_builder.py +++ b/lark/parse_tree_builder.py @@ -84,7 +84,7 @@ class ChildFilterLALR(ChildFilter): return self.node_builder(filtered) def _should_expand(sym): - return not is_terminal(sym) and sym.startswith('_') + return not sym.is_term and sym.name.startswith('_') def maybe_create_child_filter(expansion, filter_out, ambiguous): to_include = [(i, _should_expand(sym)) for i, sym in enumerate(expansion) if sym not in filter_out] @@ -109,8 +109,8 @@ class ParseTreeBuilder: def _init_builders(self, rules): filter_out = {rule.origin for rule in rules if rule.options and rule.options.filter_out} - filter_out |= {sym for rule in rules for sym in rule.expansion if is_terminal(sym) and sym.startswith('_')} - assert all(x.startswith('_') for x in filter_out) + filter_out |= {sym for rule in rules for sym in rule.expansion if sym.is_term and sym.filter_out} + assert all(t.filter_out for t in filter_out) for rule in rules: options = rule.options @@ -132,9 +132,9 @@ class ParseTreeBuilder: callback = Callback() for rule, wrapper_chain in self.rule_builders: - internal_callback_name = '_callback_%s_%s' % (rule.origin, '_'.join(rule.expansion)) + internal_callback_name = '_callback_%s_%s' % (rule.origin, '_'.join(x.name for x in rule.expansion)) - user_callback_name = rule.alias or rule.origin + user_callback_name = rule.alias or rule.origin.name try: f = transformer._get_func(user_callback_name) except AttributeError: diff --git a/lark/parsers/grammar_analysis.py b/lark/parsers/grammar_analysis.py index f34d5c1..f49e4bc 100644 --- a/lark/parsers/grammar_analysis.py +++ b/lark/parsers/grammar_analysis.py @@ -1,7 +1,7 @@ from ..utils import bfs, fzset, classify -from ..common import GrammarError, is_terminal -from ..grammar import Rule +from ..common import GrammarError +from ..grammar import Rule, Terminal, NonTerminal class RulePtr(object): @@ -67,7 +67,7 @@ def calculate_sets(rules): FIRST = {} FOLLOW = {} for sym in symbols: - FIRST[sym]={sym} if is_terminal(sym) else set() + FIRST[sym]={sym} if sym.is_term else set() FOLLOW[sym]=set() # Calculate NULLABLE and FIRST @@ -108,16 +108,16 @@ class GrammarAnalyzer(object): def __init__(self, parser_conf, debug=False): self.debug = debug - rules = parser_conf.rules + [Rule('$root', [parser_conf.start, '$END'])] + rules = parser_conf.rules + [Rule(NonTerminal('$root'), [NonTerminal(parser_conf.start), Terminal('$END')])] self.rules_by_origin = classify(rules, lambda r: r.origin) assert len(rules) == len(set(rules)) for r in rules: for sym in r.expansion: - if not (is_terminal(sym) or sym in self.rules_by_origin): + if not (sym.is_term or sym in self.rules_by_origin): raise GrammarError("Using an undefined rule: %s" % sym) # TODO test validation - self.start_state = self.expand_rule('$root') + self.start_state = self.expand_rule(NonTerminal('$root')) self.FIRST, self.FOLLOW, self.NULLABLE = calculate_sets(rules) @@ -125,7 +125,7 @@ class GrammarAnalyzer(object): "Returns all init_ptrs accessible by rule (recursive)" init_ptrs = set() def _expand_rule(rule): - assert not is_terminal(rule), rule + assert not rule.is_term, rule for r in self.rules_by_origin[rule]: init_ptr = RulePtr(r, 0) @@ -133,7 +133,7 @@ class GrammarAnalyzer(object): if r.expansion: # if not empty rule new_r = init_ptr.next - if not is_terminal(new_r): + if not new_r.is_term: yield new_r for _ in bfs([rule], _expand_rule): @@ -142,8 +142,8 @@ class GrammarAnalyzer(object): return fzset(init_ptrs) def _first(self, r): - if is_terminal(r): + if r.is_term: return {r} else: - return {rp.next for rp in self.expand_rule(r) if is_terminal(rp.next)} + return {rp.next for rp in self.expand_rule(r) if rp.next.is_term} diff --git a/lark/parsers/lalr_analysis.py b/lark/parsers/lalr_analysis.py index 4af28f9..6903be9 100644 --- a/lark/parsers/lalr_analysis.py +++ b/lark/parsers/lalr_analysis.py @@ -10,9 +10,9 @@ import logging from collections import defaultdict from ..utils import classify, classify_bool, bfs, fzset -from ..common import GrammarError, is_terminal +from ..common import GrammarError -from .grammar_analysis import GrammarAnalyzer +from .grammar_analysis import GrammarAnalyzer, Terminal class Action: def __init__(self, name): @@ -70,12 +70,12 @@ class LALR_Analyzer(GrammarAnalyzer): rps = {rp.advance(sym) for rp in rps} for rp in set(rps): - if not rp.is_satisfied and not is_terminal(rp.next): + if not rp.is_satisfied and not rp.next.is_term: rps |= self.expand_rule(rp.next) new_state = fzset(rps) lookahead[sym].append((Shift, new_state)) - if sym == '$END': + if sym == Terminal('$END'): self.end_states.append( new_state ) yield new_state @@ -93,7 +93,7 @@ class LALR_Analyzer(GrammarAnalyzer): if not len(v) == 1: raise GrammarError("Collision in %s: %s" %(k, ', '.join(['\n * %s: %s' % x for x in v]))) - self.states[state] = {k:v[0] for k, v in lookahead.items()} + self.states[state] = {k.name:v[0] for k, v in lookahead.items()} for _ in bfs([self.start_state], step): pass diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index baea614..164a227 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -59,7 +59,7 @@ class _Parser: value = self.callbacks[rule](s) - _action, new_state = get_action(rule.origin) + _action, new_state = get_action(rule.origin.name) assert _action is Shift state_stack.append(new_state) value_stack.append(value) From cf7ddeee8863096309b69fda63e5ba04610e7286 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Sat, 5 May 2018 13:26:28 +0300 Subject: [PATCH 11/34] Earley working too --- examples/python_parser.py | 2 +- lark/parse_tree_builder.py | 2 +- lark/parser_frontends.py | 7 ++++--- lark/parsers/earley.py | 9 +++++---- lark/parsers/xearley.py | 9 +++++---- 5 files changed, 16 insertions(+), 13 deletions(-) diff --git a/examples/python_parser.py b/examples/python_parser.py index f738a35..ddbd5c4 100644 --- a/examples/python_parser.py +++ b/examples/python_parser.py @@ -78,6 +78,6 @@ def test_earley_equals_lalr(): if __name__ == '__main__': test_python_lib() - # test_earley_equals_lalr() + test_earley_equals_lalr() # python_parser3.parse(_read(sys.argv[1]) + '\n') diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py index 54a1bac..e81569f 100644 --- a/lark/parse_tree_builder.py +++ b/lark/parse_tree_builder.py @@ -1,4 +1,4 @@ -from .common import is_terminal, GrammarError +from .common import GrammarError from .utils import suppress from .lexer import Token from .grammar import Rule diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index a36252c..24c3622 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -4,9 +4,10 @@ from .utils import get_regexp_width from .parsers.grammar_analysis import GrammarAnalyzer from .lexer import Lexer, ContextualLexer, Token -from .common import is_terminal, GrammarError +from .common import GrammarError from .parsers import lalr_parser, earley, xearley, resolve_ambig, cyk from .tree import Tree +from .grammar import Terminal class WithLexer: def init_traditional_lexer(self, lexer_conf): @@ -96,7 +97,7 @@ class Earley(WithLexer): resolve_ambiguity=get_ambiguity_resolver(options)) def match(self, term, token): - return term == token.type + return term.name == token.type def parse(self, text): tokens = self.lex(text) @@ -117,7 +118,7 @@ class XEarley: ) def match(self, term, text, index=0): - return self.regexps[term].match(text, index) + return self.regexps[term.name].match(text, index) def _prepare_match(self, lexer_conf): self.regexps = {} diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py index d119e41..f6397dd 100644 --- a/lark/parsers/earley.py +++ b/lark/parsers/earley.py @@ -13,9 +13,10 @@ # Author: Erez Shinan (2017) # Email : erezshin@gmail.com -from ..common import ParseError, UnexpectedToken, is_terminal +from ..common import ParseError, UnexpectedToken from ..tree import Tree, Transformer_NoRecurse from .grammar_analysis import GrammarAnalyzer +from ..grammar import NonTerminal class Derivation(Tree): @@ -127,7 +128,7 @@ class Column: self.completed[item_key] = item self.to_reduce.append(item) else: - if is_terminal(item.expect): + if item.expect.is_term: self.to_scan.append(item) else: k = item_key if self.predict_all else item @@ -161,13 +162,13 @@ class Parser: def parse(self, stream, start_symbol=None): # Define parser functions - start_symbol = start_symbol or self.parser_conf.start + start_symbol = NonTerminal(start_symbol or self.parser_conf.start) _Item = Item match = self.term_matcher def predict(nonterm, column): - assert not is_terminal(nonterm), nonterm + assert not nonterm.is_term, nonterm return [_Item(rule, 0, column, None) for rule in self.predictions[nonterm]] def complete(item): diff --git a/lark/parsers/xearley.py b/lark/parsers/xearley.py index d710f34..321b829 100644 --- a/lark/parsers/xearley.py +++ b/lark/parsers/xearley.py @@ -20,10 +20,11 @@ from collections import defaultdict -from ..common import ParseError, is_terminal +from ..common import ParseError from ..lexer import Token, UnexpectedInput from ..tree import Tree from .grammar_analysis import GrammarAnalyzer +from ..grammar import NonTerminal, Terminal from .earley import ApplyCallbacks, Item, Column @@ -32,7 +33,7 @@ class Parser: self.analysis = GrammarAnalyzer(parser_conf) self.parser_conf = parser_conf self.resolve_ambiguity = resolve_ambiguity - self.ignore = list(ignore) + self.ignore = [Terminal(t) for t in ignore] self.predict_all = predict_all self.FIRST = self.analysis.FIRST @@ -47,7 +48,7 @@ class Parser: def parse(self, stream, start_symbol=None): # Define parser functions - start_symbol = start_symbol or self.parser_conf.start + start_symbol = NonTerminal(start_symbol or self.parser_conf.start) delayed_matches = defaultdict(list) match = self.term_matcher @@ -55,7 +56,7 @@ class Parser: text_column = 0 def predict(nonterm, column): - assert not is_terminal(nonterm), nonterm + assert not nonterm.is_term, nonterm return [Item(rule, 0, column, None) for rule in self.predictions[nonterm]] def complete(item): From 4a5aa745ea99af62db563cd9170d2a432eb061f2 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Sat, 5 May 2018 14:53:05 +0300 Subject: [PATCH 12/34] All tests passing --- lark/grammar.py | 6 ++++ lark/lexer.py | 4 +-- lark/parse_tree_builder.py | 2 +- lark/parser_frontends.py | 13 +++++++-- lark/parsers/cyk.py | 58 +++++++++++--------------------------- lark/parsers/xearley.py | 4 +-- 6 files changed, 37 insertions(+), 50 deletions(-) diff --git a/lark/grammar.py b/lark/grammar.py index 2689389..b555c34 100644 --- a/lark/grammar.py +++ b/lark/grammar.py @@ -8,9 +8,15 @@ class Symbol(object): assert isinstance(other, Symbol), other return self.is_term == other.is_term and self.name == other.name + def __ne__(self, other): + return not (self == other) + def __hash__(self): return hash(self.name) + def __repr__(self): + return '%s(%r)' % (type(self).__name__, self.name) + class Terminal(Symbol): is_term = True diff --git a/lark/lexer.py b/lark/lexer.py index e7af2a2..19e1be4 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -3,7 +3,7 @@ import re from .utils import Str, classify -from .common import PatternStr, PatternRE, TokenDef +from .common import is_terminal, PatternStr, PatternRE, TokenDef ###{standalone class LexError(Exception): @@ -234,7 +234,7 @@ class ContextualLexer: lexer = lexer_by_tokens[key] except KeyError: accepts = set(accepts) | set(ignore) | set(always_accept) - state_tokens = [tokens_by_name[n] for n in accepts if n.is_term and n.name!='$END'] + state_tokens = [tokens_by_name[n] for n in accepts if is_terminal(n) and n!='$END'] lexer = Lexer(state_tokens, ignore=ignore, user_callbacks=user_callbacks) lexer_by_tokens[key] = lexer diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py index e81569f..1d4e2b8 100644 --- a/lark/parse_tree_builder.py +++ b/lark/parse_tree_builder.py @@ -110,7 +110,7 @@ class ParseTreeBuilder: def _init_builders(self, rules): filter_out = {rule.origin for rule in rules if rule.options and rule.options.filter_out} filter_out |= {sym for rule in rules for sym in rule.expansion if sym.is_term and sym.filter_out} - assert all(t.filter_out for t in filter_out) + assert all(t.name.startswith('_') for t in filter_out) for rule in rules: options = rule.options diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index 24c3622..e4401c1 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -7,7 +7,11 @@ from .lexer import Lexer, ContextualLexer, Token from .common import GrammarError from .parsers import lalr_parser, earley, xearley, resolve_ambig, cyk from .tree import Tree -from .grammar import Terminal +from .grammar import Terminal, NonTerminal + +def terminals(seq): + # return [Terminal(t) for t in seq] + return seq class WithLexer: def init_traditional_lexer(self, lexer_conf): @@ -18,7 +22,10 @@ class WithLexer: self.lexer_conf = lexer_conf states = {idx:list(t.keys()) for idx, t in self.parser._parse_table.states.items()} always_accept = lexer_conf.postlex.always_accept if lexer_conf.postlex else () - self.lexer = ContextualLexer(lexer_conf.tokens, states, ignore=lexer_conf.ignore, always_accept=always_accept, user_callbacks=lexer_conf.callbacks) + self.lexer = ContextualLexer(lexer_conf.tokens, states, + ignore=terminals(lexer_conf.ignore), + always_accept=terminals(always_accept), + user_callbacks=lexer_conf.callbacks) def lex(self, text): stream = self.lexer.lex(text) @@ -74,7 +81,7 @@ class Earley_NoLex: def match(self, term, text, index=0): - return self.regexps[term].match(text, index) + return self.regexps[term.name].match(text, index) def _prepare_match(self, lexer_conf): self.regexps = {} diff --git a/lark/parsers/cyk.py b/lark/parsers/cyk.py index 9d643aa..e2bcd83 100644 --- a/lark/parsers/cyk.py +++ b/lark/parsers/cyk.py @@ -8,47 +8,19 @@ from collections import defaultdict import itertools -from ..common import ParseError, is_terminal +from ..common import ParseError from ..lexer import Token from ..tree import Tree +from ..grammar import Terminal as T, NonTerminal as NT, Symbol try: xrange except NameError: xrange = range -class Symbol(object): - """Any grammar symbol.""" - - def __init__(self, s): - self.s = s - - def __repr__(self): - return '%s(%s)' % (type(self).__name__, str(self)) - - def __str__(self): - return str(self.s) - - def __eq__(self, other): - return self.s == str(other) - - def __ne__(self, other): - return not self.__eq__(other) - - def __hash__(self): - return hash((type(self), str(self.s))) - - -class T(Symbol): - """Terminal.""" - - def match(self, s): - return self.s == s.type - - -class NT(Symbol): - """Non-terminal.""" - pass +def match(t, s): + assert isinstance(t, T) + return t.name == s.type class Rule(object): @@ -121,10 +93,12 @@ class Parser(object): def _to_rule(self, lark_rule): """Converts a lark rule, (lhs, rhs, callback, options), to a Rule.""" + assert isinstance(lark_rule.origin, NT) + assert all(isinstance(x, Symbol) for x in lark_rule.expansion) return Rule( - NT(lark_rule.origin), [ - T(x) if is_terminal(x) else NT(x) for x in lark_rule.expansion - ], weight=lark_rule.options.priority if lark_rule.options and lark_rule.options.priority else 0, alias=lark_rule.alias) + lark_rule.origin, lark_rule.expansion, + weight=lark_rule.options.priority if lark_rule.options and lark_rule.options.priority else 0, + alias=lark_rule.alias) def parse(self, tokenized): # pylint: disable=invalid-name """Parses input, which is a list of tokens.""" @@ -132,7 +106,7 @@ class Parser(object): # Check if the parse succeeded. if all(r.lhs != self.start for r in table[(0, len(tokenized) - 1)]): raise ParseError('Parsing failed.') - parse = trees[(0, len(tokenized) - 1)][NT(self.start)] + parse = trees[(0, len(tokenized) - 1)][self.start] return self._to_tree(revert_cnf(parse)) def _to_tree(self, rule_node): @@ -143,8 +117,8 @@ class Parser(object): if isinstance(child, RuleNode): children.append(self._to_tree(child)) else: - assert isinstance(child.s, Token) - children.append(child.s) + assert isinstance(child.name, Token) + children.append(child.name) t = Tree(orig_rule.origin, children) t.rule=orig_rule return t @@ -169,7 +143,7 @@ def _parse(s, g): # Populate base case with existing terminal production rules for i, w in enumerate(s): for terminal, rules in g.terminal_rules.items(): - if terminal.match(w): + if match(terminal, w): for rule in rules: table[(i, i)].add(rule) if (rule.lhs not in trees[(i, i)] or @@ -349,13 +323,13 @@ def revert_cnf(node): if isinstance(node, T): return node # Reverts TERM rule. - if node.rule.lhs.s.startswith('__T_'): + if node.rule.lhs.name.startswith('__T_'): return node.children[0] else: children = [] for child in map(revert_cnf, node.children): # Reverts BIN rule. - if isinstance(child, RuleNode) and child.rule.lhs.s.startswith('__SP_'): + if isinstance(child, RuleNode) and child.rule.lhs.name.startswith('__SP_'): children += child.children else: children.append(child) diff --git a/lark/parsers/xearley.py b/lark/parsers/xearley.py index 321b829..c64bfee 100644 --- a/lark/parsers/xearley.py +++ b/lark/parsers/xearley.py @@ -98,14 +98,14 @@ class Parser: for item in to_scan: m = match(item.expect, stream, i) if m: - t = Token(item.expect, m.group(0), i, text_line, text_column) + t = Token(item.expect.name, m.group(0), i, text_line, text_column) delayed_matches[m.end()].append(item.advance(t)) s = m.group(0) for j in range(1, len(s)): m = match(item.expect, s[:-j]) if m: - t = Token(item.expect, m.group(0), i, text_line, text_column) + t = Token(item.expect.name, m.group(0), i, text_line, text_column) delayed_matches[i+m.end()].append(item.advance(t)) next_set = Column(i+1, self.FIRST, predict_all=self.predict_all) From 33caa391d544bb079902b6bf735d295e4ac13a4a Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Sat, 5 May 2018 16:52:39 +0300 Subject: [PATCH 13/34] Breaking backwards compatibility: * Removed the scanless parsing feature (dynamic lexing is king) * Default LALR lexer is now contextual --- docs/reference.md | 176 --------------------- examples/README.md | 3 +- examples/{conf_nolex.py => conf_earley.py} | 13 +- examples/{conf.py => conf_lalr.py} | 9 +- lark/grammar.py | 7 +- lark/lark.py | 8 +- lark/load_grammar.py | 76 +-------- lark/parse_tree_builder.py | 13 -- lark/parser_frontends.py | 28 +--- lark/tools/standalone.py | 2 +- tests/__main__.py | 4 +- tests/test_parser.py | 53 ++----- 12 files changed, 43 insertions(+), 349 deletions(-) delete mode 100644 docs/reference.md rename examples/{conf_nolex.py => conf_earley.py} (73%) rename examples/{conf.py => conf_lalr.py} (77%) diff --git a/docs/reference.md b/docs/reference.md deleted file mode 100644 index 90553f5..0000000 --- a/docs/reference.md +++ /dev/null @@ -1,176 +0,0 @@ -# Lark Reference - -## What is Lark? - -Lark is a general-purpose parsing library. It's written in Python, and supports two parsing algorithms: Earley (default) and LALR(1). - -Lark also supports scanless parsing (with Earley), contextual lexing (with LALR), and regular lexing for both parsers. - -Lark is a re-write of my previous parsing library, [PlyPlus](https://github.com/erezsh/plyplus). - -## Grammar - -Lark accepts its grammars in [EBNF](https://www.wikiwand.com/en/Extended_Backus%E2%80%93Naur_form) form. - -The grammar is a list of rules and terminals, each in their own line. - -Rules and terminals can be defined on multiple lines when using the *OR* operator ( | ). - -Comments start with // and last to the end of the line (C++ style) - -Lark begins the parse with the rule 'start', unless specified otherwise in the options. - -It might help to think of Rules and Terminals as existing in two separate layers, so that all the terminals are recognized first, and all the rules are recognized afterwards. This is not always how things happen (depending on your choice of parser & lexer), but the concept is relevant in all cases. - -### Rules - -Each rule is defined in terms of: - - name : list of items to match - | another list of items -> optional_alias - | etc. - -An alias is a name for the specific rule alternative. It affects tree construction. - -An item is a: - - - rule - - terminal - - (item item ..) - Group items - - [item item ..] - Maybe. Same as: "(item item ..)?" - - item? - Zero or one instances of item ("maybe") - - item\* - Zero or more instances of item - - item+ - One or more instances of item - - -Example: - - float: "-"? DIGIT* "." DIGIT+ exp - | "-"? DIGIT+ exp - - exp: "-"? ("e" | "E") DIGIT+ - - DIGIT: /[0-9]/ - -### Terminals - -Terminals are defined just like rules, but cannot contain rules: - - NAME : list of items to match - -Example: - - IF: "if" - INTEGER : /[0-9]+/ - DECIMAL: INTEGER "." INTEGER - WHITESPACE: (" " | /\t/ )+ - -## Tree Construction - -Lark builds a tree automatically based on the structure of the grammar. Is also accepts some hints. - -In general, Lark will place each rule as a branch, and its matches as the children of the branch. - -Terminals are always values in the tree, never branches. - -In grammar rules, using item+ or item\* will result in a list of items. - -Example: - - expr: "(" expr ")" - | NAME+ - - NAME: /\w+/ - - %ignore " " - -Lark will parse "(((hello world)))" as: - - expr - expr - expr - "hello" - "world" - -The brackets do not appear in the tree by design. - -Terminals that won't appear in the tree are: - - - Unnamed literals (like "keyword" or "+") - - Terminals whose name starts with an underscore (like \_DIGIT) - -Terminals that *will* appear in the tree are: - - - Unnamed regular expressions (like /[0-9]/) - - Named terminals whose name starts with a letter (like DIGIT) - -## Shaping the tree - -a. Rules whose name begins with an underscore will be inlined into their containing rule. - -Example: - - start: "(" _greet ")" - _greet: /\w+/ /\w+/ - -Lark will parse "(hello world)" as: - - start - "hello" - "world" - - -b. Rules that receive a question mark (?) at the beginning of their definition, will be inlined if they have a single child. - -Example: - - start: greet greet - ?greet: "(" /\w+/ ")" - | /\w+ /\w+/ - -Lark will parse "hello world (planet)" as: - - start - greet - "hello" - "world" - "planet" - -c. Rules that begin with an exclamation mark will keep all their terminals (they won't get filtered). - -d. Aliases - options in a rule can receive an alias. It will be then used as the branch name for the option. - -Example: - - start: greet greet - greet: "hello" -> hello - | "world" - -Lark will parse "hello world" as: - - start - hello - greet - -## Lark Options - -When initializing the Lark object, you can provide it with keyword options: - -- start - The start symbol (Default: "start") -- parser - Decides which parser engine to use, "earley" or "lalr". (Default: "earley") - Note: "lalr" requires a lexer -- lexer - Decides whether or not to use a lexer stage - - None: Don't use a lexer - - "standard": Use a standard lexer - - "contextual": Stronger lexer (only works with parser="lalr") - - "auto" (default): Choose for me based on grammar and parser - -- transformer - Applies the transformer to every parse tree (only allowed with parser="lalr") -- postlex - Lexer post-processing (Default: None) - -To be supported: - -- debug -- cache\_grammar -- keep\_all\_tokens -- profile - Measure run-time usage in Lark. Read results from the profiler property (Default: False) diff --git a/examples/README.md b/examples/README.md index 3fbe3ea..37076d5 100644 --- a/examples/README.md +++ b/examples/README.md @@ -12,5 +12,6 @@ - [error\_reporting\_lalr.py](error_reporting_lalr.py) - A demonstration of example-driven error reporting with the LALR parser - [python\_parser.py](python_parser.py) - A fully-working Python 2 & 3 parser (but not production ready yet!) -- [conf.py](conf.py) - Demonstrates the power of LALR's contextual lexer on a toy configuration language +- [conf\_lalr.py](conf_lalr.py) - Demonstrates the power of LALR's contextual lexer on a toy configuration language +- [conf\_earley.py](conf_earley.py) - Demonstrates the power of Earley's dynamic lexer on a toy configuration language - [reconstruct\_json.py](reconstruct_json.py) - Demonstrates the experimental text-reconstruction feature diff --git a/examples/conf_nolex.py b/examples/conf_earley.py similarity index 73% rename from examples/conf_nolex.py rename to examples/conf_earley.py index 8634a46..71517f0 100644 --- a/examples/conf_nolex.py +++ b/examples/conf_earley.py @@ -1,16 +1,14 @@ # -# This example demonstrates scanless parsing using the dynamic-lexer earley frontend +# This example demonstrates parsing using the dynamic-lexer earley frontend # # Using a lexer for configuration files is tricky, because values don't # have to be surrounded by delimiters. Using a standard lexer for this just won't work. # # In this example we use a dynamic lexer and let the Earley parser resolve the ambiguity. # -# Future versions of lark will make it easier to write these kinds of grammars. -# # Another approach is to use the contextual lexer with LALR. It is less powerful than Earley, # but it can handle some ambiguity when lexing and it's much faster. -# See examples/conf.py for an example of that approach. +# See examples/conf_lalr.py for an example of that approach. # @@ -19,14 +17,14 @@ from lark import Lark parser = Lark(r""" start: _NL? section+ section: "[" NAME "]" _NL item+ - item: NAME "=" VALUE _NL - VALUE: /./* + item: NAME "=" VALUE? _NL + VALUE: /./+ %import common.CNAME -> NAME %import common.NEWLINE -> _NL %import common.WS_INLINE %ignore WS_INLINE - """, lexer='dynamic') + """, parser="earley") def test(): sample_conf = """ @@ -34,6 +32,7 @@ def test(): a=Hello this="that",4 +empty= """ r = parser.parse(sample_conf) diff --git a/examples/conf.py b/examples/conf_lalr.py similarity index 77% rename from examples/conf.py rename to examples/conf_lalr.py index ac5a4a2..417d2af 100644 --- a/examples/conf.py +++ b/examples/conf_lalr.py @@ -1,16 +1,16 @@ # # This example demonstrates the power of the contextual lexer, by parsing a config file. # -# The tokens NAME and VALUE match the same input. A regular lexer would arbitrarily +# The tokens NAME and VALUE match the same input. A standard lexer would arbitrarily # choose one over the other, which would lead to a (confusing) parse error. -# However, due to the unambiguous structure of the grammar, the LALR(1) algorithm knows +# However, due to the unambiguous structure of the grammar, Lark's LALR(1) algorithm knows # which one of them to expect at each point during the parse. # The lexer then only matches the tokens that the parser expects. # The result is a correct parse, something that is impossible with a regular lexer. # # Another approach is to discard a lexer altogether and use the Earley algorithm. # It will handle more cases than the contextual lexer, but at the cost of performance. -# See examples/conf_nolex.py for an example of that approach. +# See examples/conf_earley.py for an example of that approach. # from lark import Lark @@ -25,13 +25,14 @@ parser = Lark(r""" %import common.WS_INLINE %ignore WS_INLINE - """, parser="lalr", lexer="contextual") + """, parser="lalr") sample_conf = """ [bla] a=Hello this="that",4 +empty= """ print(parser.parse(sample_conf).pretty()) diff --git a/lark/grammar.py b/lark/grammar.py index b555c34..bf12b10 100644 --- a/lark/grammar.py +++ b/lark/grammar.py @@ -46,20 +46,17 @@ class Rule(object): class RuleOptions: - def __init__(self, keep_all_tokens=False, expand1=False, create_token=None, filter_out=False, priority=None): + def __init__(self, keep_all_tokens=False, expand1=False, filter_out=False, priority=None): self.keep_all_tokens = keep_all_tokens self.expand1 = expand1 - self.create_token = create_token # used for scanless postprocessing self.priority = priority self.filter_out = filter_out # remove this rule from the tree - # used for "token"-rules in scanless def __repr__(self): - return 'RuleOptions(%r, %r, %r, %r, %r)' % ( + return 'RuleOptions(%r, %r, %r, %r)' % ( self.keep_all_tokens, self.expand1, - self.create_token, self.priority, self.filter_out ) diff --git a/lark/lark.py b/lark/lark.py index 8ab2227..4fc0062 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -23,9 +23,9 @@ class LarkOptions(object): Note: "lalr" requires a lexer lexer - Decides whether or not to use a lexer stage - None: Don't use a lexer (scanless, only works with parser="earley") "standard": Use a standard lexer "contextual": Stronger lexer (only works with parser="lalr") + "dynamic": Flexible and powerful (only with parser="earley") "auto" (default): Choose for me based on grammar and parser ambiguity - Decides how to handle ambiguity in the parse. Only relevant if parser="earley" @@ -131,7 +131,7 @@ class Lark: if self.options.lexer == 'auto': if self.options.parser == 'lalr': - self.options.lexer = 'standard' + self.options.lexer = 'contextual' elif self.options.parser == 'earley': self.options.lexer = 'dynamic' elif self.options.parser == 'cyk': @@ -139,7 +139,7 @@ class Lark: else: assert False, self.options.parser lexer = self.options.lexer - assert lexer in ('standard', 'contextual', 'dynamic', None) + assert lexer in ('standard', 'contextual', 'dynamic') if self.options.ambiguity == 'auto': if self.options.parser == 'earley': @@ -154,7 +154,7 @@ class Lark: self.grammar = load_grammar(grammar, self.source) # Compile the EBNF grammar into BNF - tokens, self.rules, self.ignore_tokens = self.grammar.compile(lexer=bool(lexer), start=self.options.start) + tokens, self.rules, self.ignore_tokens = self.grammar.compile() self.lexer_conf = LexerConf(tokens, self.ignore_tokens, self.options.postlex, self.options.lexer_callbacks) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 6800801..be96b1b 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -363,12 +363,6 @@ class PrepareLiterals(InlineTransformer): regexp = '[%s-%s]' % (start, end) return ST('pattern', [PatternRE(regexp)]) -class SplitLiterals(InlineTransformer): - def pattern(self, p): - if isinstance(p, PatternStr) and len(p.value)>1: - return ST('expansion', [ST('pattern', [PatternStr(ch, flags=p.flags)]) for ch in p.value]) - return ST('pattern', [p]) - class TokenTreeToPattern(Transformer): def pattern(self, ps): p ,= ps @@ -405,15 +399,6 @@ class TokenTreeToPattern(Transformer): def alias(self, t): raise GrammarError("Aliasing not allowed in terminals (You used -> in the wrong place)") -def _interleave(l, item): - for e in l: - yield e - if isinstance(e, Tree): - if e.data in ('literal', 'range'): - yield item - elif is_terminal(e): - yield item - def _choice_of_rules(rules): return ST('expansions', [ST('expansion', [Token('RULE', name)]) for name in rules]) @@ -423,62 +408,9 @@ class Grammar: self.rule_defs = rule_defs self.ignore = ignore - def _prepare_scanless_grammar(self, start): - # XXX Pretty hacky! There should be a better way to write this method.. - - rule_defs = deepcopy(self.rule_defs) - term_defs = self.token_defs - - # Implement the "%ignore" feature without a lexer.. - terms_to_ignore = {name:'__'+name for name in self.ignore} - if terms_to_ignore: - assert set(terms_to_ignore) <= {name for name, _t in term_defs} - - term_defs = [(terms_to_ignore.get(name,name),t) for name,t in term_defs] - expr = Token('RULE', '__ignore') - for r, tree, _o in rule_defs: - for exp in tree.find_data('expansion'): - exp.children = list(_interleave(exp.children, expr)) - if r == start: - exp.children = [expr] + exp.children - for exp in tree.find_data('expr'): - exp.children[0] = ST('expansion', list(_interleave(exp.children[:1], expr))) - - _ignore_tree = ST('expr', [_choice_of_rules(terms_to_ignore.values()), Token('OP', '?')]) - rule_defs.append(('__ignore', _ignore_tree, None)) - - # Convert all tokens to rules - new_terminal_names = {name: '__token_'+name for name, _t in term_defs} - - for name, tree, options in rule_defs: - for exp in chain( tree.find_data('expansion'), tree.find_data('expr') ): - for i, sym in enumerate(exp.children): - if sym in new_terminal_names: - exp.children[i] = Token(sym.type, new_terminal_names[sym]) - - for name, (tree, priority) in term_defs: # TODO transfer priority to rule? - if any(tree.find_data('alias')): - raise GrammarError("Aliasing not allowed in terminals (You used -> in the wrong place)") - - if name.startswith('_'): - options = RuleOptions(filter_out=True, priority=-priority) - else: - options = RuleOptions(keep_all_tokens=True, create_token=name, priority=-priority) - - name = new_terminal_names[name] - inner_name = name + '_inner' - rule_defs.append((name, _choice_of_rules([inner_name]), None)) - rule_defs.append((inner_name, tree, options)) - - return [], rule_defs - - - def compile(self, lexer=False, start=None): - if not lexer: - token_defs, rule_defs = self._prepare_scanless_grammar(start) - else: - token_defs = list(self.token_defs) - rule_defs = self.rule_defs + def compile(self): + token_defs = list(self.token_defs) + rule_defs = self.rule_defs # ================= # Compile Tokens @@ -495,8 +427,6 @@ class Grammar: # 1. Pre-process terminals transformer = PrepareLiterals() - if not lexer: - transformer *= SplitLiterals() transformer *= ExtractAnonTokens(tokens) # Adds to tokens # 2. Convert EBNF to BNF (and apply step 1) diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py index 1d4e2b8..94fcdb9 100644 --- a/lark/parse_tree_builder.py +++ b/lark/parse_tree_builder.py @@ -18,17 +18,6 @@ class ExpandSingleChild: return self.node_builder(children) -class CreateToken: - "Used for fixing the results of scanless parsing" - - def __init__(self, token_name, node_builder): - self.node_builder = node_builder - self.token_name = token_name - - def __call__(self, children): - return self.node_builder( [Token(self.token_name, ''.join(children))] ) - - class PropagatePositions: def __init__(self, node_builder): self.node_builder = node_builder @@ -116,10 +105,8 @@ class ParseTreeBuilder: options = rule.options keep_all_tokens = self.always_keep_all_tokens or (options.keep_all_tokens if options else False) expand_single_child = options.expand1 if options else False - create_token = options.create_token if options else False wrapper_chain = filter(None, [ - create_token and partial(CreateToken, create_token), (expand_single_child and not rule.alias) and ExpandSingleChild, maybe_create_child_filter(rule.expansion, () if keep_all_tokens else filter_out, self.ambiguous), self.propagate_positions and PropagatePositions, diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index e4401c1..b7a9225 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -72,30 +72,6 @@ def tokenize_text(text): col_start_pos = i + ch.rindex('\n') yield Token('CHAR', ch, line=line, column=i - col_start_pos) -class Earley_NoLex: - def __init__(self, lexer_conf, parser_conf, options=None): - self._prepare_match(lexer_conf) - - self.parser = earley.Parser(parser_conf, self.match, - resolve_ambiguity=get_ambiguity_resolver(options)) - - - def match(self, term, text, index=0): - return self.regexps[term.name].match(text, index) - - def _prepare_match(self, lexer_conf): - self.regexps = {} - for t in lexer_conf.tokens: - regexp = t.pattern.to_regexp() - width = get_regexp_width(regexp) - if width != (1,1): - raise GrammarError('Scanless parsing (lexer=None) requires all tokens to have a width of 1 (terminal %s: %s is %s)' % (t.name, regexp, width)) - self.regexps[t.name] = re.compile(regexp) - - def parse(self, text): - token_stream = tokenize_text(text) - return self.parser.parse(token_stream) - class Earley(WithLexer): def __init__(self, lexer_conf, parser_conf, options=None): self.init_traditional_lexer(lexer_conf) @@ -190,9 +166,7 @@ def get_frontend(parser, lexer): else: raise ValueError('Unknown lexer: %s' % lexer) elif parser=='earley': - if lexer is None: - return Earley_NoLex - elif lexer=='standard': + if lexer=='standard': return Earley elif lexer=='dynamic': return XEarley diff --git a/lark/tools/standalone.py b/lark/tools/standalone.py index 61ce94e..2e155f7 100644 --- a/lark/tools/standalone.py +++ b/lark/tools/standalone.py @@ -168,7 +168,7 @@ class TreeBuilderAtoms: print('parse_tree_builder = ParseTreeBuilder(RULES.values(), Tree)') def main(fobj, start): - lark_inst = Lark(fobj, parser="lalr", start=start) + lark_inst = Lark(fobj, parser="lalr", lexer="standard", start=start) lexer_atoms = LexerAtoms(lark_inst.parser.lexer) parser_atoms = ParserAtoms(lark_inst.parser.parser) diff --git a/tests/__main__.py b/tests/__main__.py index eebf0d9..5a30a4e 100644 --- a/tests/__main__.py +++ b/tests/__main__.py @@ -19,10 +19,10 @@ from .test_parser import ( TestEarleyStandard, TestCykStandard, TestLalrContextual, - TestEarleyScanless, + # TestEarleyScanless, TestEarleyDynamic, - TestFullEarleyScanless, + # TestFullEarleyScanless, TestFullEarleyDynamic, TestParsers, diff --git a/tests/test_parser.py b/tests/test_parser.py index 5c68bec..76ad509 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -48,9 +48,6 @@ class TestParsers(unittest.TestCase): self.assertRaises(GrammarError, Lark, g, parser='lalr') - l = Lark(g, parser='earley', lexer=None) - self.assertRaises(ParseError, l.parse, 'a') - l = Lark(g, parser='earley', lexer='dynamic') self.assertRaises(ParseError, l.parse, 'a') @@ -155,7 +152,7 @@ class TestParsers(unittest.TestCase): def _make_full_earley_test(LEXER): class _TestFullEarley(unittest.TestCase): - def test_anon_in_scanless(self): + def test_anon(self): # Fails an Earley implementation without special handling for empty rules, # or re-processing of already completed rules. g = Lark(r"""start: B @@ -164,14 +161,14 @@ def _make_full_earley_test(LEXER): self.assertEqual( g.parse('abc').children[0], 'abc') - def test_earley_scanless(self): + def test_earley(self): g = Lark("""start: A "b" c A: "a"+ c: "abc" """, parser="earley", lexer=LEXER) x = g.parse('aaaababc') - def test_earley_scanless2(self): + def test_earley2(self): grammar = """ start: statement+ @@ -187,24 +184,19 @@ def _make_full_earley_test(LEXER): l.parse(program) - # XXX Fails for scanless mode - # XXX Decided not to fix, because - # a) It's a subtle bug - # b) Scanless is intended for deprecation - # - # def test_earley_scanless3(self): - # "Tests prioritization and disambiguation for pseudo-terminals (there should be only one result)" + def test_earley3(self): + "Tests prioritization and disambiguation for pseudo-terminals (there should be only one result)" - # grammar = """ - # start: A A - # A: "a"+ - # """ + grammar = """ + start: A A + A: "a"+ + """ - # l = Lark(grammar, parser='earley', lexer=LEXER) - # res = l.parse("aaa") - # self.assertEqual(res.children, ['aa', 'a']) + l = Lark(grammar, parser='earley', lexer=LEXER) + res = l.parse("aaa") + self.assertEqual(res.children, ['aa', 'a']) - def test_earley_scanless4(self): + def test_earley4(self): grammar = """ start: A A? A: "a"+ @@ -259,7 +251,6 @@ def _make_full_earley_test(LEXER): assert x.data == '_ambig', x assert len(x.children) == 2 - @unittest.skipIf(LEXER==None, "BUG in scanless parsing!") # TODO fix bug! def test_fruitflies_ambig(self): grammar = """ start: noun verb noun -> simple @@ -350,7 +341,7 @@ def _make_full_earley_test(LEXER): # assert x.data != '_ambig', x # assert len(x.children) == 1 - _NAME = "TestFullEarley" + (LEXER or 'Scanless').capitalize() + _NAME = "TestFullEarley" + LEXER.capitalize() _TestFullEarley.__name__ = _NAME globals()[_NAME] = _TestFullEarley @@ -402,7 +393,6 @@ def _make_parser_test(LEXER, PARSER): """) g.parse(u'\xa3\u0101\u00a3') - @unittest.skipIf(LEXER is None, "Regexps >1 not supported with scanless parsing") def test_unicode2(self): g = _Lark(r"""start: UNIA UNIB UNIA UNIC UNIA: /\xa3/ @@ -614,11 +604,7 @@ def _make_parser_test(LEXER, PARSER): self.assertSequenceEqual(x.children, ['HelloWorld']) - @unittest.skipIf(LEXER is None, "Known bug with scanless parsing") # TODO def test_token_collision2(self): - # NOTE: This test reveals a bug in token reconstruction in Scanless Earley - # I probably need to re-write grammar transformation - g = _Lark(""" !start: "starts" @@ -662,7 +648,6 @@ def _make_parser_test(LEXER, PARSER): x = g.parse('aaaab') x = g.parse('b') - @unittest.skipIf(LEXER in (None, 'dynamic'), "Known bug with scanless parsing") # TODO def test_token_not_anon(self): """Tests that "a" is matched as A, rather than an anonymous token. @@ -755,7 +740,6 @@ def _make_parser_test(LEXER, PARSER): """) x = g.parse('AB') - @unittest.skipIf(LEXER == None, "Scanless can't handle regexps") def test_regex_quote(self): g = r""" start: SINGLE_QUOTED_STRING | DOUBLE_QUOTED_STRING @@ -866,7 +850,6 @@ def _make_parser_test(LEXER, PARSER): """ self.assertRaises( GrammarError, _Lark, g) - @unittest.skipIf(LEXER==None, "TODO: Fix scanless parsing or get rid of it") # TODO def test_line_and_column(self): g = r"""!start: "A" bc "D" !bc: "B\nC" @@ -1054,7 +1037,6 @@ def _make_parser_test(LEXER, PARSER): - @unittest.skipIf(LEXER==None, "Scanless doesn't support regular expressions") @unittest.skipIf(PARSER == 'cyk', "No empty rules") def test_ignore(self): grammar = r""" @@ -1081,7 +1063,6 @@ def _make_parser_test(LEXER, PARSER): self.assertEqual(tree.children, []) - @unittest.skipIf(LEXER==None, "Scanless doesn't support regular expressions") def test_regex_escaping(self): g = _Lark("start: /[ab]/") g.parse('a') @@ -1188,7 +1169,7 @@ def _make_parser_test(LEXER, PARSER): - _NAME = "Test" + PARSER.capitalize() + (LEXER or 'Scanless').capitalize() + _NAME = "Test" + PARSER.capitalize() + LEXER.capitalize() _TestParser.__name__ = _NAME globals()[_NAME] = _TestParser @@ -1199,13 +1180,13 @@ _TO_TEST = [ ('dynamic', 'earley'), ('standard', 'lalr'), ('contextual', 'lalr'), - (None, 'earley'), + # (None, 'earley'), ] for _LEXER, _PARSER in _TO_TEST: _make_parser_test(_LEXER, _PARSER) -for _LEXER in (None, 'dynamic'): +for _LEXER in ('dynamic',): _make_full_earley_test(_LEXER) if __name__ == '__main__': From 1839c324d3c34fe57f0b03711c6814f042b8e8da Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Sat, 5 May 2018 17:32:46 +0300 Subject: [PATCH 14/34] Small refactoring step --- lark/load_grammar.py | 26 +++++++++++++++++++------- lark/utils.py | 7 ++++--- 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index be96b1b..e87870f 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -13,6 +13,7 @@ from .parser_frontends import LALR from .parsers.lalr_parser import UnexpectedToken from .common import is_terminal, GrammarError, LexerConf, ParserConf, PatternStr, PatternRE, TokenDef from .grammar import RuleOptions, Rule, Terminal, NonTerminal +from .utils import classify from .tree import Tree, Transformer, InlineTransformer, Visitor, SlottedTree as ST @@ -108,10 +109,14 @@ RULES = { '?atom': ['_LPAR expansions _RPAR', 'maybe', - 'name', + 'terminal', + 'nonterminal', 'literal', 'range'], + 'terminal': ['TOKEN'], + 'nonterminal': ['RULE'], + '?name': ['RULE', 'TOKEN'], 'maybe': ['_LBRA expansions _RBRA'], @@ -514,6 +519,12 @@ def options_from_rule(name, *x): def symbols_from_strcase(expansion): return [Terminal(x) if is_terminal(x) else NonTerminal(x) for x in expansion] +class PrepareGrammar(InlineTransformer): + def terminal(self, name): + return name + def nonterminal(self, name): + return name + class GrammarLoader: def __init__(self): tokens = [TokenDef(name, PatternRE(value)) for name, value in TOKENS.items()] @@ -554,15 +565,16 @@ class GrammarLoader: raise GrammarError("Expecting a value at line %s column %s\n\n%s" % (e.line, e.column, context)) raise - # Extract grammar items + tree = PrepareGrammar().transform(tree) - token_defs = [c.children for c in tree.children if c.data=='token'] - rule_defs = [c.children for c in tree.children if c.data=='rule'] - statements = [c.children for c in tree.children if c.data=='statement'] - assert len(token_defs) + len(rule_defs) + len(statements) == len(tree.children) + # Extract grammar items + defs = classify(tree.children, lambda c: c.data, lambda c: c.children) + token_defs = defs.pop('token', []) + rule_defs = defs.pop('rule', []) + statements = defs.pop('statement', []) + assert not defs token_defs = [td if len(td)==3 else (td[0], 1, td[1]) for td in token_defs] - token_defs = [(name.value, (t, int(p))) for name, p, t in token_defs] # Execute statements diff --git a/lark/utils.py b/lark/utils.py index f606704..0018e49 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -17,14 +17,15 @@ def classify_bool(seq, pred): return true_elems, false_elems -def classify(seq, key=None): +def classify(seq, key=None, value=None): d = {} for item in seq: k = key(item) if (key is not None) else item + v = value(item) if (value is not None) else item if k in d: - d[k].append(item) + d[k].append(v) else: - d[k] = [item] + d[k] = [v] return d def bfs(initial, expand): From 5a6e60456026e9ab6a8feac967845170625a7001 Mon Sep 17 00:00:00 2001 From: Rob Rose Date: Sat, 5 May 2018 22:25:20 -0400 Subject: [PATCH 15/34] Added .gitignore for pyenv --- .gitignore | 1 + 1 file changed, 1 insertion(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..cdb93cd --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.python-version From c5e6cf0954d49b592b162a1870223965742da84a Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Tue, 8 May 2018 11:26:53 +0300 Subject: [PATCH 16/34] Refactoring to introduce Symbol instances before creating anons --- lark/load_grammar.py | 62 ++++++++++++++++++++++++++++---------------- 1 file changed, 40 insertions(+), 22 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index e87870f..6af12d0 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -12,7 +12,7 @@ from .parse_tree_builder import ParseTreeBuilder from .parser_frontends import LALR from .parsers.lalr_parser import UnexpectedToken from .common import is_terminal, GrammarError, LexerConf, ParserConf, PatternStr, PatternRE, TokenDef -from .grammar import RuleOptions, Rule, Terminal, NonTerminal +from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol from .utils import classify from .tree import Tree, Transformer, InlineTransformer, Visitor, SlottedTree as ST @@ -108,11 +108,13 @@ RULES = { ], '?atom': ['_LPAR expansions _RPAR', - 'maybe', - 'terminal', - 'nonterminal', - 'literal', - 'range'], + 'maybe', + 'value'], + + 'value': ['terminal', + 'nonterminal', + 'literal', + 'range'], 'terminal': ['TOKEN'], 'nonterminal': ['RULE'], @@ -149,7 +151,7 @@ class EBNF_to_BNF(InlineTransformer): new_name = '__%s_%s_%d' % (self.prefix, type_, self.i) self.i += 1 - t = Token('RULE', new_name, -1) + t = NonTerminal(Token('RULE', new_name, -1)) tree = ST('expansions', [ST('expansion', [expr]), ST('expansion', [t, expr])]) self.new_rules.append((new_name, tree, self.rule_options)) self.rules_by_expr[expr] = t @@ -235,7 +237,7 @@ class RuleTreeToText(Transformer): def expansions(self, x): return x def expansion(self, symbols): - return [sym.value for sym in symbols], None + return symbols, None def alias(self, x): (expansion, _alias), alias = x assert _alias is None, (alias, expansion, '-', _alias) @@ -305,7 +307,7 @@ class ExtractAnonTokens(InlineTransformer): self.token_reverse[p] = tokendef self.tokens.append(tokendef) - return Token('TOKEN', token_name, -1) + return Terminal(Token('TOKEN', token_name, -1)) def _rfind(s, choices): @@ -349,7 +351,7 @@ def _literal_to_pattern(literal): s = _fix_escaping(x) - if v[0] == '"': + if literal.type == 'STRING': s = s.replace('\\\\', '\\') return { 'STRING': PatternStr, @@ -368,6 +370,7 @@ class PrepareLiterals(InlineTransformer): regexp = '[%s-%s]' % (start, end) return ST('pattern', [PatternRE(regexp)]) + class TokenTreeToPattern(Transformer): def pattern(self, ps): p ,= ps @@ -404,6 +407,17 @@ class TokenTreeToPattern(Transformer): def alias(self, t): raise GrammarError("Aliasing not allowed in terminals (You used -> in the wrong place)") + def value(self, v): + return v[0] + +class PrepareSymbols(Transformer): + def value(self, v): + v ,= v + if isinstance(v, Tree): + return v + return {'TOKEN': Terminal, + 'RULE': NonTerminal}[v.type](v.value) + def _choice_of_rules(rules): return ST('expansions', [ST('expansion', [Token('RULE', name)]) for name in rules]) @@ -432,6 +446,7 @@ class Grammar: # 1. Pre-process terminals transformer = PrepareLiterals() + transformer *= PrepareSymbols() transformer *= ExtractAnonTokens(tokens) # Adds to tokens # 2. Convert EBNF to BNF (and apply step 1) @@ -458,7 +473,7 @@ class Grammar: if alias and name.startswith('_'): raise GrammarError("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (name, alias)) - expansion = [Terminal(x) if is_terminal(x) else NonTerminal(x) for x in expansion] + assert all(isinstance(x, Symbol) for x in expansion), expansion rule = Rule(NonTerminal(name), expansion, alias, options) compiled_rules.append(rule) @@ -489,14 +504,14 @@ def resolve_token_references(token_defs): while True: changed = False for name, (token_tree, _p) in token_defs: - for exp in chain(token_tree.find_data('expansion'), token_tree.find_data('expr')): - for i, item in enumerate(exp.children): - if isinstance(item, Token): - if item.type == 'RULE': - raise GrammarError("Rules aren't allowed inside tokens (%s in %s)" % (item, name)) - if item.type == 'TOKEN': - exp.children[i] = token_dict[item] - changed = True + for exp in token_tree.find_data('value'): + item ,= exp.children + if isinstance(item, Token): + if item.type == 'RULE': + raise GrammarError("Rules aren't allowed inside tokens (%s in %s)" % (item, name)) + if item.type == 'TOKEN': + exp.children[0] = token_dict[item] + changed = True if not changed: break @@ -525,6 +540,7 @@ class PrepareGrammar(InlineTransformer): def nonterminal(self, name): return name + class GrammarLoader: def __init__(self): tokens = [TokenDef(name, PatternRE(value)) for name, value in TOKENS.items()] @@ -609,9 +625,11 @@ class GrammarLoader: t2 ,= t.children if t2.data=='expansion' and len(t2.children) == 1: item ,= t2.children - if isinstance(item, Token) and item.type == 'TOKEN': - ignore_names.append(item.value) - continue + if item.data == 'value': + item ,= item.children + if isinstance(item, Token) and item.type == 'TOKEN': + ignore_names.append(item.value) + continue name = '__IGNORE_%d'% len(ignore_names) ignore_names.append(name) From 7b32ffd83a9d682c5c71df49467a6a481702d3f8 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Tue, 8 May 2018 12:05:11 +0300 Subject: [PATCH 17/34] Fixed token visibility rules (Issue #109) Anonymous tokens would become visible if they had the same value as named tokens. That's because they are merged for the lexer. But after this change, the rules for visibility are based on their use in the rule, and not their name or identity. --- lark/grammar.py | 14 ++++++-------- lark/load_grammar.py | 11 +++++++---- lark/parse_tree_builder.py | 11 ++++------- lark/parser_frontends.py | 9 ++------- tests/test_parser.py | 19 ++++++++----------- 5 files changed, 27 insertions(+), 37 deletions(-) diff --git a/lark/grammar.py b/lark/grammar.py index bf12b10..37c2997 100644 --- a/lark/grammar.py +++ b/lark/grammar.py @@ -20,9 +20,10 @@ class Symbol(object): class Terminal(Symbol): is_term = True - @property - def filter_out(self): - return self.name.startswith('_') + def __init__(self, name, filter_out=False): + self.name = name + self.filter_out = filter_out + class NonTerminal(Symbol): is_term = False @@ -46,17 +47,14 @@ class Rule(object): class RuleOptions: - def __init__(self, keep_all_tokens=False, expand1=False, filter_out=False, priority=None): + def __init__(self, keep_all_tokens=False, expand1=False, priority=None): self.keep_all_tokens = keep_all_tokens self.expand1 = expand1 self.priority = priority - self.filter_out = filter_out # remove this rule from the tree - def __repr__(self): - return 'RuleOptions(%r, %r, %r, %r)' % ( + return 'RuleOptions(%r, %r, %r)' % ( self.keep_all_tokens, self.expand1, self.priority, - self.filter_out ) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 6af12d0..9ebacb1 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -307,7 +307,7 @@ class ExtractAnonTokens(InlineTransformer): self.token_reverse[p] = tokendef self.tokens.append(tokendef) - return Terminal(Token('TOKEN', token_name, -1)) + return Terminal(Token('TOKEN', token_name, -1), filter_out=isinstance(p, PatternStr)) def _rfind(s, choices): @@ -415,8 +415,11 @@ class PrepareSymbols(Transformer): v ,= v if isinstance(v, Tree): return v - return {'TOKEN': Terminal, - 'RULE': NonTerminal}[v.type](v.value) + elif v.type == 'RULE': + return NonTerminal(v.value) + elif v.type == 'TOKEN': + return Terminal(v.value, filter_out=v.startswith('_')) + assert False def _choice_of_rules(rules): return ST('expansions', [ST('expansion', [Token('RULE', name)]) for name in rules]) @@ -532,7 +535,7 @@ def options_from_rule(name, *x): def symbols_from_strcase(expansion): - return [Terminal(x) if is_terminal(x) else NonTerminal(x) for x in expansion] + return [Terminal(x, filter_out=x.startswith('_')) if is_terminal(x) else NonTerminal(x) for x in expansion] class PrepareGrammar(InlineTransformer): def terminal(self, name): diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py index 94fcdb9..59bbc86 100644 --- a/lark/parse_tree_builder.py +++ b/lark/parse_tree_builder.py @@ -75,8 +75,9 @@ class ChildFilterLALR(ChildFilter): def _should_expand(sym): return not sym.is_term and sym.name.startswith('_') -def maybe_create_child_filter(expansion, filter_out, ambiguous): - to_include = [(i, _should_expand(sym)) for i, sym in enumerate(expansion) if sym not in filter_out] +def maybe_create_child_filter(expansion, keep_all_tokens, ambiguous): + to_include = [(i, _should_expand(sym)) for i, sym in enumerate(expansion) + if keep_all_tokens or not (sym.is_term and sym.filter_out)] if len(to_include) < len(expansion) or any(to_expand for i, to_expand in to_include): return partial(ChildFilter if ambiguous else ChildFilterLALR, to_include) @@ -97,10 +98,6 @@ class ParseTreeBuilder: self.user_aliases = {} def _init_builders(self, rules): - filter_out = {rule.origin for rule in rules if rule.options and rule.options.filter_out} - filter_out |= {sym for rule in rules for sym in rule.expansion if sym.is_term and sym.filter_out} - assert all(t.name.startswith('_') for t in filter_out) - for rule in rules: options = rule.options keep_all_tokens = self.always_keep_all_tokens or (options.keep_all_tokens if options else False) @@ -108,7 +105,7 @@ class ParseTreeBuilder: wrapper_chain = filter(None, [ (expand_single_child and not rule.alias) and ExpandSingleChild, - maybe_create_child_filter(rule.expansion, () if keep_all_tokens else filter_out, self.ambiguous), + maybe_create_child_filter(rule.expansion, keep_all_tokens, self.ambiguous), self.propagate_positions and PropagatePositions, ]) diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index b7a9225..f322524 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -7,11 +7,6 @@ from .lexer import Lexer, ContextualLexer, Token from .common import GrammarError from .parsers import lalr_parser, earley, xearley, resolve_ambig, cyk from .tree import Tree -from .grammar import Terminal, NonTerminal - -def terminals(seq): - # return [Terminal(t) for t in seq] - return seq class WithLexer: def init_traditional_lexer(self, lexer_conf): @@ -23,8 +18,8 @@ class WithLexer: states = {idx:list(t.keys()) for idx, t in self.parser._parse_table.states.items()} always_accept = lexer_conf.postlex.always_accept if lexer_conf.postlex else () self.lexer = ContextualLexer(lexer_conf.tokens, states, - ignore=terminals(lexer_conf.ignore), - always_accept=terminals(always_accept), + ignore=lexer_conf.ignore, + always_accept=always_accept, user_callbacks=lexer_conf.callbacks) def lex(self, text): diff --git a/tests/test_parser.py b/tests/test_parser.py index 76ad509..21a3dc6 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -649,28 +649,25 @@ def _make_parser_test(LEXER, PARSER): x = g.parse('b') def test_token_not_anon(self): - """Tests that "a" is matched as A, rather than an anonymous token. - - That means that "a" is not filtered out, despite being an 'immediate string'. - Whether or not this is the intuitive behavior, I'm not sure yet. - - Perhaps the right thing to do is report a collision (if such is relevant) - - -Erez + """Tests that "a" is matched as an anonymous token, and not A. """ g = _Lark("""start: "a" A: "a" """) x = g.parse('a') + self.assertEqual(len(x.children), 0, '"a" should be considered anonymous') - self.assertEqual(len(x.children), 1, '"a" should not be considered anonymous') + g = _Lark("""start: "a" A + A: "a" """) + x = g.parse('aa') + self.assertEqual(len(x.children), 1, 'only "a" should be considered anonymous') self.assertEqual(x.children[0].type, "A") g = _Lark("""start: /a/ A: /a/ """) x = g.parse('a') - self.assertEqual(len(x.children), 1, '/a/ should not be considered anonymous') - self.assertEqual(x.children[0].type, "A") + self.assertEqual(len(x.children), 1) + self.assertEqual(x.children[0].type, "A", "A isn't associated with /a/") @unittest.skipIf(PARSER == 'cyk', "No empty rules") def test_maybe(self): From 0d56b0cf303f71c64fa68c7a061c5e69cef788f6 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Tue, 8 May 2018 12:13:22 +0300 Subject: [PATCH 18/34] Anon terminals no longer need to start with _ --- lark/load_grammar.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 9ebacb1..95a96f5 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -254,7 +254,7 @@ class CanonizeTree(InlineTransformer): tokenmods, value = args return tokenmods + [value] -class ExtractAnonTokens(InlineTransformer): +class PrepareAnonTerminals(InlineTransformer): "Create a unique list of anonymous tokens. Attempt to give meaningful names to them when we add them" def __init__(self, tokens): @@ -278,7 +278,7 @@ class ExtractAnonTokens(InlineTransformer): try: token_name = _TOKEN_NAMES[value] except KeyError: - if value.isalnum() and value[0].isalpha() and ('__'+value.upper()) not in self.token_set: + if value.isalnum() and value[0].isalpha() and value.upper() not in self.token_set: token_name = '%s%d' % (value.upper(), self.i) try: # Make sure we don't have unicode in our token names @@ -289,8 +289,6 @@ class ExtractAnonTokens(InlineTransformer): token_name = 'ANONSTR_%d' % self.i self.i += 1 - token_name = '__' + token_name - elif isinstance(p, PatternRE): if p in self.token_reverse: # Kind of a wierd placement.name token_name = self.token_reverse[p].name @@ -448,9 +446,7 @@ class Grammar: # ================= # 1. Pre-process terminals - transformer = PrepareLiterals() - transformer *= PrepareSymbols() - transformer *= ExtractAnonTokens(tokens) # Adds to tokens + transformer = PrepareLiterals() * PrepareSymbols() * PrepareAnonTerminals(tokens) # Adds to tokens # 2. Convert EBNF to BNF (and apply step 1) ebnf_to_bnf = EBNF_to_BNF() From ea413fd648f219dd3336130563e5b8e725fee453 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Tue, 8 May 2018 12:24:01 +0300 Subject: [PATCH 19/34] Simplify PrepareAnonTerminals --- lark/load_grammar.py | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 95a96f5..3aa9827 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -13,7 +13,7 @@ from .parser_frontends import LALR from .parsers.lalr_parser import UnexpectedToken from .common import is_terminal, GrammarError, LexerConf, ParserConf, PatternStr, PatternRE, TokenDef from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol -from .utils import classify +from .utils import classify, suppress from .tree import Tree, Transformer, InlineTransformer, Visitor, SlottedTree as ST @@ -269,35 +269,32 @@ class PrepareAnonTerminals(InlineTransformer): if p in self.token_reverse and p.flags != self.token_reverse[p].pattern.flags: raise GrammarError(u'Conflicting flags for the same terminal: %s' % p) + token_name = None + if isinstance(p, PatternStr): try: # If already defined, use the user-defined token name token_name = self.token_reverse[p].name except KeyError: - # Try to assign an indicative anon-token name, otherwise use a numbered name + # Try to assign an indicative anon-token name try: token_name = _TOKEN_NAMES[value] except KeyError: if value.isalnum() and value[0].isalpha() and value.upper() not in self.token_set: - token_name = '%s%d' % (value.upper(), self.i) - try: - # Make sure we don't have unicode in our token names - token_name.encode('ascii') - except UnicodeEncodeError: - token_name = 'ANONSTR_%d' % self.i - else: - token_name = 'ANONSTR_%d' % self.i - self.i += 1 + with suppress(UnicodeEncodeError): + value.upper().encode('ascii') # Make sure we don't have unicode in our token names + token_name = value.upper() elif isinstance(p, PatternRE): if p in self.token_reverse: # Kind of a wierd placement.name token_name = self.token_reverse[p].name - else: - token_name = 'ANONRE_%d' % self.i - self.i += 1 else: assert False, p + if token_name is None: + token_name = '__ANON_%d' % self.i + self.i += 1 + if token_name not in self.token_set: assert p not in self.token_reverse self.token_set.add(token_name) From 2b4ef11ebf1770871d5b03f1f245701beab43072 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Thu, 10 May 2018 12:34:19 +0300 Subject: [PATCH 20/34] Columns now start at 1 --- lark/lexer.py | 4 ++-- lark/parsers/xearley.py | 4 ++-- tests/test_parser.py | 12 ++++++------ 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/lark/lexer.py b/lark/lexer.py index 19e1be4..51ccf6c 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -63,7 +63,7 @@ class LineCounter: self.newline_char = '\n' self.char_pos = 0 self.line = 1 - self.column = 0 + self.column = 1 self.line_start_pos = 0 def feed(self, token, test_newline=True): @@ -78,7 +78,7 @@ class LineCounter: self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1 self.char_pos += len(token) - self.column = self.char_pos - self.line_start_pos + self.column = self.char_pos - self.line_start_pos + 1 class _Lex: "Built to serve both Lexer and ContextualLexer" diff --git a/lark/parsers/xearley.py b/lark/parsers/xearley.py index c64bfee..5e8fb28 100644 --- a/lark/parsers/xearley.py +++ b/lark/parsers/xearley.py @@ -53,7 +53,7 @@ class Parser: match = self.term_matcher text_line = 1 - text_column = 0 + text_column = 1 def predict(nonterm, column): assert not nonterm.is_term, nonterm @@ -128,7 +128,7 @@ class Parser: if token == '\n': text_line += 1 - text_column = 0 + text_column = 1 else: text_column += 1 diff --git a/tests/test_parser.py b/tests/test_parser.py index 21a3dc6..6823f0c 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -854,22 +854,22 @@ def _make_parser_test(LEXER, PARSER): l = _Lark(g) a, bc, d = l.parse("AB\nCD").children self.assertEqual(a.line, 1) - self.assertEqual(a.column, 0) + self.assertEqual(a.column, 1) bc ,= bc.children self.assertEqual(bc.line, 1) - self.assertEqual(bc.column, 1) + self.assertEqual(bc.column, 2) self.assertEqual(d.line, 2) - self.assertEqual(d.column, 1) + self.assertEqual(d.column, 2) if LEXER != 'dynamic': self.assertEqual(a.end_line, 1) - self.assertEqual(a.end_column, 1) + self.assertEqual(a.end_column, 2) self.assertEqual(bc.end_line, 2) - self.assertEqual(bc.end_column, 1) + self.assertEqual(bc.end_column, 2) self.assertEqual(d.end_line, 2) - self.assertEqual(d.end_column, 2) + self.assertEqual(d.end_column, 3) From 55e9d5679264e86c977180a6d48047174ca2ec3b Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Sat, 12 May 2018 23:39:23 +0300 Subject: [PATCH 21/34] Missed those at merge --- lark/load_grammar.py | 5 +++-- lark/transformers.py | 27 +++++++++++++++++++++++++++ tests/test_trees.py | 3 ++- 3 files changed, 32 insertions(+), 3 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 5fee242..c710bf8 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -407,7 +407,7 @@ class TokenTreeToPattern(ChildrenTransformer): def value(self, v): return v[0] -class PrepareSymbols(Transformer): +class PrepareSymbols(ChildrenTransformer): def value(self, v): v ,= v if isinstance(v, Tree): @@ -532,7 +532,8 @@ def options_from_rule(name, *x): def symbols_from_strcase(expansion): return [Terminal(x, filter_out=x.startswith('_')) if is_terminal(x) else NonTerminal(x) for x in expansion] -class PrepareGrammar(InlineTransformer): +@inline_args +class PrepareGrammar(ChildrenTransformer): def terminal(self, name): return name def nonterminal(self, name): diff --git a/lark/transformers.py b/lark/transformers.py index b9f63ce..d5f13a1 100644 --- a/lark/transformers.py +++ b/lark/transformers.py @@ -78,6 +78,8 @@ class Transformer_InPlace(Transformer): return self._transform(tree) class Visitor(Base): + "Bottom-up visitor" + def visit(self, tree): for subtree in tree.iter_subtrees(): self._call_userfunc(subtree) @@ -99,6 +101,31 @@ class Visitor_Recursive(Base): return tree +from functools import wraps +def visit_children_decor(func): + @wraps(func) + def inner(cls, tree): + values = cls.visit_children(tree) + return func(cls, values) + return inner + +class Interpreter(object): + "Top-down visitor" + + def visit(self, tree): + return getattr(self, tree.data)(tree) + + def visit_children(self, tree): + return [self.visit(child) if isinstance(child, Tree) else child + for child in tree.children] + + def __getattr__(self, name): + return self.__default__ + + def __default__(self, tree): + return self.visit_children(tree) + + def inline_args(obj): if inspect.isclass(obj) and issubclass(obj, ChildrenTransformer): class _NewTransformer(ChildrenInlineTransformer, obj): diff --git a/tests/test_trees.py b/tests/test_trees.py index 6017386..a41d3de 100644 --- a/tests/test_trees.py +++ b/tests/test_trees.py @@ -5,7 +5,8 @@ from unittest import TestCase import copy import pickle -from lark.tree import Tree, Interpreter, visit_children_decor +from lark.tree import Tree +from lark.transformers import Interpreter, visit_children_decor class TestTrees(TestCase): From 1508dcd7c5f37f62527e9282050df59b237196e7 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Sat, 12 May 2018 23:54:22 +0300 Subject: [PATCH 22/34] Refactored inline_args with smart_decorator --- lark/utils.py | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/lark/utils.py b/lark/utils.py index 0018e49..5e09e7d 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -55,34 +55,34 @@ from contextlib import contextmanager Str = type(u'') -def inline_args(f): - # print '@@', f.__name__, type(f), isinstance(f, types.FunctionType), isinstance(f, types.TypeType), isinstance(f, types.BuiltinFunctionType) +def smart_decorator(f, create_decorator): if isinstance(f, types.FunctionType): - @functools.wraps(f) - def _f_func(self, args): - return f(self, *args) - return _f_func + return functools.wraps(create_decorator(f, True)) + elif isinstance(f, (type, types.BuiltinFunctionType)): - @functools.wraps(f) - def _f_builtin(_self, args): - return f(*args) - return _f_builtin + return functools.wraps(create_decorator(f, False)) + elif isinstance(f, types.MethodType): - @functools.wraps(f.__func__) - def _f(self, args): - return f.__func__(self, *args) - return _f + return functools.wraps(create_decorator(f.__func__, True)) + elif isinstance(f, functools.partial): # wraps does not work for partials in 2.7: https://bugs.python.org/issue3445 - # @functools.wraps(f) - def _f(self, args): - return f(*args) - return _f + return create_decorator(f.__func__, True) + else: - @functools.wraps(f.__call__.__func__) - def _f(self, args): - return f.__call__.__func__(self, *args) - return _f + return create_decorator(f.__func__.__call__, True) + + +def inline_args(f): + def create_decorator(_f, with_self): + if with_self: + def f(self, args): + return _f(self, *args) + else: + def f(args): + return _f(*args) + + return smart_decorator(f, create_decorator) try: From 9daacb9082dea08fee0f81eb94f353fd5af559e2 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Sun, 13 May 2018 00:42:50 +0300 Subject: [PATCH 23/34] Refactored transformers, better code --- lark/__init__.py | 1 - lark/load_grammar.py | 31 ++++++------ lark/transformers.py | 113 ++++++++++++++++++++++++++++--------------- lark/tree.py | 2 - lark/utils.py | 20 ++------ tests/test_parser.py | 4 +- 6 files changed, 98 insertions(+), 73 deletions(-) diff --git a/lark/__init__.py b/lark/__init__.py index a2a67b9..5613855 100644 --- a/lark/__init__.py +++ b/lark/__init__.py @@ -3,6 +3,5 @@ from .transformers import Transformer from .common import ParseError, GrammarError, UnexpectedToken from .lexer import UnexpectedInput, LexError from .lark import Lark -from .utils import inline_args __version__ = "0.5.6" diff --git a/lark/load_grammar.py b/lark/load_grammar.py index c710bf8..49b4b8f 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -16,7 +16,7 @@ from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol from .utils import classify, suppress from .tree import Tree, SlottedTree as ST -from .transformers import Transformer, ChildrenTransformer, inline_args, Visitor +from .transformers import Transformer, Visitor, children_args, children_args_inline __path__ = os.path.dirname(__file__) IMPORT_PATHS = [os.path.join(__path__, 'grammars')] @@ -138,8 +138,8 @@ RULES = { } -@inline_args -class EBNF_to_BNF(ChildrenTransformer): +@children_args_inline +class EBNF_to_BNF(Transformer): def __init__(self): self.new_rules = [] self.rules_by_expr = {} @@ -232,7 +232,8 @@ class SimplifyRule_Visitor(Visitor): tree.children = list(set(tree.children)) -class RuleTreeToText(ChildrenTransformer): +@children_args +class RuleTreeToText(Transformer): def expansions(self, x): return x def expansion(self, symbols): @@ -243,8 +244,8 @@ class RuleTreeToText(ChildrenTransformer): return expansion, alias.value -@inline_args -class CanonizeTree(ChildrenTransformer): +@children_args_inline +class CanonizeTree(Transformer): def maybe(self, expr): return ST('expr', [expr, Token('OP', '?', -1)]) @@ -254,8 +255,8 @@ class CanonizeTree(ChildrenTransformer): tokenmods, value = args return tokenmods + [value] -@inline_args -class PrepareAnonTerminals(ChildrenTransformer): +@children_args_inline +class PrepareAnonTerminals(Transformer): "Create a unique list of anonymous tokens. Attempt to give meaningful names to them when we add them" def __init__(self, tokens): @@ -354,8 +355,8 @@ def _literal_to_pattern(literal): 'REGEXP': PatternRE }[literal.type](s, flags) -@inline_args -class PrepareLiterals(ChildrenTransformer): +@children_args_inline +class PrepareLiterals(Transformer): def literal(self, literal): return ST('pattern', [_literal_to_pattern(literal)]) @@ -368,7 +369,8 @@ class PrepareLiterals(ChildrenTransformer): return ST('pattern', [PatternRE(regexp)]) -class TokenTreeToPattern(ChildrenTransformer): +@children_args +class TokenTreeToPattern(Transformer): def pattern(self, ps): p ,= ps return p @@ -407,7 +409,8 @@ class TokenTreeToPattern(ChildrenTransformer): def value(self, v): return v[0] -class PrepareSymbols(ChildrenTransformer): +@children_args +class PrepareSymbols(Transformer): def value(self, v): v ,= v if isinstance(v, Tree): @@ -532,8 +535,8 @@ def options_from_rule(name, *x): def symbols_from_strcase(expansion): return [Terminal(x, filter_out=x.startswith('_')) if is_terminal(x) else NonTerminal(x) for x in expansion] -@inline_args -class PrepareGrammar(ChildrenTransformer): +@children_args_inline +class PrepareGrammar(Transformer): def terminal(self, name): return name def nonterminal(self, name): diff --git a/lark/transformers.py b/lark/transformers.py index d5f13a1..41a3980 100644 --- a/lark/transformers.py +++ b/lark/transformers.py @@ -1,7 +1,7 @@ import inspect from functools import wraps -from . import utils +from .utils import smart_decorator from .tree import Tree class Discard(Exception): @@ -13,46 +13,27 @@ class Base: return getattr(self, tree.data, self.__default__)(tree) def __default__(self, tree): + "Default operation on tree (for override)" return tree class Transformer(Base): def _transform_children(self, children): for c in children: try: - yield self._transform(c) if isinstance(c, Tree) else c + yield self._transform_tree(c) if isinstance(c, Tree) else c except Discard: pass - def _transform(self, tree): + def _transform_tree(self, tree): tree = Tree(tree.data, list(self._transform_children(tree.children))) return self._call_userfunc(tree) def transform(self, tree): - return self._transform(tree) + return self._transform_tree(tree) def __mul__(self, other): return TransformerChain(self, other) -class ChildrenTransformer(Transformer): - def _call_userfunc(self, tree): - # Assumes tree is already transformed - try: - f = getattr(self, tree.data) - except AttributeError: - return self.__default__(tree) - else: - return f(tree.children) - -class ChildrenInlineTransformer(Transformer): - def _call_userfunc(self, tree): - # Assumes tree is already transformed - try: - f = getattr(self, tree.data) - except AttributeError: - return self.__default__(tree) - else: - return f(*tree.children) - class TransformerChain(object): def __init__(self, *transformers): @@ -68,14 +49,22 @@ class TransformerChain(object): class Transformer_InPlace(Transformer): - def _transform(self, tree): + def _transform_tree(self, tree): # Cancel recursion return self._call_userfunc(tree) def transform(self, tree): for subtree in tree.iter_subtrees(): subtree.children = list(self._transform_children(subtree.children)) - return self._transform(tree) + return self._transform_tree(tree) + + +class Transformer_InPlaceRecursive(Transformer): + def _transform_tree(self, tree): + tree.children = list(self._transform_children(tree.children)) + return self._call_userfunc(tree) + + class Visitor(Base): "Bottom-up visitor" @@ -85,11 +74,6 @@ class Visitor(Base): self._call_userfunc(subtree) return tree -class Transformer_InPlaceRecursive(Transformer): - def _transform(self, tree): - tree.children = list(self._transform_children(tree.children)) - return self._call_userfunc(tree) - class Visitor_Recursive(Base): def visit(self, tree): for child in tree.children: @@ -101,7 +85,6 @@ class Visitor_Recursive(Base): return tree -from functools import wraps def visit_children_decor(func): @wraps(func) def inner(cls, tree): @@ -126,11 +109,63 @@ class Interpreter(object): return self.visit_children(tree) -def inline_args(obj): - if inspect.isclass(obj) and issubclass(obj, ChildrenTransformer): - class _NewTransformer(ChildrenInlineTransformer, obj): - pass - return _NewTransformer - else: - return utils.inline_args(obj) +def _children_args__func(f): + @wraps(f) + def create_decorator(_f, with_self): + if with_self: + def f(self, tree): + return _f(self, tree.children) + else: + def f(args): + return _f(tree.children) + + return smart_decorator(f, create_decorator) + +def _children_args__class(cls): + def _call_userfunc(self, tree): + # Assumes tree is already transformed + try: + f = getattr(self, tree.data) + except AttributeError: + return self.__default__(tree) + else: + return f(tree.children) + cls._call_userfunc = _call_userfunc + return cls + + +def children_args(obj): + decorator = _children_args__class if issubclass(obj, Base) else _children_args__func + return decorator(obj) + + + +def _children_args_inline__func(f): + @wraps(f) + def create_decorator(_f, with_self): + if with_self: + def f(self, tree): + return _f(self, *tree.children) + else: + def f(args): + return _f(*tree.children) + + return smart_decorator(f, create_decorator) + + +def _children_args_inline__class(cls): + def _call_userfunc(self, tree): + # Assumes tree is already transformed + try: + f = getattr(self, tree.data) + except AttributeError: + return self.__default__(tree) + else: + return f(*tree.children) + cls._call_userfunc = _call_userfunc + return cls + +def children_args_inline(obj): + decorator = _children_args_inline__class if issubclass(obj, Base) else _children_args_inline__func + return decorator(obj) diff --git a/lark/tree.py b/lark/tree.py index a2b0bef..e20c18d 100644 --- a/lark/tree.py +++ b/lark/tree.py @@ -5,8 +5,6 @@ except ImportError: from copy import deepcopy -from .utils import inline_args - class Meta: pass diff --git a/lark/utils.py b/lark/utils.py index 5e09e7d..a4a63dd 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -50,22 +50,22 @@ except NameError: # Python 3 ###{standalone import types -import functools +from functools import wraps, partial from contextlib import contextmanager Str = type(u'') def smart_decorator(f, create_decorator): if isinstance(f, types.FunctionType): - return functools.wraps(create_decorator(f, True)) + return wraps(create_decorator(f, True)) elif isinstance(f, (type, types.BuiltinFunctionType)): - return functools.wraps(create_decorator(f, False)) + return wraps(create_decorator(f, False)) elif isinstance(f, types.MethodType): - return functools.wraps(create_decorator(f.__func__, True)) + return wraps(create_decorator(f.__func__, True)) - elif isinstance(f, functools.partial): + elif isinstance(f, partial): # wraps does not work for partials in 2.7: https://bugs.python.org/issue3445 return create_decorator(f.__func__, True) @@ -73,16 +73,6 @@ def smart_decorator(f, create_decorator): return create_decorator(f.__func__.__call__, True) -def inline_args(f): - def create_decorator(_f, with_self): - if with_self: - def f(self, args): - return _f(self, *args) - else: - def f(args): - return _f(*args) - - return smart_decorator(f, create_decorator) try: diff --git a/tests/test_parser.py b/tests/test_parser.py index e147595..6531cb6 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -21,8 +21,7 @@ from lark.lark import Lark from lark.common import GrammarError, ParseError, UnexpectedToken from lark.lexer import LexError, UnexpectedInput from lark.tree import Tree -from lark.transformers import ChildrenTransformer as Transformer -# from lark.tree import Transformer +from lark.transformers import Transformer, children_args __path__ = os.path.dirname(__file__) def _read(n, *args): @@ -94,6 +93,7 @@ class TestParsers(unittest.TestCase): self.assertEqual( r.children[0].data, "c" ) def test_embedded_transformer(self): + @children_args class T(Transformer): def a(self, children): return "" From 5e546f38a954a1b8032d7d28ea403e50504e6e05 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Sun, 13 May 2018 00:54:06 +0300 Subject: [PATCH 24/34] args decorators actually work now --- lark/__init__.py | 2 +- lark/load_grammar.py | 6 +++--- lark/parsers/earley.py | 2 +- lark/utils.py | 6 +++--- lark/{transformers.py => visitors.py} | 8 +++++--- tests/test_parser.py | 2 +- tests/test_trees.py | 2 +- 7 files changed, 15 insertions(+), 13 deletions(-) rename lark/{transformers.py => visitors.py} (93%) diff --git a/lark/__init__.py b/lark/__init__.py index 5613855..b36b3fc 100644 --- a/lark/__init__.py +++ b/lark/__init__.py @@ -1,5 +1,5 @@ from .tree import Tree -from .transformers import Transformer +from .visitors import Transformer, Visitor, children_args, children_args_inline from .common import ParseError, GrammarError, UnexpectedToken from .lexer import UnexpectedInput, LexError from .lark import Lark diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 49b4b8f..6262f62 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -16,7 +16,7 @@ from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol from .utils import classify, suppress from .tree import Tree, SlottedTree as ST -from .transformers import Transformer, Visitor, children_args, children_args_inline +from .visitors import Transformer, Visitor, children_args, children_args_inline __path__ = os.path.dirname(__file__) IMPORT_PATHS = [os.path.join(__path__, 'grammars')] @@ -255,7 +255,6 @@ class CanonizeTree(Transformer): tokenmods, value = args return tokenmods + [value] -@children_args_inline class PrepareAnonTerminals(Transformer): "Create a unique list of anonymous tokens. Attempt to give meaningful names to them when we add them" @@ -266,6 +265,7 @@ class PrepareAnonTerminals(Transformer): self.i = 0 + @children_args_inline def pattern(self, p): value = p.value if p in self.token_reverse and p.flags != self.token_reverse[p].pattern.flags: @@ -409,8 +409,8 @@ class TokenTreeToPattern(Transformer): def value(self, v): return v[0] -@children_args class PrepareSymbols(Transformer): + @children_args def value(self, v): v ,= v if isinstance(v, Tree): diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py index 5f92fbb..d58b57c 100644 --- a/lark/parsers/earley.py +++ b/lark/parsers/earley.py @@ -14,7 +14,7 @@ # Email : erezshin@gmail.com from ..tree import Tree -from ..transformers import Transformer_InPlace +from ..visitors import Transformer_InPlace from ..common import ParseError, UnexpectedToken from .grammar_analysis import GrammarAnalyzer from ..grammar import NonTerminal diff --git a/lark/utils.py b/lark/utils.py index a4a63dd..6f603bb 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -57,13 +57,13 @@ Str = type(u'') def smart_decorator(f, create_decorator): if isinstance(f, types.FunctionType): - return wraps(create_decorator(f, True)) + return wraps(f)(create_decorator(f, True)) elif isinstance(f, (type, types.BuiltinFunctionType)): - return wraps(create_decorator(f, False)) + return wraps(f)(create_decorator(f, False)) elif isinstance(f, types.MethodType): - return wraps(create_decorator(f.__func__, True)) + return wraps(f)(create_decorator(f.__func__, True)) elif isinstance(f, partial): # wraps does not work for partials in 2.7: https://bugs.python.org/issue3445 diff --git a/lark/transformers.py b/lark/visitors.py similarity index 93% rename from lark/transformers.py rename to lark/visitors.py index 41a3980..a1167fa 100644 --- a/lark/transformers.py +++ b/lark/visitors.py @@ -1,4 +1,4 @@ -import inspect +from inspect import isclass from functools import wraps from .utils import smart_decorator @@ -119,6 +119,7 @@ def _children_args__func(f): else: def f(args): return _f(tree.children) + return f return smart_decorator(f, create_decorator) @@ -136,7 +137,7 @@ def _children_args__class(cls): def children_args(obj): - decorator = _children_args__class if issubclass(obj, Base) else _children_args__func + decorator = _children_args__class if isclass(obj) and issubclass(obj, Base) else _children_args__func return decorator(obj) @@ -150,6 +151,7 @@ def _children_args_inline__func(f): else: def f(args): return _f(*tree.children) + return f return smart_decorator(f, create_decorator) @@ -167,5 +169,5 @@ def _children_args_inline__class(cls): return cls def children_args_inline(obj): - decorator = _children_args_inline__class if issubclass(obj, Base) else _children_args_inline__func + decorator = _children_args_inline__class if isclass(obj) and issubclass(obj, Base) else _children_args_inline__func return decorator(obj) diff --git a/tests/test_parser.py b/tests/test_parser.py index 6531cb6..25ce619 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -21,7 +21,7 @@ from lark.lark import Lark from lark.common import GrammarError, ParseError, UnexpectedToken from lark.lexer import LexError, UnexpectedInput from lark.tree import Tree -from lark.transformers import Transformer, children_args +from lark.visitors import Transformer, children_args __path__ = os.path.dirname(__file__) def _read(n, *args): diff --git a/tests/test_trees.py b/tests/test_trees.py index a41d3de..df3b9b6 100644 --- a/tests/test_trees.py +++ b/tests/test_trees.py @@ -6,7 +6,7 @@ import copy import pickle from lark.tree import Tree -from lark.transformers import Interpreter, visit_children_decor +from lark.visitors import Interpreter, visit_children_decor class TestTrees(TestCase): From 4864a1cf4dedb8b5fc4bea0f9f706b4ebd2d64f5 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Tue, 15 May 2018 10:59:20 +0300 Subject: [PATCH 25/34] More work --- examples/calc.py | 4 +- lark/parsers/earley.py | 7 ++- lark/parsers/resolve_ambig.py | 12 ++--- lark/visitors.py | 92 +++++++++++++++++++++-------------- tests/test_trees.py | 54 +++++++++++++++++++- 5 files changed, 119 insertions(+), 50 deletions(-) diff --git a/examples/calc.py b/examples/calc.py index cb7ef5d..e90b5cc 100644 --- a/examples/calc.py +++ b/examples/calc.py @@ -2,7 +2,7 @@ # This example shows how to write a basic calculator with variables. # -from lark import Lark, InlineTransformer +from lark import Lark, Transformer, children_args_inline try: input = raw_input # For Python2 compatibility @@ -34,7 +34,7 @@ calc_grammar = """ %ignore WS_INLINE """ -class CalculateTree(InlineTransformer): +class CalculateTree(SimpleTransformer): from operator import add, sub, mul, truediv as div, neg number = float diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py index d58b57c..87dc41e 100644 --- a/lark/parsers/earley.py +++ b/lark/parsers/earley.py @@ -21,11 +21,10 @@ from ..grammar import NonTerminal class Derivation(Tree): - _hash = None - def __init__(self, rule, items=None): Tree.__init__(self, 'drv', items or []) - self.rule = rule + self.meta.rule = rule + self._hash = None def _pretty_label(self): # Nicer pretty for debugging the parser return self.rule.origin if self.rule else self.data @@ -236,4 +235,4 @@ class ApplyCallbacks(Transformer_InPlace): self.postprocess = postprocess def drv(self, tree): - return self.postprocess[tree.rule](tree.children) + return self.postprocess[tree.meta.rule](tree.children) diff --git a/lark/parsers/resolve_ambig.py b/lark/parsers/resolve_ambig.py index 7c482ae..0d3f17c 100644 --- a/lark/parsers/resolve_ambig.py +++ b/lark/parsers/resolve_ambig.py @@ -16,7 +16,7 @@ def _sum_priority(tree): for n in tree.iter_subtrees(): try: - p += n.rule.options.priority or 0 + p += n.meta.rule.options.priority or 0 except AttributeError: pass @@ -26,8 +26,8 @@ def _compare_priority(tree1, tree2): tree1.iter_subtrees() def _compare_drv(tree1, tree2): - rule1 = getattr(tree1, 'rule', None) - rule2 = getattr(tree2, 'rule', None) + rule1 = getattr(tree1.meta, 'rule', None) + rule2 = getattr(tree2.meta, 'rule', None) if None == rule1 == rule2: return compare(tree1, tree2) @@ -45,7 +45,7 @@ def _compare_drv(tree1, tree2): if c: return c - c = _compare_rules(tree1.rule, tree2.rule) + c = _compare_rules(tree1.meta.rule, tree2.meta.rule) if c: return c @@ -65,7 +65,7 @@ def _standard_resolve_ambig(tree): best = max(tree.children, key=key_f) assert best.data == 'drv' tree.set('drv', best.children) - tree.rule = best.rule # needed for applying callbacks + tree.meta.rule = best.meta.rule # needed for applying callbacks def standard_resolve_ambig(tree): for ambig in tree.find_data('_ambig'): @@ -93,7 +93,7 @@ def _antiscore_sum_resolve_ambig(tree): best = min(tree.children, key=_antiscore_sum_drv) assert best.data == 'drv' tree.set('drv', best.children) - tree.rule = best.rule # needed for applying callbacks + tree.meta.rule = best.meta.rule # needed for applying callbacks def antiscore_sum_resolve_ambig(tree): for ambig in tree.find_data('_ambig'): diff --git a/lark/visitors.py b/lark/visitors.py index a1167fa..d3853bf 100644 --- a/lark/visitors.py +++ b/lark/visitors.py @@ -1,4 +1,4 @@ -from inspect import isclass +from inspect import isclass, getmembers, getmro from functools import wraps from .utils import smart_decorator @@ -16,6 +16,30 @@ class Base: "Default operation on tree (for override)" return tree + @classmethod + def _apply_decorator(cls, decorator): + mro = getmro(cls) + assert mro[0] is cls + libmembers = {name for _cls in mro[1:] for name, _ in getmembers(_cls)} + for name, value in getmembers(cls): + if name.startswith('_') or name in libmembers: + continue + + setattr(cls, name, decorator(value)) + return cls + + +class SimpleBase(Base): + def _call_userfunc(self, tree): + # Assumes tree is already transformed + try: + f = getattr(self, tree.data) + except AttributeError: + return self.__default__(tree) + else: + return f(tree.children) + + class Transformer(Base): def _transform_children(self, children): for c in children: @@ -35,6 +59,7 @@ class Transformer(Base): return TransformerChain(self, other) + class TransformerChain(object): def __init__(self, *transformers): self.transformers = transformers @@ -110,8 +135,22 @@ class Interpreter(object): -def _children_args__func(f): - @wraps(f) + + +def _apply_decorator(obj, decorator): + try: + _apply = obj._apply_decorator + except AttributeError: + return decorator(obj) + else: + return _apply(decorator) + + +def _children_args__func(func): + if getattr(func, '_children_args_decorated', False): + return func + + @wraps(func) def create_decorator(_f, with_self): if with_self: def f(self, tree): @@ -119,55 +158,34 @@ def _children_args__func(f): else: def f(args): return _f(tree.children) + f._children_args_decorated = True return f - return smart_decorator(f, create_decorator) - -def _children_args__class(cls): - def _call_userfunc(self, tree): - # Assumes tree is already transformed - try: - f = getattr(self, tree.data) - except AttributeError: - return self.__default__(tree) - else: - return f(tree.children) - cls._call_userfunc = _call_userfunc - return cls - + return smart_decorator(func, create_decorator) def children_args(obj): - decorator = _children_args__class if isclass(obj) and issubclass(obj, Base) else _children_args__func - return decorator(obj) + return _apply_decorator(obj, _children_args__func) -def _children_args_inline__func(f): - @wraps(f) +def _children_args_inline__func(func): + if getattr(func, '_children_args_decorated', False): + return func + + @wraps(func) def create_decorator(_f, with_self): if with_self: def f(self, tree): return _f(self, *tree.children) else: - def f(args): + def f(self, tree): + print ('##', _f, tree) return _f(*tree.children) + f._children_args_decorated = True return f - return smart_decorator(f, create_decorator) - + return smart_decorator(func, create_decorator) -def _children_args_inline__class(cls): - def _call_userfunc(self, tree): - # Assumes tree is already transformed - try: - f = getattr(self, tree.data) - except AttributeError: - return self.__default__(tree) - else: - return f(*tree.children) - cls._call_userfunc = _call_userfunc - return cls def children_args_inline(obj): - decorator = _children_args_inline__class if isclass(obj) and issubclass(obj, Base) else _children_args_inline__func - return decorator(obj) + return _apply_decorator(obj, _children_args_inline__func) diff --git a/tests/test_trees.py b/tests/test_trees.py index df3b9b6..b7796bf 100644 --- a/tests/test_trees.py +++ b/tests/test_trees.py @@ -6,7 +6,7 @@ import copy import pickle from lark.tree import Tree -from lark.visitors import Interpreter, visit_children_decor +from lark.visitors import Transformer, Interpreter, visit_children_decor, children_args_inline, children_args class TestTrees(TestCase): @@ -59,6 +59,58 @@ class TestTrees(TestCase): self.assertEqual(Interp3().visit(t), list('BCd')) + def test_transformer(self): + t = Tree('add', [Tree('sub', [Tree('i', ['3']), Tree('f', ['1.1'])]), Tree('i', ['1'])]) + + class T(Transformer): + i = children_args_inline(int) + f = children_args_inline(float) + + sub = lambda self, tree: tree.children[0] - tree.children[1] + + def add(self, tree): + return sum(tree.children) + + + res = T().transform(t) + self.assertEqual(res, 2.9) + + @children_args_inline + class T(Transformer): + i = int + f = float + sub = lambda self, a, b: a-b + + def add(self, a, b): + return a + b + + + res = T().transform(t) + self.assertEqual(res, 2.9) + + + @children_args_inline + class T(Transformer): + i = int + f = float + from operator import sub, add + + res = T().transform(t) + self.assertEqual(res, 2.9) + + + @children_args + class T(Transformer): + i = children_args_inline(int) + f = children_args_inline(float) + + sub = lambda self, values: values[0] - values[1] + + def add(self, values): + return sum(values) + + res = T().transform(t) + self.assertEqual(res, 2.9) if __name__ == '__main__': From 6bfc27c11d2d17cc6ea9ee6fab77ff75806b9d87 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Fri, 18 May 2018 15:14:57 +0300 Subject: [PATCH 26/34] New transformers near completion Nearley tool still needs fixing --- examples/calc.py | 5 +- examples/json_parser.py | 6 +- lark/__init__.py | 3 +- lark/load_grammar.py | 15 ++-- lark/parse_tree_builder.py | 16 +++- lark/parsers/earley.py | 11 +-- lark/parsers/resolve_ambig.py | 11 ++- lark/tree.py | 4 +- lark/visitors.py | 145 ++++++++++++++++++++-------------- tests/test_parser.py | 3 +- tests/test_trees.py | 30 ++----- 11 files changed, 140 insertions(+), 109 deletions(-) diff --git a/examples/calc.py b/examples/calc.py index e90b5cc..1102151 100644 --- a/examples/calc.py +++ b/examples/calc.py @@ -2,7 +2,7 @@ # This example shows how to write a basic calculator with variables. # -from lark import Lark, Transformer, children_args_inline +from lark import Lark, Transformer, visitor_args try: input = raw_input # For Python2 compatibility @@ -34,7 +34,8 @@ calc_grammar = """ %ignore WS_INLINE """ -class CalculateTree(SimpleTransformer): +@visitor_args(inline=True) +class CalculateTree(Transformer): from operator import add, sub, mul, truediv as div, neg number = float diff --git a/examples/json_parser.py b/examples/json_parser.py index 38a5b70..216af2c 100644 --- a/examples/json_parser.py +++ b/examples/json_parser.py @@ -7,7 +7,7 @@ import sys -from lark import Lark, inline_args, Transformer +from lark import Lark, Transformer, visitor_args json_grammar = r""" ?start: value @@ -34,14 +34,14 @@ json_grammar = r""" """ class TreeToJson(Transformer): - @inline_args + @visitor_args(inline=True) def string(self, s): return s[1:-1].replace('\\"', '"') array = list pair = tuple object = dict - number = inline_args(float) + number = visitor_args(inline=True)(float) null = lambda self, _: None true = lambda self, _: True diff --git a/lark/__init__.py b/lark/__init__.py index b36b3fc..850f702 100644 --- a/lark/__init__.py +++ b/lark/__init__.py @@ -1,5 +1,6 @@ from .tree import Tree -from .visitors import Transformer, Visitor, children_args, children_args_inline +from .visitors import Transformer, Visitor, visitor_args, Discard +from .visitors import InlineTransformer, inline_args # XXX Deprecated from .common import ParseError, GrammarError, UnexpectedToken from .lexer import UnexpectedInput, LexError from .lark import Lark diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 6262f62..5cf8e19 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -16,7 +16,7 @@ from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol from .utils import classify, suppress from .tree import Tree, SlottedTree as ST -from .visitors import Transformer, Visitor, children_args, children_args_inline +from .visitors import Transformer, Visitor, visitor_args __path__ = os.path.dirname(__file__) IMPORT_PATHS = [os.path.join(__path__, 'grammars')] @@ -138,7 +138,7 @@ RULES = { } -@children_args_inline +@visitor_args(inline=True) class EBNF_to_BNF(Transformer): def __init__(self): self.new_rules = [] @@ -232,7 +232,6 @@ class SimplifyRule_Visitor(Visitor): tree.children = list(set(tree.children)) -@children_args class RuleTreeToText(Transformer): def expansions(self, x): return x @@ -244,7 +243,7 @@ class RuleTreeToText(Transformer): return expansion, alias.value -@children_args_inline +@visitor_args(inline=True) class CanonizeTree(Transformer): def maybe(self, expr): return ST('expr', [expr, Token('OP', '?', -1)]) @@ -265,7 +264,7 @@ class PrepareAnonTerminals(Transformer): self.i = 0 - @children_args_inline + @visitor_args(inline=True) def pattern(self, p): value = p.value if p in self.token_reverse and p.flags != self.token_reverse[p].pattern.flags: @@ -355,7 +354,7 @@ def _literal_to_pattern(literal): 'REGEXP': PatternRE }[literal.type](s, flags) -@children_args_inline +@visitor_args(inline=True) class PrepareLiterals(Transformer): def literal(self, literal): return ST('pattern', [_literal_to_pattern(literal)]) @@ -369,7 +368,6 @@ class PrepareLiterals(Transformer): return ST('pattern', [PatternRE(regexp)]) -@children_args class TokenTreeToPattern(Transformer): def pattern(self, ps): p ,= ps @@ -410,7 +408,6 @@ class TokenTreeToPattern(Transformer): return v[0] class PrepareSymbols(Transformer): - @children_args def value(self, v): v ,= v if isinstance(v, Tree): @@ -535,7 +532,7 @@ def options_from_rule(name, *x): def symbols_from_strcase(expansion): return [Terminal(x, filter_out=x.startswith('_')) if is_terminal(x) else NonTerminal(x) for x in expansion] -@children_args_inline +@visitor_args(inline=True) class PrepareGrammar(Transformer): def terminal(self, name): return name diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py index 7c624e2..abca756 100644 --- a/lark/parse_tree_builder.py +++ b/lark/parse_tree_builder.py @@ -3,9 +3,10 @@ from .utils import suppress from .lexer import Token from .grammar import Rule from .tree import Tree +from .visitors import InlineTransformer # XXX Deprecated ###{standalone -from functools import partial +from functools import partial, wraps class ExpandSingleChild: @@ -95,6 +96,15 @@ def maybe_create_child_filter(expansion, keep_all_tokens, ambiguous): class Callback(object): pass + +def inline_args(func): + @wraps(func) + def f(children): + return func(*children) + return f + + + class ParseTreeBuilder: def __init__(self, rules, tree_class, propagate_positions=False, keep_all_tokens=False, ambiguous=False): self.tree_class = tree_class @@ -130,6 +140,10 @@ class ParseTreeBuilder: user_callback_name = rule.alias or rule.origin.name try: f = getattr(transformer, user_callback_name) + assert not getattr(f, 'meta', False), "Meta args not supported for internal transformer" + # XXX InlineTransformer is deprecated! + if getattr(f, 'inline', False) or isinstance(transformer, InlineTransformer): + f = inline_args(f) except AttributeError: f = partial(self.tree_class, user_callback_name) diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py index 87dc41e..46a1271 100644 --- a/lark/parsers/earley.py +++ b/lark/parsers/earley.py @@ -14,7 +14,7 @@ # Email : erezshin@gmail.com from ..tree import Tree -from ..visitors import Transformer_InPlace +from ..visitors import Transformer_InPlace, visitor_args from ..common import ParseError, UnexpectedToken from .grammar_analysis import GrammarAnalyzer from ..grammar import NonTerminal @@ -114,9 +114,9 @@ class Column: if old_tree.data != '_ambig': new_tree = old_tree.copy() - new_tree.rule = old_tree.rule + new_tree.meta.rule = old_tree.meta.rule old_tree.set('_ambig', [new_tree]) - old_tree.rule = None # No longer a 'drv' node + old_tree.meta.rule = None # No longer a 'drv' node if item.tree.children[0] is old_tree: # XXX a little hacky! raise ParseError("Infinite recursion in grammar! (Rule %s)" % item.rule) @@ -234,5 +234,6 @@ class ApplyCallbacks(Transformer_InPlace): def __init__(self, postprocess): self.postprocess = postprocess - def drv(self, tree): - return self.postprocess[tree.meta.rule](tree.children) + @visitor_args(meta=True) + def drv(self, children, meta): + return self.postprocess[meta.rule](children) diff --git a/lark/parsers/resolve_ambig.py b/lark/parsers/resolve_ambig.py index 0d3f17c..2470eb9 100644 --- a/lark/parsers/resolve_ambig.py +++ b/lark/parsers/resolve_ambig.py @@ -26,8 +26,15 @@ def _compare_priority(tree1, tree2): tree1.iter_subtrees() def _compare_drv(tree1, tree2): - rule1 = getattr(tree1.meta, 'rule', None) - rule2 = getattr(tree2.meta, 'rule', None) + try: + rule1 = tree1.meta.rule + except AttributeError: + rule1 = None + + try: + rule2 = tree2.meta.rule + except AttributeError: + rule2 = None if None == rule1 == rule2: return compare(tree1, tree2) diff --git a/lark/tree.py b/lark/tree.py index e20c18d..1490632 100644 --- a/lark/tree.py +++ b/lark/tree.py @@ -10,10 +10,10 @@ class Meta: ###{standalone class Tree(object): - def __init__(self, data, children): + def __init__(self, data, children, meta=None): self.data = data self.children = children - self._meta = None + self._meta = meta @property def meta(self): diff --git a/lark/visitors.py b/lark/visitors.py index d3853bf..950371f 100644 --- a/lark/visitors.py +++ b/lark/visitors.py @@ -8,39 +8,23 @@ class Discard(Exception): pass -class Base: - def _call_userfunc(self, tree): - return getattr(self, tree.data, self.__default__)(tree) +# Transformers - def __default__(self, tree): - "Default operation on tree (for override)" - return tree - - @classmethod - def _apply_decorator(cls, decorator): - mro = getmro(cls) - assert mro[0] is cls - libmembers = {name for _cls in mro[1:] for name, _ in getmembers(_cls)} - for name, value in getmembers(cls): - if name.startswith('_') or name in libmembers: - continue - - setattr(cls, name, decorator(value)) - return cls - - -class SimpleBase(Base): - def _call_userfunc(self, tree): +class Transformer: + def _call_userfunc(self, data, children, meta): # Assumes tree is already transformed try: - f = getattr(self, tree.data) + f = getattr(self, data) except AttributeError: - return self.__default__(tree) + return self.__default__(data, children, meta) else: - return f(tree.children) - + if getattr(f, 'meta', False): + return f(children, meta) + elif getattr(f, 'inline', False): + return f(*children) + else: + return f(children) -class Transformer(Base): def _transform_children(self, children): for c in children: try: @@ -49,8 +33,8 @@ class Transformer(Base): pass def _transform_tree(self, tree): - tree = Tree(tree.data, list(self._transform_children(tree.children))) - return self._call_userfunc(tree) + children = list(self._transform_children(tree.children)) + return self._call_userfunc(tree.data, children, tree.meta) def transform(self, tree): return self._transform_tree(tree) @@ -58,6 +42,32 @@ class Transformer(Base): def __mul__(self, other): return TransformerChain(self, other) + def __default__(self, data, children, meta): + "Default operation on tree (for override)" + return Tree(data, children, meta) + + @classmethod + def _apply_decorator(cls, decorator, **kwargs): + mro = getmro(cls) + assert mro[0] is cls + libmembers = {name for _cls in mro[1:] for name, _ in getmembers(_cls)} + for name, value in getmembers(cls): + if name.startswith('_') or name in libmembers: + continue + + setattr(cls, name, decorator(value, **kwargs)) + return cls + + +class InlineTransformer(Transformer): # XXX Deprecated + def _call_userfunc(self, data, children, meta): + # Assumes tree is already transformed + try: + f = getattr(self, data) + except AttributeError: + return self.__default__(data, children, meta) + else: + return f(*children) class TransformerChain(object): @@ -75,7 +85,7 @@ class TransformerChain(object): class Transformer_InPlace(Transformer): def _transform_tree(self, tree): # Cancel recursion - return self._call_userfunc(tree) + return self._call_userfunc(tree.data, tree.children, tree.meta) def transform(self, tree): for subtree in tree.iter_subtrees(): @@ -87,11 +97,22 @@ class Transformer_InPlace(Transformer): class Transformer_InPlaceRecursive(Transformer): def _transform_tree(self, tree): tree.children = list(self._transform_children(tree.children)) - return self._call_userfunc(tree) + return self._call_userfunc(tree.data, tree.children, tree.meta) + +# Visitors -class Visitor(Base): +class VisitorBase: + def _call_userfunc(self, tree): + return getattr(self, tree.data, self.__default__)(tree) + + def __default__(self, tree): + "Default operation on tree (for override)" + return tree + + +class Visitor(VisitorBase): "Bottom-up visitor" def visit(self, tree): @@ -99,7 +120,7 @@ class Visitor(Base): self._call_userfunc(subtree) return tree -class Visitor_Recursive(Base): +class Visitor_Recursive(VisitorBase): def visit(self, tree): for child in tree.children: if isinstance(child, Tree): @@ -110,6 +131,7 @@ class Visitor_Recursive(Base): return tree + def visit_children_decor(func): @wraps(func) def inner(cls, tree): @@ -117,7 +139,8 @@ def visit_children_decor(func): return func(cls, values) return inner -class Interpreter(object): + +class Interpreter: "Top-down visitor" def visit(self, tree): @@ -136,56 +159,58 @@ class Interpreter(object): +# Decorators -def _apply_decorator(obj, decorator): +def _apply_decorator(obj, decorator, **kwargs): try: _apply = obj._apply_decorator except AttributeError: - return decorator(obj) + return decorator(obj, **kwargs) else: - return _apply(decorator) + return _apply(decorator, **kwargs) -def _children_args__func(func): - if getattr(func, '_children_args_decorated', False): - return func +def _inline_args__func(func): @wraps(func) def create_decorator(_f, with_self): if with_self: - def f(self, tree): - return _f(self, tree.children) + def f(self, children): + return _f(self, *children) else: - def f(args): - return _f(tree.children) - f._children_args_decorated = True + def f(self, children): + return _f(*children) return f return smart_decorator(func, create_decorator) -def children_args(obj): - return _apply_decorator(obj, _children_args__func) +def inline_args(obj): # XXX Deprecated + return _apply_decorator(obj, _inline_args__func) -def _children_args_inline__func(func): - if getattr(func, '_children_args_decorated', False): - return func - @wraps(func) +def _visitor_args_func_dec(func, inline=False, meta=False): + assert not (inline and meta) def create_decorator(_f, with_self): if with_self: - def f(self, tree): - return _f(self, *tree.children) + def f(self, *args, **kwargs): + return _f(self, *args, **kwargs) else: - def f(self, tree): - print ('##', _f, tree) - return _f(*tree.children) - f._children_args_decorated = True + def f(self, *args, **kwargs): + return _f(*args, **kwargs) return f - return smart_decorator(func, create_decorator) + f = smart_decorator(func, create_decorator) + f.inline = inline + f.meta = meta + return f + +def visitor_args(inline=False, meta=False): + if inline and meta: + raise ValueError("Visitor functions can either accept meta, or be inlined. Not both.") + def _visitor_args_dec(obj): + return _apply_decorator(obj, _visitor_args_func_dec, inline=inline, meta=meta) + return _visitor_args_dec -def children_args_inline(obj): - return _apply_decorator(obj, _children_args_inline__func) diff --git a/tests/test_parser.py b/tests/test_parser.py index 25ce619..f48f3bd 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -21,7 +21,7 @@ from lark.lark import Lark from lark.common import GrammarError, ParseError, UnexpectedToken from lark.lexer import LexError, UnexpectedInput from lark.tree import Tree -from lark.visitors import Transformer, children_args +from lark.visitors import Transformer __path__ = os.path.dirname(__file__) def _read(n, *args): @@ -93,7 +93,6 @@ class TestParsers(unittest.TestCase): self.assertEqual( r.children[0].data, "c" ) def test_embedded_transformer(self): - @children_args class T(Transformer): def a(self, children): return "" diff --git a/tests/test_trees.py b/tests/test_trees.py index b7796bf..af5a2a0 100644 --- a/tests/test_trees.py +++ b/tests/test_trees.py @@ -6,7 +6,7 @@ import copy import pickle from lark.tree import Tree -from lark.visitors import Transformer, Interpreter, visit_children_decor, children_args_inline, children_args +from lark.visitors import Transformer, Interpreter, visit_children_decor, visitor_args class TestTrees(TestCase): @@ -63,19 +63,18 @@ class TestTrees(TestCase): t = Tree('add', [Tree('sub', [Tree('i', ['3']), Tree('f', ['1.1'])]), Tree('i', ['1'])]) class T(Transformer): - i = children_args_inline(int) - f = children_args_inline(float) + i = visitor_args(inline=True)(int) + f = visitor_args(inline=True)(float) - sub = lambda self, tree: tree.children[0] - tree.children[1] - - def add(self, tree): - return sum(tree.children) + sub = lambda self, values: values[0] - values[1] + def add(self, values): + return sum(values) res = T().transform(t) self.assertEqual(res, 2.9) - @children_args_inline + @visitor_args(inline=True) class T(Transformer): i = int f = float @@ -89,7 +88,7 @@ class TestTrees(TestCase): self.assertEqual(res, 2.9) - @children_args_inline + @visitor_args(inline=True) class T(Transformer): i = int f = float @@ -99,19 +98,6 @@ class TestTrees(TestCase): self.assertEqual(res, 2.9) - @children_args - class T(Transformer): - i = children_args_inline(int) - f = children_args_inline(float) - - sub = lambda self, values: values[0] - values[1] - - def add(self, values): - return sum(values) - - res = T().transform(t) - self.assertEqual(res, 2.9) - if __name__ == '__main__': unittest.main() From ac0d49e7ab7c2089be6ac7974cce2630958d169e Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Sun, 27 May 2018 00:04:11 +0300 Subject: [PATCH 27/34] Added %declare --- examples/indented_tree.py | 3 +-- lark/lexer.py | 2 +- lark/load_grammar.py | 56 +++++++++++++++++++++++++-------------- 3 files changed, 38 insertions(+), 23 deletions(-) diff --git a/examples/indented_tree.py b/examples/indented_tree.py index b633cdd..0a132a1 100644 --- a/examples/indented_tree.py +++ b/examples/indented_tree.py @@ -18,11 +18,10 @@ tree_grammar = r""" %import common.CNAME -> NAME %import common.WS_INLINE + %declare _INDENT _DEDENT %ignore WS_INLINE _NL: /(\r?\n[\t ]*)+/ - _INDENT: "" - _DEDENT: "" """ class TreeIndenter(Indenter): diff --git a/lark/lexer.py b/lark/lexer.py index 51ccf6c..6502535 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -234,7 +234,7 @@ class ContextualLexer: lexer = lexer_by_tokens[key] except KeyError: accepts = set(accepts) | set(ignore) | set(always_accept) - state_tokens = [tokens_by_name[n] for n in accepts if is_terminal(n) and n!='$END'] + state_tokens = [tokens_by_name[n] for n in accepts if n and n in tokens_by_name] lexer = Lexer(state_tokens, ignore=ignore, user_callbacks=user_callbacks) lexer_by_tokens[key] = lexer diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 3aa9827..f5a0be8 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -22,7 +22,7 @@ IMPORT_PATHS = [os.path.join(__path__, 'grammars')] _RE_FLAGS = 'imslux' -_TOKEN_NAMES = { +_TERMINAL_NAMES = { '.' : 'DOT', ',' : 'COMMA', ':' : 'COLON', @@ -62,7 +62,7 @@ _TOKEN_NAMES = { } # Grammar Parser -TOKENS = { +TERMINALS = { '_LPAR': r'\(', '_RPAR': r'\)', '_LBRA': r'\[', @@ -73,7 +73,7 @@ TOKENS = { '_DOT': r'\.', 'TILDE': '~', 'RULE': '!?[_?]?[a-z][_a-z0-9]*', - 'TOKEN': '_?[A-Z][_A-Z0-9]*', + 'TERMINAL': '_?[A-Z][_A-Z0-9]*', 'STRING': r'"(\\"|\\\\|[^"\n])*?"i?', 'REGEXP': r'/(?!/)(\\/|\\\\|[^/\n])*?/[%s]*' % _RE_FLAGS, '_NL': r'(\r?\n)+\s*', @@ -81,6 +81,7 @@ TOKENS = { 'COMMENT': r'//[^\n]*', '_TO': '->', '_IGNORE': r'%ignore', + '_DECLARE': r'%declare', '_IMPORT': r'%import', 'NUMBER': r'\d+', } @@ -116,22 +117,24 @@ RULES = { 'literal', 'range'], - 'terminal': ['TOKEN'], + 'terminal': ['TERMINAL'], 'nonterminal': ['RULE'], - '?name': ['RULE', 'TOKEN'], + '?name': ['RULE', 'TERMINAL'], 'maybe': ['_LBRA expansions _RBRA'], 'range': ['STRING _DOT _DOT STRING'], - 'token': ['TOKEN _COLON expansions _NL', - 'TOKEN _DOT NUMBER _COLON expansions _NL'], - 'statement': ['ignore', 'import'], + 'token': ['TERMINAL _COLON expansions _NL', + 'TERMINAL _DOT NUMBER _COLON expansions _NL'], + 'statement': ['ignore', 'import', 'declare'], 'ignore': ['_IGNORE expansions _NL'], + 'declare': ['_DECLARE _declare_args _NL'], 'import': ['_IMPORT import_args _NL', - '_IMPORT import_args _TO TOKEN'], + '_IMPORT import_args _TO TERMINAL _NL'], 'import_args': ['_import_args'], '_import_args': ['name', '_import_args _DOT name'], + '_declare_args': ['name', '_declare_args name'], 'literal': ['REGEXP', 'STRING'], } @@ -278,7 +281,7 @@ class PrepareAnonTerminals(InlineTransformer): except KeyError: # Try to assign an indicative anon-token name try: - token_name = _TOKEN_NAMES[value] + token_name = _TERMINAL_NAMES[value] except KeyError: if value.isalnum() and value[0].isalpha() and value.upper() not in self.token_set: with suppress(UnicodeEncodeError): @@ -302,7 +305,7 @@ class PrepareAnonTerminals(InlineTransformer): self.token_reverse[p] = tokendef self.tokens.append(tokendef) - return Terminal(Token('TOKEN', token_name, -1), filter_out=isinstance(p, PatternStr)) + return Terminal(Token('TERMINAL', token_name, -1), filter_out=isinstance(p, PatternStr)) def _rfind(s, choices): @@ -376,7 +379,7 @@ class TokenTreeToPattern(Transformer): return items[0] if len({i.flags for i in items}) > 1: raise GrammarError("Lark doesn't support joining tokens with conflicting flags!") - return PatternRE(''.join(i.to_regexp() for i in items), items[0].flags) + return PatternRE(''.join(i.to_regexp() for i in items), items[0].flags if items else ()) def expansions(self, exps): if len(exps) == 1: @@ -412,7 +415,7 @@ class PrepareSymbols(Transformer): return v elif v.type == 'RULE': return NonTerminal(v.value) - elif v.type == 'TOKEN': + elif v.type == 'TERMINAL': return Terminal(v.value, filter_out=v.startswith('_')) assert False @@ -435,8 +438,15 @@ class Grammar: # Convert token-trees to strings/regexps transformer = PrepareLiterals() * TokenTreeToPattern() + for name, (token_tree, priority) in token_defs: + if token_tree is None: # Terminal added through %declare + continue + expansions = list(token_tree.find_data('expansion')) + if len(expansions) == 1 and not expansions[0].children: + raise GrammarError("Terminals cannot be empty (%s)" % name) + tokens = [TokenDef(name, transformer.transform(token_tree), priority) - for name, (token_tree, priority) in token_defs] + for name, (token_tree, priority) in token_defs if token_tree] # ================= # Compile Rules @@ -500,12 +510,14 @@ def resolve_token_references(token_defs): while True: changed = False for name, (token_tree, _p) in token_defs: + if token_tree is None: # Terminal added through %declare + continue for exp in token_tree.find_data('value'): item ,= exp.children if isinstance(item, Token): if item.type == 'RULE': - raise GrammarError("Rules aren't allowed inside tokens (%s in %s)" % (item, name)) - if item.type == 'TOKEN': + raise GrammarError("Rules aren't allowed inside terminals (%s in %s)" % (item, name)) + if item.type == 'TERMINAL': exp.children[0] = token_dict[item] changed = True if not changed: @@ -539,7 +551,7 @@ class PrepareGrammar(InlineTransformer): class GrammarLoader: def __init__(self): - tokens = [TokenDef(name, PatternRE(value)) for name, value in TOKENS.items()] + tokens = [TokenDef(name, PatternRE(value)) for name, value in TERMINALS.items()] rules = [options_from_rule(name, x) for name, x in RULES.items()] rules = [Rule(NonTerminal(r), symbols_from_strcase(x.split()), None, o) for r, xs, o in rules for x in xs] @@ -591,6 +603,7 @@ class GrammarLoader: # Execute statements ignore = [] + declared = [] for (stmt,) in statements: if stmt.data == 'ignore': t ,= stmt.children @@ -603,6 +616,9 @@ class GrammarLoader: token_options = dict(g.token_defs)[dotted_path[-1]] assert isinstance(token_options, tuple) and len(token_options)==2 token_defs.append([name.value, token_options]) + elif stmt.data == 'declare': + for t in stmt.children: + token_defs.append([t.value, (None, None)]) else: assert False, stmt @@ -613,7 +629,7 @@ class GrammarLoader: raise GrammarError('Names starting with double-underscore are reserved (Error at %s)' % name) # Handle ignore tokens - # XXX A slightly hacky solution. Recognition of %ignore TOKEN as separate comes from the lexer's + # XXX A slightly hacky solution. Recognition of %ignore TERMINAL as separate comes from the lexer's # inability to handle duplicate tokens (two names, one value) ignore_names = [] for t in ignore: @@ -623,7 +639,7 @@ class GrammarLoader: item ,= t2.children if item.data == 'value': item ,= item.children - if isinstance(item, Token) and item.type == 'TOKEN': + if isinstance(item, Token) and item.type == 'TERMINAL': ignore_names.append(item.value) continue @@ -656,7 +672,7 @@ class GrammarLoader: for name, expansions, _o in rules: used_symbols = {t for x in expansions.find_data('expansion') - for t in x.scan_values(lambda t: t.type in ('RULE', 'TOKEN'))} + for t in x.scan_values(lambda t: t.type in ('RULE', 'TERMINAL'))} for sym in used_symbols: if is_terminal(sym): if sym not in token_names: From 34cd792ffc1f96d144a599c14070e8f27262f645 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Sun, 27 May 2018 00:54:32 +0300 Subject: [PATCH 28/34] Fixed Python grammars, and a bug in newline detection --- examples/python2.g | 11 +++++------ examples/python3.g | 12 +++++------- examples/python_parser.py | 6 +++--- lark/lexer.py | 2 +- 4 files changed, 14 insertions(+), 17 deletions(-) diff --git a/examples/python2.g b/examples/python2.g index 986350d..b0d5e14 100644 --- a/examples/python2.g +++ b/examples/python2.g @@ -149,10 +149,6 @@ string: STRING | LONG_STRING COMMENT: /#[^\n]*/ _NEWLINE: ( /\r?\n[\t ]*/ | COMMENT )+ -%ignore /[\t \f]+/ // WS -%ignore /\\[\t \f]*\r?\n/ // LINE_CONT -%ignore COMMENT - STRING : /[ubf]?r?("(?!"").*?(? NAME IMAG_NUMBER: (_INT | FLOAT) ("j"|"J") -_DEDENT: "" -_INDENT: "" + +%ignore /[\t \f]+/ // WS +%ignore /\\[\t \f]*\r?\n/ // LINE_CONT +%ignore COMMENT +%declare _INDENT _DEDENT diff --git a/examples/python3.g b/examples/python3.g index 1c01e75..398e1ee 100644 --- a/examples/python3.g +++ b/examples/python3.g @@ -127,7 +127,7 @@ AWAIT: "await" | "True" -> const_true | "False" -> const_false -?testlist_comp: (test|star_expr) ( comp_for | ("," (test|star_expr))+ [","] | ",") +?testlist_comp: (test|star_expr) [comp_for | ("," (test|star_expr))+ [","] | ","] subscriptlist: subscript ("," subscript)* [","] subscript: test | [test] ":" [test] [sliceop] sliceop: ":" [test] @@ -170,10 +170,6 @@ COMMENT: /#[^\n]*/ _NEWLINE: ( /\r?\n[\t ]*/ | COMMENT )+ -%ignore /[\t \f]+/ // WS -%ignore /\\[\t \f]*\r?\n/ // LINE_CONT -%ignore COMMENT - STRING : /[ubf]?r?("(?!"").*?(?" -_INDENT: "" +%ignore /[\t \f]+/ // WS +%ignore /\\[\t \f]*\r?\n/ // LINE_CONT +%ignore COMMENT +%declare _INDENT _DEDENT diff --git a/examples/python_parser.py b/examples/python_parser.py index ddbd5c4..0f9f30b 100644 --- a/examples/python_parser.py +++ b/examples/python_parser.py @@ -14,8 +14,8 @@ from lark.indenter import Indenter class PythonIndenter(Indenter): NL_type = '_NEWLINE' - OPEN_PAREN_types = ['__LPAR', '__LSQB', '__LBRACE'] - CLOSE_PAREN_types = ['__RPAR', '__RSQB', '__RBRACE'] + OPEN_PAREN_types = ['LPAR', 'LSQB', 'LBRACE'] + CLOSE_PAREN_types = ['RPAR', 'RSQB', 'RBRACE'] INDENT_type = '_INDENT' DEDENT_type = '_DEDENT' tab_len = 8 @@ -78,6 +78,6 @@ def test_earley_equals_lalr(): if __name__ == '__main__': test_python_lib() - test_earley_equals_lalr() + # test_earley_equals_lalr() # python_parser3.parse(_read(sys.argv[1]) + '\n') diff --git a/lark/lexer.py b/lark/lexer.py index 6502535..4f668f6 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -178,7 +178,7 @@ def build_mres(tokens, match_whole=False): return _build_mres(tokens, len(tokens), match_whole) def _regexp_has_newline(r): - return '\n' in r or '\\n' in r or ('(?s)' in r and '.' in r) + return '\n' in r or '\\n' in r or ('(?s' in r and '.' in r) class Lexer: def __init__(self, tokens, ignore=(), user_callbacks={}): From a91eec7f2b4c73054f439d6f1eae7c317f65205e Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Sun, 27 May 2018 12:00:43 +0300 Subject: [PATCH 29/34] .gitignore should never have been added --- .gitignore | 1 - 1 file changed, 1 deletion(-) delete mode 100644 .gitignore diff --git a/.gitignore b/.gitignore deleted file mode 100644 index cdb93cd..0000000 --- a/.gitignore +++ /dev/null @@ -1 +0,0 @@ -.python-version From 5cb7f2cb787a82161855f0b782095fe073801c78 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Sun, 27 May 2018 12:05:00 +0300 Subject: [PATCH 30/34] Added %declare to examples/lark.lark --- examples/lark.lark | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/lark.lark b/examples/lark.lark index 1fbf592..e561dbe 100644 --- a/examples/lark.lark +++ b/examples/lark.lark @@ -11,6 +11,7 @@ priority: "." NUMBER statement: "%ignore" expansions _NL -> ignore | "%import" import_args ["->" TOKEN] _NL -> import + | "%declare" name+ -> declare import_args: name ("." name)* From 6d76a4ce8de0231cca71b22b7a4d1acf5ad7aa4f Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Mon, 18 Jun 2018 11:30:32 +0300 Subject: [PATCH 31/34] visitor_args -> v_args --- examples/calc.py | 4 ++-- examples/json_parser.py | 6 +++--- lark/__init__.py | 2 +- lark/load_grammar.py | 13 +++++++------ lark/parsers/earley.py | 4 ++-- lark/visitors.py | 2 +- tests/test_trees.py | 10 +++++----- 7 files changed, 21 insertions(+), 20 deletions(-) diff --git a/examples/calc.py b/examples/calc.py index 1102151..a187571 100644 --- a/examples/calc.py +++ b/examples/calc.py @@ -2,7 +2,7 @@ # This example shows how to write a basic calculator with variables. # -from lark import Lark, Transformer, visitor_args +from lark import Lark, Transformer, v_args try: input = raw_input # For Python2 compatibility @@ -34,7 +34,7 @@ calc_grammar = """ %ignore WS_INLINE """ -@visitor_args(inline=True) +@v_args(inline=True) class CalculateTree(Transformer): from operator import add, sub, mul, truediv as div, neg number = float diff --git a/examples/json_parser.py b/examples/json_parser.py index 216af2c..a0c2d3b 100644 --- a/examples/json_parser.py +++ b/examples/json_parser.py @@ -7,7 +7,7 @@ import sys -from lark import Lark, Transformer, visitor_args +from lark import Lark, Transformer, v_args json_grammar = r""" ?start: value @@ -34,14 +34,14 @@ json_grammar = r""" """ class TreeToJson(Transformer): - @visitor_args(inline=True) + @v_args(inline=True) def string(self, s): return s[1:-1].replace('\\"', '"') array = list pair = tuple object = dict - number = visitor_args(inline=True)(float) + number = v_args(inline=True)(float) null = lambda self, _: None true = lambda self, _: True diff --git a/lark/__init__.py b/lark/__init__.py index 850f702..0146664 100644 --- a/lark/__init__.py +++ b/lark/__init__.py @@ -1,5 +1,5 @@ from .tree import Tree -from .visitors import Transformer, Visitor, visitor_args, Discard +from .visitors import Transformer, Visitor, v_args, Discard from .visitors import InlineTransformer, inline_args # XXX Deprecated from .common import ParseError, GrammarError, UnexpectedToken from .lexer import UnexpectedInput, LexError diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 5cf8e19..42b27df 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -16,7 +16,8 @@ from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol from .utils import classify, suppress from .tree import Tree, SlottedTree as ST -from .visitors import Transformer, Visitor, visitor_args +from .visitors import Transformer, Visitor, v_args +inline_args = v_args(inline=True) __path__ = os.path.dirname(__file__) IMPORT_PATHS = [os.path.join(__path__, 'grammars')] @@ -138,7 +139,7 @@ RULES = { } -@visitor_args(inline=True) +@inline_args class EBNF_to_BNF(Transformer): def __init__(self): self.new_rules = [] @@ -243,7 +244,7 @@ class RuleTreeToText(Transformer): return expansion, alias.value -@visitor_args(inline=True) +@inline_args class CanonizeTree(Transformer): def maybe(self, expr): return ST('expr', [expr, Token('OP', '?', -1)]) @@ -264,7 +265,7 @@ class PrepareAnonTerminals(Transformer): self.i = 0 - @visitor_args(inline=True) + @inline_args def pattern(self, p): value = p.value if p in self.token_reverse and p.flags != self.token_reverse[p].pattern.flags: @@ -354,7 +355,7 @@ def _literal_to_pattern(literal): 'REGEXP': PatternRE }[literal.type](s, flags) -@visitor_args(inline=True) +@inline_args class PrepareLiterals(Transformer): def literal(self, literal): return ST('pattern', [_literal_to_pattern(literal)]) @@ -532,7 +533,7 @@ def options_from_rule(name, *x): def symbols_from_strcase(expansion): return [Terminal(x, filter_out=x.startswith('_')) if is_terminal(x) else NonTerminal(x) for x in expansion] -@visitor_args(inline=True) +@inline_args class PrepareGrammar(Transformer): def terminal(self, name): return name diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py index 46a1271..65e0ea5 100644 --- a/lark/parsers/earley.py +++ b/lark/parsers/earley.py @@ -14,7 +14,7 @@ # Email : erezshin@gmail.com from ..tree import Tree -from ..visitors import Transformer_InPlace, visitor_args +from ..visitors import Transformer_InPlace, v_args from ..common import ParseError, UnexpectedToken from .grammar_analysis import GrammarAnalyzer from ..grammar import NonTerminal @@ -234,6 +234,6 @@ class ApplyCallbacks(Transformer_InPlace): def __init__(self, postprocess): self.postprocess = postprocess - @visitor_args(meta=True) + @v_args(meta=True) def drv(self, children, meta): return self.postprocess[meta.rule](children) diff --git a/lark/visitors.py b/lark/visitors.py index 950371f..2f87c25 100644 --- a/lark/visitors.py +++ b/lark/visitors.py @@ -206,7 +206,7 @@ def _visitor_args_func_dec(func, inline=False, meta=False): f.meta = meta return f -def visitor_args(inline=False, meta=False): +def v_args(inline=False, meta=False): if inline and meta: raise ValueError("Visitor functions can either accept meta, or be inlined. Not both.") def _visitor_args_dec(obj): diff --git a/tests/test_trees.py b/tests/test_trees.py index af5a2a0..1debd89 100644 --- a/tests/test_trees.py +++ b/tests/test_trees.py @@ -6,7 +6,7 @@ import copy import pickle from lark.tree import Tree -from lark.visitors import Transformer, Interpreter, visit_children_decor, visitor_args +from lark.visitors import Transformer, Interpreter, visit_children_decor, v_args class TestTrees(TestCase): @@ -63,8 +63,8 @@ class TestTrees(TestCase): t = Tree('add', [Tree('sub', [Tree('i', ['3']), Tree('f', ['1.1'])]), Tree('i', ['1'])]) class T(Transformer): - i = visitor_args(inline=True)(int) - f = visitor_args(inline=True)(float) + i = v_args(inline=True)(int) + f = v_args(inline=True)(float) sub = lambda self, values: values[0] - values[1] @@ -74,7 +74,7 @@ class TestTrees(TestCase): res = T().transform(t) self.assertEqual(res, 2.9) - @visitor_args(inline=True) + @v_args(inline=True) class T(Transformer): i = int f = float @@ -88,7 +88,7 @@ class TestTrees(TestCase): self.assertEqual(res, 2.9) - @visitor_args(inline=True) + @v_args(inline=True) class T(Transformer): i = int f = float From c77934f6a24e28ae5c1716fa90c47ce46235fd8d Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Mon, 18 Jun 2018 11:32:07 +0300 Subject: [PATCH 32/34] Fixed nearley --- lark/tools/nearley.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lark/tools/nearley.py b/lark/tools/nearley.py index eead26d..a7fd259 100644 --- a/lark/tools/nearley.py +++ b/lark/tools/nearley.py @@ -160,7 +160,7 @@ def create_code_for_nearley_grammar(g, start, builtin_path, folder_path): emit('class TransformNearley(Transformer):') for alias in n2l.alias_js_code: emit(" %s = var.get('%s').to_python()" % (alias, alias)) - emit(" __default__ = lambda self, n, c: c if c else None") + emit(" __default__ = lambda self, n, c, m: c if c else None") emit() emit('parser = Lark(grammar, start="n_%s")' % start) From c0cf1b3176dd72bfe5450faa95e6ee9cfc0bb73e Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Mon, 18 Jun 2018 11:57:53 +0300 Subject: [PATCH 33/34] Added some docstrings, removed is_terminal from common --- lark/common.py | 3 --- lark/lark.py | 2 ++ lark/lexer.py | 2 +- lark/load_grammar.py | 5 ++++- lark/tree.py | 7 ++++++- lark/visitors.py | 33 +++++++++++++++++++++++++++++++-- 6 files changed, 44 insertions(+), 8 deletions(-) diff --git a/lark/common.py b/lark/common.py index 84a4139..78ef205 100644 --- a/lark/common.py +++ b/lark/common.py @@ -7,9 +7,6 @@ Py36 = (sys.version_info[:2] >= (3, 6)) ###{standalone -def is_terminal(sym): - return sym.isupper() - class GrammarError(Exception): pass diff --git a/lark/lark.py b/lark/lark.py index 4fc0062..bb15a2f 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -207,6 +207,7 @@ class Lark: def lex(self, text): + "Only lex (and postlex) the text, without parsing it. Only relevant when lexer='standard'" if not hasattr(self, 'lexer'): self.lexer = self._build_lexer() stream = self.lexer.lex(text) @@ -216,6 +217,7 @@ class Lark: return stream def parse(self, text): + "Parse the given text, according to the options provided. Returns a tree, unless specified otherwise." return self.parser.parse(text) # if self.profiler: diff --git a/lark/lexer.py b/lark/lexer.py index cf7ad30..e332e22 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -3,7 +3,7 @@ import re from .utils import Str, classify -from .common import is_terminal, PatternStr, PatternRE, TokenDef +from .common import PatternStr, PatternRE, TokenDef ###{standalone class LexError(Exception): diff --git a/lark/load_grammar.py b/lark/load_grammar.py index bbfedf9..bd6fa36 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -11,7 +11,7 @@ from .lexer import Token, UnexpectedInput from .parse_tree_builder import ParseTreeBuilder from .parser_frontends import LALR from .parsers.lalr_parser import UnexpectedToken -from .common import is_terminal, GrammarError, LexerConf, ParserConf, PatternStr, PatternRE, TokenDef +from .common import GrammarError, LexerConf, ParserConf, PatternStr, PatternRE, TokenDef from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol from .utils import classify, suppress @@ -24,6 +24,9 @@ IMPORT_PATHS = [os.path.join(__path__, 'grammars')] _RE_FLAGS = 'imslux' +def is_terminal(sym): + return sym.isupper() + _TERMINAL_NAMES = { '.' : 'DOT', ',' : 'COMMA', diff --git a/lark/tree.py b/lark/tree.py index 1490632..000823e 100644 --- a/lark/tree.py +++ b/lark/tree.py @@ -45,6 +45,7 @@ class Tree(object): ###} def expand_kids_by_index(self, *indices): + "Expand (inline) children at the given indices" for i in sorted(indices, reverse=True): # reverse so that changing tail won't affect indices kid = self.children[i] self.children[i:i+1] = kid.children @@ -62,9 +63,11 @@ class Tree(object): return hash((self.data, tuple(self.children))) def find_pred(self, pred): + "Find all nodes where pred(tree) == True" return filter(pred, self.iter_subtrees()) def find_data(self, data): + "Find all nodes where tree.data == data" return self.find_pred(lambda t: t.data == data) def scan_values(self, pred): @@ -108,10 +111,12 @@ class Tree(object): self.children = children class SlottedTree(Tree): - __slots__ = 'data', 'children', 'rule' + __slots__ = 'data', 'children', 'rule', '_meta' def pydot__tree_to_png(tree, filename): + "Creates a colorful image that represents the tree (data+children, without meta)" + import pydot graph = pydot.Dot(graph_type='digraph', rankdir="LR") diff --git a/lark/visitors.py b/lark/visitors.py index 2f87c25..752df69 100644 --- a/lark/visitors.py +++ b/lark/visitors.py @@ -11,6 +11,14 @@ class Discard(Exception): # Transformers class Transformer: + """Visits the tree recursively, starting with the leaves and finally the root (bottom-up) + + Calls its methods (provided by user via inheritance) according to tree.data + The returned value replaces the old one in the structure. + + Can be used to implement map or reduce. + """ + def _call_userfunc(self, data, children, meta): # Assumes tree is already transformed try: @@ -84,6 +92,7 @@ class TransformerChain(object): class Transformer_InPlace(Transformer): + "Non-recursive. Changes the tree in-place instead of returning new instances" def _transform_tree(self, tree): # Cancel recursion return self._call_userfunc(tree.data, tree.children, tree.meta) @@ -95,6 +104,7 @@ class Transformer_InPlace(Transformer): class Transformer_InPlaceRecursive(Transformer): + "Recursive. Changes the tree in-place instead of returning new instances" def _transform_tree(self, tree): tree.children = list(self._transform_children(tree.children)) return self._call_userfunc(tree.data, tree.children, tree.meta) @@ -113,7 +123,12 @@ class VisitorBase: class Visitor(VisitorBase): - "Bottom-up visitor" + """Bottom-up visitor, non-recursive + + Visits the tree, starting with the leaves and finally the root (bottom-up) + Calls its methods (provided by user via inheritance) according to tree.data + """ + def visit(self, tree): for subtree in tree.iter_subtrees(): @@ -121,6 +136,12 @@ class Visitor(VisitorBase): return tree class Visitor_Recursive(VisitorBase): + """Bottom-up visitor, recursive + + Visits the tree, starting with the leaves and finally the root (bottom-up) + Calls its methods (provided by user via inheritance) according to tree.data + """ + def visit(self, tree): for child in tree.children: if isinstance(child, Tree): @@ -133,6 +154,7 @@ class Visitor_Recursive(VisitorBase): def visit_children_decor(func): + "See Interpreter" @wraps(func) def inner(cls, tree): values = cls.visit_children(tree) @@ -141,8 +163,14 @@ def visit_children_decor(func): class Interpreter: - "Top-down visitor" + """Top-down visitor, recursive + + Visits the tree, starting with the root and finally the leaves (top-down) + Calls its methods (provided by user via inheritance) according to tree.data + Unlike Transformer and Visitor, the Interpreter doesn't automatically visit its sub-branches. + The user has to explicitly call visit_children, or use the @visit_children_decor + """ def visit(self, tree): return getattr(self, tree.data)(tree) @@ -207,6 +235,7 @@ def _visitor_args_func_dec(func, inline=False, meta=False): return f def v_args(inline=False, meta=False): + "A convenience decorator factory, for modifying the behavior of user-supplied visitor methods" if inline and meta: raise ValueError("Visitor functions can either accept meta, or be inlined. Not both.") def _visitor_args_dec(obj): From 5c6df8e82536afc066ba970c8319342192e07d14 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Mon, 18 Jun 2018 15:14:31 +0300 Subject: [PATCH 34/34] Moved and restructured exceptions * All exceptions are now under exceptions.py * UnexpectedInput is now superclass of UnexpectedToken and UnexpectedCharacters, all of which support the get_context() and match_examples() methods. --- examples/error_reporting_lalr.py | 17 +++---- lark/__init__.py | 3 +- lark/common.py | 57 --------------------- lark/exceptions.py | 85 ++++++++++++++++++++++++++++++++ lark/lexer.py | 27 +++------- lark/load_grammar.py | 9 ++-- lark/parse_tree_builder.py | 2 +- lark/parser_frontends.py | 2 +- lark/parsers/cyk.py | 2 +- lark/parsers/earley.py | 6 +-- lark/parsers/grammar_analysis.py | 2 +- lark/parsers/lalr_analysis.py | 2 +- lark/parsers/lalr_parser.py | 4 +- lark/parsers/xearley.py | 4 +- lark/tree.py | 15 ++++++ tests/test_parser.py | 3 +- 16 files changed, 131 insertions(+), 109 deletions(-) create mode 100644 lark/exceptions.py diff --git a/examples/error_reporting_lalr.py b/examples/error_reporting_lalr.py index a1055fd..0e355af 100644 --- a/examples/error_reporting_lalr.py +++ b/examples/error_reporting_lalr.py @@ -2,7 +2,7 @@ # This demonstrates example-driven error reporting with the LALR parser # -from lark import Lark, UnexpectedToken +from lark import Lark, UnexpectedInput from .json_parser import json_grammar # Using the grammar from the json_parser example @@ -32,11 +32,11 @@ class JsonTrailingComma(JsonSyntaxError): def parse(json_text): try: j = json_parser.parse(json_text) - except UnexpectedToken as ut: - exc_class = ut.match_examples(json_parser.parse, { - JsonMissingValue: ['{"foo": }'], + except UnexpectedInput as u: + exc_class = u.match_examples(json_parser.parse, { JsonMissingOpening: ['{"foo": ]}', - '{"foor": }}'], + '{"foor": }}', + '{"foo": }'], JsonMissingClosing: ['{"foo": [}', '{', '{"a": 1', @@ -55,15 +55,10 @@ def parse(json_text): }) if not exc_class: raise - raise exc_class(ut.get_context(json_text), ut.line, ut.column) + raise exc_class(u.get_context(json_text), u.line, u.column) def test(): - try: - parse('{"key":') - except JsonMissingValue: - pass - try: parse('{"key": "value"') except JsonMissingClosing: diff --git a/lark/__init__.py b/lark/__init__.py index 0146664..2ff54e5 100644 --- a/lark/__init__.py +++ b/lark/__init__.py @@ -1,8 +1,7 @@ from .tree import Tree from .visitors import Transformer, Visitor, v_args, Discard from .visitors import InlineTransformer, inline_args # XXX Deprecated -from .common import ParseError, GrammarError, UnexpectedToken -from .lexer import UnexpectedInput, LexError +from .exceptions import ParseError, LexError, GrammarError, UnexpectedToken, UnexpectedInput, UnexpectedCharacters from .lark import Lark __version__ = "0.5.6" diff --git a/lark/common.py b/lark/common.py index 78ef205..698a3ec 100644 --- a/lark/common.py +++ b/lark/common.py @@ -7,63 +7,6 @@ Py36 = (sys.version_info[:2] >= (3, 6)) ###{standalone -class GrammarError(Exception): - pass - -class ParseError(Exception): - pass - -class UnexpectedToken(ParseError): - def __init__(self, token, expected, seq, index, considered_rules=None, state=None): - self.token = token - self.expected = expected - self.line = getattr(token, 'line', '?') - self.column = getattr(token, 'column', '?') - self.considered_rules = considered_rules - self.state = state - - try: - context = ' '.join(['%r(%s)' % (t.value, t.type) for t in seq[index:index+5]]) - except AttributeError: - context = seq[index:index+5] - except TypeError: - context = "" - message = ("Unexpected token %r at line %s, column %s.\n" - "Expected: %s\n" - "Context: %s" % (token, self.line, self.column, expected, context)) - - super(UnexpectedToken, self).__init__(message) - - def match_examples(self, parse_fn, examples): - """ Given a parser instance and a dictionary mapping some label with - some malformed syntax examples, it'll return the label for the - example that bests matches the current error. - """ - assert self.state, "Not supported for this exception" - - candidate = None - for label, example in examples.items(): - assert not isinstance(example, STRING_TYPE) - - for malformed in example: - try: - parse_fn(malformed) - except UnexpectedToken as ut: - if ut.state == self.state: - if ut.token == self.token: # Try exact match first - return label - elif not candidate: - candidate = label - - return candidate - - def get_context(self, text, span=10): - pos = self.token.pos_in_stream - start = max(pos - span, 0) - end = pos + span - before = text[start:pos].rsplit('\n', 1)[-1] - after = text[pos:end].split('\n', 1)[0] - return before + after + '\n' + ' ' * len(before) + '^\n' ###} diff --git a/lark/exceptions.py b/lark/exceptions.py new file mode 100644 index 0000000..7bf1a78 --- /dev/null +++ b/lark/exceptions.py @@ -0,0 +1,85 @@ +from .utils import STRING_TYPE + +class LarkError(Exception): + pass + +class GrammarError(LarkError): + pass + +class ParseError(LarkError): + pass + +class LexError(LarkError): + pass + +class UnexpectedInput(LarkError): + def get_context(self, text, span=10): + pos = self.pos_in_stream + start = max(pos - span, 0) + end = pos + span + before = text[start:pos].rsplit('\n', 1)[-1] + after = text[pos:end].split('\n', 1)[0] + return before + after + '\n' + ' ' * len(before) + '^\n' + + def match_examples(self, parse_fn, examples): + """ Given a parser instance and a dictionary mapping some label with + some malformed syntax examples, it'll return the label for the + example that bests matches the current error. + """ + assert self.state is not None, "Not supported for this exception" + + candidate = None + for label, example in examples.items(): + assert not isinstance(example, STRING_TYPE) + + for malformed in example: + try: + parse_fn(malformed) + except UnexpectedInput as ut: + if ut.state == self.state: + try: + if ut.token == self.token: # Try exact match first + return label + except AttributeError: + pass + if not candidate: + candidate = label + + return candidate + + +class UnexpectedCharacters(LexError, UnexpectedInput): + def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None): + context = seq[lex_pos:lex_pos+10] + message = "No token defined for '%s' in %r at line %d col %d" % (seq[lex_pos], context, line, column) + if allowed: + message += '\n\nExpecting: %s\n' % allowed + + super(UnexpectedCharacters, self).__init__(message) + + self.line = line + self.column = column + self.context = context + self.allowed = allowed + self.considered_tokens = considered_tokens + self.pos_in_stream = lex_pos + self.state = state + + +class UnexpectedToken(ParseError, UnexpectedInput): + def __init__(self, token, expected, considered_rules=None, state=None): + self.token = token + self.expected = expected # XXX str shouldn't necessary + self.line = getattr(token, 'line', '?') + self.column = getattr(token, 'column', '?') + self.considered_rules = considered_rules + self.state = state + self.pos_in_stream = token.pos_in_stream + + message = ("Unexpected token %r at line %s, column %s.\n" + "Expected: %s\n" + % (token, self.line, self.column, ', '.join(self.expected))) + + super(UnexpectedToken, self).__init__(message) + + diff --git a/lark/lexer.py b/lark/lexer.py index e332e22..ed81d37 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -4,26 +4,9 @@ import re from .utils import Str, classify from .common import PatternStr, PatternRE, TokenDef +from .exceptions import UnexpectedCharacters ###{standalone -class LexError(Exception): - pass - -class UnexpectedInput(LexError): - def __init__(self, seq, lex_pos, line, column, allowed=None, considered_rules=None): - context = seq[lex_pos:lex_pos+5] - message = "No token defined for: '%s' in %r at line %d col %d" % (seq[lex_pos], context, line, column) - if allowed: - message += '\n\nExpecting: %s\n' % allowed - - super(UnexpectedInput, self).__init__(message) - - self.line = line - self.column = column - self.context = context - self.allowed = allowed - self.considered_rules = considered_rules - class Token(Str): __slots__ = ('type', 'pos_in_stream', 'value', 'line', 'column', 'end_line', 'end_column') @@ -84,8 +67,9 @@ class LineCounter: class _Lex: "Built to serve both Lexer and ContextualLexer" - def __init__(self, lexer): + def __init__(self, lexer, state=None): self.lexer = lexer + self.state = state def lex(self, stream, newline_types, ignore_types): newline_types = list(newline_types) @@ -118,7 +102,7 @@ class _Lex: break else: if line_ctr.char_pos < len(stream): - raise UnexpectedInput(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column) + raise UnexpectedCharacters(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column, state=self.state) break class UnlessCallback: @@ -251,9 +235,10 @@ class ContextualLexer: self.parser_state = state def lex(self, stream): - l = _Lex(self.lexers[self.parser_state]) + l = _Lex(self.lexers[self.parser_state], self.parser_state) for x in l.lex(stream, self.root_lexer.newline_types, self.root_lexer.ignore_types): yield x l.lexer = self.lexers[self.parser_state] + l.state = self.parser_state diff --git a/lark/load_grammar.py b/lark/load_grammar.py index bd6fa36..56524d7 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -6,14 +6,15 @@ import re from ast import literal_eval from copy import deepcopy -from .lexer import Token, UnexpectedInput +from .lexer import Token + from .parse_tree_builder import ParseTreeBuilder from .parser_frontends import LALR -from .parsers.lalr_parser import UnexpectedToken -from .common import GrammarError, LexerConf, ParserConf, PatternStr, PatternRE, TokenDef +from .common import LexerConf, ParserConf, PatternStr, PatternRE, TokenDef from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol from .utils import classify, suppress +from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken from .tree import Tree, SlottedTree as ST from .visitors import Transformer, Visitor, v_args @@ -576,7 +577,7 @@ class GrammarLoader: try: tree = self.canonize_tree.transform( self.parser.parse(grammar_text+'\n') ) - except UnexpectedInput as e: + except UnexpectedCharacters as e: raise GrammarError("Unexpected input %r at line %d column %d in %s" % (e.context, e.line, e.column, name)) except UnexpectedToken as e: context = e.get_context(grammar_text) diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py index abca756..e3e14ee 100644 --- a/lark/parse_tree_builder.py +++ b/lark/parse_tree_builder.py @@ -1,4 +1,4 @@ -from .common import GrammarError +from .exceptions import GrammarError from .utils import suppress from .lexer import Token from .grammar import Rule diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index f322524..08e2d0e 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -4,7 +4,7 @@ from .utils import get_regexp_width from .parsers.grammar_analysis import GrammarAnalyzer from .lexer import Lexer, ContextualLexer, Token -from .common import GrammarError +from .exceptions import GrammarError from .parsers import lalr_parser, earley, xearley, resolve_ambig, cyk from .tree import Tree diff --git a/lark/parsers/cyk.py b/lark/parsers/cyk.py index e2bcd83..d65d485 100644 --- a/lark/parsers/cyk.py +++ b/lark/parsers/cyk.py @@ -8,7 +8,7 @@ from collections import defaultdict import itertools -from ..common import ParseError +from ..exceptions import ParseError from ..lexer import Token from ..tree import Tree from ..grammar import Terminal as T, NonTerminal as NT, Symbol diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py index 65e0ea5..4ff26b2 100644 --- a/lark/parsers/earley.py +++ b/lark/parsers/earley.py @@ -15,7 +15,7 @@ from ..tree import Tree from ..visitors import Transformer_InPlace, v_args -from ..common import ParseError, UnexpectedToken +from ..exceptions import ParseError, UnexpectedToken from .grammar_analysis import GrammarAnalyzer from ..grammar import NonTerminal @@ -197,8 +197,8 @@ class Parser: next_set.add(item.advance(token) for item in column.to_scan if match(item.expect, token)) if not next_set: - expect = {i.expect for i in column.to_scan} - raise UnexpectedToken(token, expect, stream, set(column.to_scan)) + expect = {i.expect.name for i in column.to_scan} + raise UnexpectedToken(token, expect, considered_rules=set(column.to_scan)) return next_set diff --git a/lark/parsers/grammar_analysis.py b/lark/parsers/grammar_analysis.py index f49e4bc..3568414 100644 --- a/lark/parsers/grammar_analysis.py +++ b/lark/parsers/grammar_analysis.py @@ -1,6 +1,6 @@ from ..utils import bfs, fzset, classify -from ..common import GrammarError +from ..exceptions import GrammarError from ..grammar import Rule, Terminal, NonTerminal diff --git a/lark/parsers/lalr_analysis.py b/lark/parsers/lalr_analysis.py index 6903be9..6eec0a1 100644 --- a/lark/parsers/lalr_analysis.py +++ b/lark/parsers/lalr_analysis.py @@ -10,7 +10,7 @@ import logging from collections import defaultdict from ..utils import classify, classify_bool, bfs, fzset -from ..common import GrammarError +from ..exceptions import GrammarError from .grammar_analysis import GrammarAnalyzer, Terminal diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index 164a227..8fa56f5 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -2,7 +2,7 @@ """ # Author: Erez Shinan (2017) # Email : erezshin@gmail.com -from ..common import UnexpectedToken +from ..exceptions import UnexpectedToken from .lalr_analysis import LALR_Analyzer, Shift @@ -46,7 +46,7 @@ class _Parser: return states[state][key] except KeyError: expected = states[state].keys() - raise UnexpectedToken(token, expected, seq, i, state=state) + raise UnexpectedToken(token, expected, state=state) # TODO filter out rules from expected def reduce(rule): size = len(rule.expansion) diff --git a/lark/parsers/xearley.py b/lark/parsers/xearley.py index 5e8fb28..02698fb 100644 --- a/lark/parsers/xearley.py +++ b/lark/parsers/xearley.py @@ -20,8 +20,8 @@ from collections import defaultdict -from ..common import ParseError -from ..lexer import Token, UnexpectedInput +from ..exceptions import ParseError, UnexpectedInput +from ..lexer import Token from ..tree import Tree from .grammar_analysis import GrammarAnalyzer from ..grammar import NonTerminal, Terminal diff --git a/lark/tree.py b/lark/tree.py index 000823e..5a29c0f 100644 --- a/lark/tree.py +++ b/lark/tree.py @@ -110,6 +110,21 @@ class Tree(object): self.data = data self.children = children + # XXX Deprecated! Here for backwards compatibility <0.6.0 + @property + def line(self): + return self.meta.line + @property + def column(self): + return self.meta.column + @property + def end_line(self): + return self.meta.end_line + @property + def end_column(self): + return self.meta.end_column + + class SlottedTree(Tree): __slots__ = 'data', 'children', 'rule', '_meta' diff --git a/tests/test_parser.py b/tests/test_parser.py index f48f3bd..36cb142 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -18,8 +18,7 @@ from io import ( logging.basicConfig(level=logging.INFO) from lark.lark import Lark -from lark.common import GrammarError, ParseError, UnexpectedToken -from lark.lexer import LexError, UnexpectedInput +from lark.exceptions import GrammarError, ParseError, UnexpectedToken, LexError, UnexpectedInput from lark.tree import Tree from lark.visitors import Transformer