From 5946e78ed246bde9a31ca14bc523d290e7c8f8f3 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Sat, 6 May 2017 17:04:19 +0300 Subject: [PATCH 01/17] Bugfix for Scanless Earley. Thanks Ken! --- lark/load_grammar.py | 2 +- tests/test_parser.py | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 4b8e7fe..460af8a 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -313,7 +313,7 @@ class PrepareLiterals(InlineTransformer): class SplitLiterals(InlineTransformer): def pattern(self, p): if isinstance(p, PatternStr) and len(p.value)>1: - return T('expansion', [T('pattern', [PatternStr(ch)]) for ch in p.value]) + return T('expansion', [T('pattern', [PatternStr(ch, flags=p.flags)]) for ch in p.value]) return T('pattern', [p]) class TokenTreeToPattern(Transformer): diff --git a/tests/test_parser.py b/tests/test_parser.py index 6063096..4a2171c 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -568,6 +568,14 @@ def _make_parser_test(LEXER, PARSER): tree = l.parse('AB,a') self.assertEqual(tree.children, ['AB']) + def test_token_flags3(self): + l = _Lark("""!start: ABC+ + ABC: "abc"i + """ + ) + tree = l.parse('aBcAbC') + self.assertEqual(tree.children, ['aBc', 'AbC']) + def test_token_flags2(self): g = """!start: ("a"i | /a/ /b/?)+ """ From e20b54be2fa9e6594a10bdd9cc8c36a3ffe46bc0 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Sat, 6 May 2017 18:00:51 +0300 Subject: [PATCH 02/17] Bugfix for issue #6. Thanks DaRasch! --- lark/utils.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/lark/utils.py b/lark/utils.py index f363234..91b8a24 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -63,11 +63,17 @@ def inline_args(f): def _f_builtin(_self, args): return f(*args) return _f_builtin - else: - @functools.wraps(f) + elif isinstance(f, types.MethodType): + @functools.wraps(f.__func__) def _f(self, args): return f.__func__(self, *args) return _f + else: + @functools.wraps(f.__call__.__func__) + def _f(self, args): + return f.__call__.__func__(self, *args) + return _f + try: From 64d141e486ef8b783861e2af53482bb786d9bfff Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Mon, 22 May 2017 12:26:24 +0300 Subject: [PATCH 03/17] BUGFIX: Fixes a subtle bug in the LALR(1) parser. See the new test for details. --- lark/parsers/lalr_parser.py | 6 +++--- tests/test_parser.py | 16 ++++++++++++++++ 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index 60d4e9c..1420345 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -34,7 +34,7 @@ class Parser(object): raise UnexpectedToken(token, expected, seq, i) - def reduce(rule, size): + def reduce(rule, size, end=False): if size: s = value_stack[-size:] del state_stack[-size:] @@ -44,7 +44,7 @@ class Parser(object): res = self.callbacks[rule](s) - if len(state_stack) == 1 and rule.origin == self.analysis.start_symbol: + if end and len(state_stack) == 1 and rule.origin == self.analysis.start_symbol: return res _action, new_state = get_action(rule.origin) @@ -73,7 +73,7 @@ class Parser(object): while True: _action, rule = get_action('$end') assert _action == 'reduce' - res = reduce(*rule) + res = reduce(*rule, end=True) if res: assert state_stack == [self.analysis.init_state_idx] and not value_stack, len(state_stack) return res diff --git a/tests/test_parser.py b/tests/test_parser.py index 4a2171c..20fdb87 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -584,6 +584,22 @@ def _make_parser_test(LEXER, PARSER): self.assertEqual(tree.children, ['a', 'A']) + def test_reduce_cycle(self): + """Tests an edge-condition in the LALR parser, in which a transition state looks exactly like the end state. + It seems that the correct solution is to explicitely distinguish finalization in the reduce() function. + """ + + l = _Lark(""" + term: A + | term term + + A: "a" + + """, start='term') + + tree = l.parse("aa") + self.assertEqual(len(tree.children), 2) + From f1aede9acd9014dcce86f744101dcd36d361029c Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Wed, 24 May 2017 16:12:07 +0300 Subject: [PATCH 04/17] README: Added 'pytreeview' to projects using lark --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 4440211..31e8b3b 100644 --- a/README.md +++ b/README.md @@ -125,7 +125,8 @@ Lark has no dependencies. ### Projects using Lark - - [mappyfile](https://github.com/geographika/mappyfile) - A pure Python MapFile parser for working with MapServer + - [mappyfile](https://github.com/geographika/mappyfile) - a MapFile parser for working with MapServer configuration + - [pytreeview](https://gitlab.com/parmenti/pytreeview) - a lightweight tree-based grammar explorer Using Lark? Send me a message and I'll add your project! From a588a70a7a691af7bef235f38835777c414e91b0 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Sat, 27 May 2017 01:55:49 +0300 Subject: [PATCH 05/17] Added the experimental "propagate_positions" feature (only for standard lexer for now). --- lark/lark.py | 4 +++- lark/lexer.py | 16 +++++++++++++--- lark/parse_tree_builder.py | 27 ++++++++++++++++++++++++++- lark/utils.py | 20 ++++++++++++++++++++ 4 files changed, 62 insertions(+), 5 deletions(-) diff --git a/lark/lark.py b/lark/lark.py index b839650..3d8cbcd 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -39,6 +39,7 @@ class LarkOptions(object): postlex - Lexer post-processing (Default: None) start - The start symbol (Default: start) profile - Measure run-time usage in Lark. Read results from the profiler proprety (Default: False) + propagate_positions - Experimental. Don't use yet. """ __doc__ += OPTIONS_DOC def __init__(self, options_dict): @@ -55,6 +56,7 @@ class LarkOptions(object): self.start = o.pop('start', 'start') self.profile = o.pop('profile', False) self.ambiguity = o.pop('ambiguity', 'auto') + self.propagate_positions = o.pop('propagate_positions', False) assert self.parser in ('earley', 'lalr', None) @@ -160,7 +162,7 @@ class Lark: def _build_parser(self): self.parser_class = get_frontend(self.options.parser, self.options.lexer) - self.parse_tree_builder = ParseTreeBuilder(self.options.tree_class) + self.parse_tree_builder = ParseTreeBuilder(self.options.tree_class, self.options.propagate_positions) rules, callback = self.parse_tree_builder.create_tree_builder(self.rules, self.options.transformer) if self.profiler: for f in dir(callback): diff --git a/lark/lexer.py b/lark/lexer.py index 053ce32..63e306f 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -155,17 +155,27 @@ class Lexer(object): if m: value = m.group(0) type_ = type_from_index[m.lastindex] - if type_ not in ignore_types: + to_yield = type_ not in ignore_types + + if to_yield: t = Token(type_, value, lex_pos, line, lex_pos - col_start_pos) if t.type in self.callback: t = self.callback[t.type](t) - yield t + end_col = t.column + len(value) if type_ in newline_types: newlines = value.count(self.newline_char) if newlines: line += newlines - col_start_pos = lex_pos + value.rindex(self.newline_char) + last_newline_index = value.rindex(self.newline_char) + 1 + col_start_pos = lex_pos + last_newline_index + end_col = len(value) - last_newline_index + + if to_yield: + t.end_line = line + t.end_col = end_col + yield t + lex_pos += len(value) break else: diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py index 547deab..b3bc522 100644 --- a/lark/parse_tree_builder.py +++ b/lark/parse_tree_builder.py @@ -1,4 +1,5 @@ from .common import is_terminal, GrammarError +from .utils import suppress from .lexer import Token class Callback(object): @@ -42,10 +43,31 @@ def create_rule_handler(expansion, usermethod, keep_all_tokens, filter_out): # else, if no filtering required.. return usermethod +def propagate_positions_wrapper(f): + def _f(args): + res = f(args) + + if args: + for a in args: + with suppress(AttributeError): + res.line = a.line + res.column = a.column + break + + for a in reversed(args): + with suppress(AttributeError): + res.end_line = a.end_line + res.end_col = a.end_col + break + + return res + + return _f class ParseTreeBuilder: - def __init__(self, tree_class): + def __init__(self, tree_class, propagate_positions=False): self.tree_class = tree_class + self.propagate_positions = propagate_positions def _create_tree_builder_function(self, name): tree_class = self.tree_class @@ -92,6 +114,9 @@ class ParseTreeBuilder: alias_handler = create_rule_handler(expansion, f, keep_all_tokens, filter_out) + if self.propagate_positions: + alias_handler = propagate_positions_wrapper(alias_handler) + callback_name = 'autoalias_%s_%s' % (_origin, '_'.join(expansion)) if hasattr(callback, callback_name): raise GrammarError("Rule expansion '%s' already exists in rule %s" % (' '.join(expansion), origin)) diff --git a/lark/utils.py b/lark/utils.py index 91b8a24..d984400 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -1,6 +1,7 @@ import functools import types from collections import deque +from contextlib import contextmanager class fzset(frozenset): def __repr__(self): @@ -88,5 +89,24 @@ except NameError: return -1 +try: + from contextlib import suppress # Python 3 +except ImportError: + @contextmanager + def suppress(*excs): + '''Catch and dismiss the provided exception + + >>> x = 'hello' + >>> with suppress(IndexError): + ... x = x[10] + >>> x + 'hello' + ''' + try: + yield + except excs: + pass + + From 40d732ddf5601fa840ee749922b12a0f7b5b48af Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Sat, 27 May 2017 17:19:36 +0300 Subject: [PATCH 06/17] Added Contribute section to README --- README.md | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 31e8b3b..183a162 100644 --- a/README.md +++ b/README.md @@ -252,6 +252,22 @@ Lark offers both Earley and LALR(1), which means you can choose between the most Lark uses the [MIT license](LICENSE). +## Contribute + +Lark is currently accepting pull-requests. + +There are many ways you can help the project: + +* Improve the performance of Lark's parsing algorithm +* Implement macros for grammars (important for grammar composition) +* Write new grammars for Lark's library +* Write & improve the documentation +* Write a blog post introducing Lark to your audience + +If you're interested in taking one of these on, let me know and I will provide more details and assist you in the process. + ## Contact -If you have any questions or want to contribute, you can email me at erezshin at gmail com. +If you have any questions or want my assistance, you can email me at erezshin at gmail com. + +I'm also available for contract work. From 88242f10d7b2e8416681238c563f96e5fa1c7fea Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Sun, 28 May 2017 16:09:41 +0300 Subject: [PATCH 07/17] Tiny fix in lexer --- lark/lexer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lark/lexer.py b/lark/lexer.py index 63e306f..86d976f 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -159,10 +159,10 @@ class Lexer(object): if to_yield: t = Token(type_, value, lex_pos, line, lex_pos - col_start_pos) + end_col = t.column + len(value) if t.type in self.callback: t = self.callback[t.type](t) - end_col = t.column + len(value) if type_ in newline_types: newlines = value.count(self.newline_char) if newlines: From 686e796a62438e0a504a992cd172e21cbf7e7e3f Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Sun, 28 May 2017 19:50:18 +0300 Subject: [PATCH 08/17] Small improvements to Earley --- examples/python3.g | 9 +++++---- lark/parsers/earley.py | 8 +++++++- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/examples/python3.g b/examples/python3.g index 279c268..a382d0b 100644 --- a/examples/python3.g +++ b/examples/python3.g @@ -116,11 +116,12 @@ AWAIT: "await" | atom_expr "." NAME -> getattr | atom -?atom: "(" [yield_expr|testlist_comp] ")" - | "[" [testlist_comp] "]" - | "{" [dictorsetmaker] "}" +?atom: "(" [yield_expr|testlist_comp] ")" -> tuple + | "[" [testlist_comp] "]" -> list + | "{" [dictorsetmaker] "}" -> dict | NAME -> var - | number | string+ | "..." + | number | string+ + | "..." -> ellipsis | "None" -> const_none | "True" -> const_true | "False" -> const_false diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py index dfe13c7..6014fee 100644 --- a/lark/parsers/earley.py +++ b/lark/parsers/earley.py @@ -107,7 +107,7 @@ class Column: new_tree.rule = old_tree.rule old_tree.set('_ambig', [new_tree]) if item.tree.children[0] is old_tree: # XXX a little hacky! - raise ParseError("Infinite recursion in grammar!") + raise ParseError("Infinite recursion in grammar! (Rule %s)" % item.rule) old_tree.children.append(item.tree) else: self.completed[item] = item @@ -228,6 +228,12 @@ def _compare_drv(tree1, tree2): if not (isinstance(tree1, Tree) and isinstance(tree2, Tree)): return compare(tree1, tree2) + try: + rule1, rule2 = tree1.rule, tree2.rule + except AttributeError: + # Probably trees that don't take part in this parse (better way to distinguish?) + return compare(tree1, tree2) + c = _compare_rules(tree1.rule, tree2.rule) if c: return c From 1da851516ceb3e8f374d9649b23256640fa11b77 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Mon, 29 May 2017 00:05:54 +0300 Subject: [PATCH 09/17] Added option: Keep all tokens --- lark/lark.py | 10 ++++++---- lark/parse_tree_builder.py | 5 +++-- tests/test_parser.py | 6 ++++++ 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/lark/lark.py b/lark/lark.py index 3d8cbcd..488ed5c 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -63,8 +63,6 @@ class LarkOptions(object): if self.parser == 'earley' and self.transformer: raise ValueError('Cannot specify an auto-transformer when using the Earley algorithm.' 'Please use your transformer on the resulting parse tree, or use a different algorithm (i.e. lalr)') - if self.keep_all_tokens: - raise NotImplementedError("keep_all_tokens: Not implemented yet!") if o: raise ValueError("Unknown options: %s" % o.keys()) @@ -121,7 +119,7 @@ class Lark: assert isinstance(grammar, STRING_TYPE) - if self.options.cache_grammar or self.options.keep_all_tokens: + if self.options.cache_grammar: raise NotImplementedError("Not available yet") assert not self.options.profile, "Feature temporarily disabled" @@ -142,8 +140,12 @@ class Lark: assert self.options.parser == 'earley' assert self.options.ambiguity in ('resolve', 'explicit', 'auto') + # Parse the grammar file and compose the grammars (TODO) self.grammar = load_grammar(grammar, source) + + # Compile the EBNF grammar into BNF tokens, self.rules, self.grammar_extra = self.grammar.compile(lexer=bool(lexer), start=self.options.start) + self.ignore_tokens = self.grammar.extra['ignore'] self.lexer_conf = LexerConf(tokens, self.ignore_tokens, self.options.postlex) @@ -162,7 +164,7 @@ class Lark: def _build_parser(self): self.parser_class = get_frontend(self.options.parser, self.options.lexer) - self.parse_tree_builder = ParseTreeBuilder(self.options.tree_class, self.options.propagate_positions) + self.parse_tree_builder = ParseTreeBuilder(self.options.tree_class, self.options.propagate_positions, self.options.keep_all_tokens) rules, callback = self.parse_tree_builder.create_tree_builder(self.rules, self.options.transformer) if self.profiler: for f in dir(callback): diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py index b3bc522..0c21bfb 100644 --- a/lark/parse_tree_builder.py +++ b/lark/parse_tree_builder.py @@ -65,9 +65,10 @@ def propagate_positions_wrapper(f): return _f class ParseTreeBuilder: - def __init__(self, tree_class, propagate_positions=False): + def __init__(self, tree_class, propagate_positions=False, keep_all_tokens=False): self.tree_class = tree_class self.propagate_positions = propagate_positions + self.always_keep_all_tokens = keep_all_tokens def _create_tree_builder_function(self, name): tree_class = self.tree_class @@ -88,7 +89,7 @@ class ParseTreeBuilder: filter_out.add(origin) for origin, (expansions, options) in rules.items(): - keep_all_tokens = options.keep_all_tokens if options else False + keep_all_tokens = self.always_keep_all_tokens or (options.keep_all_tokens if options else False) expand1 = options.expand1 if options else False create_token = options.create_token if options else False diff --git a/tests/test_parser.py b/tests/test_parser.py index 20fdb87..55c010e 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -538,6 +538,12 @@ def _make_parser_test(LEXER, PARSER): g.parse("+2e-9") self.assertRaises(ParseError, g.parse, "+2e-9e") + def test_keep_all_tokens(self): + l = _Lark("""start: "a"+ """, keep_all_tokens=True) + tree = l.parse('aaa') + self.assertEqual(tree.children, ['a', 'a', 'a']) + + def test_token_flags(self): l = _Lark("""!start: "a"i+ """ From 9570918005420ec287f8b11449eb2750da40f4b3 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Mon, 17 Jul 2017 17:04:42 +0300 Subject: [PATCH 10/17] Improved: efficiency of iter_subtrees(), customizability of pretty() --- lark/tree.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/lark/tree.py b/lark/tree.py index 8d58af6..2bfa6d4 100644 --- a/lark/tree.py +++ b/lark/tree.py @@ -10,11 +10,14 @@ class Tree(object): def __repr__(self): return 'Tree(%s, %s)' % (self.data, self.children) + def _pretty_label(self): + return self.data + def _pretty(self, level, indent_str): if len(self.children) == 1 and not isinstance(self.children[0], Tree): return [ indent_str*level, self.data, '\t', '%s' % self.children[0], '\n'] - l = [ indent_str*level, self.data, '\n' ] + l = [ indent_str*level, self._pretty_label(), '\n' ] for n in self.children: if isinstance(n, Tree): l += n._pretty(level+1, indent_str) @@ -62,10 +65,14 @@ class Tree(object): yield c def iter_subtrees(self): + visited = set() q = [self] while q: subtree = q.pop() + if id(subtree) in visited: + continue # already been here from another branch + visited.add(id(subtree)) yield subtree q += [c for c in subtree.children if isinstance(c, Tree)] From 48efa6500a177a1fffde71f40287cc6b3ce9c8c0 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Mon, 17 Jul 2017 17:11:43 +0300 Subject: [PATCH 11/17] Improvements to the Earley parser ambiguity resolution --- lark/parsers/earley.py | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py index 6014fee..b808ff1 100644 --- a/lark/parsers/earley.py +++ b/lark/parsers/earley.py @@ -29,6 +29,9 @@ class Derivation(Tree): Tree.__init__(self, 'drv', items or []) self.rule = rule + def _pretty_label(self): # Nicer pretty for debugging the parser + return self.rule.origin if self.rule else self.data + END_TOKEN = EndToken() class Item(object): @@ -106,8 +109,11 @@ class Column: new_tree = old_tree.copy() new_tree.rule = old_tree.rule old_tree.set('_ambig', [new_tree]) + old_tree.rule = None # No longer a 'drv' node + if item.tree.children[0] is old_tree: # XXX a little hacky! raise ParseError("Infinite recursion in grammar! (Rule %s)" % item.rule) + old_tree.children.append(item.tree) else: self.completed[item] = item @@ -234,6 +240,14 @@ def _compare_drv(tree1, tree2): # Probably trees that don't take part in this parse (better way to distinguish?) return compare(tree1, tree2) + # XXX These artifacts can appear due to imperfections in the ordering of Visitor_NoRecurse, + # when confronted with duplicate (same-id) nodes. Fixing this ordering is possible, but would be + # computationally inefficient. So we handle it here. + if tree1.data == '_ambig': + _resolve_ambig(tree1) + if tree2.data == '_ambig': + _resolve_ambig(tree2) + c = _compare_rules(tree1.rule, tree2.rule) if c: return c @@ -247,12 +261,19 @@ def _compare_drv(tree1, tree2): return compare(len(tree1.children), len(tree2.children)) +def _resolve_ambig(tree): + assert tree.data == '_ambig' + + best = min(tree.children, key=cmp_to_key(_compare_drv)) + assert best.data == 'drv' + tree.set('drv', best.children) + tree.rule = best.rule # needed for applying callbacks + + assert tree.data != '_ambig' + class ResolveAmbig(Visitor_NoRecurse): def _ambig(self, tree): - best = min(tree.children, key=cmp_to_key(_compare_drv)) - assert best.data == 'drv' - tree.set('drv', best.children) - tree.rule = best.rule # needed for applying callbacks + _resolve_ambig(tree) # RULES = [ From 107c596aea0f3d998dd02cc43e995e89544b6964 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Mon, 17 Jul 2017 17:17:05 +0300 Subject: [PATCH 12/17] Version bump --- lark/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lark/__init__.py b/lark/__init__.py index e22a247..d91a460 100644 --- a/lark/__init__.py +++ b/lark/__init__.py @@ -3,4 +3,4 @@ from .common import ParseError, GrammarError from .lark import Lark from .utils import inline_args -__version__ = "0.2.7" +__version__ = "0.2.8" From 70fa3c6ea9d5009bfea5f24f1312b80f1f73b3c2 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Mon, 24 Jul 2017 20:11:38 +0300 Subject: [PATCH 13/17] Tiny fix for issue #14 Version bump --- lark/__init__.py | 2 +- lark/parsers/earley.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/lark/__init__.py b/lark/__init__.py index d91a460..92f1c78 100644 --- a/lark/__init__.py +++ b/lark/__init__.py @@ -3,4 +3,4 @@ from .common import ParseError, GrammarError from .lark import Lark from .utils import inline_args -__version__ = "0.2.8" +__version__ = "0.2.9" diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py index b808ff1..3c02332 100644 --- a/lark/parsers/earley.py +++ b/lark/parsers/earley.py @@ -224,7 +224,8 @@ class ApplyCallbacks(Transformer_NoRecurse): return Tree(rule.origin, children) def _compare_rules(rule1, rule2): - assert rule1.origin == rule2.origin + if rule1.origin != rule2.origin: + return 0 c = compare( len(rule1.expansion), len(rule2.expansion)) if rule1.origin.startswith('__'): # XXX hack! We need to set priority in parser, not here c = -c From ee8dd8b3f3486b0c18ada7e4e933dce06d189661 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Wed, 26 Jul 2017 10:17:27 +0300 Subject: [PATCH 14/17] Added a test suggested by James McLaughlin --- tests/test_parser.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tests/test_parser.py b/tests/test_parser.py index 55c010e..348993a 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -379,6 +379,20 @@ def _make_parser_test(LEXER, PARSER): x = g.parse('Hello HelloWorld') self.assertSequenceEqual(x.children, ['HelloWorld']) + def test_token_collision2(self): + # NOTE: This test reveals a bug in token reconstruction in Scanless Earley + # I probably need to re-write grammar transformation + + g = _Lark(""" + !start: "starts" + + %import common.LCASE_LETTER + """) + + x = g.parse("starts") + self.assertSequenceEqual(x.children, ['starts']) + + # def test_string_priority(self): # g = _Lark("""start: (A | /a?bb/)+ # A: "a" """) From 188386cf04da278ff9efac81b26e9ec127e8aebd Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Wed, 26 Jul 2017 10:18:50 +0300 Subject: [PATCH 15/17] Fixed main bug in test_token_collision2. --- lark/common.py | 17 +++++++++++++---- lark/lexer.py | 6 ++---- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/lark/common.py b/lark/common.py index 2c940bd..f9b0990 100644 --- a/lark/common.py +++ b/lark/common.py @@ -1,4 +1,5 @@ import re +import sre_parse class GrammarError(Exception): pass @@ -57,9 +58,9 @@ class Pattern(object): # Pattern Hashing assumes all subclasses have a different priority! def __hash__(self): - return hash((self.priority, self.value)) + return hash((type(self), self.value)) def __eq__(self, other): - return self.priority == other.priority and self.value == other.value + return type(self) == type(other) and self.value == other.value def _get_flags(self): if self.flags: @@ -71,13 +72,21 @@ class PatternStr(Pattern): def to_regexp(self): return self._get_flags() + re.escape(self.value) - priority = 0 + @property + def min_width(self): + return len(self.value) + max_width = min_width class PatternRE(Pattern): def to_regexp(self): return self._get_flags() + self.value - priority = 1 + @property + def min_width(self): + return sre_parse.parse(self.to_regexp()).getwidth()[0] + @property + def max_width(self): + return sre_parse.parse(self.to_regexp()).getwidth()[1] class TokenDef(object): def __init__(self, name, pattern): diff --git a/lark/lexer.py b/lark/lexer.py index 86d976f..4e6d5b9 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -1,7 +1,6 @@ ## Lexer Implementation import re -import sre_parse from .utils import Str, classify from .common import is_terminal, PatternStr, PatternRE, TokenDef @@ -120,8 +119,7 @@ class Lexer(object): except: raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern)) - width = sre_parse.parse(t.pattern.to_regexp()).getwidth() - if width[0] == 0: + if t.pattern.min_width == 0: raise LexError("Lexer does not allow zero-width tokens. (%s: %s)" % (t.name, t.pattern)) token_names = {t.name for t in tokens} @@ -133,7 +131,7 @@ class Lexer(object): self.newline_types = [t.name for t in tokens if _regexp_has_newline(t.pattern.to_regexp())] self.ignore_types = [t for t in ignore] - tokens.sort(key=lambda x:(x.pattern.priority, len(x.pattern.value)), reverse=True) + tokens.sort(key=lambda x:x.pattern.max_width, reverse=True) tokens, self.callback = _create_unless(tokens) assert all(self.callback.values()) From da7eaa219d0b4be914eab0a2ca632d2615b478a5 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Wed, 26 Jul 2017 10:19:54 +0300 Subject: [PATCH 16/17] Version bump --- lark/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lark/__init__.py b/lark/__init__.py index 92f1c78..5337ecb 100644 --- a/lark/__init__.py +++ b/lark/__init__.py @@ -3,4 +3,4 @@ from .common import ParseError, GrammarError from .lark import Lark from .utils import inline_args -__version__ = "0.2.9" +__version__ = "0.2.10" From 4eec9244038eafaf31c1cda595e18321c8f9730b Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Fri, 4 Aug 2017 02:42:31 +0300 Subject: [PATCH 17/17] Added prioritization to Earley. Use rule.1 etc. Highest priority will be selected in case of ambiguity. --- lark/common.py | 2 +- lark/load_grammar.py | 20 +++++++++++++++----- lark/parse_tree_builder.py | 2 +- lark/parser_frontends.py | 4 ++-- lark/parsers/earley.py | 5 +++++ lark/parsers/grammar_analysis.py | 11 ++++++----- lark/parsers/lalr_parser.py | 1 + tests/test_parser.py | 24 ++++++++++++++++++++++++ 8 files changed, 55 insertions(+), 14 deletions(-) diff --git a/lark/common.py b/lark/common.py index f9b0990..f1b6784 100644 --- a/lark/common.py +++ b/lark/common.py @@ -41,7 +41,7 @@ class LexerConf: class ParserConf: def __init__(self, rules, callback, start): - assert all(len(r)==3 for r in rules) + assert all(len(r) == 4 for r in rules) self.rules = rules self.callback = callback self.start = start diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 460af8a..21c5a8b 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -75,6 +75,7 @@ TOKENS = { '_TO': '->', '_IGNORE': r'%ignore', '_IMPORT': r'%import', + 'NUMBER': '\d+', } RULES = { @@ -82,7 +83,8 @@ RULES = { '_list': ['_item', '_list _item'], '_item': ['rule', 'token', 'statement', '_NL'], - 'rule': ['RULE _COLON expansions _NL'], + 'rule': ['RULE _COLON expansions _NL', + 'RULE _DOT NUMBER _COLON expansions _NL'], 'expansions': ['alias', 'expansions _OR alias', 'expansions _NL _OR alias'], @@ -470,21 +472,29 @@ class Grammar: class RuleOptions: - def __init__(self, keep_all_tokens=False, expand1=False, create_token=None, filter_out=False): + def __init__(self, keep_all_tokens=False, expand1=False, create_token=None, filter_out=False, priority=None): self.keep_all_tokens = keep_all_tokens self.expand1 = expand1 self.create_token = create_token # used for scanless postprocessing + self.priority = priority self.filter_out = filter_out # remove this rule from the tree # used for "token"-rules in scanless @classmethod - def from_rule(cls, name, expansions): + def from_rule(cls, name, *x): + if len(x) > 1: + priority, expansions = x + priority = int(priority) + else: + expansions ,= x + priority = None + keep_all_tokens = name.startswith('!') name = name.lstrip('!') expand1 = name.startswith('?') name = name.lstrip('?') - return name, expansions, cls(keep_all_tokens, expand1) + return name, expansions, cls(keep_all_tokens, expand1, priority=priority) @@ -605,7 +615,7 @@ class GrammarLoader: raise GrammarError("Token '%s' defined more than once" % name) token_names.add(name) - rules = [RuleOptions.from_rule(name, x) for name, x in rule_defs] + rules = [RuleOptions.from_rule(*x) for x in rule_defs] rule_names = set() for name, _x, _o in rules: diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py index 0c21bfb..601372e 100644 --- a/lark/parse_tree_builder.py +++ b/lark/parse_tree_builder.py @@ -123,6 +123,6 @@ class ParseTreeBuilder: raise GrammarError("Rule expansion '%s' already exists in rule %s" % (' '.join(expansion), origin)) setattr(callback, callback_name, alias_handler) - new_rules.append(( _origin, expansion, callback_name )) + new_rules.append(( _origin, expansion, callback_name, options )) return new_rules, callback diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index 1646726..a9066f5 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -129,7 +129,7 @@ class Earley_NoLex: def __init__(self, lexer_conf, parser_conf, options=None): self.token_by_name = {t.name:t for t in lexer_conf.tokens} - rules = [(n, list(self._prepare_expansion(x)), a) for n,x,a in parser_conf.rules] + rules = [(n, list(self._prepare_expansion(x)), a, o) for n,x,a,o in parser_conf.rules] resolve_ambiguity = (options.ambiguity=='resolve') if options else True self.parser = earley.Parser(rules, @@ -156,7 +156,7 @@ class Earley(WithLexer): def __init__(self, lexer_conf, parser_conf, options=None): WithLexer.__init__(self, lexer_conf) - rules = [(n, self._prepare_expansion(x), a) for n,x,a in parser_conf.rules] + rules = [(n, self._prepare_expansion(x), a, o) for n,x,a,o in parser_conf.rules] resolve_ambiguity = (options.ambiguity=='resolve') if options else True self.parser = earley.Parser(rules, diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py index 3c02332..dbcbda3 100644 --- a/lark/parsers/earley.py +++ b/lark/parsers/earley.py @@ -224,6 +224,11 @@ class ApplyCallbacks(Transformer_NoRecurse): return Tree(rule.origin, children) def _compare_rules(rule1, rule2): + if rule1.options and rule2.options: + if rule1.options.priority is not None and rule2.options.priority is not None: + assert rule1.options.priority != rule2.options.priority, "Priority is the same between both rules: %s == %s" % (rule1, rule2) + return -compare(rule1.options.priority, rule2.options.priority) + if rule1.origin != rule2.origin: return 0 c = compare( len(rule1.expansion), len(rule2.expansion)) diff --git a/lark/parsers/grammar_analysis.py b/lark/parsers/grammar_analysis.py index f08a8bd..7dff9ce 100644 --- a/lark/parsers/grammar_analysis.py +++ b/lark/parsers/grammar_analysis.py @@ -7,10 +7,11 @@ class Rule(object): origin : a symbol expansion : a list of symbols """ - def __init__(self, origin, expansion, alias=None): + def __init__(self, origin, expansion, alias=None, options=None): self.origin = origin self.expansion = expansion self.alias = alias + self.options = options def __repr__(self): return '<%s : %s>' % (self.origin, ' '.join(map(str,self.expansion))) @@ -111,12 +112,12 @@ class GrammarAnalyzer(object): self.debug = debug rule_tuples = list(rule_tuples) rule_tuples.append(('$root', [start_symbol, '$end'])) - rule_tuples = [(t[0], t[1], None) if len(t)==2 else t for t in rule_tuples] + rule_tuples = [(t[0], t[1], None, None) if len(t)==2 else t for t in rule_tuples] self.rules = set() - self.rules_by_origin = {o: [] for o, _x, _a in rule_tuples} - for origin, exp, alias in rule_tuples: - r = Rule( origin, exp, alias ) + self.rules_by_origin = {o: [] for o, _x, _a, _opt in rule_tuples} + for origin, exp, alias, options in rule_tuples: + r = Rule( origin, exp, alias, options ) self.rules.add(r) self.rules_by_origin[origin].append(r) diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index 1420345..bd519d1 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -9,6 +9,7 @@ from .lalr_analysis import LALR_Analyzer, ACTION_SHIFT class Parser(object): def __init__(self, parser_conf): + assert all(o is None or o.priority is None for n,x,a,o in parser_conf.rules), "LALR doesn't yet support prioritization" self.analysis = LALR_Analyzer(parser_conf.rules, parser_conf.start) self.analysis.compute_lookahead() self.callbacks = {rule: getattr(parser_conf.callback, rule.alias or rule.origin, None) diff --git a/tests/test_parser.py b/tests/test_parser.py index 348993a..d7f6928 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -621,6 +621,30 @@ def _make_parser_test(LEXER, PARSER): self.assertEqual(len(tree.children), 2) + @unittest.skipIf(PARSER != 'earley', "Currently only Earley supports priority in rules") + def test_earley_prioritization(self): + "Tests effect of priority on result" + + grammar = """ + start: a | b + a.1: "a" + b.2: "a" + """ + + l = Lark(grammar, parser='earley', lexer='standard') + res = l.parse("a") + self.assertEqual(res.children[0].data, 'b') + + grammar = """ + start: a | b + a.2: "a" + b.1: "a" + """ + + l = Lark(grammar, parser='earley', lexer='standard') + res = l.parse("a") + self.assertEqual(res.children[0].data, 'a') + _NAME = "Test" + PARSER.capitalize() + (LEXER or 'Scanless').capitalize()