From 6ea4588bcf4ac66e546022160ebc3cae1035e07b Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Wed, 27 Jun 2018 16:31:02 +0300 Subject: [PATCH] Dynamic lexer is now returns the maximum match only. Complete lexing beahvior moved to "dynamic_complete" Changed dynamic lexer behavior to only match terminals to their maximum length (i.e. greedy match), emulating the standard lexer. The original dynamic lexer behavior, that attempts to match all appearances of a terminal, has been moved to the "dynamic_complete" lexer. For example, when applying a terminal "a"+ to the text "aaa": - dynamic: ["aaa"] - dynamic_complete: ["a", "aa", "aaa"] --- lark/lark.py | 9 +++++---- lark/parser_frontends.py | 12 ++++++++++-- lark/parsers/xearley.py | 18 ++++++++++-------- tests/test_parser.py | 5 ++++- 4 files changed, 29 insertions(+), 15 deletions(-) diff --git a/lark/lark.py b/lark/lark.py index bb15a2f..210ff6b 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -14,6 +14,7 @@ from .lexer import Lexer from .parse_tree_builder import ParseTreeBuilder from .parser_frontends import get_frontend + class LarkOptions(object): """Specifies the options for Lark @@ -26,6 +27,8 @@ class LarkOptions(object): "standard": Use a standard lexer "contextual": Stronger lexer (only works with parser="lalr") "dynamic": Flexible and powerful (only with parser="earley") + "dynamic_complete": Same as dynamic, but tries *every* variation + of tokenizing possible. (only with parser="earley") "auto" (default): Choose for me based on grammar and parser ambiguity - Decides how to handle ambiguity in the parse. Only relevant if parser="earley" @@ -139,7 +142,7 @@ class Lark: else: assert False, self.options.parser lexer = self.options.lexer - assert lexer in ('standard', 'contextual', 'dynamic') + assert lexer in ('standard', 'contextual', 'dynamic', 'dynamic_complete') if self.options.ambiguity == 'auto': if self.options.parser == 'earley': @@ -213,8 +216,7 @@ class Lark: stream = self.lexer.lex(text) if self.options.postlex: return self.options.postlex.process(stream) - else: - return stream + return stream def parse(self, text): "Parse the given text, according to the options provided. Returns a tree, unless specified otherwise." @@ -231,4 +233,3 @@ class Lark: # else: # l = list(self.lex(text)) # return self.parser.parse(l) - diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index 08e2d0e..4c604a2 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -83,7 +83,7 @@ class Earley(WithLexer): class XEarley: - def __init__(self, lexer_conf, parser_conf, options=None): + def __init__(self, lexer_conf, parser_conf, options=None, **kw): self.token_by_name = {t.name:t for t in lexer_conf.tokens} self._prepare_match(lexer_conf) @@ -92,7 +92,8 @@ class XEarley: self.match, resolve_ambiguity=get_ambiguity_resolver(options), ignore=lexer_conf.ignore, - predict_all=options.earley__predict_all + predict_all=options.earley__predict_all, + **kw ) def match(self, term, text, index=0): @@ -115,6 +116,11 @@ class XEarley: def parse(self, text): return self.parser.parse(text) +class XEarley_CompleteLex(XEarley): + def __init__(self, *args, **kw): + super(self).__init__(*args, complete_lex=True, **kw) + + class CYK(WithLexer): @@ -165,6 +171,8 @@ def get_frontend(parser, lexer): return Earley elif lexer=='dynamic': return XEarley + elif lexer=='dynamic_complete': + return XEarley_CompleteLex elif lexer=='contextual': raise ValueError('The Earley parser does not support the contextual parser') else: diff --git a/lark/parsers/xearley.py b/lark/parsers/xearley.py index 02698fb..57e4a4a 100644 --- a/lark/parsers/xearley.py +++ b/lark/parsers/xearley.py @@ -28,13 +28,15 @@ from ..grammar import NonTerminal, Terminal from .earley import ApplyCallbacks, Item, Column + class Parser: - def __init__(self, parser_conf, term_matcher, resolve_ambiguity=None, ignore=(), predict_all=False): + def __init__(self, parser_conf, term_matcher, resolve_ambiguity=None, ignore=(), predict_all=False, complete_lex=False): self.analysis = GrammarAnalyzer(parser_conf) self.parser_conf = parser_conf self.resolve_ambiguity = resolve_ambiguity self.ignore = [Terminal(t) for t in ignore] self.predict_all = predict_all + self.complete_lex = complete_lex self.FIRST = self.analysis.FIRST self.postprocess = {} @@ -101,12 +103,13 @@ class Parser: t = Token(item.expect.name, m.group(0), i, text_line, text_column) delayed_matches[m.end()].append(item.advance(t)) - s = m.group(0) - for j in range(1, len(s)): - m = match(item.expect, s[:-j]) - if m: - t = Token(item.expect.name, m.group(0), i, text_line, text_column) - delayed_matches[i+m.end()].append(item.advance(t)) + if self.complete_lex: + s = m.group(0) + for j in range(1, len(s)): + m = match(item.expect, s[:-j]) + if m: + t = Token(item.expect.name, m.group(0), i, text_line, text_column) + delayed_matches[i+m.end()].append(item.advance(t)) next_set = Column(i+1, self.FIRST, predict_all=self.predict_all) next_set.add(delayed_matches[i+1]) @@ -132,7 +135,6 @@ class Parser: else: text_column += 1 - predict_and_complete(column) # Parse ended. Now build a parse tree diff --git a/tests/test_parser.py b/tests/test_parser.py index 36cb142..c0a809f 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -18,7 +18,7 @@ from io import ( logging.basicConfig(level=logging.INFO) from lark.lark import Lark -from lark.exceptions import GrammarError, ParseError, UnexpectedToken, LexError, UnexpectedInput +from lark.exceptions import GrammarError, ParseError, UnexpectedToken, UnexpectedInput from lark.tree import Tree from lark.visitors import Transformer @@ -184,6 +184,7 @@ def _make_full_earley_test(LEXER): l.parse(program) + @unittest.skipIf(LEXER=='dynamic', "Only relevant for the dynamic_complete parser") def test_earley3(self): "Tests prioritization and disambiguation for pseudo-terminals (there should be only one result)" @@ -289,6 +290,7 @@ def _make_full_earley_test(LEXER): self.assertEqual(res, expected) + @unittest.skipIf(LEXER=='dynamic', "Only relevant for the dynamic_complete parser") def test_explicit_ambiguity2(self): grammar = r""" start: NAME+ @@ -1175,6 +1177,7 @@ _TO_TEST = [ ('standard', 'earley'), ('standard', 'cyk'), ('dynamic', 'earley'), + ('dynamic_complete', 'earley'), ('standard', 'lalr'), ('contextual', 'lalr'), # (None, 'earley'),