Changed dynamic lexer behavior to only match terminals to their maximum length (i.e. greedy match), emulating the standard lexer. The original dynamic lexer behavior, that attempts to match all appearances of a terminal, has been moved to the "dynamic_complete" lexer. For example, when applying a terminal "a"+ to the text "aaa": - dynamic: ["aaa"] - dynamic_complete: ["a", "aa", "aaa"]tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.6.0
@@ -14,6 +14,7 @@ from .lexer import Lexer | |||||
from .parse_tree_builder import ParseTreeBuilder | from .parse_tree_builder import ParseTreeBuilder | ||||
from .parser_frontends import get_frontend | from .parser_frontends import get_frontend | ||||
class LarkOptions(object): | class LarkOptions(object): | ||||
"""Specifies the options for Lark | """Specifies the options for Lark | ||||
@@ -26,6 +27,8 @@ class LarkOptions(object): | |||||
"standard": Use a standard lexer | "standard": Use a standard lexer | ||||
"contextual": Stronger lexer (only works with parser="lalr") | "contextual": Stronger lexer (only works with parser="lalr") | ||||
"dynamic": Flexible and powerful (only with parser="earley") | "dynamic": Flexible and powerful (only with parser="earley") | ||||
"dynamic_complete": Same as dynamic, but tries *every* variation | |||||
of tokenizing possible. (only with parser="earley") | |||||
"auto" (default): Choose for me based on grammar and parser | "auto" (default): Choose for me based on grammar and parser | ||||
ambiguity - Decides how to handle ambiguity in the parse. Only relevant if parser="earley" | ambiguity - Decides how to handle ambiguity in the parse. Only relevant if parser="earley" | ||||
@@ -139,7 +142,7 @@ class Lark: | |||||
else: | else: | ||||
assert False, self.options.parser | assert False, self.options.parser | ||||
lexer = self.options.lexer | lexer = self.options.lexer | ||||
assert lexer in ('standard', 'contextual', 'dynamic') | |||||
assert lexer in ('standard', 'contextual', 'dynamic', 'dynamic_complete') | |||||
if self.options.ambiguity == 'auto': | if self.options.ambiguity == 'auto': | ||||
if self.options.parser == 'earley': | if self.options.parser == 'earley': | ||||
@@ -213,8 +216,7 @@ class Lark: | |||||
stream = self.lexer.lex(text) | stream = self.lexer.lex(text) | ||||
if self.options.postlex: | if self.options.postlex: | ||||
return self.options.postlex.process(stream) | return self.options.postlex.process(stream) | ||||
else: | |||||
return stream | |||||
return stream | |||||
def parse(self, text): | def parse(self, text): | ||||
"Parse the given text, according to the options provided. Returns a tree, unless specified otherwise." | "Parse the given text, according to the options provided. Returns a tree, unless specified otherwise." | ||||
@@ -231,4 +233,3 @@ class Lark: | |||||
# else: | # else: | ||||
# l = list(self.lex(text)) | # l = list(self.lex(text)) | ||||
# return self.parser.parse(l) | # return self.parser.parse(l) | ||||
@@ -83,7 +83,7 @@ class Earley(WithLexer): | |||||
class XEarley: | class XEarley: | ||||
def __init__(self, lexer_conf, parser_conf, options=None): | |||||
def __init__(self, lexer_conf, parser_conf, options=None, **kw): | |||||
self.token_by_name = {t.name:t for t in lexer_conf.tokens} | self.token_by_name = {t.name:t for t in lexer_conf.tokens} | ||||
self._prepare_match(lexer_conf) | self._prepare_match(lexer_conf) | ||||
@@ -92,7 +92,8 @@ class XEarley: | |||||
self.match, | self.match, | ||||
resolve_ambiguity=get_ambiguity_resolver(options), | resolve_ambiguity=get_ambiguity_resolver(options), | ||||
ignore=lexer_conf.ignore, | ignore=lexer_conf.ignore, | ||||
predict_all=options.earley__predict_all | |||||
predict_all=options.earley__predict_all, | |||||
**kw | |||||
) | ) | ||||
def match(self, term, text, index=0): | def match(self, term, text, index=0): | ||||
@@ -115,6 +116,11 @@ class XEarley: | |||||
def parse(self, text): | def parse(self, text): | ||||
return self.parser.parse(text) | return self.parser.parse(text) | ||||
class XEarley_CompleteLex(XEarley): | |||||
def __init__(self, *args, **kw): | |||||
super(self).__init__(*args, complete_lex=True, **kw) | |||||
class CYK(WithLexer): | class CYK(WithLexer): | ||||
@@ -165,6 +171,8 @@ def get_frontend(parser, lexer): | |||||
return Earley | return Earley | ||||
elif lexer=='dynamic': | elif lexer=='dynamic': | ||||
return XEarley | return XEarley | ||||
elif lexer=='dynamic_complete': | |||||
return XEarley_CompleteLex | |||||
elif lexer=='contextual': | elif lexer=='contextual': | ||||
raise ValueError('The Earley parser does not support the contextual parser') | raise ValueError('The Earley parser does not support the contextual parser') | ||||
else: | else: | ||||
@@ -28,13 +28,15 @@ from ..grammar import NonTerminal, Terminal | |||||
from .earley import ApplyCallbacks, Item, Column | from .earley import ApplyCallbacks, Item, Column | ||||
class Parser: | class Parser: | ||||
def __init__(self, parser_conf, term_matcher, resolve_ambiguity=None, ignore=(), predict_all=False): | |||||
def __init__(self, parser_conf, term_matcher, resolve_ambiguity=None, ignore=(), predict_all=False, complete_lex=False): | |||||
self.analysis = GrammarAnalyzer(parser_conf) | self.analysis = GrammarAnalyzer(parser_conf) | ||||
self.parser_conf = parser_conf | self.parser_conf = parser_conf | ||||
self.resolve_ambiguity = resolve_ambiguity | self.resolve_ambiguity = resolve_ambiguity | ||||
self.ignore = [Terminal(t) for t in ignore] | self.ignore = [Terminal(t) for t in ignore] | ||||
self.predict_all = predict_all | self.predict_all = predict_all | ||||
self.complete_lex = complete_lex | |||||
self.FIRST = self.analysis.FIRST | self.FIRST = self.analysis.FIRST | ||||
self.postprocess = {} | self.postprocess = {} | ||||
@@ -101,12 +103,13 @@ class Parser: | |||||
t = Token(item.expect.name, m.group(0), i, text_line, text_column) | t = Token(item.expect.name, m.group(0), i, text_line, text_column) | ||||
delayed_matches[m.end()].append(item.advance(t)) | delayed_matches[m.end()].append(item.advance(t)) | ||||
s = m.group(0) | |||||
for j in range(1, len(s)): | |||||
m = match(item.expect, s[:-j]) | |||||
if m: | |||||
t = Token(item.expect.name, m.group(0), i, text_line, text_column) | |||||
delayed_matches[i+m.end()].append(item.advance(t)) | |||||
if self.complete_lex: | |||||
s = m.group(0) | |||||
for j in range(1, len(s)): | |||||
m = match(item.expect, s[:-j]) | |||||
if m: | |||||
t = Token(item.expect.name, m.group(0), i, text_line, text_column) | |||||
delayed_matches[i+m.end()].append(item.advance(t)) | |||||
next_set = Column(i+1, self.FIRST, predict_all=self.predict_all) | next_set = Column(i+1, self.FIRST, predict_all=self.predict_all) | ||||
next_set.add(delayed_matches[i+1]) | next_set.add(delayed_matches[i+1]) | ||||
@@ -132,7 +135,6 @@ class Parser: | |||||
else: | else: | ||||
text_column += 1 | text_column += 1 | ||||
predict_and_complete(column) | predict_and_complete(column) | ||||
# Parse ended. Now build a parse tree | # Parse ended. Now build a parse tree | ||||
@@ -18,7 +18,7 @@ from io import ( | |||||
logging.basicConfig(level=logging.INFO) | logging.basicConfig(level=logging.INFO) | ||||
from lark.lark import Lark | from lark.lark import Lark | ||||
from lark.exceptions import GrammarError, ParseError, UnexpectedToken, LexError, UnexpectedInput | |||||
from lark.exceptions import GrammarError, ParseError, UnexpectedToken, UnexpectedInput | |||||
from lark.tree import Tree | from lark.tree import Tree | ||||
from lark.visitors import Transformer | from lark.visitors import Transformer | ||||
@@ -184,6 +184,7 @@ def _make_full_earley_test(LEXER): | |||||
l.parse(program) | l.parse(program) | ||||
@unittest.skipIf(LEXER=='dynamic', "Only relevant for the dynamic_complete parser") | |||||
def test_earley3(self): | def test_earley3(self): | ||||
"Tests prioritization and disambiguation for pseudo-terminals (there should be only one result)" | "Tests prioritization and disambiguation for pseudo-terminals (there should be only one result)" | ||||
@@ -289,6 +290,7 @@ def _make_full_earley_test(LEXER): | |||||
self.assertEqual(res, expected) | self.assertEqual(res, expected) | ||||
@unittest.skipIf(LEXER=='dynamic', "Only relevant for the dynamic_complete parser") | |||||
def test_explicit_ambiguity2(self): | def test_explicit_ambiguity2(self): | ||||
grammar = r""" | grammar = r""" | ||||
start: NAME+ | start: NAME+ | ||||
@@ -1175,6 +1177,7 @@ _TO_TEST = [ | |||||
('standard', 'earley'), | ('standard', 'earley'), | ||||
('standard', 'cyk'), | ('standard', 'cyk'), | ||||
('dynamic', 'earley'), | ('dynamic', 'earley'), | ||||
('dynamic_complete', 'earley'), | |||||
('standard', 'lalr'), | ('standard', 'lalr'), | ||||
('contextual', 'lalr'), | ('contextual', 'lalr'), | ||||
# (None, 'earley'), | # (None, 'earley'), | ||||