Also some issues with python_parsertags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.5.1
@@ -3,6 +3,7 @@ | |||||
import re | import re | ||||
from .utils import Str, classify | from .utils import Str, classify | ||||
from .common import is_terminal | |||||
class LexError(Exception): | class LexError(Exception): | ||||
pass | pass | ||||
@@ -169,3 +170,64 @@ class Lexer(object): | |||||
if lex_pos < len(stream): | if lex_pos < len(stream): | ||||
raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos) | raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos) | ||||
break | break | ||||
class ContextualLexer: | |||||
def __init__(self, tokens, states, ignore=()): | |||||
tokens_by_name = {} | |||||
for t in tokens: | |||||
assert t.name not in tokens_by_name | |||||
tokens_by_name[t.name] = t | |||||
lexer_by_tokens = {} | |||||
self.lexers = {} | |||||
for state, accepts in states.items(): | |||||
key = frozenset(accepts) | |||||
try: | |||||
lexer = lexer_by_tokens[key] | |||||
except KeyError: | |||||
accepts = list(accepts) # For python3 | |||||
accepts += ignore | |||||
# if '_NEWLINE' in tokens_by_name and '_NEWLINE' not in accepts: | |||||
# accepts.append('_NEWLINE') # XXX hack for now | |||||
state_tokens = [tokens_by_name[n] for n in accepts if is_terminal(n) and n!='$end'] | |||||
lexer = Lexer(state_tokens, ignore=ignore) | |||||
lexer_by_tokens[key] = lexer | |||||
self.lexers[state] = lexer | |||||
self.root_lexer = Lexer(tokens, ignore=ignore) | |||||
def lex(self, stream, parser): | |||||
lex_pos = 0 | |||||
line = 1 | |||||
col_start_pos = 0 | |||||
newline_types = list(self.root_lexer.newline_types) | |||||
ignore_types = list(self.root_lexer.ignore_types) | |||||
while True: | |||||
lexer = self.lexers[parser.state] | |||||
for mre, type_from_index in lexer.mres: | |||||
m = mre.match(stream, lex_pos) | |||||
if m: | |||||
value = m.group(0) | |||||
type_ = type_from_index[m.lastindex] | |||||
if type_ not in ignore_types: | |||||
t = Token(type_, value, lex_pos) | |||||
t.line = line | |||||
t.column = lex_pos - col_start_pos | |||||
if t.type in lexer.callback: | |||||
t = lexer.callback[t.type](t) | |||||
yield t | |||||
if type_ in newline_types: | |||||
newlines = value.count(lexer.newline_char) | |||||
if newlines: | |||||
line += newlines | |||||
col_start_pos = lex_pos + value.rindex(lexer.newline_char) | |||||
lex_pos += len(value) | |||||
break | |||||
else: | |||||
if lex_pos < len(stream): | |||||
raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos) | |||||
break | |||||
@@ -1,7 +1,7 @@ | |||||
import re | import re | ||||
import sre_parse | import sre_parse | ||||
from .lexer import Lexer | |||||
from .lexer import Lexer, ContextualLexer | |||||
from .parsers.lalr_analysis import GrammarAnalyzer | from .parsers.lalr_analysis import GrammarAnalyzer | ||||
from .common import is_terminal, GrammarError | from .common import is_terminal, GrammarError | ||||
@@ -31,6 +31,25 @@ class LALR(WithLexer): | |||||
tokens = list(self.lex(text)) | tokens = list(self.lex(text)) | ||||
return self.parser.parse(tokens) | return self.parser.parse(tokens) | ||||
class LALR_ContextualLexer: | |||||
def __init__(self, lexer_conf, parser_conf): | |||||
self.lexer_conf = lexer_conf | |||||
self.parser_conf = parser_conf | |||||
self.analyzer = GrammarAnalyzer(parser_conf.rules, parser_conf.start) | |||||
self.analyzer.analyze() | |||||
d = {idx:t.keys() for idx, t in self.analyzer.states_idx.items()} | |||||
self.lexer = ContextualLexer(lexer_conf.tokens, d, ignore=lexer_conf.ignore) | |||||
def parse(self, text): | |||||
parser = lalr_parser.Parser(self.analyzer, self.parser_conf.callback) | |||||
l = self.lexer.lex(text, parser) | |||||
return parser.parse(l, True) | |||||
class Earley(WithLexer): | class Earley(WithLexer): | ||||
def __init__(self, lexer_conf, parser_conf): | def __init__(self, lexer_conf, parser_conf): | ||||
WithLexer.__init__(self, lexer_conf) | WithLexer.__init__(self, lexer_conf) | ||||
@@ -82,4 +101,4 @@ class Earley_NoLex: | |||||
assert len(res) ==1 , 'Ambiguious Parse! Not handled yet' | assert len(res) ==1 , 'Ambiguious Parse! Not handled yet' | ||||
return res[0] | return res[0] | ||||
ENGINE_DICT = { 'lalr': LALR, 'earley': Earley, 'earley_nolex': Earley_NoLex } | |||||
ENGINE_DICT = { 'lalr': LALR, 'earley': Earley, 'earley_nolex': Earley_NoLex, 'lalr_contextual_lexer': LALR_ContextualLexer } |
@@ -7,13 +7,14 @@ class Parser(object): | |||||
self.analysis = analysis | self.analysis = analysis | ||||
self.callbacks = {rule: getattr(callback, rule.alias or rule.origin, None) | self.callbacks = {rule: getattr(callback, rule.alias or rule.origin, None) | ||||
for rule in analysis.rules} | for rule in analysis.rules} | ||||
self.state = self.analysis.init_state_idx | |||||
def parse(self, seq): | |||||
def parse(self, stream, set_state=False): # XXX no set_state | |||||
stream = iter(stream) | |||||
states_idx = self.analysis.states_idx | states_idx = self.analysis.states_idx | ||||
state_stack = [self.analysis.init_state_idx] | state_stack = [self.analysis.init_state_idx] | ||||
value_stack = [] | value_stack = [] | ||||
i = 0 | |||||
def get_action(key): | def get_action(key): | ||||
state = state_stack[-1] | state = state_stack[-1] | ||||
@@ -21,13 +22,8 @@ class Parser(object): | |||||
return states_idx[state][key] | return states_idx[state][key] | ||||
except KeyError: | except KeyError: | ||||
expected = states_idx[state].keys() | expected = states_idx[state].keys() | ||||
try: | |||||
token = seq[i] | |||||
except IndexError: | |||||
assert key == '$end' | |||||
token = seq[-1] | |||||
raise UnexpectedToken(token, expected, seq, i) | |||||
raise UnexpectedToken(token, expected, [], 0) | |||||
def reduce(rule, size): | def reduce(rule, size): | ||||
if size: | if size: | ||||
@@ -48,15 +44,20 @@ class Parser(object): | |||||
value_stack.append(res) | value_stack.append(res) | ||||
# Main LALR-parser loop | # Main LALR-parser loop | ||||
while i < len(seq): | |||||
action, arg = get_action(seq[i].type) | |||||
if action == ACTION_SHIFT: | |||||
state_stack.append(arg) | |||||
value_stack.append(seq[i]) | |||||
i+= 1 | |||||
else: | |||||
reduce(*arg) | |||||
try: | |||||
token = next(stream) | |||||
while True: | |||||
action, arg = get_action(token.type) | |||||
if action == ACTION_SHIFT: | |||||
state_stack.append(arg) | |||||
value_stack.append(token) | |||||
if set_state: self.state = arg | |||||
token = next(stream) | |||||
else: | |||||
reduce(*arg) | |||||
except StopIteration: | |||||
pass | |||||
while True: | while True: | ||||
_action, rule = get_action('$end') | _action, rule = get_action('$end') | ||||
@@ -5,7 +5,7 @@ import logging | |||||
from .test_trees import TestTrees | from .test_trees import TestTrees | ||||
# from .test_selectors import TestSelectors | # from .test_selectors import TestSelectors | ||||
from .test_parser import TestLalr, TestEarley, TestParsers | |||||
from .test_parser import TestLalr, TestEarley, TestLalr_contextual_lexer, TestParsers | |||||
# from .test_grammars import TestPythonG, TestConfigG | # from .test_grammars import TestPythonG, TestConfigG | ||||
logging.basicConfig(level=logging.INFO) | logging.basicConfig(level=logging.INFO) | ||||
@@ -356,11 +356,10 @@ def _make_parser_test(PARSER): | |||||
_TestParser.__name__ = _NAME | _TestParser.__name__ = _NAME | ||||
globals()[_NAME] = _TestParser | globals()[_NAME] = _TestParser | ||||
for PARSER in ['lalr', 'earley']: | |||||
for PARSER in ['lalr', 'earley', 'lalr_contextual_lexer']: | |||||
_make_parser_test(PARSER) | _make_parser_test(PARSER) | ||||
if __name__ == '__main__': | if __name__ == '__main__': | ||||
unittest.main() | unittest.main() | ||||