@@ -20,6 +20,7 @@ class LexerConf(Serialize): | |||||
class ParserConf: | class ParserConf: | ||||
def __init__(self, rules, callbacks, start): | def __init__(self, rules, callbacks, start): | ||||
assert isinstance(start, list) | |||||
self.rules = rules | self.rules = rules | ||||
self.callbacks = callbacks | self.callbacks = callbacks | ||||
self.start = start | self.start = start | ||||
@@ -52,7 +52,7 @@ class UnexpectedInput(LarkError): | |||||
class UnexpectedCharacters(LexError, UnexpectedInput): | class UnexpectedCharacters(LexError, UnexpectedInput): | ||||
def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None): | |||||
def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None): | |||||
message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos], line, column) | message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos], line, column) | ||||
self.line = line | self.line = line | ||||
@@ -65,6 +65,8 @@ class UnexpectedCharacters(LexError, UnexpectedInput): | |||||
message += '\n\n' + self.get_context(seq) | message += '\n\n' + self.get_context(seq) | ||||
if allowed: | if allowed: | ||||
message += '\nExpecting: %s\n' % allowed | message += '\nExpecting: %s\n' % allowed | ||||
if token_history: | |||||
message += '\nPrevious tokens: %s\n' % ', '.join(repr(t) for t in token_history) | |||||
super(UnexpectedCharacters, self).__init__(message) | super(UnexpectedCharacters, self).__init__(message) | ||||
@@ -85,6 +85,9 @@ class LarkOptions(Serialize): | |||||
options[name] = value | options[name] = value | ||||
if isinstance(options['start'], str): | |||||
options['start'] = [options['start']] | |||||
self.__dict__['options'] = options | self.__dict__['options'] = options | ||||
assert self.parser in ('earley', 'lalr', 'cyk', None) | assert self.parser in ('earley', 'lalr', 'cyk', None) | ||||
@@ -149,6 +149,7 @@ class _Lex: | |||||
newline_types = frozenset(newline_types) | newline_types = frozenset(newline_types) | ||||
ignore_types = frozenset(ignore_types) | ignore_types = frozenset(ignore_types) | ||||
line_ctr = LineCounter() | line_ctr = LineCounter() | ||||
last_token = None | |||||
while line_ctr.char_pos < len(stream): | while line_ctr.char_pos < len(stream): | ||||
lexer = self.lexer | lexer = self.lexer | ||||
@@ -166,6 +167,7 @@ class _Lex: | |||||
t = lexer.callback[t.type](t) | t = lexer.callback[t.type](t) | ||||
if not isinstance(t, Token): | if not isinstance(t, Token): | ||||
raise ValueError("Callbacks must return a token (returned %r)" % t) | raise ValueError("Callbacks must return a token (returned %r)" % t) | ||||
last_token = t | |||||
yield t | yield t | ||||
else: | else: | ||||
if type_ in lexer.callback: | if type_ in lexer.callback: | ||||
@@ -180,7 +182,7 @@ class _Lex: | |||||
break | break | ||||
else: | else: | ||||
allowed = {v for m, tfi in lexer.mres for v in tfi.values()} | allowed = {v for m, tfi in lexer.mres for v in tfi.values()} | ||||
raise UnexpectedCharacters(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column, allowed=allowed, state=self.state) | |||||
raise UnexpectedCharacters(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column, allowed=allowed, state=self.state, token_history=last_token and [last_token]) | |||||
class UnlessCallback: | class UnlessCallback: | ||||
@@ -554,7 +554,8 @@ class Grammar: | |||||
for s in r.expansion | for s in r.expansion | ||||
if isinstance(s, NonTerminal) | if isinstance(s, NonTerminal) | ||||
and s != r.origin} | and s != r.origin} | ||||
compiled_rules = [r for r in compiled_rules if r.origin.name==start or r.origin in used_rules] | |||||
used_rules |= {NonTerminal(s) for s in start} | |||||
compiled_rules = [r for r in compiled_rules if r.origin in used_rules] | |||||
if len(compiled_rules) == c: | if len(compiled_rules) == c: | ||||
break | break | ||||
@@ -690,7 +691,7 @@ class GrammarLoader: | |||||
callback = ParseTreeBuilder(rules, ST).create_callback() | callback = ParseTreeBuilder(rules, ST).create_callback() | ||||
lexer_conf = LexerConf(terminals, ['WS', 'COMMENT']) | lexer_conf = LexerConf(terminals, ['WS', 'COMMENT']) | ||||
parser_conf = ParserConf(rules, callback, 'start') | |||||
parser_conf = ParserConf(rules, callback, ['start']) | |||||
self.parser = LALR_TraditionalLexer(lexer_conf, parser_conf) | self.parser = LALR_TraditionalLexer(lexer_conf, parser_conf) | ||||
self.canonize_tree = CanonizeTree() | self.canonize_tree = CanonizeTree() | ||||
@@ -89,7 +89,7 @@ class Parser(object): | |||||
self.orig_rules = {rule: rule for rule in rules} | self.orig_rules = {rule: rule for rule in rules} | ||||
rules = [self._to_rule(rule) for rule in rules] | rules = [self._to_rule(rule) for rule in rules] | ||||
self.grammar = to_cnf(Grammar(rules)) | self.grammar = to_cnf(Grammar(rules)) | ||||
self.start = NT(start) | |||||
self.start = NT(start[0]) | |||||
def _to_rule(self, lark_rule): | def _to_rule(self, lark_rule): | ||||
"""Converts a lark rule, (lhs, rhs, callback, options), to a Rule.""" | """Converts a lark rule, (lhs, rhs, callback, options), to a Rule.""" | ||||
@@ -274,7 +274,7 @@ class Parser: | |||||
assert i == len(columns)-1 | assert i == len(columns)-1 | ||||
def parse(self, stream, start_symbol=None): | def parse(self, stream, start_symbol=None): | ||||
start_symbol = NonTerminal(start_symbol or self.parser_conf.start) | |||||
start_symbol = NonTerminal(start_symbol or self.parser_conf.start[0]) | |||||
columns = [set()] | columns = [set()] | ||||
to_scan = set() # The scan buffer. 'Q' in E.Scott's paper. | to_scan = set() # The scan buffer. 'Q' in E.Scott's paper. | ||||
@@ -109,7 +109,7 @@ class GrammarAnalyzer(object): | |||||
def __init__(self, parser_conf, debug=False): | def __init__(self, parser_conf, debug=False): | ||||
self.debug = debug | self.debug = debug | ||||
rules = parser_conf.rules + [Rule(NonTerminal('$root'), [NonTerminal(parser_conf.start), Terminal('$END')])] | |||||
rules = parser_conf.rules + [Rule(NonTerminal('$root'), [NonTerminal(s), Terminal('$END')]) for s in parser_conf.start] | |||||
self.rules_by_origin = classify(rules, lambda r: r.origin) | self.rules_by_origin = classify(rules, lambda r: r.origin) | ||||
if len(rules) != len(set(rules)): | if len(rules) != len(set(rules)): | ||||
@@ -29,10 +29,10 @@ Shift = Action('Shift') | |||||
Reduce = Action('Reduce') | Reduce = Action('Reduce') | ||||
class ParseTable: | class ParseTable: | ||||
def __init__(self, states, start_state, end_state): | |||||
def __init__(self, states, start_state, end_states): | |||||
self.states = states | self.states = states | ||||
self.start_state = start_state | self.start_state = start_state | ||||
self.end_state = end_state | |||||
self.end_states = end_states | |||||
def serialize(self, memo): | def serialize(self, memo): | ||||
tokens = Enumerator() | tokens = Enumerator() | ||||
@@ -48,7 +48,7 @@ class ParseTable: | |||||
'tokens': tokens.reversed(), | 'tokens': tokens.reversed(), | ||||
'states': states, | 'states': states, | ||||
'start_state': self.start_state, | 'start_state': self.start_state, | ||||
'end_state': self.end_state, | |||||
'end_states': self.end_states, | |||||
} | } | ||||
@classmethod | @classmethod | ||||
@@ -59,7 +59,7 @@ class ParseTable: | |||||
for token, (action, arg) in actions.items()} | for token, (action, arg) in actions.items()} | ||||
for state, actions in data['states'].items() | for state, actions in data['states'].items() | ||||
} | } | ||||
return cls(states, data['start_state'], data['end_state']) | |||||
return cls(states, data['start_state'], data['end_states']) | |||||
class IntParseTable(ParseTable): | class IntParseTable(ParseTable): | ||||
@@ -77,8 +77,8 @@ class IntParseTable(ParseTable): | |||||
start_state = state_to_idx[parse_table.start_state] | start_state = state_to_idx[parse_table.start_state] | ||||
end_state = state_to_idx[parse_table.end_state] | |||||
return cls(int_states, start_state, end_state) | |||||
end_states = [state_to_idx[s] for s in parse_table.end_states] | |||||
return cls(int_states, start_state, end_states) | |||||
###} | ###} | ||||
@@ -130,9 +130,7 @@ class LALR_Analyzer(GrammarAnalyzer): | |||||
for _ in bfs([self.start_state], step): | for _ in bfs([self.start_state], step): | ||||
pass | pass | ||||
self.end_state ,= self.end_states | |||||
self._parse_table = ParseTable(self.states, self.start_state, self.end_state) | |||||
self._parse_table = ParseTable(self.states, self.start_state, self.end_states) | |||||
if self.debug: | if self.debug: | ||||
self.parse_table = self._parse_table | self.parse_table = self._parse_table | ||||
@@ -40,7 +40,7 @@ class _Parser: | |||||
def __init__(self, parse_table, callbacks): | def __init__(self, parse_table, callbacks): | ||||
self.states = parse_table.states | self.states = parse_table.states | ||||
self.start_state = parse_table.start_state | self.start_state = parse_table.start_state | ||||
self.end_state = parse_table.end_state | |||||
self.end_states = parse_table.end_states | |||||
self.callbacks = callbacks | self.callbacks = callbacks | ||||
def parse(self, seq, set_state=None): | def parse(self, seq, set_state=None): | ||||
@@ -81,7 +81,7 @@ class _Parser: | |||||
for token in stream: | for token in stream: | ||||
while True: | while True: | ||||
action, arg = get_action(token) | action, arg = get_action(token) | ||||
assert arg != self.end_state | |||||
assert arg not in self.end_states | |||||
if action is Shift: | if action is Shift: | ||||
state_stack.append(arg) | state_stack.append(arg) | ||||
@@ -95,7 +95,7 @@ class _Parser: | |||||
while True: | while True: | ||||
_action, arg = get_action(token) | _action, arg = get_action(token) | ||||
if _action is Shift: | if _action is Shift: | ||||
assert arg == self.end_state | |||||
assert arg in self.end_states | |||||
val ,= value_stack | val ,= value_stack | ||||
return val | return val | ||||
else: | else: | ||||
@@ -1523,6 +1523,15 @@ def _make_parser_test(LEXER, PARSER): | |||||
parser3 = Lark.deserialize(d, namespace, m) | parser3 = Lark.deserialize(d, namespace, m) | ||||
self.assertEqual(parser3.parse('ABC'), Tree('start', [Tree('b', [])]) ) | self.assertEqual(parser3.parse('ABC'), Tree('start', [Tree('b', [])]) ) | ||||
def test_multi_start(self): | |||||
parser = _Lark(''' | |||||
a: "x" | |||||
b: "x" "b"? | |||||
''', start=['a', 'b']) | |||||
# parser.parse('acab') | |||||
# parser.parse('bcab') | |||||
_NAME = "Test" + PARSER.capitalize() + LEXER.capitalize() | _NAME = "Test" + PARSER.capitalize() + LEXER.capitalize() | ||||