diff --git a/lark-stubs/lark.pyi b/lark-stubs/lark.pyi index fba567b..6fd3139 100644 --- a/lark-stubs/lark.pyi +++ b/lark-stubs/lark.pyi @@ -2,8 +2,10 @@ from typing import ( TypeVar, Type, List, Dict, IO, Iterator, Callable, Union, Optional, - Literal, Protocol, Tuple, Iterable, + Literal, Protocol, Tuple, Iterable, ) + +from .parsers.lalr_puppet import ParserPuppet from .visitors import Transformer from .lexer import Token, Lexer, TerminalDef from .tree import Tree @@ -12,6 +14,7 @@ from .load_grammar import Grammar _T = TypeVar('_T') + class PostLex(Protocol): def process(self, stream: Iterator[Token]) -> Iterator[Token]: @@ -46,6 +49,7 @@ class PackageResource(object): def __init__(self, pkg_name: str, path: str): ... + class FromPackageLoader: def __init__(self, pkg_name: str, search_paths: Tuple[str, ...] = ...): ... @@ -87,6 +91,9 @@ class Lark: def parse(self, text: str, start: Optional[str] = None, on_error: Callable[[UnexpectedInput], bool] = None) -> Tree: ... + def get_puppet(self, text: str = None, start: Optional[str] = None) -> ParserPuppet: + ... + @classmethod def open(cls: Type[_T], grammar_filename: str, rel_to: Optional[str] = None, **options) -> _T: ... diff --git a/lark-stubs/parsers/lalr_puppet.pyi b/lark-stubs/parsers/lalr_puppet.pyi index 7820dbd..d50ff15 100644 --- a/lark-stubs/parsers/lalr_puppet.pyi +++ b/lark-stubs/parsers/lalr_puppet.pyi @@ -9,9 +9,20 @@ class ParserPuppet(object): Accessible via `UnexpectedToken.puppet` (raised by the parser on token error) """ + parser: Any + parser_state: Any + lexer_state: Any + + def feed_token(self, token: Token) -> Any: ... + def exhaust_lexer(self) -> None: ... + + def feed_eof(self, last_token: Token = None) -> Any: ... + def copy(self) -> ParserPuppet: ... + + def as_immutable(self) -> ImmutableParserPuppet: ... def pretty(self) -> str: ... @@ -20,3 +31,13 @@ class ParserPuppet(object): def accepts(self) -> Set[str]: ... def resume_parse(self) -> Tree: ... + + +class ImmutableParserPuppet(ParserPuppet): + result: Any = None + + def feed_token(self, token: Token) -> ImmutableParserPuppet: ... + + def exhaust_lexer(self) -> ImmutableParserPuppet: ... + + def feed_eof(self, last_token: Token = None) -> ImmutableParserPuppet: ... diff --git a/lark/lark.py b/lark/lark.py index ba98d16..5ea6b5c 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -531,6 +531,9 @@ class Lark(Serialize): def get_terminal(self, name): "Get information about a terminal" return self._terminals_dict[name] + + def get_puppet(self, text=None, start=None): + return self.parser.get_puppet(text, start=start) def parse(self, text, start=None, on_error=None): """Parse the given text, according to the options provided. diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index d98805c..d334947 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -89,18 +89,29 @@ class ParsingFrontend(Serialize): if lexer_conf.postlex: self.lexer = PostLexConnector(self.lexer, lexer_conf.postlex) - - - def parse(self, text, start=None, on_error=None): + + def _verify_start(self, start=None): if start is None: start = self.parser_conf.start if len(start) > 1: raise ConfigurationError("Lark initialized with more than 1 possible start rule. Must specify which start rule to parse", start) start ,= start + elif start not in self.parser_conf.start: + raise ConfigurationError("Unknown start rule %s. Must be one of %r" % (start, self.parser_conf.start)) + return start + def parse(self, text, start=None, on_error=None): + start = self._verify_start(start) stream = text if self.skip_lexer else LexerThread(self.lexer, text) kw = {} if on_error is None else {'on_error': on_error} return self.parser.parse(stream, start, **kw) + + def get_puppet(self, text=None, start=None): + start = self._verify_start(start) + if self.parser_conf.parser_type != 'lalr': + raise ConfigurationError("Can only create a Puppet for parser='lalr' at the moment.") + stream = text if self.skip_lexer else LexerThread(self.lexer, text) + return self.parser.get_puppet(stream, start) def get_frontend(parser, lexer): diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index 6fc76ea..a7a4074 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -32,6 +32,10 @@ class LALR_Parser(Serialize): def serialize(self, memo): return self._parse_table.serialize(memo) + + def get_puppet(self, lexer, start): + return self.parser.get_puppet(lexer, start) + def parse(self, lexer, start, on_error=None): try: @@ -158,10 +162,16 @@ class _Parser(object): self.callbacks = callbacks self.debug = debug + def get_puppet(self, lexer, start, value_stack=None, state_stack=None): + parse_conf = ParseConf(self.parse_table, self.callbacks, start) + parser_state = ParserState(parse_conf, lexer, state_stack, value_stack) + return ParserPuppet(self, parser_state, parser_state.lexer) + def parse(self, lexer, start, value_stack=None, state_stack=None): parse_conf = ParseConf(self.parse_table, self.callbacks, start) parser_state = ParserState(parse_conf, lexer, state_stack, value_stack) return self.parse_from_state(parser_state) + def parse_from_state(self, state): # Main LALR-parser loop diff --git a/lark/parsers/lalr_puppet.py b/lark/parsers/lalr_puppet.py index 93ba287..8e5a315 100644 --- a/lark/parsers/lalr_puppet.py +++ b/lark/parsers/lalr_puppet.py @@ -23,6 +23,19 @@ class ParserPuppet(object): Note that ``token`` has to be an instance of ``Token``. """ return self.parser_state.feed_token(token, token.type == '$END') + + def exhaust_lexer(self): + """Try to feed the rest of the lexer state into the puppet. + + Note that this modifies the puppet in place and does not feed an '$END' Token""" + for token in self.lexer_state.lex(self.parser_state): + self.parser_state.feed_token(token) + + def feed_eof(self, last_token=None): + """Feed a '$END' Token. Borrows from 'last_token' if given.""" + eof = Token.new_borrow_pos('$END', '', last_token) if last_token is not None else Token('$END', '', 0, 1, 1) + return self.feed_token(eof) + def __copy__(self): """Create a new puppet with a separate state. @@ -93,4 +106,13 @@ class ImmutableParserPuppet(ParserPuppet): def feed_token(self, token): c = copy(self) c.result = ParserPuppet.feed_token(c, token) - return c \ No newline at end of file + return c + + def exhaust_lexer(self): + """Try to feed the rest of the lexer state into the puppet. + + Note that this returns a new ImmutableParserPuppet and does not feed an '$END' Token""" + res = copy(self) + for token in res.lexer_state.lex(res.parser_state): + res = res.parser_state.feed_token(token) + return res diff --git a/tests/test_parser.py b/tests/test_parser.py index 3d18b20..90ee1ee 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -2395,6 +2395,42 @@ def _make_parser_test(LEXER, PARSER): """, regex=True) self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்') + @unittest.skipIf(PARSER!='lalr', "Puppet is only implemented for LALR at the moment") + def test_parser_puppet(self): + + g = _Lark(r''' + start: A+ B* + A: "a" + B: "b" + ''') + + puppet = g.get_puppet() + + self.assertRaises(UnexpectedToken, puppet.feed_eof) + self.assertRaises(TypeError, puppet.exhaust_lexer) + puppet.feed_token(Token('A', 'a')) + res = puppet.feed_eof() + self.assertEqual(res, Tree('start', ['a'])) + + puppet = g.get_puppet("ab") + + puppet.exhaust_lexer() + + puppet_copy = puppet.copy() + self.assertEqual(puppet_copy.parser_state, puppet.parser_state) + self.assertEqual(puppet_copy.lexer_state.state, puppet.lexer_state.state) + self.assertIsNot(puppet_copy.parser_state, puppet.parser_state) + self.assertIsNot(puppet_copy.lexer_state.state, puppet.lexer_state.state) + self.assertIsNot(puppet_copy.lexer_state.state.line_ctr, puppet.lexer_state.state.line_ctr) + + res = puppet.feed_eof(puppet.lexer_state.state.last_token) + self.assertEqual(res, Tree('start', ['a', 'b'])) + self.assertRaises(UnexpectedToken ,puppet.feed_eof) + + self.assertRaises(UnexpectedToken, puppet_copy.feed_token, Token('A', 'a')) + puppet_copy.feed_token(Token('B', 'b')) + res = puppet_copy.feed_eof() + self.assertEqual(res, Tree('start', ['a', 'b', 'b'])) @unittest.skipIf(PARSER!='lalr', "Puppet error handling only works with LALR for now") def test_error_with_puppet(self): @@ -2408,14 +2444,6 @@ def _make_parser_test(LEXER, PARSER): # Skip comma return True elif e.token.type == 'SIGNED_NUMBER': - # Make a copy and ensure it is properly made - puppet_copy = e.puppet.copy() - assert puppet_copy.parser_state == e.puppet.parser_state - assert puppet_copy.lexer_state.state == e.puppet.lexer_state.state - assert puppet_copy.parser_state is not e.puppet.parser_state - assert puppet_copy.lexer_state.state is not e.puppet.lexer_state.state - assert puppet_copy.lexer_state.state.line_ctr is not e.puppet.lexer_state.state.line_ctr - # Try to feed a comma and retry the number e.puppet.feed_token(Token('COMMA', ',')) e.puppet.feed_token(e.token)