| @@ -2,6 +2,7 @@ from typing import List, Tuple, Union, Callable, Dict, Optional | |||||
| from lark import Tree | from lark import Tree | ||||
| from lark.grammar import RuleOptions | from lark.grammar import RuleOptions | ||||
| from lark.exceptions import UnexpectedInput | |||||
| class Grammar: | class Grammar: | ||||
| @@ -24,3 +25,6 @@ class GrammarBuilder: | |||||
| def validate(self) -> None: ... | def validate(self) -> None: ... | ||||
| def build(self) -> Grammar: ... | def build(self) -> Grammar: ... | ||||
| def find_grammar_errors(text: str, start: str='start') -> List[Tuple[UnexpectedInput, str]]: ... | |||||
| @@ -1,5 +1,5 @@ | |||||
| from __future__ import absolute_import | from __future__ import absolute_import | ||||
| from lark.exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedToken, ConfigurationError, assert_config | |||||
| from lark.exceptions import ConfigurationError, assert_config | |||||
| import sys, os, pickle, hashlib | import sys, os, pickle, hashlib | ||||
| from io import open | from io import open | ||||
| @@ -518,35 +518,7 @@ class Lark(Serialize): | |||||
| result of the transformation. Otherwise, returns a Tree instance. | result of the transformation. Otherwise, returns a Tree instance. | ||||
| """ | """ | ||||
| try: | |||||
| return self.parser.parse(text, start=start) | |||||
| except UnexpectedInput as e: | |||||
| if on_error is None: | |||||
| raise | |||||
| while True: | |||||
| if isinstance(e, UnexpectedCharacters): | |||||
| s = e.puppet.lexer_state.state | |||||
| p = s.line_ctr.char_pos | |||||
| if not on_error(e): | |||||
| raise e | |||||
| if isinstance(e, UnexpectedCharacters): | |||||
| # If user didn't change the character position, then we should | |||||
| if p == s.line_ctr.char_pos: | |||||
| s.line_ctr.feed(s.text[p:p+1]) | |||||
| try: | |||||
| return e.puppet.resume_parse() | |||||
| except UnexpectedToken as e2: | |||||
| if isinstance(e, UnexpectedToken) and e.token.type == e2.token.type == '$END' and e.puppet == e2.puppet: | |||||
| # Prevent infinite loop | |||||
| raise e2 | |||||
| e = e2 | |||||
| except UnexpectedCharacters as e2: | |||||
| e = e2 | |||||
| return self.parser.parse(text, start=start, on_error=on_error) | |||||
| @property | @property | ||||
| def source(self): | def source(self): | ||||
| @@ -8,7 +8,7 @@ import pkgutil | |||||
| from ast import literal_eval | from ast import literal_eval | ||||
| from numbers import Integral | from numbers import Integral | ||||
| from .utils import bfs, Py36, logger, classify_bool, is_id_continue, is_id_start | |||||
| from .utils import bfs, Py36, logger, classify_bool, is_id_continue, is_id_start, bfs_all_unique | |||||
| from .lexer import Token, TerminalDef, PatternStr, PatternRE | from .lexer import Token, TerminalDef, PatternStr, PatternRE | ||||
| from .parse_tree_builder import ParseTreeBuilder | from .parse_tree_builder import ParseTreeBuilder | ||||
| @@ -16,7 +16,7 @@ from .parser_frontends import ParsingFrontend | |||||
| from .common import LexerConf, ParserConf | from .common import LexerConf, ParserConf | ||||
| from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol | from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol | ||||
| from .utils import classify, suppress, dedup_list, Str | from .utils import classify, suppress, dedup_list, Str | ||||
| from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken | |||||
| from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken, ParseError | |||||
| from .tree import Tree, SlottedTree as ST | from .tree import Tree, SlottedTree as ST | ||||
| from .visitors import Transformer, Visitor, v_args, Transformer_InPlace, Transformer_NonRecursive | from .visitors import Transformer, Visitor, v_args, Transformer_InPlace, Transformer_NonRecursive | ||||
| @@ -853,6 +853,54 @@ def _parse_grammar(text, name, start='start'): | |||||
| return PrepareGrammar().transform(tree) | return PrepareGrammar().transform(tree) | ||||
| def _error_repr(error): | |||||
| if isinstance(error, UnexpectedToken): | |||||
| error2 = _translate_parser_exception(_get_parser().parse, error) | |||||
| if error2: | |||||
| return error2 | |||||
| expected = ', '.join(error.accepts or error.expected) | |||||
| return "Unexpected token %r. Expected one of: {%s}" % (str(error.token), expected) | |||||
| else: | |||||
| return str(error) | |||||
| def _search_puppet(puppet, predicate): | |||||
| def expand(node): | |||||
| path, p = node | |||||
| for choice in p.choices(): | |||||
| t = Token(choice, '') | |||||
| try: | |||||
| new_p = p.feed_token(t) | |||||
| except ParseError: # Illegal | |||||
| pass | |||||
| else: | |||||
| yield path + (choice,), new_p | |||||
| for path, p in bfs_all_unique([((), puppet)], expand): | |||||
| if predicate(p): | |||||
| return path, p | |||||
| def find_grammar_errors(text, start='start'): | |||||
| errors = [] | |||||
| def on_error(e): | |||||
| errors.append((e, _error_repr(e))) | |||||
| # recover to a new line | |||||
| token_path, _ = _search_puppet(e.puppet.as_immutable(), lambda p: '_NL' in p.choices()) | |||||
| for token_type in token_path: | |||||
| e.puppet.feed_token(Token(token_type, '')) | |||||
| e.puppet.feed_token(Token('_NL', '\n')) | |||||
| return True | |||||
| _tree = _get_parser().parse(text + '\n', start, on_error=on_error) | |||||
| errors_by_line = classify(errors, lambda e: e[0].line) | |||||
| errors = [el[0] for el in errors_by_line.values()] # already sorted | |||||
| for e in errors: | |||||
| e[0].puppet = None | |||||
| return errors | |||||
| def _get_mangle(prefix, aliases, base_mangle=None): | def _get_mangle(prefix, aliases, base_mangle=None): | ||||
| def mangle(s): | def mangle(s): | ||||
| if s in aliases: | if s in aliases: | ||||
| @@ -101,18 +101,16 @@ class ParsingFrontend(Serialize): | |||||
| self.lexer = PostLexConnector(self.lexer, lexer_conf.postlex) | self.lexer = PostLexConnector(self.lexer, lexer_conf.postlex) | ||||
| def parse(self, text, start=None): | |||||
| def parse(self, text, start=None, on_error=None): | |||||
| if start is None: | if start is None: | ||||
| start = self.parser_conf.start | start = self.parser_conf.start | ||||
| if len(start) > 1: | if len(start) > 1: | ||||
| raise ConfigurationError("Lark initialized with more than 1 possible start rule. Must specify which start rule to parse", start) | raise ConfigurationError("Lark initialized with more than 1 possible start rule. Must specify which start rule to parse", start) | ||||
| start ,= start | start ,= start | ||||
| if self.skip_lexer: | |||||
| return self.parser.parse(text, start) | |||||
| lexer_thread = LexerThread(self.lexer, text) | |||||
| return self.parser.parse(lexer_thread, start) | |||||
| stream = text if self.skip_lexer else LexerThread(self.lexer, text) | |||||
| kw = {} if on_error is None else {'on_error': on_error} | |||||
| return self.parser.parse(stream, start, **kw) | |||||
| def get_frontend(parser, lexer): | def get_frontend(parser, lexer): | ||||
| @@ -9,6 +9,7 @@ from ..utils import Serialize | |||||
| from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable | from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable | ||||
| from .lalr_puppet import ParserPuppet | from .lalr_puppet import ParserPuppet | ||||
| from lark.exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedToken | |||||
| ###{standalone | ###{standalone | ||||
| @@ -32,8 +33,35 @@ class LALR_Parser(Serialize): | |||||
| def serialize(self, memo): | def serialize(self, memo): | ||||
| return self._parse_table.serialize(memo) | return self._parse_table.serialize(memo) | ||||
| def parse(self, *args): | |||||
| return self.parser.parse(*args) | |||||
| def parse(self, lexer, start, on_error=None): | |||||
| try: | |||||
| return self.parser.parse(lexer, start) | |||||
| except UnexpectedInput as e: | |||||
| if on_error is None: | |||||
| raise | |||||
| while True: | |||||
| if isinstance(e, UnexpectedCharacters): | |||||
| s = e.puppet.lexer_state.state | |||||
| p = s.line_ctr.char_pos | |||||
| if not on_error(e): | |||||
| raise e | |||||
| if isinstance(e, UnexpectedCharacters): | |||||
| # If user didn't change the character position, then we should | |||||
| if p == s.line_ctr.char_pos: | |||||
| s.line_ctr.feed(s.text[p:p+1]) | |||||
| try: | |||||
| return e.puppet.resume_parse() | |||||
| except UnexpectedToken as e2: | |||||
| if isinstance(e, UnexpectedToken) and e.token.type == e2.token.type == '$END' and e.puppet == e2.puppet: | |||||
| # Prevent infinite loop | |||||
| raise e2 | |||||
| e = e2 | |||||
| except UnexpectedCharacters as e2: | |||||
| e = e2 | |||||
| class ParseConf(object): | class ParseConf(object): | ||||
| @@ -318,6 +318,14 @@ def bfs(initial, expand): | |||||
| visited.add(next_node) | visited.add(next_node) | ||||
| open_q.append(next_node) | open_q.append(next_node) | ||||
| def bfs_all_unique(initial, expand): | |||||
| "bfs, but doesn't keep track of visited (aka seen), because there can be no repetitions" | |||||
| open_q = deque(list(initial)) | |||||
| while open_q: | |||||
| node = open_q.popleft() | |||||
| yield node | |||||
| open_q += expand(node) | |||||
| def _serialize(value, memo): | def _serialize(value, memo): | ||||
| if isinstance(value, Serialize): | if isinstance(value, Serialize): | ||||
| @@ -4,7 +4,7 @@ import sys | |||||
| from unittest import TestCase, main | from unittest import TestCase, main | ||||
| from lark import Lark, Token, Tree | from lark import Lark, Token, Tree | ||||
| from lark.load_grammar import GrammarError, GRAMMAR_ERRORS | |||||
| from lark.load_grammar import GrammarError, GRAMMAR_ERRORS, find_grammar_errors | |||||
| from lark.load_grammar import FromPackageLoader | from lark.load_grammar import FromPackageLoader | ||||
| @@ -160,6 +160,41 @@ class TestGrammar(TestCase): | |||||
| x = p.parse('12 capybaras') | x = p.parse('12 capybaras') | ||||
| self.assertEqual(x.children, ['12', 'capybaras']) | self.assertEqual(x.children, ['12', 'capybaras']) | ||||
| def test_find_grammar_errors(self): | |||||
| text = """ | |||||
| a: rule | |||||
| b rule | |||||
| c: rule | |||||
| B.: "hello" f | |||||
| D: "okay" | |||||
| """ | |||||
| assert [e.line for e, _s in find_grammar_errors(text)] == [3, 5] | |||||
| text = """ | |||||
| a: rule | |||||
| b rule | |||||
| | ok | |||||
| c: rule | |||||
| B.: "hello" f | |||||
| D: "okay" | |||||
| """ | |||||
| assert [e.line for e, _s in find_grammar_errors(text)] == [3, 4, 6] | |||||
| text = """ | |||||
| a: rule @#$#@$@&& | |||||
| b: rule | |||||
| | ok | |||||
| c: rule | |||||
| B: "hello" f @ | |||||
| D: "okay" | |||||
| """ | |||||
| x = find_grammar_errors(text) | |||||
| assert [e.line for e, _s in find_grammar_errors(text)] == [2, 6] | |||||
| if __name__ == '__main__': | if __name__ == '__main__': | ||||
| main() | main() | ||||