Browse Source

Merge branch 'find_grammar_errors'

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.11.3
Erez Sh 4 years ago
parent
commit
516d66cc30
7 changed files with 134 additions and 41 deletions
  1. +4
    -0
      lark-stubs/load_grammar.pyi
  2. +2
    -30
      lark/lark.py
  3. +50
    -2
      lark/load_grammar.py
  4. +4
    -6
      lark/parser_frontends.py
  5. +30
    -2
      lark/parsers/lalr_parser.py
  6. +8
    -0
      lark/utils.py
  7. +36
    -1
      tests/test_grammar.py

+ 4
- 0
lark-stubs/load_grammar.pyi View File

@@ -2,6 +2,7 @@ from typing import List, Tuple, Union, Callable, Dict, Optional


from lark import Tree from lark import Tree
from lark.grammar import RuleOptions from lark.grammar import RuleOptions
from lark.exceptions import UnexpectedInput




class Grammar: class Grammar:
@@ -24,3 +25,6 @@ class GrammarBuilder:
def validate(self) -> None: ... def validate(self) -> None: ...


def build(self) -> Grammar: ... def build(self) -> Grammar: ...


def find_grammar_errors(text: str, start: str='start') -> List[Tuple[UnexpectedInput, str]]: ...

+ 2
- 30
lark/lark.py View File

@@ -1,5 +1,5 @@
from __future__ import absolute_import from __future__ import absolute_import
from lark.exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedToken, ConfigurationError, assert_config
from lark.exceptions import ConfigurationError, assert_config


import sys, os, pickle, hashlib import sys, os, pickle, hashlib
from io import open from io import open
@@ -518,35 +518,7 @@ class Lark(Serialize):
result of the transformation. Otherwise, returns a Tree instance. result of the transformation. Otherwise, returns a Tree instance.


""" """

try:
return self.parser.parse(text, start=start)
except UnexpectedInput as e:
if on_error is None:
raise

while True:
if isinstance(e, UnexpectedCharacters):
s = e.puppet.lexer_state.state
p = s.line_ctr.char_pos

if not on_error(e):
raise e

if isinstance(e, UnexpectedCharacters):
# If user didn't change the character position, then we should
if p == s.line_ctr.char_pos:
s.line_ctr.feed(s.text[p:p+1])

try:
return e.puppet.resume_parse()
except UnexpectedToken as e2:
if isinstance(e, UnexpectedToken) and e.token.type == e2.token.type == '$END' and e.puppet == e2.puppet:
# Prevent infinite loop
raise e2
e = e2
except UnexpectedCharacters as e2:
e = e2
return self.parser.parse(text, start=start, on_error=on_error)


@property @property
def source(self): def source(self):


+ 50
- 2
lark/load_grammar.py View File

@@ -8,7 +8,7 @@ import pkgutil
from ast import literal_eval from ast import literal_eval
from numbers import Integral from numbers import Integral


from .utils import bfs, Py36, logger, classify_bool, is_id_continue, is_id_start
from .utils import bfs, Py36, logger, classify_bool, is_id_continue, is_id_start, bfs_all_unique
from .lexer import Token, TerminalDef, PatternStr, PatternRE from .lexer import Token, TerminalDef, PatternStr, PatternRE


from .parse_tree_builder import ParseTreeBuilder from .parse_tree_builder import ParseTreeBuilder
@@ -16,7 +16,7 @@ from .parser_frontends import ParsingFrontend
from .common import LexerConf, ParserConf from .common import LexerConf, ParserConf
from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol
from .utils import classify, suppress, dedup_list, Str from .utils import classify, suppress, dedup_list, Str
from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken
from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken, ParseError


from .tree import Tree, SlottedTree as ST from .tree import Tree, SlottedTree as ST
from .visitors import Transformer, Visitor, v_args, Transformer_InPlace, Transformer_NonRecursive from .visitors import Transformer, Visitor, v_args, Transformer_InPlace, Transformer_NonRecursive
@@ -853,6 +853,54 @@ def _parse_grammar(text, name, start='start'):
return PrepareGrammar().transform(tree) return PrepareGrammar().transform(tree)




def _error_repr(error):
if isinstance(error, UnexpectedToken):
error2 = _translate_parser_exception(_get_parser().parse, error)
if error2:
return error2
expected = ', '.join(error.accepts or error.expected)
return "Unexpected token %r. Expected one of: {%s}" % (str(error.token), expected)
else:
return str(error)

def _search_puppet(puppet, predicate):
def expand(node):
path, p = node
for choice in p.choices():
t = Token(choice, '')
try:
new_p = p.feed_token(t)
except ParseError: # Illegal
pass
else:
yield path + (choice,), new_p

for path, p in bfs_all_unique([((), puppet)], expand):
if predicate(p):
return path, p

def find_grammar_errors(text, start='start'):
errors = []
def on_error(e):
errors.append((e, _error_repr(e)))

# recover to a new line
token_path, _ = _search_puppet(e.puppet.as_immutable(), lambda p: '_NL' in p.choices())
for token_type in token_path:
e.puppet.feed_token(Token(token_type, ''))
e.puppet.feed_token(Token('_NL', '\n'))
return True

_tree = _get_parser().parse(text + '\n', start, on_error=on_error)

errors_by_line = classify(errors, lambda e: e[0].line)
errors = [el[0] for el in errors_by_line.values()] # already sorted

for e in errors:
e[0].puppet = None
return errors


def _get_mangle(prefix, aliases, base_mangle=None): def _get_mangle(prefix, aliases, base_mangle=None):
def mangle(s): def mangle(s):
if s in aliases: if s in aliases:


+ 4
- 6
lark/parser_frontends.py View File

@@ -101,18 +101,16 @@ class ParsingFrontend(Serialize):
self.lexer = PostLexConnector(self.lexer, lexer_conf.postlex) self.lexer = PostLexConnector(self.lexer, lexer_conf.postlex)




def parse(self, text, start=None):
def parse(self, text, start=None, on_error=None):
if start is None: if start is None:
start = self.parser_conf.start start = self.parser_conf.start
if len(start) > 1: if len(start) > 1:
raise ConfigurationError("Lark initialized with more than 1 possible start rule. Must specify which start rule to parse", start) raise ConfigurationError("Lark initialized with more than 1 possible start rule. Must specify which start rule to parse", start)
start ,= start start ,= start


if self.skip_lexer:
return self.parser.parse(text, start)

lexer_thread = LexerThread(self.lexer, text)
return self.parser.parse(lexer_thread, start)
stream = text if self.skip_lexer else LexerThread(self.lexer, text)
kw = {} if on_error is None else {'on_error': on_error}
return self.parser.parse(stream, start, **kw)




def get_frontend(parser, lexer): def get_frontend(parser, lexer):


+ 30
- 2
lark/parsers/lalr_parser.py View File

@@ -9,6 +9,7 @@ from ..utils import Serialize


from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable
from .lalr_puppet import ParserPuppet from .lalr_puppet import ParserPuppet
from lark.exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedToken


###{standalone ###{standalone


@@ -32,8 +33,35 @@ class LALR_Parser(Serialize):
def serialize(self, memo): def serialize(self, memo):
return self._parse_table.serialize(memo) return self._parse_table.serialize(memo)


def parse(self, *args):
return self.parser.parse(*args)
def parse(self, lexer, start, on_error=None):
try:
return self.parser.parse(lexer, start)
except UnexpectedInput as e:
if on_error is None:
raise

while True:
if isinstance(e, UnexpectedCharacters):
s = e.puppet.lexer_state.state
p = s.line_ctr.char_pos

if not on_error(e):
raise e

if isinstance(e, UnexpectedCharacters):
# If user didn't change the character position, then we should
if p == s.line_ctr.char_pos:
s.line_ctr.feed(s.text[p:p+1])

try:
return e.puppet.resume_parse()
except UnexpectedToken as e2:
if isinstance(e, UnexpectedToken) and e.token.type == e2.token.type == '$END' and e.puppet == e2.puppet:
# Prevent infinite loop
raise e2
e = e2
except UnexpectedCharacters as e2:
e = e2




class ParseConf(object): class ParseConf(object):


+ 8
- 0
lark/utils.py View File

@@ -318,6 +318,14 @@ def bfs(initial, expand):
visited.add(next_node) visited.add(next_node)
open_q.append(next_node) open_q.append(next_node)


def bfs_all_unique(initial, expand):
"bfs, but doesn't keep track of visited (aka seen), because there can be no repetitions"
open_q = deque(list(initial))
while open_q:
node = open_q.popleft()
yield node
open_q += expand(node)



def _serialize(value, memo): def _serialize(value, memo):
if isinstance(value, Serialize): if isinstance(value, Serialize):


+ 36
- 1
tests/test_grammar.py View File

@@ -4,7 +4,7 @@ import sys
from unittest import TestCase, main from unittest import TestCase, main


from lark import Lark, Token, Tree from lark import Lark, Token, Tree
from lark.load_grammar import GrammarError, GRAMMAR_ERRORS
from lark.load_grammar import GrammarError, GRAMMAR_ERRORS, find_grammar_errors
from lark.load_grammar import FromPackageLoader from lark.load_grammar import FromPackageLoader




@@ -160,6 +160,41 @@ class TestGrammar(TestCase):
x = p.parse('12 capybaras') x = p.parse('12 capybaras')
self.assertEqual(x.children, ['12', 'capybaras']) self.assertEqual(x.children, ['12', 'capybaras'])


def test_find_grammar_errors(self):
text = """
a: rule
b rule
c: rule
B.: "hello" f
D: "okay"
"""

assert [e.line for e, _s in find_grammar_errors(text)] == [3, 5]

text = """
a: rule
b rule
| ok
c: rule
B.: "hello" f
D: "okay"
"""

assert [e.line for e, _s in find_grammar_errors(text)] == [3, 4, 6]

text = """
a: rule @#$#@$@&&
b: rule
| ok
c: rule
B: "hello" f @
D: "okay"
"""

x = find_grammar_errors(text)
assert [e.line for e, _s in find_grammar_errors(text)] == [2, 6]




if __name__ == '__main__': if __name__ == '__main__':
main() main()


Loading…
Cancel
Save