Przeglądaj źródła

Merge branch 'find_grammar_errors'

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.11.3
Erez Sh 3 lat temu
rodzic
commit
516d66cc30
7 zmienionych plików z 134 dodań i 41 usunięć
  1. +4
    -0
      lark-stubs/load_grammar.pyi
  2. +2
    -30
      lark/lark.py
  3. +50
    -2
      lark/load_grammar.py
  4. +4
    -6
      lark/parser_frontends.py
  5. +30
    -2
      lark/parsers/lalr_parser.py
  6. +8
    -0
      lark/utils.py
  7. +36
    -1
      tests/test_grammar.py

+ 4
- 0
lark-stubs/load_grammar.pyi Wyświetl plik

@@ -2,6 +2,7 @@ from typing import List, Tuple, Union, Callable, Dict, Optional

from lark import Tree
from lark.grammar import RuleOptions
from lark.exceptions import UnexpectedInput


class Grammar:
@@ -24,3 +25,6 @@ class GrammarBuilder:
def validate(self) -> None: ...

def build(self) -> Grammar: ...


def find_grammar_errors(text: str, start: str='start') -> List[Tuple[UnexpectedInput, str]]: ...

+ 2
- 30
lark/lark.py Wyświetl plik

@@ -1,5 +1,5 @@
from __future__ import absolute_import
from lark.exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedToken, ConfigurationError, assert_config
from lark.exceptions import ConfigurationError, assert_config

import sys, os, pickle, hashlib
from io import open
@@ -518,35 +518,7 @@ class Lark(Serialize):
result of the transformation. Otherwise, returns a Tree instance.

"""

try:
return self.parser.parse(text, start=start)
except UnexpectedInput as e:
if on_error is None:
raise

while True:
if isinstance(e, UnexpectedCharacters):
s = e.puppet.lexer_state.state
p = s.line_ctr.char_pos

if not on_error(e):
raise e

if isinstance(e, UnexpectedCharacters):
# If user didn't change the character position, then we should
if p == s.line_ctr.char_pos:
s.line_ctr.feed(s.text[p:p+1])

try:
return e.puppet.resume_parse()
except UnexpectedToken as e2:
if isinstance(e, UnexpectedToken) and e.token.type == e2.token.type == '$END' and e.puppet == e2.puppet:
# Prevent infinite loop
raise e2
e = e2
except UnexpectedCharacters as e2:
e = e2
return self.parser.parse(text, start=start, on_error=on_error)

@property
def source(self):


+ 50
- 2
lark/load_grammar.py Wyświetl plik

@@ -8,7 +8,7 @@ import pkgutil
from ast import literal_eval
from numbers import Integral

from .utils import bfs, Py36, logger, classify_bool, is_id_continue, is_id_start
from .utils import bfs, Py36, logger, classify_bool, is_id_continue, is_id_start, bfs_all_unique
from .lexer import Token, TerminalDef, PatternStr, PatternRE

from .parse_tree_builder import ParseTreeBuilder
@@ -16,7 +16,7 @@ from .parser_frontends import ParsingFrontend
from .common import LexerConf, ParserConf
from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol
from .utils import classify, suppress, dedup_list, Str
from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken
from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken, ParseError

from .tree import Tree, SlottedTree as ST
from .visitors import Transformer, Visitor, v_args, Transformer_InPlace, Transformer_NonRecursive
@@ -853,6 +853,54 @@ def _parse_grammar(text, name, start='start'):
return PrepareGrammar().transform(tree)


def _error_repr(error):
if isinstance(error, UnexpectedToken):
error2 = _translate_parser_exception(_get_parser().parse, error)
if error2:
return error2
expected = ', '.join(error.accepts or error.expected)
return "Unexpected token %r. Expected one of: {%s}" % (str(error.token), expected)
else:
return str(error)

def _search_puppet(puppet, predicate):
def expand(node):
path, p = node
for choice in p.choices():
t = Token(choice, '')
try:
new_p = p.feed_token(t)
except ParseError: # Illegal
pass
else:
yield path + (choice,), new_p

for path, p in bfs_all_unique([((), puppet)], expand):
if predicate(p):
return path, p

def find_grammar_errors(text, start='start'):
errors = []
def on_error(e):
errors.append((e, _error_repr(e)))

# recover to a new line
token_path, _ = _search_puppet(e.puppet.as_immutable(), lambda p: '_NL' in p.choices())
for token_type in token_path:
e.puppet.feed_token(Token(token_type, ''))
e.puppet.feed_token(Token('_NL', '\n'))
return True

_tree = _get_parser().parse(text + '\n', start, on_error=on_error)

errors_by_line = classify(errors, lambda e: e[0].line)
errors = [el[0] for el in errors_by_line.values()] # already sorted

for e in errors:
e[0].puppet = None
return errors


def _get_mangle(prefix, aliases, base_mangle=None):
def mangle(s):
if s in aliases:


+ 4
- 6
lark/parser_frontends.py Wyświetl plik

@@ -101,18 +101,16 @@ class ParsingFrontend(Serialize):
self.lexer = PostLexConnector(self.lexer, lexer_conf.postlex)


def parse(self, text, start=None):
def parse(self, text, start=None, on_error=None):
if start is None:
start = self.parser_conf.start
if len(start) > 1:
raise ConfigurationError("Lark initialized with more than 1 possible start rule. Must specify which start rule to parse", start)
start ,= start

if self.skip_lexer:
return self.parser.parse(text, start)

lexer_thread = LexerThread(self.lexer, text)
return self.parser.parse(lexer_thread, start)
stream = text if self.skip_lexer else LexerThread(self.lexer, text)
kw = {} if on_error is None else {'on_error': on_error}
return self.parser.parse(stream, start, **kw)


def get_frontend(parser, lexer):


+ 30
- 2
lark/parsers/lalr_parser.py Wyświetl plik

@@ -9,6 +9,7 @@ from ..utils import Serialize

from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable
from .lalr_puppet import ParserPuppet
from lark.exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedToken

###{standalone

@@ -32,8 +33,35 @@ class LALR_Parser(Serialize):
def serialize(self, memo):
return self._parse_table.serialize(memo)

def parse(self, *args):
return self.parser.parse(*args)
def parse(self, lexer, start, on_error=None):
try:
return self.parser.parse(lexer, start)
except UnexpectedInput as e:
if on_error is None:
raise

while True:
if isinstance(e, UnexpectedCharacters):
s = e.puppet.lexer_state.state
p = s.line_ctr.char_pos

if not on_error(e):
raise e

if isinstance(e, UnexpectedCharacters):
# If user didn't change the character position, then we should
if p == s.line_ctr.char_pos:
s.line_ctr.feed(s.text[p:p+1])

try:
return e.puppet.resume_parse()
except UnexpectedToken as e2:
if isinstance(e, UnexpectedToken) and e.token.type == e2.token.type == '$END' and e.puppet == e2.puppet:
# Prevent infinite loop
raise e2
e = e2
except UnexpectedCharacters as e2:
e = e2


class ParseConf(object):


+ 8
- 0
lark/utils.py Wyświetl plik

@@ -318,6 +318,14 @@ def bfs(initial, expand):
visited.add(next_node)
open_q.append(next_node)

def bfs_all_unique(initial, expand):
"bfs, but doesn't keep track of visited (aka seen), because there can be no repetitions"
open_q = deque(list(initial))
while open_q:
node = open_q.popleft()
yield node
open_q += expand(node)


def _serialize(value, memo):
if isinstance(value, Serialize):


+ 36
- 1
tests/test_grammar.py Wyświetl plik

@@ -4,7 +4,7 @@ import sys
from unittest import TestCase, main

from lark import Lark, Token, Tree
from lark.load_grammar import GrammarError, GRAMMAR_ERRORS
from lark.load_grammar import GrammarError, GRAMMAR_ERRORS, find_grammar_errors
from lark.load_grammar import FromPackageLoader


@@ -160,6 +160,41 @@ class TestGrammar(TestCase):
x = p.parse('12 capybaras')
self.assertEqual(x.children, ['12', 'capybaras'])

def test_find_grammar_errors(self):
text = """
a: rule
b rule
c: rule
B.: "hello" f
D: "okay"
"""

assert [e.line for e, _s in find_grammar_errors(text)] == [3, 5]

text = """
a: rule
b rule
| ok
c: rule
B.: "hello" f
D: "okay"
"""

assert [e.line for e, _s in find_grammar_errors(text)] == [3, 4, 6]

text = """
a: rule @#$#@$@&&
b: rule
| ok
c: rule
B: "hello" f @
D: "okay"
"""

x = find_grammar_errors(text)
assert [e.line for e, _s in find_grammar_errors(text)] == [2, 6]



if __name__ == '__main__':
main()


Ładowanie…
Anuluj
Zapisz