Procházet zdrojové kódy

Merge branch 'master' of https://github.com/lark-parser/lark into earley_custom

 Conflicts:
	tests/test_parser.py
tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.11.2
MegaIng1 před 3 roky
rodič
revize
26e03b9ff8
16 změnil soubory, kde provedl 179 přidání a 42 odebrání
  1. +2
    -2
      README.md
  2. +79
    -0
      examples/advanced/error_reporting_earley.py
  3. +1
    -1
      examples/advanced/error_reporting_lalr.py
  4. +1
    -1
      lark-stubs/lark.pyi
  5. +4
    -1
      lark-stubs/lexer.pyi
  6. +1
    -1
      lark/__init__.py
  7. +23
    -11
      lark/exceptions.py
  8. +8
    -7
      lark/lexer.py
  9. +0
    -3
      lark/parser_frontends.py
  10. +10
    -6
      lark/parsers/earley.py
  11. +8
    -2
      lark/parsers/lalr_parser.py
  12. +2
    -1
      lark/parsers/xearley.py
  13. +9
    -1
      lark/tree_matcher.py
  14. +2
    -2
      setup.py
  15. +1
    -0
      tests/__main__.py
  16. +28
    -3
      tests/test_parser.py

+ 2
- 2
README.md Zobrazit soubor

@@ -106,7 +106,7 @@ Lark is great at handling ambiguity. Here is the result of parsing the phrase "f
- MyPy support using type stubs
- And much more!

See the full list of [features here](https://lark-parser.readthedocs.io/en/latest/features/)
See the full list of [features here](https://lark-parser.readthedocs.io/en/latest/features.html)


### Comparison to other libraries
@@ -132,7 +132,7 @@ Check out the [JSON tutorial](/docs/json_tutorial.md#conclusion) for more detail
|:--------|:----------|:----|:--------|:------------|:------------|:----------|:----------
| **Lark** | Earley/LALR(1) | EBNF | Yes! | Yes! | Yes! | Yes! | Yes! (LALR only) |
| [PLY](http://www.dabeaz.com/ply/) | LALR(1) | BNF | No | No | No | No | No |
| [PyParsing](http://pyparsing.wikispaces.com/) | PEG | Combinators | No | No | No\* | No | No |
| [PyParsing](https://github.com/pyparsing/pyparsing) | PEG | Combinators | No | No | No\* | No | No |
| [Parsley](https://pypi.python.org/pypi/Parsley) | PEG | EBNF | No | No | No\* | No | No |
| [Parsimonious](https://github.com/erikrose/parsimonious) | PEG | EBNF | Yes | No | No\* | No | No |
| [ANTLR](https://github.com/antlr/antlr4) | LL(*) | EBNF | Yes | No | Yes? | Yes | No |


+ 79
- 0
examples/advanced/error_reporting_earley.py Zobrazit soubor

@@ -0,0 +1,79 @@
"""
Example-Driven Error Reporting
==============================

A demonstration of example-driven error reporting with the Earley parser
(See also: error_reporting_lalr.py)
"""
from lark import Lark, UnexpectedInput

from _json_parser import json_grammar # Using the grammar from the json_parser example

json_parser = Lark(json_grammar)

class JsonSyntaxError(SyntaxError):
def __str__(self):
context, line, column = self.args
return '%s at line %s, column %s.\n\n%s' % (self.label, line, column, context)

class JsonMissingValue(JsonSyntaxError):
label = 'Missing Value'

class JsonMissingOpening(JsonSyntaxError):
label = 'Missing Opening'

class JsonMissingClosing(JsonSyntaxError):
label = 'Missing Closing'

class JsonMissingComma(JsonSyntaxError):
label = 'Missing Comma'

class JsonTrailingComma(JsonSyntaxError):
label = 'Trailing Comma'


def parse(json_text):
try:
j = json_parser.parse(json_text)
except UnexpectedInput as u:
exc_class = u.match_examples(json_parser.parse, {
JsonMissingOpening: ['{"foo": ]}',
'{"foor": }}',
'{"foo": }'],
JsonMissingClosing: ['{"foo": [}',
'{',
'{"a": 1',
'[1'],
JsonMissingComma: ['[1 2]',
'[false 1]',
'["b" 1]',
'{"a":true 1:4}',
'{"a":1 1:4}',
'{"a":"b" 1:4}'],
JsonTrailingComma: ['[,]',
'[1,]',
'[1,2,]',
'{"foo":1,}',
'{"foo":false,"bar":true,}']
}, use_accepts=True)
if not exc_class:
raise
raise exc_class(u.get_context(json_text), u.line, u.column)


def test():
try:
parse('{"example1": "value"')
except JsonMissingClosing as e:
print(e)

try:
parse('{"example2": ] ')
except JsonMissingOpening as e:
print(e)


if __name__ == '__main__':
test()



+ 1
- 1
examples/advanced/error_reporting_lalr.py Zobrazit soubor

@@ -3,7 +3,7 @@ Example-Driven Error Reporting
==============================

A demonstration of example-driven error reporting with the LALR parser
(See also: error_reporting_earley.py)
"""
from lark import Lark, UnexpectedInput



+ 1
- 1
lark-stubs/lark.pyi Zobrazit soubor

@@ -63,7 +63,7 @@ class Lark:
*,
start: Union[None, str, List[str]] = "start",
parser: Literal["earley", "lalr", "cyk"] = "auto",
lexer: Union[Literal["auto", "standard", "contextual", "dynamic", "dynamic_complete"], Lexer] = "auto",
lexer: Union[Literal["auto", "standard", "contextual", "dynamic", "dynamic_complete"], Type[Lexer]] = "auto",
transformer: Optional[Transformer] = None,
postlex: Optional[PostLex] = None,
ambiguity: Literal["explicit", "resolve"] = "resolve",


+ 4
- 1
lark-stubs/lexer.pyi Zobrazit soubor

@@ -85,6 +85,9 @@ class Token(str):
end_column: int
end_pos: int

def __init__(self, type_: str, value: Any, pos_in_stream: int = None, line: int = None, column: int = None, end_line: int = None, end_column: int = None, end_pos: int = None):
...

def update(self, type_: Optional[str] = None, value: Optional[str] = None) -> Token:
...

@@ -136,7 +139,7 @@ class TraditionalLexer(Lexer):
def lex(self, stream: str) -> Iterator[Token]:
...

def next_token(self, lex_state: Any) -> Token:
def next_token(self, lex_state: Any, parser_state: Any = None) -> Token:
...

class ContextualLexer(Lexer):


+ 1
- 1
lark/__init__.py Zobrazit soubor

@@ -3,7 +3,7 @@ from .tree import Tree
from .visitors import Transformer, Visitor, v_args, Discard, Transformer_NonRecursive
from .visitors import InlineTransformer, inline_args # XXX Deprecated
from .exceptions import (ParseError, LexError, GrammarError, UnexpectedToken,
UnexpectedInput, UnexpectedCharacters, LarkError)
UnexpectedInput, UnexpectedCharacters, UnexpectedEOF, LarkError)
from .lexer import Token
from .lark import Lark



+ 23
- 11
lark/exceptions.py Zobrazit soubor

@@ -19,14 +19,6 @@ class LexError(LarkError):
pass


class UnexpectedEOF(ParseError):
def __init__(self, expected):
self.expected = expected

message = ("Unexpected end-of-input. Expected one of: \n\t* %s\n" % '\n\t* '.join(x.name for x in self.expected))
super(UnexpectedEOF, self).__init__(message)


class UnexpectedInput(LarkError):
"""UnexpectedInput Error.

@@ -47,6 +39,7 @@ class UnexpectedInput(LarkError):
The parser doesn't hold a copy of the text it has to parse,
so you have to provide it again
"""
assert self.pos_in_stream is not None, self
pos = self.pos_in_stream
start = max(pos - span, 0)
end = pos + span
@@ -91,7 +84,7 @@ class UnexpectedInput(LarkError):
parse_fn(malformed)
except UnexpectedInput as ut:
if ut.state == self.state:
if use_accepts and ut.accepts != self.accepts:
if use_accepts and hasattr(self, 'accepts') and ut.accepts != self.accepts:
logger.debug("Different accepts with same state[%d]: %s != %s at example [%s][%s]" %
(self.state, self.accepts, ut.accepts, i, j))
continue
@@ -108,15 +101,29 @@ class UnexpectedInput(LarkError):

except AttributeError:
pass
if not candidate[0]:
if candidate[0] is None:
logger.debug("Same State match at example [%s][%s]" % (i, j))
candidate = label, False

return candidate[0]

class UnexpectedEOF(ParseError, UnexpectedInput):
def __init__(self, expected, state=None):
self.expected = expected
self.state = state
from .lexer import Token
self.token = Token("<EOF>", "") #, line=-1, column=-1, pos_in_stream=-1)
self.pos_in_stream = -1
self.line = -1
self.column = -1

message = ("Unexpected end-of-input. Expected one of: \n\t* %s\n" % '\n\t* '.join(x.name for x in self.expected))
super(UnexpectedEOF, self).__init__(message)


class UnexpectedCharacters(LexError, UnexpectedInput):
def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None):
# TODO considered_tokens and allowed can be figured out using state
self.line = line
self.column = column
self.pos_in_stream = lex_pos
@@ -147,7 +154,8 @@ class UnexpectedToken(ParseError, UnexpectedInput):

see: :ref:`ParserPuppet`.
"""
def __init__(self, token, expected, considered_rules=None, state=None, puppet=None):
def __init__(self, token, expected, considered_rules=None, state=None, puppet=None, token_history=None):
# TODO considered_rules and expected can be figured out using state
self.line = getattr(token, 'line', '?')
self.column = getattr(token, 'column', '?')
self.pos_in_stream = getattr(token, 'pos_in_stream', None)
@@ -157,6 +165,7 @@ class UnexpectedToken(ParseError, UnexpectedInput):
self.expected = expected # XXX deprecate? `accepts` is better
self.considered_rules = considered_rules
self.puppet = puppet
self.token_history = token_history

# TODO Only calculate `accepts()` when we need to display it to the user
# This will improve performance when doing automatic error handling
@@ -166,6 +175,9 @@ class UnexpectedToken(ParseError, UnexpectedInput):
"Expected one of: \n\t* %s\n"
% (token, self.line, self.column, '\n\t* '.join(self.accepts or self.expected)))

if self.token_history:
message += "Previous tokens: %r\n" % token_history

super(UnexpectedToken, self).__init__(message)




+ 8
- 7
lark/lexer.py Zobrazit soubor

@@ -338,12 +338,12 @@ class TraditionalLexer(Lexer):
if m:
return m.group(0), type_from_index[m.lastindex]

def lex(self, state, _parser_state):
def lex(self, state, parser_state):
with suppress(EOFError):
while True:
yield self.next_token(state)
yield self.next_token(state, parser_state)

def next_token(self, lex_state):
def next_token(self, lex_state, parser_state=None):
line_ctr = lex_state.line_ctr
while line_ctr.char_pos < len(lex_state.text):
res = self.match(lex_state.text, line_ctr.char_pos)
@@ -352,7 +352,8 @@ class TraditionalLexer(Lexer):
if not allowed:
allowed = {"<END-OF-FILE>"}
raise UnexpectedCharacters(lex_state.text, line_ctr.char_pos, line_ctr.line, line_ctr.column,
allowed=allowed, token_history=lex_state.last_token and [lex_state.last_token])
allowed=allowed, token_history=lex_state.last_token and [lex_state.last_token],
state=parser_state)

value, type_ = res

@@ -428,14 +429,14 @@ class ContextualLexer(Lexer):
try:
while True:
lexer = self.lexers[parser_state.position]
yield lexer.next_token(lexer_state)
yield lexer.next_token(lexer_state, parser_state)
except EOFError:
pass
except UnexpectedCharacters as e:
# In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined, but not in the current context.
# This tests the input against the global context, to provide a nicer error.
token = self.root_lexer.next_token(lexer_state)
raise UnexpectedToken(token, e.allowed, state=parser_state.position)
token = self.root_lexer.next_token(lexer_state, parser_state)
raise UnexpectedToken(token, e.allowed, state=parser_state, token_history=[lexer_state.last_token])


class LexerThread:


+ 0
- 3
lark/parser_frontends.py Zobrazit soubor

@@ -179,9 +179,6 @@ class Earley_WithLexer(WithLexer):
tree_class = options.tree_class or Tree if options.ambiguity != 'forest' else None
self.parser = earley.Parser(parser_conf, self.match, resolve_ambiguity=resolve_ambiguity, debug=debug, tree_class=tree_class)

def make_lexer(self, text):
return WithLexer.make_lexer(self, text).lex(None)

def match(self, term, token):
return term.name == token.type



+ 10
- 6
lark/parsers/earley.py Zobrazit soubor

@@ -146,7 +146,7 @@ class Parser:
column.add(new_item)
items.append(new_item)

def _parse(self, stream, columns, to_scan, start_symbol=None):
def _parse(self, lexer, columns, to_scan, start_symbol=None):
def is_quasi_complete(item):
if item.is_complete:
return True
@@ -245,7 +245,7 @@ class Parser:

if not next_set and not next_to_scan:
expect = {i.expect.name for i in to_scan}
raise UnexpectedToken(token, expect, considered_rules = set(to_scan))
raise UnexpectedToken(token, expect, considered_rules=set(to_scan), state=frozenset(i.s for i in to_scan))

return next_to_scan

@@ -261,20 +261,24 @@ class Parser:
# Completions will be added to the SPPF tree, and predictions will be recursively
# processed down to terminals/empty nodes to be added to the scanner for the next
# step.
expects = {i.expect for i in to_scan}
i = 0
for token in stream:
for token in lexer.lex(expects):
self.predict_and_complete(i, to_scan, columns, transitives)

to_scan = scan(i, token, to_scan)
i += 1

expects.clear()
expects |= {i.expect for i in to_scan}

self.predict_and_complete(i, to_scan, columns, transitives)

## Column is now the final column in the parse.
assert i == len(columns)-1
return to_scan

def parse(self, stream, start):
def parse(self, lexer, start):
assert start, start
start_symbol = NonTerminal(start)

@@ -291,7 +295,7 @@ class Parser:
else:
columns[0].add(item)

to_scan = self._parse(stream, columns, to_scan, start_symbol)
to_scan = self._parse(lexer, columns, to_scan, start_symbol)

# If the parse was successful, the start
# symbol should have been completed in the last step of the Earley cycle, and will be in
@@ -299,7 +303,7 @@ class Parser:
solutions = [n.node for n in columns[-1] if n.is_complete and n.node is not None and n.s == start_symbol and n.start == 0]
if not solutions:
expected_terminals = [t.expect for t in to_scan]
raise UnexpectedEOF(expected_terminals)
raise UnexpectedEOF(expected_terminals, state=frozenset(i.s for i in to_scan))

if self.debug:
from .earley_forest import ForestToPyDotVisitor


+ 8
- 2
lark/parsers/lalr_parser.py Zobrazit soubor

@@ -3,7 +3,7 @@
# Author: Erez Shinan (2017)
# Email : erezshin@gmail.com
from copy import deepcopy, copy
from ..exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedToken
from ..exceptions import UnexpectedInput, UnexpectedToken
from ..lexer import Token

from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable
@@ -62,6 +62,12 @@ class ParserState:
def position(self):
return self.state_stack[-1]

# Necessary for match_examples() to work
def __eq__(self, other):
if not isinstance(other, ParserState):
return False
return self.position == other.position

def __copy__(self):
return type(self)(
self.parse_conf,
@@ -86,7 +92,7 @@ class ParserState:
action, arg = states[state][token.type]
except KeyError:
expected = {s for s in states[state].keys() if s.isupper()}
raise UnexpectedToken(token, expected, state=state, puppet=None)
raise UnexpectedToken(token, expected, state=self, puppet=None)

assert arg != end_state



+ 2
- 1
lark/parsers/xearley.py Zobrazit soubor

@@ -113,7 +113,8 @@ class Parser(BaseParser):
del delayed_matches[i+1] # No longer needed, so unburden memory

if not next_set and not delayed_matches and not next_to_scan:
raise UnexpectedCharacters(stream, i, text_line, text_column, {item.expect.name for item in to_scan}, set(to_scan))
raise UnexpectedCharacters(stream, i, text_line, text_column, {item.expect.name for item in to_scan},
set(to_scan), state=frozenset(i.s for i in to_scan))

return next_to_scan



+ 9
- 1
lark/tree_matcher.py Zobrazit soubor

@@ -69,6 +69,14 @@ def parse_rulename(s):
return name, args



class ChildrenLexer:
def __init__(self, children):
self.children = children

def lex(self, parser_state):
return self.children

class TreeMatcher:
"""Match the elements of a tree node, based on an ontology
provided by a Lark grammar.
@@ -173,6 +181,6 @@ class TreeMatcher:
self._parser_cache[rulename] = parser

# find a full derivation
unreduced_tree = parser.parse(tree.children, rulename)
unreduced_tree = parser.parse(ChildrenLexer(tree.children), rulename)
assert unreduced_tree.data == rulename
return unreduced_tree

+ 2
- 2
setup.py Zobrazit soubor

@@ -29,8 +29,8 @@ setup(
description = "a modern parsing library",
license = "MIT",
keywords = "Earley LALR parser parsing ast",
url = "https://github.com/erezsh/lark",
download_url = "https://github.com/erezsh/lark/tarball/master",
url = "https://github.com/lark-parser/lark",
download_url = "https://github.com/lark-parser/lark/tarball/master",
long_description='''
Lark is a modern general-purpose parsing library for Python.



+ 1
- 0
tests/__main__.py Zobrazit soubor

@@ -9,6 +9,7 @@ from .test_tools import TestStandalone
from .test_cache import TestCache
from .test_grammar import TestGrammar
from .test_reconstructor import TestReconstructor
from .test_tree_forest_transformer import TestTreeForestTransformer

try:
from .test_nearley.test_nearley import TestNearley


+ 28
- 3
tests/test_parser.py Zobrazit soubor

@@ -322,7 +322,7 @@ class TestParsers(unittest.TestCase):

def test_alias(self):
Lark("""start: ["a"] "b" ["c"] "e" ["f"] ["g"] ["h"] "x" -> d """)
def test_backwards_custom_lexer(self):
class OldCustomLexer(Lexer):
def __init__(self, lexer_conf):
@@ -330,12 +330,12 @@ class TestParsers(unittest.TestCase):

def lex(self, text):
yield Token('A', 'A')
p = Lark("""
start: A
%declare A
""", parser='lalr', lexer=OldCustomLexer)
r = p.parse('')
self.assertEqual(r, Tree('start', [Token('A', 'A')]))

@@ -2361,6 +2361,31 @@ def _make_parser_test(LEXER, PARSER):
self.assertEqual(a.line, 1)
self.assertEqual(b.line, 2)

@unittest.skipIf(PARSER=='cyk', "match_examples() not supported for CYK")
def test_match_examples(self):
p = _Lark(r"""
start: "a" "b" "c"
""")

def match_error(s):
try:
_ = p.parse(s)
except UnexpectedInput as u:
return u.match_examples(p.parse, {
0: ['abe'],
1: ['ab'],
2: ['cbc', 'dbc'],
})
assert False

assert match_error("abe") == 0
assert match_error("ab") == 1
assert match_error("bbc") == 2
assert match_error("cbc") == 2
self.assertEqual( match_error("dbc"), 2 )
self.assertEqual( match_error("ebc"), 2 )


@unittest.skipIf(not regex or sys.version_info[0] == 2, 'Unicode and Python 2 do not place nicely together.')
def test_unicode_class(self):
"Tests that character classes from the `regex` module work correctly."


Načítá se…
Zrušit
Uložit