Browse Source

Merge remote-tracking branch 'origin/v1.0' into 1.0b

gm/2021-09-23T00Z/github.com--lark-parser-lark/1.0b
Erez Sh 3 years ago
parent
commit
554835f19e
16 changed files with 333 additions and 104 deletions
  1. +1
    -1
      README.md
  2. +2
    -0
      docs/classes.rst
  3. +1
    -1
      docs/index.rst
  4. +5
    -0
      docs/visitors.rst
  5. +88
    -53
      examples/advanced/python3.lark
  6. +3
    -1
      examples/standalone/json_parser_main.py
  7. +2
    -2
      lark/ast_utils.py
  8. +23
    -7
      lark/exceptions.py
  9. +7
    -1
      lark/lark.py
  10. +0
    -1
      lark/lexer.py
  11. +123
    -12
      lark/load_grammar.py
  12. +1
    -1
      lark/parser_frontends.py
  13. +1
    -1
      lark/parsers/lalr_interactive_parser.py
  14. +28
    -2
      lark/utils.py
  15. +48
    -1
      tests/test_grammar.py
  16. +0
    -20
      tests/test_parser.py

+ 1
- 1
README.md View File

@@ -26,7 +26,7 @@ Most importantly, Lark will save you time and prevent you from getting parsing h


- [Documentation @readthedocs](https://lark-parser.readthedocs.io/) - [Documentation @readthedocs](https://lark-parser.readthedocs.io/)
- [Cheatsheet (PDF)](/docs/_static/lark_cheatsheet.pdf) - [Cheatsheet (PDF)](/docs/_static/lark_cheatsheet.pdf)
- [Online IDE (very basic)](https://lark-parser.github.io/lark/ide/app.html)
- [Online IDE](https://lark-parser.github.io/ide)
- [Tutorial](/docs/json_tutorial.md) for writing a JSON parser. - [Tutorial](/docs/json_tutorial.md) for writing a JSON parser.
- Blog post: [How to write a DSL with Lark](http://blog.erezsh.com/how-to-write-a-dsl-in-python-with-lark/) - Blog post: [How to write a DSL with Lark](http://blog.erezsh.com/how-to-write-a-dsl-in-python-with-lark/)
- [Gitter chat](https://gitter.im/lark-parser/Lobby) - [Gitter chat](https://gitter.im/lark-parser/Lobby)


+ 2
- 0
docs/classes.rst View File

@@ -66,6 +66,8 @@ UnexpectedInput


.. autoclass:: lark.exceptions.UnexpectedCharacters .. autoclass:: lark.exceptions.UnexpectedCharacters


.. autoclass:: lark.exceptions.UnexpectedEOF

InteractiveParser InteractiveParser
----------------- -----------------




+ 1
- 1
docs/index.rst View File

@@ -113,7 +113,7 @@ Resources


.. _Examples: https://github.com/lark-parser/lark/tree/master/examples .. _Examples: https://github.com/lark-parser/lark/tree/master/examples
.. _Third-party examples: https://github.com/ligurio/lark-grammars .. _Third-party examples: https://github.com/ligurio/lark-grammars
.. _Online IDE: https://lark-parser.github.io/lark/ide/app.html
.. _Online IDE: https://lark-parser.github.io/ide
.. _How to write a DSL: http://blog.erezsh.com/how-to-write-a-dsl-in-python-with-lark/ .. _How to write a DSL: http://blog.erezsh.com/how-to-write-a-dsl-in-python-with-lark/
.. _Program Synthesis is Possible: https://www.cs.cornell.edu/~asampson/blog/minisynth.html .. _Program Synthesis is Possible: https://www.cs.cornell.edu/~asampson/blog/minisynth.html
.. _Cheatsheet (PDF): _static/lark_cheatsheet.pdf .. _Cheatsheet (PDF): _static/lark_cheatsheet.pdf


+ 5
- 0
docs/visitors.rst View File

@@ -107,3 +107,8 @@ Discard
------- -------


.. autoclass:: lark.visitors.Discard .. autoclass:: lark.visitors.Discard

VisitError
-------

.. autoclass:: lark.exceptions.VisitError

+ 88
- 53
examples/advanced/python3.lark View File

@@ -21,7 +21,7 @@ decorators: decorator+
decorated: decorators (classdef | funcdef | async_funcdef) decorated: decorators (classdef | funcdef | async_funcdef)


async_funcdef: "async" funcdef async_funcdef: "async" funcdef
funcdef: "def" NAME "(" parameters? ")" ["->" test] ":" suite
funcdef: "def" NAME "(" [parameters] ")" ["->" test] ":" suite


parameters: paramvalue ("," paramvalue)* ["," SLASH] ["," [starparams | kwparams]] parameters: paramvalue ("," paramvalue)* ["," SLASH] ["," [starparams | kwparams]]
| starparams | starparams
@@ -29,25 +29,36 @@ parameters: paramvalue ("," paramvalue)* ["," SLASH] ["," [starparams | kwparams


SLASH: "/" // Otherwise the it will completely disappear and it will be undisguisable in the result SLASH: "/" // Otherwise the it will completely disappear and it will be undisguisable in the result
starparams: "*" typedparam? ("," paramvalue)* ["," kwparams] starparams: "*" typedparam? ("," paramvalue)* ["," kwparams]
kwparams: "**" typedparam
kwparams: "**" typedparam ","?


?paramvalue: typedparam ["=" test]
?typedparam: NAME [":" test]
?paramvalue: typedparam ("=" test)?
?typedparam: NAME (":" test)?


varargslist: (vfpdef ["=" test] ("," vfpdef ["=" test])* ["," [ "*" [vfpdef] ("," vfpdef ["=" test])* ["," ["**" vfpdef [","]]] | "**" vfpdef [","]]]
| "*" [vfpdef] ("," vfpdef ["=" test])* ["," ["**" vfpdef [","]]]
| "**" vfpdef [","])


vfpdef: NAME
lambdef: "lambda" [lambda_params] ":" test
lambdef_nocond: "lambda" [lambda_params] ":" test_nocond
lambda_params: lambda_paramvalue ("," lambda_paramvalue)* ["," [lambda_starparams | lambda_kwparams]]
| lambda_starparams
| lambda_kwparams
?lambda_paramvalue: NAME ("=" test)?
lambda_starparams: "*" [NAME] ("," lambda_paramvalue)* ["," [lambda_kwparams]]
lambda_kwparams: "**" NAME ","?



?stmt: simple_stmt | compound_stmt ?stmt: simple_stmt | compound_stmt
?simple_stmt: small_stmt (";" small_stmt)* [";"] _NEWLINE ?simple_stmt: small_stmt (";" small_stmt)* [";"] _NEWLINE
?small_stmt: (expr_stmt | del_stmt | pass_stmt | flow_stmt | import_stmt | global_stmt | nonlocal_stmt | assert_stmt)
?expr_stmt: testlist_star_expr (annassign | augassign (yield_expr|testlist)
| ("=" (yield_expr|testlist_star_expr))*)
annassign: ":" test ["=" test]
?testlist_star_expr: (test|star_expr) ("," (test|star_expr))* [","]
!augassign: ("+=" | "-=" | "*=" | "@=" | "/=" | "%=" | "&=" | "|=" | "^=" | "<<=" | ">>=" | "**=" | "//=")
?small_stmt: (expr_stmt | assign_stmt | del_stmt | pass_stmt | flow_stmt | import_stmt | global_stmt | nonlocal_stmt | assert_stmt)
expr_stmt: testlist_star_expr
assign_stmt: annassign | augassign | assign

annassign: testlist_star_expr ":" test ["=" test]
assign: testlist_star_expr ("=" (yield_expr|testlist_star_expr))+
augassign: testlist_star_expr augassign_op (yield_expr|testlist)
!augassign_op: "+=" | "-=" | "*=" | "@=" | "/=" | "%=" | "&=" | "|=" | "^=" | "<<=" | ">>=" | "**=" | "//="
?testlist_star_expr: test_or_star_expr
| test_or_star_expr ("," test_or_star_expr)+ ","? -> tuple
| test_or_star_expr "," -> tuple

// For normal and annotated assignments, additional restrictions enforced by the interpreter // For normal and annotated assignments, additional restrictions enforced by the interpreter
del_stmt: "del" exprlist del_stmt: "del" exprlist
pass_stmt: "pass" pass_stmt: "pass"
@@ -71,43 +82,52 @@ global_stmt: "global" NAME ("," NAME)*
nonlocal_stmt: "nonlocal" NAME ("," NAME)* nonlocal_stmt: "nonlocal" NAME ("," NAME)*
assert_stmt: "assert" test ["," test] assert_stmt: "assert" test ["," test]


compound_stmt: if_stmt | while_stmt | for_stmt | try_stmt | with_stmt | funcdef | classdef | decorated | async_stmt
?compound_stmt: if_stmt | while_stmt | for_stmt | try_stmt | with_stmt | funcdef | classdef | decorated | async_stmt
async_stmt: "async" (funcdef | with_stmt | for_stmt) async_stmt: "async" (funcdef | with_stmt | for_stmt)
if_stmt: "if" test ":" suite ("elif" test ":" suite)* ["else" ":" suite]
if_stmt: "if" test ":" suite elifs ["else" ":" suite]
elifs: elif_*
elif_: "elif" test ":" suite
while_stmt: "while" test ":" suite ["else" ":" suite] while_stmt: "while" test ":" suite ["else" ":" suite]
for_stmt: "for" exprlist "in" testlist ":" suite ["else" ":" suite] for_stmt: "for" exprlist "in" testlist ":" suite ["else" ":" suite]
try_stmt: ("try" ":" suite ((except_clause ":" suite)+ ["else" ":" suite] ["finally" ":" suite] | "finally" ":" suite))
with_stmt: "with" with_item ("," with_item)* ":" suite
try_stmt: "try" ":" suite except_clauses ["else" ":" suite] [finally]
| "try" ":" suite finally -> try_finally
finally: "finally" ":" suite
except_clauses: except_clause+
except_clause: "except" [test ["as" NAME]] ":" suite

with_stmt: "with" with_items ":" suite
with_items: with_item ("," with_item)*
with_item: test ["as" expr] with_item: test ["as" expr]
// NB compile.c makes sure that the default except clause is last // NB compile.c makes sure that the default except clause is last
except_clause: "except" [test ["as" NAME]]
suite: simple_stmt | _NEWLINE _INDENT stmt+ _DEDENT suite: simple_stmt | _NEWLINE _INDENT stmt+ _DEDENT


?test: or_test ("if" or_test "else" test)? | lambdef
?test: or_test ("if" or_test "else" test)?
| lambdef
?test_nocond: or_test | lambdef_nocond ?test_nocond: or_test | lambdef_nocond
lambdef: "lambda" [varargslist] ":" test
lambdef_nocond: "lambda" [varargslist] ":" test_nocond

?or_test: and_test ("or" and_test)* ?or_test: and_test ("or" and_test)*
?and_test: not_test ("and" not_test)* ?and_test: not_test ("and" not_test)*
?not_test: "not" not_test -> not
?not_test: "not" not_test -> not_test
| comparison | comparison
?comparison: expr (_comp_op expr)*
?comparison: expr (comp_op expr)*
star_expr: "*" expr star_expr: "*" expr
?expr: xor_expr ("|" xor_expr)*

?expr: or_expr
?or_expr: xor_expr ("|" xor_expr)*
?xor_expr: and_expr ("^" and_expr)* ?xor_expr: and_expr ("^" and_expr)*
?and_expr: shift_expr ("&" shift_expr)* ?and_expr: shift_expr ("&" shift_expr)*
?shift_expr: arith_expr (_shift_op arith_expr)* ?shift_expr: arith_expr (_shift_op arith_expr)*
?arith_expr: term (_add_op term)* ?arith_expr: term (_add_op term)*
?term: factor (_mul_op factor)* ?term: factor (_mul_op factor)*
?factor: _factor_op factor | power
?factor: _unary_op factor | power


!_factor_op: "+"|"-"|"~"
!_unary_op: "+"|"-"|"~"
!_add_op: "+"|"-" !_add_op: "+"|"-"
!_shift_op: "<<"|">>" !_shift_op: "<<"|">>"
!_mul_op: "*"|"@"|"/"|"%"|"//" !_mul_op: "*"|"@"|"/"|"%"|"//"
// <> isn't actually a valid comparison operator in Python. It's here for the // <> isn't actually a valid comparison operator in Python. It's here for the
// sake of a __future__ import described in PEP 401 (which really works :-) // sake of a __future__ import described in PEP 401 (which really works :-)
!_comp_op: "<"|">"|"=="|">="|"<="|"<>"|"!="|"in"|"not" "in"|"is"|"is" "not"
!comp_op: "<"|">"|"=="|">="|"<="|"<>"|"!="|"in"|"not" "in"|"is"|"is" "not"


?power: await_expr ("**" factor)? ?power: await_expr ("**" factor)?
?await_expr: AWAIT? atom_expr ?await_expr: AWAIT? atom_expr
@@ -118,61 +138,75 @@ AWAIT: "await"
| atom_expr "." NAME -> getattr | atom_expr "." NAME -> getattr
| atom | atom


?atom: "(" [yield_expr|tuplelist_comp] ")" -> tuple
| "[" [testlist_comp] "]" -> list
| "{" [dict_comp] "}" -> dict
| "{" set_comp "}" -> set
?atom: "(" yield_expr ")"
| "(" _tuple_inner? ")" -> tuple
| "(" comprehension{test_or_star_expr} ")" -> tuple_comprehension
| "[" _testlist_comp? "]" -> list
| "[" comprehension{test_or_star_expr} "]" -> list_comprehension
| "{" _dict_exprlist? "}" -> dict
| "{" comprehension{key_value} "}" -> dict_comprehension
| "{" _set_exprlist "}" -> set
| "{" comprehension{test} "}" -> set_comprehension
| NAME -> var | NAME -> var
| number | string+
| number
| string_concat
| "(" test ")" | "(" test ")"
| "..." -> ellipsis | "..." -> ellipsis
| "None" -> const_none | "None" -> const_none
| "True" -> const_true | "True" -> const_true
| "False" -> const_false | "False" -> const_false


?testlist_comp: test | tuplelist_comp
tuplelist_comp: (test|star_expr) (comp_for | ("," (test|star_expr))+ [","] | ",")

?string_concat: string+

_testlist_comp: test | _tuple_inner
_tuple_inner: test_or_star_expr (("," test_or_star_expr)+ [","] | ",")

?test_or_star_expr: test
| star_expr

?subscriptlist: subscript ?subscriptlist: subscript
| subscript (("," subscript)+ [","] | ",") -> subscript_tuple | subscript (("," subscript)+ [","] | ",") -> subscript_tuple
subscript: test | ([test] ":" [test] [sliceop]) -> slice
?subscript: test | ([test] ":" [test] [sliceop]) -> slice
sliceop: ":" [test] sliceop: ":" [test]
exprlist: (expr|star_expr)
| (expr|star_expr) (("," (expr|star_expr))+ [","]|",") -> exprlist_tuple
testlist: test | testlist_tuple
?exprlist: (expr|star_expr)
| (expr|star_expr) (("," (expr|star_expr))+ [","]|",")
?testlist: test | testlist_tuple
testlist_tuple: test (("," test)+ [","] | ",") testlist_tuple: test (("," test)+ [","] | ",")
dict_comp: key_value comp_for
| (key_value | "**" expr) ("," (key_value | "**" expr))* [","]
_dict_exprlist: (key_value | "**" expr) ("," (key_value | "**" expr))* [","]


key_value: test ":" test key_value: test ":" test


set_comp: test comp_for
| (test|star_expr) ("," (test | star_expr))* [","]
_set_exprlist: test_or_star_expr ("," test_or_star_expr)* [","]


classdef: "class" NAME ["(" [arguments] ")"] ":" suite classdef: "class" NAME ["(" [arguments] ")"] ":" suite




arguments: argvalue ("," argvalue)* ("," [ starargs | kwargs])? arguments: argvalue ("," argvalue)* ("," [ starargs | kwargs])?
| starargs | starargs
| kwargs | kwargs
| test comp_for
| comprehension{test}


starargs: "*" test ("," "*" test)* ("," argvalue)* ["," kwargs]
starargs: stararg ("," stararg)* ("," argvalue)* ["," kwargs]
stararg: "*" test
kwargs: "**" test kwargs: "**" test


?argvalue: test ("=" test)? ?argvalue: test ("=" test)?




comp_iter: comp_for | comp_if | async_for
async_for: "async" "for" exprlist "in" or_test [comp_iter]
comp_for: "for" exprlist "in" or_test [comp_iter]
comp_if: "if" test_nocond [comp_iter]
comprehension{comp_result}: comp_result comp_fors [comp_if]
comp_fors: comp_for+
comp_for: [ASYNC] "for" exprlist "in" or_test
ASYNC: "async"
?comp_if: "if" test_nocond


// not used in grammar, but may appear in "node" passed from Parser to Compiler // not used in grammar, but may appear in "node" passed from Parser to Compiler
encoding_decl: NAME encoding_decl: NAME


yield_expr: "yield" [yield_arg]
yield_arg: "from" test | testlist

yield_expr: "yield" [testlist]
| "yield" "from" test -> yield_from


number: DEC_NUMBER | HEX_NUMBER | BIN_NUMBER | OCT_NUMBER | FLOAT_NUMBER | IMAG_NUMBER number: DEC_NUMBER | HEX_NUMBER | BIN_NUMBER | OCT_NUMBER | FLOAT_NUMBER | IMAG_NUMBER
string: STRING | LONG_STRING string: STRING | LONG_STRING
@@ -181,6 +215,7 @@ string: STRING | LONG_STRING
%import python (NAME, COMMENT, STRING, LONG_STRING) %import python (NAME, COMMENT, STRING, LONG_STRING)
%import python (DEC_NUMBER, HEX_NUMBER, OCT_NUMBER, BIN_NUMBER, FLOAT_NUMBER, IMAG_NUMBER) %import python (DEC_NUMBER, HEX_NUMBER, OCT_NUMBER, BIN_NUMBER, FLOAT_NUMBER, IMAG_NUMBER)



// Other terminals // Other terminals


_NEWLINE: ( /\r?\n[\t ]*/ | COMMENT )+ _NEWLINE: ( /\r?\n[\t ]*/ | COMMENT )+


+ 3
- 1
examples/standalone/json_parser_main.py View File

@@ -10,7 +10,9 @@ Standalone Parser


import sys import sys


from json_parser import Lark_StandAlone, Transformer, inline_args
from json_parser import Lark_StandAlone, Transformer, v_args

inline_args = v_args(inline=True)


class TreeToJson(Transformer): class TreeToJson(Transformer):
@inline_args @inline_args


+ 2
- 2
lark/ast_utils.py View File

@@ -38,8 +38,8 @@ def create_transformer(ast_module: types.ModuleType, transformer: Optional[Trans
Classes starting with an underscore (`_`) will be skipped. Classes starting with an underscore (`_`) will be skipped.


Parameters: Parameters:
ast_module - A Python module containing all the subclasses of `ast_utils.Ast`
transformer (Optional[Transformer]) - An initial transformer. Its attributes may be overwritten.
ast_module: A Python module containing all the subclasses of ``ast_utils.Ast``
transformer (Optional[Transformer]): An initial transformer. Its attributes may be overwritten.
""" """
t = transformer or Transformer() t = transformer or Transformer()




+ 23
- 7
lark/exceptions.py View File

@@ -40,8 +40,9 @@ class UnexpectedInput(LarkError):


Used as a base class for the following exceptions: Used as a base class for the following exceptions:


- ``UnexpectedToken``: The parser received an unexpected token
- ``UnexpectedCharacters``: The lexer encountered an unexpected string - ``UnexpectedCharacters``: The lexer encountered an unexpected string
- ``UnexpectedToken``: The parser received an unexpected token
- ``UnexpectedEOF``: The parser expected a token, but the input ended


After catching one of these exceptions, you may call the following helper methods to create a nicer error message. After catching one of these exceptions, you may call the following helper methods to create a nicer error message.
""" """
@@ -135,7 +136,8 @@ class UnexpectedInput(LarkError):




class UnexpectedEOF(ParseError, UnexpectedInput): class UnexpectedEOF(ParseError, UnexpectedInput):

"""An exception that is raised by the parser, when the input ends while it still expects a token.
"""
expected: 'List[Token]' expected: 'List[Token]'


def __init__(self, expected, state=None, terminals_by_name=None): def __init__(self, expected, state=None, terminals_by_name=None):
@@ -158,6 +160,9 @@ class UnexpectedEOF(ParseError, UnexpectedInput):




class UnexpectedCharacters(LexError, UnexpectedInput): class UnexpectedCharacters(LexError, UnexpectedInput):
"""An exception that is raised by the lexer, when it cannot match the next
string of characters to any of its terminals.
"""


allowed: Set[str] allowed: Set[str]
considered_tokens: Set[Any] considered_tokens: Set[Any]
@@ -199,10 +204,15 @@ class UnexpectedToken(ParseError, UnexpectedInput):
"""An exception that is raised by the parser, when the token it received """An exception that is raised by the parser, when the token it received
doesn't match any valid step forward. doesn't match any valid step forward.


The parser provides an interactive instance through `interactive_parser`,
which is initialized to the point of failture, and can be used for debugging and error handling.
Parameters:
token: The mismatched token
expected: The set of expected tokens
considered_rules: Which rules were considered, to deduce the expected tokens
state: A value representing the parser state. Do not rely on its value or type.
interactive_parser: An instance of ``InteractiveParser``, that is initialized to the point of failture,
and can be used for debugging and error handling.


see: ``InteractiveParser``.
Note: These parameters are available as attributes of the instance.
""" """


expected: Set[str] expected: Set[str]
@@ -247,8 +257,13 @@ class VisitError(LarkError):
"""VisitError is raised when visitors are interrupted by an exception """VisitError is raised when visitors are interrupted by an exception


It provides the following attributes for inspection: It provides the following attributes for inspection:
- obj: the tree node or token it was processing when the exception was raised
- orig_exc: the exception that cause it to fail

Parameters:
rule: the name of the visit rule that failed
obj: the tree-node or token that was being processed
orig_exc: the exception that cause it to fail

Note: These parameters are available as attributes
""" """


obj: 'Union[Tree, Token]' obj: 'Union[Tree, Token]'
@@ -258,6 +273,7 @@ class VisitError(LarkError):
message = 'Error trying to process rule "%s":\n\n%s' % (rule, orig_exc) message = 'Error trying to process rule "%s":\n\n%s' % (rule, orig_exc)
super(VisitError, self).__init__(message) super(VisitError, self).__init__(message)


self.rule = rule
self.obj = obj self.obj = obj
self.orig_exc = orig_exc self.orig_exc = orig_exc




+ 7
- 1
lark/lark.py View File

@@ -137,7 +137,7 @@ class LarkOptions(Serialize):
A List of either paths or loader functions to specify from where grammars are imported A List of either paths or loader functions to specify from where grammars are imported
source_path source_path
Override the source of from where the grammar was loaded. Useful for relative imports and unconventional grammar loading Override the source of from where the grammar was loaded. Useful for relative imports and unconventional grammar loading
**=== End Options ===**
**=== End of Options ===**
""" """
if __doc__: if __doc__:
__doc__ += OPTIONS_DOC __doc__ += OPTIONS_DOC
@@ -560,6 +560,8 @@ class Lark(Serialize):
"""Only lex (and postlex) the text, without parsing it. Only relevant when lexer='standard' """Only lex (and postlex) the text, without parsing it. Only relevant when lexer='standard'


When dont_ignore=True, the lexer will return all tokens, even those marked for %ignore. When dont_ignore=True, the lexer will return all tokens, even those marked for %ignore.

:raises UnexpectedCharacters: In case the lexer cannot find a suitable match.
""" """
if not hasattr(self, 'lexer') or dont_ignore: if not hasattr(self, 'lexer') or dont_ignore:
lexer = self._build_lexer(dont_ignore) lexer = self._build_lexer(dont_ignore)
@@ -602,6 +604,10 @@ class Lark(Serialize):
If a transformer is supplied to ``__init__``, returns whatever is the If a transformer is supplied to ``__init__``, returns whatever is the
result of the transformation. Otherwise, returns a Tree instance. result of the transformation. Otherwise, returns a Tree instance.


:raises UnexpectedInput: On a parse error, one of these sub-exceptions will rise:
``UnexpectedCharacters``, ``UnexpectedToken``, or ``UnexpectedEOF``.
For convenience, these sub-exceptions also inherit from ``ParserError`` and ``LexerError``.

""" """
return self.parser.parse(text, start=start, on_error=on_error) return self.parser.parse(text, start=start, on_error=on_error)




+ 0
- 1
lark/lexer.py View File

@@ -281,7 +281,6 @@ def _create_unless(terminals, g_regex_flags, re_, use_bytes):
return new_terminals, callback return new_terminals, callback





class Scanner: class Scanner:
def __init__(self, terminals, g_regex_flags, re_, use_bytes, match_whole=False): def __init__(self, terminals, g_regex_flags, re_, use_bytes, match_whole=False):
self.terminals = terminals self.terminals = terminals


+ 123
- 12
lark/load_grammar.py View File

@@ -10,7 +10,7 @@ from numbers import Integral
from contextlib import suppress from contextlib import suppress
from typing import List, Tuple, Union, Callable, Dict, Optional from typing import List, Tuple, Union, Callable, Dict, Optional


from .utils import bfs, logger, classify_bool, is_id_continue, is_id_start, bfs_all_unique
from .utils import bfs, logger, classify_bool, is_id_continue, is_id_start, bfs_all_unique, small_factors
from .lexer import Token, TerminalDef, PatternStr, PatternRE from .lexer import Token, TerminalDef, PatternStr, PatternRE


from .parse_tree_builder import ParseTreeBuilder from .parse_tree_builder import ParseTreeBuilder
@@ -176,27 +176,136 @@ RULES = {
} }




# Value 5 keeps the number of states in the lalr parser somewhat minimal
# It isn't optimal, but close to it. See PR #949
SMALL_FACTOR_THRESHOLD = 5
# The Threshold whether repeat via ~ are split up into different rules
# 50 is chosen since it keeps the number of states low and therefore lalr analysis time low,
# while not being to overaggressive and unnecessarily creating rules that might create shift/reduce conflicts.
# (See PR #949)
REPEAT_BREAK_THRESHOLD = 50


@inline_args @inline_args
class EBNF_to_BNF(Transformer_InPlace): class EBNF_to_BNF(Transformer_InPlace):
def __init__(self): def __init__(self):
self.new_rules = [] self.new_rules = []
self.rules_by_expr = {}
self.rules_cache = {}
self.prefix = 'anon' self.prefix = 'anon'
self.i = 0 self.i = 0
self.rule_options = None self.rule_options = None


def _add_recurse_rule(self, type_, expr):
if expr in self.rules_by_expr:
return self.rules_by_expr[expr]

new_name = '__%s_%s_%d' % (self.prefix, type_, self.i)
def _name_rule(self, inner):
new_name = '__%s_%s_%d' % (self.prefix, inner, self.i)
self.i += 1 self.i += 1
t = NonTerminal(new_name)
tree = ST('expansions', [ST('expansion', [expr]), ST('expansion', [t, expr])])
self.new_rules.append((new_name, tree, self.rule_options))
self.rules_by_expr[expr] = t
return new_name

def _add_rule(self, key, name, expansions):
t = NonTerminal(name)
self.new_rules.append((name, expansions, self.rule_options))
self.rules_cache[key] = t
return t return t


def _add_recurse_rule(self, type_, expr):
try:
return self.rules_cache[expr]
except KeyError:
new_name = self._name_rule(type_)
t = NonTerminal(new_name)
tree = ST('expansions', [
ST('expansion', [expr]),
ST('expansion', [t, expr])
])
return self._add_rule(expr, new_name, tree)

def _add_repeat_rule(self, a, b, target, atom):
"""Generate a rule that repeats target ``a`` times, and repeats atom ``b`` times.

When called recursively (into target), it repeats atom for x(n) times, where:
x(0) = 1
x(n) = a(n) * x(n-1) + b

Example rule when a=3, b=4:

new_rule: target target target atom atom atom atom

"""
key = (a, b, target, atom)
try:
return self.rules_cache[key]
except KeyError:
new_name = self._name_rule('repeat_a%d_b%d' % (a, b))
tree = ST('expansions', [ST('expansion', [target] * a + [atom] * b)])
return self._add_rule(key, new_name, tree)

def _add_repeat_opt_rule(self, a, b, target, target_opt, atom):
"""Creates a rule that matches atom 0 to (a*n+b)-1 times.

When target matches n times atom, and target_opt 0 to n-1 times target_opt,

First we generate target * i followed by target_opt, for i from 0 to a-1
These match 0 to n*a - 1 times atom

Then we generate target * a followed by atom * i, for i from 0 to b-1
These match n*a to n*a + b-1 times atom

The created rule will not have any shift/reduce conflicts so that it can be used with lalr

Example rule when a=3, b=4:

new_rule: target_opt
| target target_opt
| target target target_opt

| target target target
| target target target atom
| target target target atom atom
| target target target atom atom atom

"""
key = (a, b, target, atom, "opt")
try:
return self.rules_cache[key]
except KeyError:
new_name = self._name_rule('repeat_a%d_b%d_opt' % (a, b))
tree = ST('expansions', [
ST('expansion', [target]*i + [target_opt]) for i in range(a)
] + [
ST('expansion', [target]*a + [atom]*i) for i in range(b)
])
return self._add_rule(key, new_name, tree)

def _generate_repeats(self, rule, mn, mx):
"""Generates a rule tree that repeats ``rule`` exactly between ``mn`` to ``mx`` times.
"""
# For a small number of repeats, we can take the naive approach
if mx < REPEAT_BREAK_THRESHOLD:
return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx + 1)])

# For large repeat values, we break the repetition into sub-rules.
# We treat ``rule~mn..mx`` as ``rule~mn rule~0..(diff=mx-mn)``.
# We then use small_factors to split up mn and diff up into values [(a, b), ...]
# This values are used with the help of _add_repeat_rule and _add_repeat_rule_opt
# to generate a complete rule/expression that matches the corresponding number of repeats
mn_target = rule
for a, b in small_factors(mn, SMALL_FACTOR_THRESHOLD):
mn_target = self._add_repeat_rule(a, b, mn_target, rule)
if mx == mn:
return mn_target

diff = mx - mn + 1 # We add one because _add_repeat_opt_rule generates rules that match one less
diff_factors = small_factors(diff, SMALL_FACTOR_THRESHOLD)
diff_target = rule # Match rule 1 times
diff_opt_target = ST('expansion', []) # match rule 0 times (e.g. up to 1 -1 times)
for a, b in diff_factors[:-1]:
diff_opt_target = self._add_repeat_opt_rule(a, b, diff_target, diff_opt_target, rule)
diff_target = self._add_repeat_rule(a, b, diff_target, rule)

a, b = diff_factors[-1]
diff_opt_target = self._add_repeat_opt_rule(a, b, diff_target, diff_opt_target, rule)

return ST('expansions', [ST('expansion', [mn_target] + [diff_opt_target])])

def expr(self, rule, op, *args): def expr(self, rule, op, *args):
if op.value == '?': if op.value == '?':
empty = ST('expansion', []) empty = ST('expansion', [])
@@ -221,7 +330,9 @@ class EBNF_to_BNF(Transformer_InPlace):
mn, mx = map(int, args) mn, mx = map(int, args)
if mx < mn or mn < 0: if mx < mn or mn < 0:
raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (rule, mn, mx)) raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (rule, mn, mx))
return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx+1)])

return self._generate_repeats(rule, mn, mx)

assert False, op assert False, op


def maybe(self, rule): def maybe(self, rule):


+ 1
- 1
lark/parser_frontends.py View File

@@ -238,4 +238,4 @@ class MakeParsingFrontend(MakeParsingFrontend):
assert isinstance(parser_conf, ParserConf) assert isinstance(parser_conf, ParserConf)
parser_conf.parser_type = self.parser_type parser_conf.parser_type = self.parser_type
lexer_conf.lexer_type = self.lexer_type lexer_conf.lexer_type = self.lexer_type
return ParsingFrontend(lexer_conf, parser_conf, options)
return ParsingFrontend(lexer_conf, parser_conf, options)

+ 1
- 1
lark/parsers/lalr_interactive_parser.py View File

@@ -65,7 +65,7 @@ class InteractiveParser(object):
"""Print the output of ``choices()`` in a way that's easier to read.""" """Print the output of ``choices()`` in a way that's easier to read."""
out = ["Parser choices:"] out = ["Parser choices:"]
for k, v in self.choices().items(): for k, v in self.choices().items():
out.append('\t- %s -> %s' % (k, v))
out.append('\t- %s -> %r' % (k, v))
out.append('stack size: %s' % len(self.parser_state.state_stack)) out.append('stack size: %s' % len(self.parser_state.state_stack))
return '\n'.join(out) return '\n'.join(out)




+ 28
- 2
lark/utils.py View File

@@ -163,7 +163,7 @@ def get_regexp_width(expr):
return 1, sre_constants.MAXREPEAT return 1, sre_constants.MAXREPEAT
else: else:
return 0, sre_constants.MAXREPEAT return 0, sre_constants.MAXREPEAT
###} ###}




@@ -245,7 +245,7 @@ except ImportError:


class FS: class FS:
exists = os.path.exists exists = os.path.exists
@staticmethod @staticmethod
def open(name, mode="r", **kwargs): def open(name, mode="r", **kwargs):
if atomicwrites and "w" in mode: if atomicwrites and "w" in mode:
@@ -316,3 +316,29 @@ def _serialize(value, memo):
return {key:_serialize(elem, memo) for key, elem in value.items()} return {key:_serialize(elem, memo) for key, elem in value.items()}
# assert value is None or isinstance(value, (int, float, str, tuple)), value # assert value is None or isinstance(value, (int, float, str, tuple)), value
return value return value




def small_factors(n, max_factor):
"""
Splits n up into smaller factors and summands <= max_factor.
Returns a list of [(a, b), ...]
so that the following code returns n:

n = 1
for a, b in values:
n = n * a + b

Currently, we also keep a + b <= max_factor, but that might change
"""
assert n >= 0
assert max_factor > 2
if n <= max_factor:
return [(n, 0)]

for a in range(max_factor, 1, -1):
r, b = divmod(n, a)
if a + b <= max_factor:
return small_factors(r, max_factor) + [(a, b)]
assert False, "Failed to factorize %s" % n

+ 48
- 1
tests/test_grammar.py View File

@@ -3,7 +3,7 @@ from __future__ import absolute_import
import sys import sys
from unittest import TestCase, main from unittest import TestCase, main


from lark import Lark, Token, Tree
from lark import Lark, Token, Tree, ParseError, UnexpectedInput
from lark.load_grammar import GrammarError, GRAMMAR_ERRORS, find_grammar_errors from lark.load_grammar import GrammarError, GRAMMAR_ERRORS, find_grammar_errors
from lark.load_grammar import FromPackageLoader from lark.load_grammar import FromPackageLoader


@@ -198,6 +198,53 @@ class TestGrammar(TestCase):
x = find_grammar_errors(text) x = find_grammar_errors(text)
assert [e.line for e, _s in find_grammar_errors(text)] == [2, 6] assert [e.line for e, _s in find_grammar_errors(text)] == [2, 6]


def test_ranged_repeat_terms(self):
g = u"""!start: AAA
AAA: "A"~3
"""
l = Lark(g, parser='lalr')
self.assertEqual(l.parse(u'AAA'), Tree('start', ["AAA"]))
self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AA')
self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAA')

g = u"""!start: AABB CC
AABB: "A"~0..2 "B"~2
CC: "C"~1..2
"""
l = Lark(g, parser='lalr')
self.assertEqual(l.parse(u'AABBCC'), Tree('start', ['AABB', 'CC']))
self.assertEqual(l.parse(u'BBC'), Tree('start', ['BB', 'C']))
self.assertEqual(l.parse(u'ABBCC'), Tree('start', ['ABB', 'CC']))
self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAB')
self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAABBB')
self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB')
self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB')

def test_ranged_repeat_large(self):
g = u"""!start: "A"~60
"""
l = Lark(g, parser='lalr')
self.assertGreater(len(l.rules), 1, "Expected that more than one rule will be generated")
self.assertEqual(l.parse(u'A' * 60), Tree('start', ["A"] * 60))
self.assertRaises(ParseError, l.parse, u'A' * 59)
self.assertRaises((ParseError, UnexpectedInput), l.parse, u'A' * 61)

g = u"""!start: "A"~15..100
"""
l = Lark(g, parser='lalr')
for i in range(0, 110):
if 15 <= i <= 100:
self.assertEqual(l.parse(u'A' * i), Tree('start', ['A']*i))
else:
self.assertRaises(UnexpectedInput, l.parse, u'A' * i)

# 8191 is a Mersenne prime
g = u"""start: "A"~8191
"""
l = Lark(g, parser='lalr')
self.assertEqual(l.parse(u'A' * 8191), Tree('start', []))
self.assertRaises(UnexpectedInput, l.parse, u'A' * 8190)
self.assertRaises(UnexpectedInput, l.parse, u'A' * 8192)




if __name__ == '__main__': if __name__ == '__main__':


+ 0
- 20
tests/test_parser.py View File

@@ -2203,27 +2203,7 @@ def _make_parser_test(LEXER, PARSER):
self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB') self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB')




def test_ranged_repeat_terms(self):
g = u"""!start: AAA
AAA: "A"~3
"""
l = _Lark(g)
self.assertEqual(l.parse(u'AAA'), Tree('start', ["AAA"]))
self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AA')
self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAA')


g = u"""!start: AABB CC
AABB: "A"~0..2 "B"~2
CC: "C"~1..2
"""
l = _Lark(g)
self.assertEqual(l.parse(u'AABBCC'), Tree('start', ['AABB', 'CC']))
self.assertEqual(l.parse(u'BBC'), Tree('start', ['BB', 'C']))
self.assertEqual(l.parse(u'ABBCC'), Tree('start', ['ABB', 'CC']))
self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAB')
self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAABBB')
self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB')
self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB')


@unittest.skipIf(PARSER=='earley', "Priority not handled correctly right now") # TODO XXX @unittest.skipIf(PARSER=='earley', "Priority not handled correctly right now") # TODO XXX
def test_priority_vs_embedded(self): def test_priority_vs_embedded(self):


Loading…
Cancel
Save