@@ -66,6 +66,8 @@ UnexpectedInput | |||
.. autoclass:: lark.exceptions.UnexpectedCharacters | |||
.. autoclass:: lark.exceptions.UnexpectedEOF | |||
InteractiveParser | |||
----------------- | |||
@@ -107,3 +107,8 @@ Discard | |||
------- | |||
.. autoclass:: lark.visitors.Discard | |||
VisitError | |||
------- | |||
.. autoclass:: lark.exceptions.VisitError |
@@ -36,8 +36,8 @@ def create_transformer(ast_module, transformer=None): | |||
Classes starting with an underscore (`_`) will be skipped. | |||
Parameters: | |||
ast_module - A Python module containing all the subclasses of `ast_utils.Ast` | |||
transformer (Optional[Transformer]) - An initial transformer. Its attributes may be overwritten. | |||
ast_module: A Python module containing all the subclasses of ``ast_utils.Ast`` | |||
transformer (Optional[Transformer]): An initial transformer. Its attributes may be overwritten. | |||
""" | |||
t = transformer or Transformer() | |||
@@ -36,8 +36,9 @@ class UnexpectedInput(LarkError): | |||
Used as a base class for the following exceptions: | |||
- ``UnexpectedToken``: The parser received an unexpected token | |||
- ``UnexpectedCharacters``: The lexer encountered an unexpected string | |||
- ``UnexpectedToken``: The parser received an unexpected token | |||
- ``UnexpectedEOF``: The parser expected a token, but the input ended | |||
After catching one of these exceptions, you may call the following helper methods to create a nicer error message. | |||
""" | |||
@@ -128,6 +129,9 @@ class UnexpectedInput(LarkError): | |||
class UnexpectedEOF(ParseError, UnexpectedInput): | |||
"""An exception that is raised by the parser, when the input ends while it still expects a token. | |||
""" | |||
def __init__(self, expected, state=None, terminals_by_name=None): | |||
super(UnexpectedEOF, self).__init__() | |||
@@ -148,6 +152,10 @@ class UnexpectedEOF(ParseError, UnexpectedInput): | |||
class UnexpectedCharacters(LexError, UnexpectedInput): | |||
"""An exception that is raised by the lexer, when it cannot match the next | |||
string of characters to any of its terminals. | |||
""" | |||
def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None, | |||
terminals_by_name=None, considered_rules=None): | |||
super(UnexpectedCharacters, self).__init__() | |||
@@ -185,10 +193,15 @@ class UnexpectedToken(ParseError, UnexpectedInput): | |||
"""An exception that is raised by the parser, when the token it received | |||
doesn't match any valid step forward. | |||
The parser provides an interactive instance through `interactive_parser`, | |||
which is initialized to the point of failture, and can be used for debugging and error handling. | |||
Parameters: | |||
token: The mismatched token | |||
expected: The set of expected tokens | |||
considered_rules: Which rules were considered, to deduce the expected tokens | |||
state: A value representing the parser state. Do not rely on its value or type. | |||
interactive_parser: An instance of ``InteractiveParser``, that is initialized to the point of failture, | |||
and can be used for debugging and error handling. | |||
see: ``InteractiveParser``. | |||
Note: These parameters are available as attributes of the instance. | |||
""" | |||
def __init__(self, token, expected, considered_rules=None, state=None, interactive_parser=None, terminals_by_name=None, token_history=None): | |||
@@ -234,14 +247,20 @@ class VisitError(LarkError): | |||
"""VisitError is raised when visitors are interrupted by an exception | |||
It provides the following attributes for inspection: | |||
- obj: the tree node or token it was processing when the exception was raised | |||
- orig_exc: the exception that cause it to fail | |||
Parameters: | |||
rule: the name of the visit rule that failed | |||
obj: the tree-node or token that was being processed | |||
orig_exc: the exception that cause it to fail | |||
Note: These parameters are available as attributes | |||
""" | |||
def __init__(self, rule, obj, orig_exc): | |||
message = 'Error trying to process rule "%s":\n\n%s' % (rule, orig_exc) | |||
super(VisitError, self).__init__(message) | |||
self.rule = rule | |||
self.obj = obj | |||
self.orig_exc = orig_exc | |||
@@ -102,7 +102,7 @@ class LarkOptions(Serialize): | |||
A List of either paths or loader functions to specify from where grammars are imported | |||
source_path | |||
Override the source of from where the grammar was loaded. Useful for relative imports and unconventional grammar loading | |||
**=== End Options ===** | |||
**=== End of Options ===** | |||
""" | |||
if __doc__: | |||
__doc__ += OPTIONS_DOC | |||
@@ -527,6 +527,8 @@ class Lark(Serialize): | |||
"""Only lex (and postlex) the text, without parsing it. Only relevant when lexer='standard' | |||
When dont_ignore=True, the lexer will return all tokens, even those marked for %ignore. | |||
:raises UnexpectedCharacters: In case the lexer cannot find a suitable match. | |||
""" | |||
if not hasattr(self, 'lexer') or dont_ignore: | |||
lexer = self._build_lexer(dont_ignore) | |||
@@ -569,6 +571,10 @@ class Lark(Serialize): | |||
If a transformer is supplied to ``__init__``, returns whatever is the | |||
result of the transformation. Otherwise, returns a Tree instance. | |||
:raises UnexpectedInput: On a parse error, one of these sub-exceptions will rise: | |||
``UnexpectedCharacters``, ``UnexpectedToken``, or ``UnexpectedEOF``. | |||
For convenience, these sub-exceptions also inherit from ``ParserError`` and ``LexerError``. | |||
""" | |||
return self.parser.parse(text, start=start, on_error=on_error) | |||
@@ -9,7 +9,7 @@ import pkgutil | |||
from ast import literal_eval | |||
from numbers import Integral | |||
from .utils import bfs, Py36, logger, classify_bool, is_id_continue, is_id_start, bfs_all_unique | |||
from .utils import bfs, Py36, logger, classify_bool, is_id_continue, is_id_start, bfs_all_unique, small_factors | |||
from .lexer import Token, TerminalDef, PatternStr, PatternRE | |||
from .parse_tree_builder import ParseTreeBuilder | |||
@@ -174,28 +174,141 @@ RULES = { | |||
'literal': ['REGEXP', 'STRING'], | |||
} | |||
REPEAT_BREAK_THRESHOLD = 50 | |||
# The Threshold whether repeat via ~ are split up into different rules | |||
# 50 is chosen since it keeps the number of states low and therefore lalr analysis time low, | |||
# while not being to overaggressive and unnecessarily creating rules that might create shift/reduce conflicts. | |||
# For a grammar of the form start: "A"~0..N, these are the timing stats: | |||
# N t | |||
# 10 0.000 | |||
# 20 0.004 | |||
# 30 0.016 | |||
# 40 0.049 | |||
# 50 0.109 | |||
# 60 0.215 | |||
# 70 0.383 | |||
# 80 0.631 | |||
# (See PR #949) | |||
@inline_args | |||
class EBNF_to_BNF(Transformer_InPlace): | |||
def __init__(self): | |||
self.new_rules = [] | |||
self.rules_by_expr = {} | |||
self.rules_cache = {} | |||
self.prefix = 'anon' | |||
self.i = 0 | |||
self.rule_options = None | |||
def _add_recurse_rule(self, type_, expr): | |||
if expr in self.rules_by_expr: | |||
return self.rules_by_expr[expr] | |||
new_name = '__%s_%s_%d' % (self.prefix, type_, self.i) | |||
def _name_rule(self, inner): | |||
new_name = '__%s_%s_%d' % (self.prefix, inner, self.i) | |||
self.i += 1 | |||
t = NonTerminal(new_name) | |||
tree = ST('expansions', [ST('expansion', [expr]), ST('expansion', [t, expr])]) | |||
self.new_rules.append((new_name, tree, self.rule_options)) | |||
self.rules_by_expr[expr] = t | |||
return new_name | |||
def _add_rule(self, key, name, expansions): | |||
t = NonTerminal(name) | |||
self.new_rules.append((name, expansions, self.rule_options)) | |||
self.rules_cache[key] = t | |||
return t | |||
def _add_recurse_rule(self, type_, expr): | |||
try: | |||
return self.rules_cache[expr] | |||
except KeyError: | |||
new_name = self._name_rule(type_) | |||
t = NonTerminal(new_name) | |||
tree = ST('expansions', [ | |||
ST('expansion', [expr]), | |||
ST('expansion', [t, expr]) | |||
]) | |||
return self._add_rule(expr, new_name, tree) | |||
def _add_repeat_rule(self, a, b, target, atom): | |||
""" | |||
When target matches n times atom | |||
This builds a rule that matches atom (a*n + b) times | |||
The rule is of the form: | |||
The rules are of the form: (Example a = 3, b = 4) | |||
new_rule: target target target atom atom atom atom | |||
e.g. we use target * a and atom * b | |||
""" | |||
key = (a, b, target, atom) | |||
try: | |||
return self.rules_cache[key] | |||
except KeyError: | |||
new_name = self._name_rule('repeat_a%d_b%d' % (a, b)) | |||
tree = ST('expansions', [ST('expansion', [target] * a + [atom] * b)]) | |||
return self._add_rule(key, new_name, tree) | |||
def _add_repeat_opt_rule(self, a, b, target, target_opt, atom): | |||
""" | |||
When target matches n times atom, and target_opt 0 to n-1 times target_opt, | |||
This builds a rule that matches atom 0 to (a*n+b)-1 times. | |||
The created rule will not have any shift/reduce conflicts so that it can be used with lalr | |||
The rules are of the form: (Example a = 3, b = 4) | |||
new_rule: target_opt | |||
| target target_opt | |||
| target target target_opt | |||
| target target target | |||
| target target target atom | |||
| target target target atom atom | |||
| target target target atom atom atom | |||
First we generate target * i followed by target_opt for i from 0 to a-1 | |||
These match 0 to n*a - 1 times atom | |||
Then we generate target * a followed by atom * i for i from 0 to b-1 | |||
These match n*a to n*a + b-1 times atom | |||
""" | |||
key = (a, b, target, atom, "opt") | |||
try: | |||
return self.rules_cache[key] | |||
except KeyError: | |||
new_name = self._name_rule('repeat_a%d_b%d_opt' % (a, b)) | |||
tree = ST('expansions', [ | |||
ST('expansion', [target] * i + [target_opt]) | |||
for i in range(a) | |||
] + [ | |||
ST('expansion', [target] * a + [atom] * i) | |||
for i in range(b) | |||
]) | |||
return self._add_rule(key, new_name, tree) | |||
def _generate_repeats(self, rule, mn, mx): | |||
""" | |||
We treat rule~mn..mx as rule~mn rule~0..(diff=mx-mn). | |||
We then use small_factors to split up mn and diff up into values [(a, b), ...] | |||
This values are used with the help of _add_repeat_rule and _add_repeat_rule_opt | |||
to generate a complete rule/expression that matches the corresponding number of repeats | |||
""" | |||
mn_factors = small_factors(mn) | |||
mn_target = rule | |||
for a, b in mn_factors: | |||
mn_target = self._add_repeat_rule(a, b, mn_target, rule) | |||
if mx == mn: | |||
return mn_target | |||
diff = mx - mn + 1 # We add one because _add_repeat_opt_rule generates rules that match one less | |||
diff_factors = small_factors(diff) | |||
diff_target = rule # Match rule 1 times | |||
diff_opt_target = ST('expansion', []) # match rule 0 times (e.g. up to 1 -1 times) | |||
for a, b in diff_factors[:-1]: | |||
new_diff_target = self._add_repeat_rule(a, b, diff_target, rule) | |||
diff_opt_target = self._add_repeat_opt_rule(a, b, diff_target, diff_opt_target, rule) | |||
diff_target = new_diff_target | |||
a, b = diff_factors[-1] # We do the last on separately since we don't need to call self._add_repeat_rule | |||
diff_opt_target = self._add_repeat_opt_rule(a, b, diff_target, diff_opt_target, rule) | |||
return ST('expansions', [ST('expansion', [mn_target] + [diff_opt_target])]) | |||
def expr(self, rule, op, *args): | |||
if op.value == '?': | |||
empty = ST('expansion', []) | |||
@@ -220,7 +333,11 @@ class EBNF_to_BNF(Transformer_InPlace): | |||
mn, mx = map(int, args) | |||
if mx < mn or mn < 0: | |||
raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (rule, mn, mx)) | |||
return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx+1)]) | |||
# For small number of repeats, we don't need to build new rules. | |||
if mx > REPEAT_BREAK_THRESHOLD: | |||
return self._generate_repeats(rule, mn, mx) | |||
else: | |||
return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx + 1)]) | |||
assert False, op | |||
def maybe(self, rule): | |||
@@ -187,7 +187,7 @@ def get_regexp_width(expr): | |||
return 1, sre_constants.MAXREPEAT | |||
else: | |||
return 0, sre_constants.MAXREPEAT | |||
###} | |||
@@ -288,7 +288,7 @@ except ImportError: | |||
class FS: | |||
exists = os.path.exists | |||
@staticmethod | |||
def open(name, mode="r", **kwargs): | |||
if atomicwrites and "w" in mode: | |||
@@ -359,3 +359,32 @@ def _serialize(value, memo): | |||
return {key:_serialize(elem, memo) for key, elem in value.items()} | |||
# assert value is None or isinstance(value, (int, float, str, tuple)), value | |||
return value | |||
# Value 5 keeps the number of states in the lalr parser somewhat minimal | |||
# It isn't optimal, but close to it. See PR #949 | |||
SMALL_FACTOR_THRESHOLD = 5 | |||
def small_factors(n): | |||
""" | |||
Splits n up into smaller factors and summands <= SMALL_FACTOR_THRESHOLD. | |||
Returns a list of [(a, b), ...] | |||
so that the following code returns n: | |||
n = 1 | |||
for a, b in values: | |||
n = n * a + b | |||
Currently, we also keep a + b <= SMALL_FACTOR_THRESHOLD, but that might change | |||
""" | |||
assert n >= 0 | |||
if n <= SMALL_FACTOR_THRESHOLD: | |||
return [(n, 0)] | |||
# While this does not provide an optimal solution, it produces a pretty good one. | |||
# See above comment and PR #949 | |||
for a in range(SMALL_FACTOR_THRESHOLD, 1, -1): | |||
r, b = divmod(n, a) | |||
if a + b <= SMALL_FACTOR_THRESHOLD: | |||
return small_factors(r) + [(a, b)] | |||
assert False, "Failed to factorize %s" % n |
@@ -2226,6 +2226,35 @@ def _make_parser_test(LEXER, PARSER): | |||
self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB') | |||
self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB') | |||
@unittest.skipIf(PARSER == 'cyk', "For large number of repeats, empty rules might be generated") | |||
def test_ranged_repeat_large(self): | |||
# Large is currently arbitrarily chosen to be large than 20 | |||
g = u"""!start: "A"~60 | |||
""" | |||
l = _Lark(g) | |||
self.assertGreater(len(l.rules), 1, "Expected that more than one rule will be generated") | |||
self.assertEqual(l.parse(u'A' * 60), Tree('start', ["A"] * 60)) | |||
self.assertRaises(ParseError, l.parse, u'A' * 59) | |||
self.assertRaises((ParseError, UnexpectedInput), l.parse, u'A' * 61) | |||
g = u"""!start: "A"~15..100 | |||
""" | |||
l = _Lark(g) | |||
for i in range(0, 110): | |||
if 15 <= i <= 100: | |||
self.assertEqual(l.parse(u'A' * i), Tree('start', ['A']*i)) | |||
else: | |||
self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'A' * i) | |||
# 8191 is a Mersenne prime | |||
g = u"""start: "A"~8191 | |||
""" | |||
l = _Lark(g) | |||
self.assertEqual(l.parse(u'A' * 8191), Tree('start', [])) | |||
self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'A' * 8190) | |||
self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'A' * 8192) | |||
@unittest.skipIf(PARSER=='earley', "Priority not handled correctly right now") # TODO XXX | |||
def test_priority_vs_embedded(self): | |||
g = """ | |||