@@ -45,6 +45,12 @@ Literals can be one of: | |||
* `/re with flags/imulx` | |||
* Literal range: `"a".."z"`, `"1".."9"`, etc. | |||
### Priority | |||
Terminals can be assigned priority only when using a lexer (future versions may support Earley's dynamic lexing). | |||
Priority can be either positive or negative. In not specified for a terminal, it's assumed to be 1 (i.e. the default). | |||
#### Notes for when using a lexer: | |||
When using a lexer (standard or contextual), it is the grammar-author's responsibility to make sure the literals don't collide, or that if they do, they are matched in the desired order. Literals are matched in an order according to the following criteria: | |||
@@ -90,7 +96,7 @@ Each item is one of: | |||
* `item*` - Zero or more instances of item | |||
* `item+` - One or more instances of item | |||
* `item ~ n` - Exactly *n* instances of item | |||
* `item ~ n..m` - Between *n* to *m* instances of item | |||
* `item ~ n..m` - Between *n* to *m* instances of item (not recommended for wide ranges, due to performance issues) | |||
**Examples:** | |||
```perl | |||
@@ -102,6 +108,11 @@ expr: expr operator expr | |||
four_words: word ~ 4 | |||
``` | |||
### Priority | |||
Rules can be assigned priority only when using Earley (future versions may support LALR as well). | |||
Priority can be either positive or negative. In not specified for a terminal, it's assumed to be 1 (i.e. the default). | |||
## Directives | |||
@@ -7,7 +7,7 @@ An [Earley Parser](https://www.wikiwand.com/en/Earley_parser) is a chart parser | |||
Lark's Earley implementation runs on top of a skipping chart parser, which allows it to use regular expressions, instead of matching characters one-by-one. This is a huge improvement to Earley that is unique to Lark. This feature is used by default, but can also be requested explicitely using `lexer='dynamic'`. | |||
It's possible to bypass the dynamic lexer, and use the regular Earley parser with a traditional lexer, that tokenizes as an independant first step. Doing so will provide a speed benefit, but will tokenize without using Earley's ambiguity-resolution ability. So choose this only if you know why! Activate with `lexer='standard'` | |||
It's possible to bypass the dynamic lexing, and use the regular Earley parser with a traditional lexer, that tokenizes as an independant first step. Doing so will provide a speed benefit, but will tokenize without using Earley's ambiguity-resolution ability. So choose this only if you know why! Activate with `lexer='standard'` | |||
**SPPF & Ambiguity resolution** | |||
@@ -5,4 +5,4 @@ from .exceptions import ParseError, LexError, GrammarError, UnexpectedToken, Une | |||
from .lexer import Token | |||
from .lark import Lark | |||
__version__ = "0.7.1" | |||
__version__ = "0.7.2" |
@@ -205,6 +205,8 @@ class Lark(Serialize): | |||
# Compile the EBNF grammar into BNF | |||
self.terminals, self.rules, self.ignore_tokens = self.grammar.compile(self.options.start) | |||
self._terminals_dict = {t.name:t for t in self.terminals} | |||
# If the user asked to invert the priorities, negate them all here. | |||
# This replaces the old 'resolve__antiscore_sum' option. | |||
if self.options.priority == 'invert': | |||
@@ -290,6 +292,10 @@ class Lark(Serialize): | |||
return self.options.postlex.process(stream) | |||
return stream | |||
def get_terminal(self, name): | |||
"Get information about a terminal" | |||
return self._terminals_dict[name] | |||
def parse(self, text, start=None): | |||
"""Parse the given text, according to the options provided. | |||
@@ -41,6 +41,8 @@ class Pattern(Serialize): | |||
class PatternStr(Pattern): | |||
type = "str" | |||
def to_regexp(self): | |||
return self._get_flags(re.escape(self.value)) | |||
@@ -50,15 +52,23 @@ class PatternStr(Pattern): | |||
max_width = min_width | |||
class PatternRE(Pattern): | |||
type = "re" | |||
def to_regexp(self): | |||
return self._get_flags(self.value) | |||
_width = None | |||
def _get_width(self): | |||
if self._width is None: | |||
self._width = get_regexp_width(self.to_regexp()) | |||
return self._width | |||
@property | |||
def min_width(self): | |||
return get_regexp_width(self.to_regexp())[0] | |||
return self._get_width()[0] | |||
@property | |||
def max_width(self): | |||
return get_regexp_width(self.to_regexp())[1] | |||
return self._get_width()[1] | |||
class TerminalDef(Serialize): | |||
@@ -88,7 +98,7 @@ class Token(Str): | |||
self.type = type_ | |||
self.pos_in_stream = pos_in_stream | |||
self.value = value | |||
self.value = Str(value) | |||
self.line = line | |||
self.column = column | |||
self.end_line = end_line | |||
@@ -90,7 +90,7 @@ TERMINALS = { | |||
'_IGNORE': r'%ignore', | |||
'_DECLARE': r'%declare', | |||
'_IMPORT': r'%import', | |||
'NUMBER': r'\d+', | |||
'NUMBER': r'[+-]?\d+', | |||
} | |||
RULES = { | |||
@@ -196,7 +196,7 @@ class EBNF_to_BNF(Transformer_InPlace): | |||
mn = mx = int(args[0]) | |||
else: | |||
mn, mx = map(int, args) | |||
if mx < mn: | |||
if mx < mn or mn < 0: | |||
raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (rule, mn, mx)) | |||
return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx+1)]) | |||
assert False, op | |||
@@ -55,7 +55,7 @@ class LR0ItemSet(object): | |||
def update_set(set1, set2): | |||
if not set2: | |||
if not set2 or set1 > set2: | |||
return False | |||
copy = set(set1) | |||
@@ -102,6 +102,8 @@ def calculate_sets(rules): | |||
if set(rule.expansion[:i]) <= NULLABLE: | |||
if update_set(FIRST[rule.origin], FIRST[sym]): | |||
changed = True | |||
else: | |||
break | |||
# Calculate FOLLOW | |||
changed = True | |||
@@ -159,7 +161,7 @@ class GrammarAnalyzer(object): | |||
self.FIRST, self.FOLLOW, self.NULLABLE = calculate_sets(rules) | |||
def expand_rule(self, rule, rules_by_origin=None): | |||
def expand_rule(self, source_rule, rules_by_origin=None): | |||
"Returns all init_ptrs accessible by rule (recursive)" | |||
if rules_by_origin is None: | |||
@@ -178,13 +180,7 @@ class GrammarAnalyzer(object): | |||
if not new_r.is_term: | |||
yield new_r | |||
for _ in bfs([rule], _expand_rule): | |||
for _ in bfs([source_rule], _expand_rule): | |||
pass | |||
return fzset(init_ptrs) | |||
def _first(self, r): | |||
if r.is_term: | |||
return {r} | |||
else: | |||
return {rp.next for rp in self.expand_rule(r) if rp.next.is_term} |
@@ -1029,6 +1029,32 @@ def _make_parser_test(LEXER, PARSER): | |||
self.assertEqual(res.children, ['ab']) | |||
grammar = """ | |||
start: A B | AB | |||
A: "a" | |||
B.-20: "b" | |||
AB.-10: "ab" | |||
""" | |||
l = _Lark(grammar) | |||
res = l.parse("ab") | |||
self.assertEqual(res.children, ['a', 'b']) | |||
grammar = """ | |||
start: A B | AB | |||
A.-99999999999999999999999: "a" | |||
B: "b" | |||
AB: "ab" | |||
""" | |||
l = _Lark(grammar) | |||
res = l.parse("ab") | |||
self.assertEqual(res.children, ['ab']) | |||
def test_import(self): | |||
grammar = """ | |||