merge lalr parser

5 years ago · fe2869bd63
--- a/docs/grammar.md
+++ b/docs/grammar.md
@@ -45,6 +45,12 @@ Literals can be one of:
 * `/re with flags/imulx`
 * Literal range: `"a".."z"`, `"1".."9"`, etc.

 ### Priority

 Terminals can be assigned priority only when using a lexer (future versions may support Earley's dynamic lexing).

 Priority can be either positive or negative. In not specified for a terminal, it's assumed to be 1 (i.e. the default).

 #### Notes for when using a lexer:

 When using a lexer (standard or contextual), it is the grammar-author's responsibility to make sure the literals don't collide, or that if they do, they are matched in the desired order. Literals are matched in an order according to the following criteria:
@@ -90,7 +96,7 @@ Each item is one of:
 * `item*` - Zero or more instances of item
 * `item+` - One or more instances of item
 * `item ~ n` - Exactly *n* instances of item
 * `item ~ n..m` - Between *n* to *m* instances of item
 * `item ~ n..m` - Between *n* to *m* instances of item (not recommended for wide ranges, due to performance issues)

 **Examples:**
 ```perl
@@ -102,6 +108,11 @@ expr: expr operator expr
 four_words: word ~ 4
 ```

 ### Priority

 Rules can be assigned priority only when using Earley (future versions may support LALR as well).

 Priority can be either positive or negative. In not specified for a terminal, it's assumed to be 1 (i.e. the default).

 ## Directives

--- a/docs/parsers.md
+++ b/docs/parsers.md
@@ -7,7 +7,7 @@ An [Earley Parser](https://www.wikiwand.com/en/Earley_parser) is a chart parser

 Lark's Earley implementation runs on top of a skipping chart parser, which allows it to use regular expressions, instead of matching characters one-by-one. This is a huge improvement to Earley that is unique to Lark. This feature is used by default, but can also be requested explicitely using `lexer='dynamic'`.

 It's possible to bypass the dynamic lexer, and use the regular Earley parser with a traditional lexer, that tokenizes as an independant first step. Doing so will provide a speed benefit, but will tokenize without using Earley's ambiguity-resolution ability. So choose this only if you know why! Activate with `lexer='standard'`
 It's possible to bypass the dynamic lexing, and use the regular Earley parser with a traditional lexer, that tokenizes as an independant first step. Doing so will provide a speed benefit, but will tokenize without using Earley's ambiguity-resolution ability. So choose this only if you know why! Activate with `lexer='standard'`

 **SPPF & Ambiguity resolution**

--- a/lark/init.py
+++ b/lark/init.py
@@ -5,4 +5,4 @@ from .exceptions import ParseError, LexError, GrammarError, UnexpectedToken, Une
 from .lexer import Token
 from .lark import Lark

 __version__ = "0.7.1"
 __version__ = "0.7.2"
--- a/lark/lark.py
+++ b/lark/lark.py
@@ -205,6 +205,8 @@ class Lark(Serialize):
        # Compile the EBNF grammar into BNF
        self.terminals, self.rules, self.ignore_tokens = self.grammar.compile(self.options.start)

        self._terminals_dict = {t.name:t for t in self.terminals}

        # If the user asked to invert the priorities, negate them all here.
        # This replaces the old 'resolve__antiscore_sum' option.
        if self.options.priority == 'invert':
@@ -290,6 +292,10 @@ class Lark(Serialize):
            return self.options.postlex.process(stream)
        return stream

    def get_terminal(self, name):
        "Get information about a terminal"
        return self._terminals_dict[name]

    def parse(self, text, start=None):
        """Parse the given text, according to the options provided.

--- a/lark/lexer.py
+++ b/lark/lexer.py
@@ -41,6 +41,8 @@ class Pattern(Serialize):


 class PatternStr(Pattern):
    type = "str"
    
    def to_regexp(self):
        return self._get_flags(re.escape(self.value))

@@ -50,15 +52,23 @@ class PatternStr(Pattern):
    max_width = min_width

 class PatternRE(Pattern):
    type = "re"

    def to_regexp(self):
        return self._get_flags(self.value)

    _width = None
    def _get_width(self):
        if self._width is None:
            self._width = get_regexp_width(self.to_regexp())
        return self._width

    @property
    def min_width(self):
        return get_regexp_width(self.to_regexp())[0]
        return self._get_width()[0]
    @property
    def max_width(self):
        return get_regexp_width(self.to_regexp())[1]
        return self._get_width()[1]


 class TerminalDef(Serialize):
@@ -88,7 +98,7 @@ class Token(Str):

        self.type = type_
        self.pos_in_stream = pos_in_stream
        self.value = value
        self.value = Str(value)
        self.line = line
        self.column = column
        self.end_line = end_line
--- a/lark/load_grammar.py
+++ b/lark/load_grammar.py
@@ -90,7 +90,7 @@ TERMINALS = {
    '_IGNORE': r'%ignore',
    '_DECLARE': r'%declare',
    '_IMPORT': r'%import',
    'NUMBER': r'\d+',
    'NUMBER': r'[+-]?\d+',
 }

 RULES = {
@@ -196,7 +196,7 @@ class EBNF_to_BNF(Transformer_InPlace):
                mn = mx = int(args[0])
            else:
                mn, mx = map(int, args)
                if mx < mn:
                if mx < mn or mn < 0:
                    raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (rule, mn, mx))
            return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx+1)])
        assert False, op
--- a/lark/parsers/grammar_analysis.py
+++ b/lark/parsers/grammar_analysis.py
@@ -55,7 +55,7 @@ class LR0ItemSet(object):


 def update_set(set1, set2):
    if not set2:
    if not set2 or set1 > set2:
        return False

    copy = set(set1)
@@ -102,6 +102,8 @@ def calculate_sets(rules):
                if set(rule.expansion[:i]) <= NULLABLE:
                    if update_set(FIRST[rule.origin], FIRST[sym]):
                        changed = True
                else:
                    break

    # Calculate FOLLOW
    changed = True
@@ -159,7 +161,7 @@ class GrammarAnalyzer(object):

        self.FIRST, self.FOLLOW, self.NULLABLE = calculate_sets(rules)

    def expand_rule(self, rule, rules_by_origin=None):
    def expand_rule(self, source_rule, rules_by_origin=None):
        "Returns all init_ptrs accessible by rule (recursive)"

        if rules_by_origin is None:
@@ -178,13 +180,7 @@ class GrammarAnalyzer(object):
                    if not new_r.is_term:
                        yield new_r

        for _ in bfs([rule], _expand_rule):
        for _ in bfs([source_rule], _expand_rule):
            pass

        return fzset(init_ptrs)

    def _first(self, r):
        if r.is_term:
            return {r}
        else:
            return {rp.next for rp in self.expand_rule(r) if rp.next.is_term}
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -1029,6 +1029,32 @@ def _make_parser_test(LEXER, PARSER):
            self.assertEqual(res.children, ['ab'])


            grammar = """
            start: A B | AB
            A: "a"
            B.-20: "b"
            AB.-10: "ab"
            """
            l = _Lark(grammar)
            res = l.parse("ab")
            self.assertEqual(res.children, ['a', 'b'])


            grammar = """
            start: A B | AB
            A.-99999999999999999999999: "a"
            B: "b"
            AB: "ab"
            """
            l = _Lark(grammar)
            res = l.parse("ab")

            self.assertEqual(res.children, ['ab'])






        def test_import(self):
            grammar = """