Browse Source

merge lalr parser

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.8.0
Raekye 5 years ago
parent
commit
fe2869bd63
8 changed files with 66 additions and 17 deletions
  1. +12
    -1
      docs/grammar.md
  2. +1
    -1
      docs/parsers.md
  3. +1
    -1
      lark/__init__.py
  4. +6
    -0
      lark/lark.py
  5. +13
    -3
      lark/lexer.py
  6. +2
    -2
      lark/load_grammar.py
  7. +5
    -9
      lark/parsers/grammar_analysis.py
  8. +26
    -0
      tests/test_parser.py

+ 12
- 1
docs/grammar.md View File

@@ -45,6 +45,12 @@ Literals can be one of:
* `/re with flags/imulx` * `/re with flags/imulx`
* Literal range: `"a".."z"`, `"1".."9"`, etc. * Literal range: `"a".."z"`, `"1".."9"`, etc.


### Priority

Terminals can be assigned priority only when using a lexer (future versions may support Earley's dynamic lexing).

Priority can be either positive or negative. In not specified for a terminal, it's assumed to be 1 (i.e. the default).

#### Notes for when using a lexer: #### Notes for when using a lexer:


When using a lexer (standard or contextual), it is the grammar-author's responsibility to make sure the literals don't collide, or that if they do, they are matched in the desired order. Literals are matched in an order according to the following criteria: When using a lexer (standard or contextual), it is the grammar-author's responsibility to make sure the literals don't collide, or that if they do, they are matched in the desired order. Literals are matched in an order according to the following criteria:
@@ -90,7 +96,7 @@ Each item is one of:
* `item*` - Zero or more instances of item * `item*` - Zero or more instances of item
* `item+` - One or more instances of item * `item+` - One or more instances of item
* `item ~ n` - Exactly *n* instances of item * `item ~ n` - Exactly *n* instances of item
* `item ~ n..m` - Between *n* to *m* instances of item
* `item ~ n..m` - Between *n* to *m* instances of item (not recommended for wide ranges, due to performance issues)


**Examples:** **Examples:**
```perl ```perl
@@ -102,6 +108,11 @@ expr: expr operator expr
four_words: word ~ 4 four_words: word ~ 4
``` ```


### Priority

Rules can be assigned priority only when using Earley (future versions may support LALR as well).

Priority can be either positive or negative. In not specified for a terminal, it's assumed to be 1 (i.e. the default).


## Directives ## Directives




+ 1
- 1
docs/parsers.md View File

@@ -7,7 +7,7 @@ An [Earley Parser](https://www.wikiwand.com/en/Earley_parser) is a chart parser


Lark's Earley implementation runs on top of a skipping chart parser, which allows it to use regular expressions, instead of matching characters one-by-one. This is a huge improvement to Earley that is unique to Lark. This feature is used by default, but can also be requested explicitely using `lexer='dynamic'`. Lark's Earley implementation runs on top of a skipping chart parser, which allows it to use regular expressions, instead of matching characters one-by-one. This is a huge improvement to Earley that is unique to Lark. This feature is used by default, but can also be requested explicitely using `lexer='dynamic'`.


It's possible to bypass the dynamic lexer, and use the regular Earley parser with a traditional lexer, that tokenizes as an independant first step. Doing so will provide a speed benefit, but will tokenize without using Earley's ambiguity-resolution ability. So choose this only if you know why! Activate with `lexer='standard'`
It's possible to bypass the dynamic lexing, and use the regular Earley parser with a traditional lexer, that tokenizes as an independant first step. Doing so will provide a speed benefit, but will tokenize without using Earley's ambiguity-resolution ability. So choose this only if you know why! Activate with `lexer='standard'`


**SPPF & Ambiguity resolution** **SPPF & Ambiguity resolution**




+ 1
- 1
lark/__init__.py View File

@@ -5,4 +5,4 @@ from .exceptions import ParseError, LexError, GrammarError, UnexpectedToken, Une
from .lexer import Token from .lexer import Token
from .lark import Lark from .lark import Lark


__version__ = "0.7.1"
__version__ = "0.7.2"

+ 6
- 0
lark/lark.py View File

@@ -205,6 +205,8 @@ class Lark(Serialize):
# Compile the EBNF grammar into BNF # Compile the EBNF grammar into BNF
self.terminals, self.rules, self.ignore_tokens = self.grammar.compile(self.options.start) self.terminals, self.rules, self.ignore_tokens = self.grammar.compile(self.options.start)


self._terminals_dict = {t.name:t for t in self.terminals}

# If the user asked to invert the priorities, negate them all here. # If the user asked to invert the priorities, negate them all here.
# This replaces the old 'resolve__antiscore_sum' option. # This replaces the old 'resolve__antiscore_sum' option.
if self.options.priority == 'invert': if self.options.priority == 'invert':
@@ -290,6 +292,10 @@ class Lark(Serialize):
return self.options.postlex.process(stream) return self.options.postlex.process(stream)
return stream return stream


def get_terminal(self, name):
"Get information about a terminal"
return self._terminals_dict[name]

def parse(self, text, start=None): def parse(self, text, start=None):
"""Parse the given text, according to the options provided. """Parse the given text, according to the options provided.




+ 13
- 3
lark/lexer.py View File

@@ -41,6 +41,8 @@ class Pattern(Serialize):




class PatternStr(Pattern): class PatternStr(Pattern):
type = "str"
def to_regexp(self): def to_regexp(self):
return self._get_flags(re.escape(self.value)) return self._get_flags(re.escape(self.value))


@@ -50,15 +52,23 @@ class PatternStr(Pattern):
max_width = min_width max_width = min_width


class PatternRE(Pattern): class PatternRE(Pattern):
type = "re"

def to_regexp(self): def to_regexp(self):
return self._get_flags(self.value) return self._get_flags(self.value)


_width = None
def _get_width(self):
if self._width is None:
self._width = get_regexp_width(self.to_regexp())
return self._width

@property @property
def min_width(self): def min_width(self):
return get_regexp_width(self.to_regexp())[0]
return self._get_width()[0]
@property @property
def max_width(self): def max_width(self):
return get_regexp_width(self.to_regexp())[1]
return self._get_width()[1]




class TerminalDef(Serialize): class TerminalDef(Serialize):
@@ -88,7 +98,7 @@ class Token(Str):


self.type = type_ self.type = type_
self.pos_in_stream = pos_in_stream self.pos_in_stream = pos_in_stream
self.value = value
self.value = Str(value)
self.line = line self.line = line
self.column = column self.column = column
self.end_line = end_line self.end_line = end_line


+ 2
- 2
lark/load_grammar.py View File

@@ -90,7 +90,7 @@ TERMINALS = {
'_IGNORE': r'%ignore', '_IGNORE': r'%ignore',
'_DECLARE': r'%declare', '_DECLARE': r'%declare',
'_IMPORT': r'%import', '_IMPORT': r'%import',
'NUMBER': r'\d+',
'NUMBER': r'[+-]?\d+',
} }


RULES = { RULES = {
@@ -196,7 +196,7 @@ class EBNF_to_BNF(Transformer_InPlace):
mn = mx = int(args[0]) mn = mx = int(args[0])
else: else:
mn, mx = map(int, args) mn, mx = map(int, args)
if mx < mn:
if mx < mn or mn < 0:
raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (rule, mn, mx)) raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (rule, mn, mx))
return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx+1)]) return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx+1)])
assert False, op assert False, op


+ 5
- 9
lark/parsers/grammar_analysis.py View File

@@ -55,7 +55,7 @@ class LR0ItemSet(object):




def update_set(set1, set2): def update_set(set1, set2):
if not set2:
if not set2 or set1 > set2:
return False return False


copy = set(set1) copy = set(set1)
@@ -102,6 +102,8 @@ def calculate_sets(rules):
if set(rule.expansion[:i]) <= NULLABLE: if set(rule.expansion[:i]) <= NULLABLE:
if update_set(FIRST[rule.origin], FIRST[sym]): if update_set(FIRST[rule.origin], FIRST[sym]):
changed = True changed = True
else:
break


# Calculate FOLLOW # Calculate FOLLOW
changed = True changed = True
@@ -159,7 +161,7 @@ class GrammarAnalyzer(object):


self.FIRST, self.FOLLOW, self.NULLABLE = calculate_sets(rules) self.FIRST, self.FOLLOW, self.NULLABLE = calculate_sets(rules)


def expand_rule(self, rule, rules_by_origin=None):
def expand_rule(self, source_rule, rules_by_origin=None):
"Returns all init_ptrs accessible by rule (recursive)" "Returns all init_ptrs accessible by rule (recursive)"


if rules_by_origin is None: if rules_by_origin is None:
@@ -178,13 +180,7 @@ class GrammarAnalyzer(object):
if not new_r.is_term: if not new_r.is_term:
yield new_r yield new_r


for _ in bfs([rule], _expand_rule):
for _ in bfs([source_rule], _expand_rule):
pass pass


return fzset(init_ptrs) return fzset(init_ptrs)

def _first(self, r):
if r.is_term:
return {r}
else:
return {rp.next for rp in self.expand_rule(r) if rp.next.is_term}

+ 26
- 0
tests/test_parser.py View File

@@ -1029,6 +1029,32 @@ def _make_parser_test(LEXER, PARSER):
self.assertEqual(res.children, ['ab']) self.assertEqual(res.children, ['ab'])




grammar = """
start: A B | AB
A: "a"
B.-20: "b"
AB.-10: "ab"
"""
l = _Lark(grammar)
res = l.parse("ab")
self.assertEqual(res.children, ['a', 'b'])


grammar = """
start: A B | AB
A.-99999999999999999999999: "a"
B: "b"
AB: "ab"
"""
l = _Lark(grammar)
res = l.parse("ab")

self.assertEqual(res.children, ['ab'])







def test_import(self): def test_import(self):
grammar = """ grammar = """


Loading…
Cancel
Save