Ver a proveniência

Improvements and cleanup to the earley parser

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.5.1
Erez Shinan há 7 anos
ascendente
cometimento
fee18a8d8a
2 ficheiros alterados com 59 adições e 35 eliminações
  1. +30
    -4
      lark/parser_frontends.py
  2. +29
    -31
      lark/parsers/earley.py

+ 30
- 4
lark/parser_frontends.py Ver ficheiro

@@ -1,3 +1,5 @@
import re

from .lexer import Lexer
from .parsers.lalr_analysis import GrammarAnalyzer

@@ -33,22 +35,46 @@ class Earley(WithLexer):
WithLexer.__init__(self, lexer_conf)

rules = [{'name':n,
'symbols': self._process_expansion(x),
'symbols': list(self._prepare_expansion(x)),
'postprocess': getattr(parser_conf.callback, a)}
for n,x,a in parser_conf.rules]

self.parser = earley.Parser(rules, parser_conf.start)

def _prepare_expansion(self, expansion):
for sym in expansion:
if is_terminal(sym):
yield sym, None
else:
yield sym

def parse(self, text):
tokens = list(self.lex(text))
res = self.parser.parse(tokens)
assert len(res) ==1 , 'Ambiguious Parse! Not handled yet'
return res[0]

@staticmethod
def _process_expansion(x):
return [{'literal': s} if is_terminal(s) else s for s in x]
class Earley2:
def __init__(self, lexer_conf, parser_conf):
self.token_by_name = {t.name:t for t in lexer_conf.tokens}

rules = [{'name':n,
'symbols': list(self._prepare_expansion(x)),
'postprocess': getattr(parser_conf.callback, a)}
for n,x,a in parser_conf.rules]

self.parser = earley.Parser(rules, parser_conf.start)

def _prepare_expansion(self, expansion):
for sym in expansion:
if is_terminal(sym):
yield sym, re.compile(self.token_by_name[sym].to_regexp())
else:
yield sym

def parse(self, text):
res = self.parser.parse(text)
assert len(res) ==1 , 'Ambiguious Parse! Not handled yet'
return res[0]

ENGINE_DICT = { 'lalr': LALR, 'earley': Earley }

+ 29
- 31
lark/parsers/earley.py Ver ficheiro

@@ -31,37 +31,35 @@ class State(object):
self.is_complete = (self.expect == len(self.rule.symbols))
if not self.is_complete:
self.expect_symbol = self.rule.symbols[self.expect]
self.is_literal = isinstance(self.expect_symbol, dict)
if self.is_literal:
self.expect_symbol = self.expect_symbol['literal']
assert isinstance(self.expect_symbol, STRING_TYPE), self.expect_symbol
self.is_terminal = isinstance(self.expect_symbol, tuple)
else:
self.is_literal = False
self.is_terminal = False

def next_state(self, data):
return State(self.rule, self.expect+1, self.reference, self.data + [data])

def consume_terminal(self, inp):
if not self.is_complete and self.is_literal:
if not self.is_complete and self.is_terminal:
# PORT: originally tests regexp

if self.expect_symbol == inp.type:
if self.expect_symbol[1] is not None:
match = self.expect_symbol[1].match(stream, pos)

if self.expect_symbol[0] == inp.type:
return self.next_state(inp)

def consume_nonterminal(self, inp):
if not self.is_complete and not self.is_literal:
if not self.is_complete and not self.is_terminal:

if self.expect_symbol == inp:
return self.next_state(inp)

def process(self, location, ind, table, rules, added_rules):

if self.is_complete:
# Completed a rule
if self.rule.postprocess:
try:
# self.data = self.rule.postprocess(self.data, self.reference)
# import pdb
# pdb.set_trace()
self.data = self.rule.postprocess(self.data)
except AbortParseMatch:
self.data = MatchFailed
@@ -75,7 +73,7 @@ class State(object):

else:
exp = self.rule.symbols[self.expect]
if isinstance(exp, dict):
if isinstance(exp, tuple):
return

for r in rules[exp]:
@@ -87,19 +85,13 @@ class State(object):
else:
# Empty rule
new_copy = self.consume_nonterminal(r.name)
if r.postprocess:
new_copy.data[-1] = r.postprocess([])
else:
new_copy.data[-1] = []
new_copy.data[-1] = r.postprocess([]) if r.postprocess else []

new_copy.epsilon_closure(location, ind, table)

def epsilon_closure(self, location, ind, table, result=None):
def epsilon_closure(self, location, ind, table):
col = table[location]
if not result:
result = col

result.append(self)
col.append(self)

if not self.is_complete:
for i in xrange(ind):
@@ -117,29 +109,35 @@ class Parser(object):
self.rules_by_name = classify(self.rules, lambda r: r.name)
self.start = start or self.rules[0].name

def advance_to(self, table, n, added_rules):
def advance_to(self, table, added_rules):
n = len(table)-1
for w, s in enumerate(table[n]):
s.process(n, w, table, self.rules_by_name, added_rules)

def parse(self, stream):
initial_rules = set(self.rules_by_name[self.start])
table = [[State(r, 0, 0) for r in initial_rules]]
self.advance_to(table, 0, initial_rules)
self.advance_to(table, initial_rules)

i = 0

for pos, token in enumerate(stream):
table.append([])
while i < len(stream):
col = []

for s in table[pos]:
token = stream[i]
for s in table[-1]:
x = s.consume_terminal(token)
if x:
table[pos + 1].append(x)
col.append(x)

if not col:
expected = {s.expect_symbol for s in table[-1] if s.is_terminal}
raise UnexpectedToken(stream[i], expected, stream, i)

self.advance_to(table, pos + 1, set())
table.append(col)
self.advance_to(table, set())

if not table[-1]:
expected = {s.expect_symbol for s in table[-2] if s.is_literal}
raise UnexpectedToken(stream[pos], expected, stream, pos)
i += 1

res = list(self.finish(table))
if not res:


Carregando…
Cancelar
Guardar