Explorar el Código

Official switched to my Earley implementation

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.5.1
Erez Shinan hace 7 años
padre
commit
c17558dd91
Se han modificado 4 ficheros con 301 adiciones y 306 borrados
  1. +14
    -18
      lark/parser_frontends.py
  2. +132
    -144
      lark/parsers/earley.py
  3. +0
    -144
      lark/parsers/earley2.py
  4. +155
    -0
      lark/parsers/nearley.py

+ 14
- 18
lark/parser_frontends.py Ver fichero

@@ -1,10 +1,10 @@
import re
import sre_parse

from .lexer import Lexer, ContextualLexer
from .lexer import Lexer, ContextualLexer, Token

from .common import is_terminal, GrammarError, ParserConf
from .parsers import lalr_parser, earley, earley2
from .parsers import lalr_parser, earley, nearley
from .parsers.grammar_analysis import Rule

class WithLexer:
@@ -56,18 +56,14 @@ class Nearley(WithLexer):
WithLexer.__init__(self, lexer_conf)

rules = [{'name':n,
'symbols': list(self._prepare_expansion(x)),
'symbols': self._prepare_expansion(x),
'postprocess': getattr(parser_conf.callback, a)}
for n,x,a in parser_conf.rules]

self.parser = earley.Parser(rules, parser_conf.start)
self.parser = nearley.Parser(rules, parser_conf.start)

def _prepare_expansion(self, expansion):
for sym in expansion:
if is_terminal(sym):
yield sym, None
else:
yield sym
return [(sym, None) if is_terminal(sym) else sym for sym in expansion]

def parse(self, text):
tokens = list(self.lex(text))
@@ -76,14 +72,14 @@ class Nearley(WithLexer):
return res[0]


class MyEarley(WithLexer):
class Earley(WithLexer):
def __init__(self, lexer_conf, parser_conf):
WithLexer.__init__(self, lexer_conf)

rules = [(n, self._prepare_expansion(x), a)
for n,x,a in parser_conf.rules]

self.parser = earley2.Parser(ParserConf(rules, parser_conf.callback, parser_conf.start))
self.parser = earley.Parser(ParserConf(rules, parser_conf.callback, parser_conf.start))

def _prepare_expansion(self, expansion):
return [(sym,) if is_terminal(sym) else sym for sym in expansion]
@@ -95,7 +91,7 @@ class MyEarley(WithLexer):
return res[0]


class Earley_NoLex:
class Nearley_NoLex:
def __init__(self, lexer_conf, parser_conf):
self.token_by_name = {t.name:t for t in lexer_conf.tokens}

@@ -104,7 +100,7 @@ class Earley_NoLex:
'postprocess': getattr(parser_conf.callback, a)}
for n,x,a in parser_conf.rules]

self.parser = earley.Parser(rules, parser_conf.start)
self.parser = nearley.Parser(rules, parser_conf.start)

def _prepare_expansion(self, expansion):
for sym in expansion:
@@ -123,14 +119,14 @@ class Earley_NoLex:
return res[0]


class MyEarley_NoLex:
class Earley_NoLex:
def __init__(self, lexer_conf, parser_conf):
self.token_by_name = {t.name:t for t in lexer_conf.tokens}

rules = [(n, list(self._prepare_expansion(x)), a)
for n,x,a in parser_conf.rules]

self.parser = earley2.Parser(ParserConf(rules, parser_conf.callback, parser_conf.start))
self.parser = earley.Parser(ParserConf(rules, parser_conf.callback, parser_conf.start))

def _prepare_expansion(self, expansion):
for sym in expansion:
@@ -139,18 +135,18 @@ class MyEarley_NoLex:
width = sre_parse.parse(regexp).getwidth()
if not width == (1,1):
raise GrammarError('Dynamic lexing requires all tokens to have a width of 1 (%s is %s)' % (regexp, width))
yield re.compile(regexp).match
yield (re.compile(regexp).match,)
else:
yield sym

def parse(self, text):
res = self.parser.parse(text)
res = self.parser.parse([Token(x,x) for x in text]) # A little hacky perhaps!
assert len(res) ==1 , 'Ambiguious Parse! Not handled yet'
return res[0]

ENGINE_DICT = {
'lalr': LALR,
'earley': MyEarley,
'earley': Earley,
'earley_nolex': Earley_NoLex,
'lalr_contextual_lexer': LALR_ContextualLexer
}

+ 132
- 144
lark/parsers/earley.py Ver fichero

@@ -1,155 +1,143 @@
"My name is Earley"
from ..common import ParseError, UnexpectedToken, is_terminal
from .grammar_analysis import GrammarAnalyzer

from ..utils import classify, STRING_TYPE
from ..common import ParseError, UnexpectedToken
# is_terminal = callable

try:
xrange
except NameError:
xrange = range

class MatchFailed(object):
pass

class AbortParseMatch(Exception):
pass


class Rule(object):
def __init__(self, name, symbols, postprocess):
self.name = name
self.symbols = symbols
self.postprocess = postprocess

class State(object):
def __init__(self, rule, expect, reference, data=None):
class Item:
def __init__(self, rule, ptr, start, data):
self.rule = rule
self.expect = expect
self.reference = reference
self.data = data or []

self.is_complete = (self.expect == len(self.rule.symbols))
if not self.is_complete:
self.expect_symbol = self.rule.symbols[self.expect]
self.is_terminal = isinstance(self.expect_symbol, tuple)
else:
self.is_terminal = False

def next_state(self, data):
return State(self.rule, self.expect+1, self.reference, self.data + [data])

def consume_terminal(self, inp):
if not self.is_complete and self.is_terminal:
# PORT: originally tests regexp

if self.expect_symbol[1] is not None:
match = self.expect_symbol[1].match(inp)
if match:
return self.next_state(inp)

elif self.expect_symbol[0] == inp.type:
return self.next_state(inp)

def consume_nonterminal(self, inp):
if not self.is_complete and not self.is_terminal:

if self.expect_symbol == inp:
return self.next_state(inp)

def process(self, location, ind, table, rules, added_rules):

if self.is_complete:
# Completed a rule
if self.rule.postprocess:
try:
self.data = self.rule.postprocess(self.data)
except AbortParseMatch:
self.data = MatchFailed

if self.data is not MatchFailed:
for s in table[self.reference]:
x = s.consume_nonterminal(self.rule.name)
if x:
x.data[-1] = self.data
x.epsilon_closure(location, ind, table)

else:
exp = self.rule.symbols[self.expect]
if isinstance(exp, tuple):
return

for r in rules[exp]:
assert r.name == exp
if r not in added_rules:
if r.symbols:
added_rules.add(r)
State(r, 0, location).epsilon_closure(location, ind, table)
else:
# Empty rule
new_copy = self.consume_nonterminal(r.name)
new_copy.data[-1] = r.postprocess([]) if r.postprocess else []
self.ptr = ptr
self.start = start
self.data = data

new_copy.epsilon_closure(location, ind, table)
@property
def expect(self):
return self.rule.expansion[self.ptr]

def epsilon_closure(self, location, ind, table):
col = table[location]
col.append(self)
@property
def is_complete(self):
return self.ptr == len(self.rule.expansion)

if not self.is_complete:
for i in xrange(ind):
state = col[i]
if state.is_complete and state.reference == location:
x = self.consume_nonterminal(state.rule.name)
if x:
x.data[-1] = state.data
x.epsilon_closure(location, ind, table)
def advance(self, data):
return Item(self.rule, self.ptr+1, self.start, self.data + [data])

def __eq__(self, other):
return self.start == other.start and self.ptr == other.ptr and self.rule == other.rule
def __hash__(self):
return hash((self.rule, self.ptr, self.start))

class Parser(object):
def __init__(self, rules, start=None):
self.rules = [Rule(r['name'], r['symbols'], r.get('postprocess', None)) for r in rules]
self.rules_by_name = classify(self.rules, lambda r: r.name)
self.start = start or self.rules[0].name

def advance_to(self, table, added_rules):
n = len(table)-1
for w, s in enumerate(table[n]):
s.process(n, w, table, self.rules_by_name, added_rules)
class Parser:
def __init__(self, parser_conf):
self.analysis = GrammarAnalyzer(parser_conf.rules, parser_conf.start)
self.start = parser_conf.start

self.postprocess = {}
self.predictions = {}
for rule in self.analysis.rules:
if rule.origin != '$root': # XXX kinda ugly
self.postprocess[rule] = getattr(parser_conf.callback, rule.alias)
self.predictions[rule.origin] = [(x.rule, x.index) for x in self.analysis.expand_rule(rule.origin)]

def parse(self, stream):
initial_rules = set(self.rules_by_name[self.start])
table = [[State(r, 0, 0) for r in initial_rules]]
self.advance_to(table, initial_rules)

i = 0

while i < len(stream):
col = []

token = stream[i]
for s in table[-1]:
x = s.consume_terminal(token)
if x:
col.append(x)

if not col:
expected = {s.expect_symbol for s in table[-1] if s.is_terminal}
raise UnexpectedToken(stream[i], expected, stream, i)

table.append(col)
self.advance_to(table, set())

i += 1

res = list(self.finish(table))
if not res:
raise ParseError('Incomplete parse')
return res

def finish(self, table):
for t in table[-1]:
if (t.rule.name == self.start
and t.expect == len(t.rule.symbols)
and t.reference == 0
and t.data is not MatchFailed):
yield t.data
# Define parser functions

def predict(symbol, i):
assert not is_terminal(symbol), symbol
return {Item(rule, index, i, []) for rule, index in self.predictions[symbol]}

def complete(item, table):
#item.data = (item.rule_ptr.rule, item.data)
item.data = self.postprocess[item.rule](item.data)
return {old_item.advance(item.data) for old_item in table[item.start]
if not old_item.is_complete and old_item.expect == item.rule.origin}

def process_column(i, term):
assert i == len(table)-1
cur_set = table[i]
next_set = set()

to_process = cur_set
while to_process:
new_items = set()
for item in to_process:
if item.is_complete:
new_items |= complete(item, table)
else:
if is_terminal(item.expect):
# scan
match = item.expect[0](term) if callable(item.expect[0]) else item.expect[0] == term
if match:
next_set.add(item.advance(stream[i]))
else:
if item.ptr: # part of an already predicted batch
new_items |= predict(item.expect, i)

to_process = new_items - cur_set # TODO: is this precaution necessary?
cur_set |= to_process


if not next_set and term != '$end':
expect = filter(is_terminal, [x.expect for x in cur_set if not x.is_complete])
raise UnexpectedToken(term, expect, stream, i)

table.append(next_set)

# Main loop starts

table = [predict(self.start, 0)]

for i, char in enumerate(stream):
process_column(i, char.type)

process_column(len(stream), '$end')

# Parse ended. Now build a parse tree
solutions = [n.data for n in table[len(stream)]
if n.is_complete and n.rule.origin==self.start and n.start==0]

if not solutions:
raise ParseError('Incomplete parse: Could not find a solution to input')

return solutions
#return map(self.reduce_solution, solutions)

def reduce_solution(self, solution):
rule, children = solution
children = [self.reduce_solution(c) if isinstance(c, tuple) else c for c in children]
return self.postprocess[rule](children)



from ..common import ParserConf
# A = 'A'.__eq__
# rules = [
# ('a', ['a', A], None),
# ('a', ['a', A, 'a'], None),
# ('a', ['a', A, A, 'a'], None),
# ('a', [A], None),
# ]

# p = Parser(ParserConf(rules, None, 'a'))
# for x in p.parse('AAAA'):
# print '->'
# print x.pretty()

# import re
# NUM = re.compile('[0-9]').match
# ADD = re.compile('[+-]').match
# MUL = re.compile('[*/]').match
# rules = [
# ('sum', ['sum', ADD, 'product'], None),
# ('sum', ['product'], None),
# ('product', ['product', MUL, 'factor'], None),
# ('product', ['factor'], None),
# ('factor', ['('.__eq__, 'sum', ')'.__eq__], None),
# ('factor', ['number'], None),
# ('number', [NUM, 'number'], None),
# ('number', [NUM], None),
# ]

# p = Parser(ParserConf(rules, None, 'sum'))
# # print p.parse('NALNMNANR')
# print p.parse('1+(2*3-4)')[0].pretty()

+ 0
- 144
lark/parsers/earley2.py Ver fichero

@@ -1,144 +0,0 @@
import sys

from ..common import ParseError, UnexpectedToken, is_terminal
from grammar_analysis import GrammarAnalyzer

# is_terminal = callable

class Item:
def __init__(self, rule, ptr, start, data):
self.rule = rule
self.ptr = ptr
self.start = start
self.data = data

@property
def expect(self):
return self.rule.expansion[self.ptr]

@property
def is_complete(self):
return self.ptr == len(self.rule.expansion)

def advance(self, data):
return Item(self.rule, self.ptr+1, self.start, self.data + [data])

def __eq__(self, other):
return self.start == other.start and self.ptr == other.ptr and self.rule == other.rule
def __hash__(self):
return hash((self.rule, self.ptr, self.start))


class Parser:
def __init__(self, parser_conf):
self.analysis = GrammarAnalyzer(parser_conf.rules, parser_conf.start)
self.start = parser_conf.start

self.postprocess = {}
self.predictions = {}
for rule in self.analysis.rules:
if rule.origin != '$root': # XXX kinda ugly
self.postprocess[rule] = getattr(parser_conf.callback, rule.alias)
self.predictions[rule.origin] = [(x.rule, x.index) for x in self.analysis.expand_rule(rule.origin)]

def parse(self, stream):
# Define parser functions

def predict(symbol, i):
assert not is_terminal(symbol), symbol
return {Item(rule, index, i, []) for rule, index in self.predictions[symbol]}

def complete(item, table):
#item.data = (item.rule_ptr.rule, item.data)
item.data = self.postprocess[item.rule](item.data)
return {old_item.advance(item.data) for old_item in table[item.start]
if not old_item.is_complete and old_item.expect == item.rule.origin}

def process_column(i, term):
assert i == len(table)-1
cur_set = table[i]
next_set = set()

to_process = cur_set
while to_process:
new_items = set()
for item in to_process:
if item.is_complete:
new_items |= complete(item, table)
else:
if is_terminal(item.expect):
# scan
if item.expect[0] == term:
next_set.add(item.advance(stream[i]))
else:
if item.ptr: # part of an already predicted batch
new_items |= predict(item.expect, i)

to_process = new_items - cur_set # TODO: is this precaution necessary?
cur_set |= to_process


if not next_set and term != '$end':
expect = filter(is_terminal, [x.expect for x in cur_set if not x.is_complete])
raise UnexpectedToken(term, expect, stream, i)

table.append(next_set)

# Main loop starts

table = [predict(self.start, 0)]

for i, char in enumerate(stream):
process_column(i, char.type)

process_column(len(stream), '$end')

# Parse ended. Now build a parse tree
solutions = [n.data for n in table[len(stream)]
if n.is_complete and n.rule.origin==self.start and n.start==0]

if not solutions:
raise ParseError('Incomplete parse: Could not find a solution to input')

return solutions
#return map(self.reduce_solution, solutions)

def reduce_solution(self, solution):
rule, children = solution
children = [self.reduce_solution(c) if isinstance(c, tuple) else c for c in children]
return self.postprocess[rule](children)



from ..common import ParserConf
# A = 'A'.__eq__
# rules = [
# ('a', ['a', A], None),
# ('a', ['a', A, 'a'], None),
# ('a', ['a', A, A, 'a'], None),
# ('a', [A], None),
# ]

# p = Parser(ParserConf(rules, None, 'a'))
# for x in p.parse('AAAA'):
# print '->'
# print x.pretty()

# import re
# NUM = re.compile('[0-9]').match
# ADD = re.compile('[+-]').match
# MUL = re.compile('[*/]').match
# rules = [
# ('sum', ['sum', ADD, 'product'], None),
# ('sum', ['product'], None),
# ('product', ['product', MUL, 'factor'], None),
# ('product', ['factor'], None),
# ('factor', ['('.__eq__, 'sum', ')'.__eq__], None),
# ('factor', ['number'], None),
# ('number', [NUM, 'number'], None),
# ('number', [NUM], None),
# ]

# p = Parser(ParserConf(rules, None, 'sum'))
# # print p.parse('NALNMNANR')
# print p.parse('1+(2*3-4)')[0].pretty()

+ 155
- 0
lark/parsers/nearley.py Ver fichero

@@ -0,0 +1,155 @@
"My name is Earley"

from ..utils import classify, STRING_TYPE
from ..common import ParseError, UnexpectedToken

try:
xrange
except NameError:
xrange = range

class MatchFailed(object):
pass

class AbortParseMatch(Exception):
pass


class Rule(object):
def __init__(self, name, symbols, postprocess):
self.name = name
self.symbols = symbols
self.postprocess = postprocess

class State(object):
def __init__(self, rule, expect, reference, data=None):
self.rule = rule
self.expect = expect
self.reference = reference
self.data = data or []

self.is_complete = (self.expect == len(self.rule.symbols))
if not self.is_complete:
self.expect_symbol = self.rule.symbols[self.expect]
self.is_terminal = isinstance(self.expect_symbol, tuple)
else:
self.is_terminal = False

def next_state(self, data):
return State(self.rule, self.expect+1, self.reference, self.data + [data])

def consume_terminal(self, inp):
if not self.is_complete and self.is_terminal:
# PORT: originally tests regexp

if self.expect_symbol[1] is not None:
match = self.expect_symbol[1].match(inp)
if match:
return self.next_state(inp)

elif self.expect_symbol[0] == inp.type:
return self.next_state(inp)

def consume_nonterminal(self, inp):
if not self.is_complete and not self.is_terminal:

if self.expect_symbol == inp:
return self.next_state(inp)

def process(self, location, ind, table, rules, added_rules):

if self.is_complete:
# Completed a rule
if self.rule.postprocess:
try:
self.data = self.rule.postprocess(self.data)
except AbortParseMatch:
self.data = MatchFailed

if self.data is not MatchFailed:
for s in table[self.reference]:
x = s.consume_nonterminal(self.rule.name)
if x:
x.data[-1] = self.data
x.epsilon_closure(location, ind, table)

else:
exp = self.rule.symbols[self.expect]
if isinstance(exp, tuple):
return

for r in rules[exp]:
assert r.name == exp
if r not in added_rules:
if r.symbols:
added_rules.add(r)
State(r, 0, location).epsilon_closure(location, ind, table)
else:
# Empty rule
new_copy = self.consume_nonterminal(r.name)
new_copy.data[-1] = r.postprocess([]) if r.postprocess else []

new_copy.epsilon_closure(location, ind, table)

def epsilon_closure(self, location, ind, table):
col = table[location]
col.append(self)

if not self.is_complete:
for i in xrange(ind):
state = col[i]
if state.is_complete and state.reference == location:
x = self.consume_nonterminal(state.rule.name)
if x:
x.data[-1] = state.data
x.epsilon_closure(location, ind, table)


class Parser(object):
def __init__(self, rules, start=None):
self.rules = [Rule(r['name'], r['symbols'], r.get('postprocess', None)) for r in rules]
self.rules_by_name = classify(self.rules, lambda r: r.name)
self.start = start or self.rules[0].name

def advance_to(self, table, added_rules):
n = len(table)-1
for w, s in enumerate(table[n]):
s.process(n, w, table, self.rules_by_name, added_rules)

def parse(self, stream):
initial_rules = set(self.rules_by_name[self.start])
table = [[State(r, 0, 0) for r in initial_rules]]
self.advance_to(table, initial_rules)

i = 0

while i < len(stream):
col = []

token = stream[i]
for s in table[-1]:
x = s.consume_terminal(token)
if x:
col.append(x)

if not col:
expected = {s.expect_symbol for s in table[-1] if s.is_terminal}
raise UnexpectedToken(stream[i], expected, stream, i)

table.append(col)
self.advance_to(table, set())

i += 1

res = list(self.finish(table))
if not res:
raise ParseError('Incomplete parse')
return res

def finish(self, table):
for t in table[-1]:
if (t.rule.name == self.start
and t.expect == len(t.rule.symbols)
and t.reference == 0
and t.data is not MatchFailed):
yield t.data

Cargando…
Cancelar
Guardar