Przeglądaj źródła

My Earley parser is working

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.5.1
Erez Shinan 7 lat temu
rodzic
commit
538f944602
5 zmienionych plików z 127 dodań i 62 usunięć
  1. +1
    -1
      lark/common.py
  2. +56
    -4
      lark/parser_frontends.py
  3. +68
    -55
      lark/parsers/earley2.py
  4. +1
    -1
      lark/parsers/grammar_analysis.py
  5. +1
    -1
      lark/parsers/lalr_analysis.py

+ 1
- 1
lark/common.py Wyświetl plik

@@ -28,7 +28,7 @@ class UnexpectedToken(ParseError):


def is_terminal(sym):
return sym.isupper() or sym[0] == '$'
return isinstance(sym, tuple) or sym.isupper() or sym[0] == '$'


class LexerConf:


+ 56
- 4
lark/parser_frontends.py Wyświetl plik

@@ -3,8 +3,9 @@ import sre_parse

from .lexer import Lexer, ContextualLexer

from .common import is_terminal, GrammarError
from .parsers import lalr_parser, earley
from .common import is_terminal, GrammarError, ParserConf
from .parsers import lalr_parser, earley, earley2
from .parsers.grammar_analysis import Rule

class WithLexer:
def __init__(self, lexer_conf):
@@ -50,7 +51,7 @@ class LALR_ContextualLexer:



class Earley(WithLexer):
class Nearley(WithLexer):
def __init__(self, lexer_conf, parser_conf):
WithLexer.__init__(self, lexer_conf)

@@ -74,6 +75,26 @@ class Earley(WithLexer):
assert len(res) ==1 , 'Ambiguious Parse! Not handled yet'
return res[0]


class MyEarley(WithLexer):
def __init__(self, lexer_conf, parser_conf):
WithLexer.__init__(self, lexer_conf)

rules = [(n, self._prepare_expansion(x), a)
for n,x,a in parser_conf.rules]

self.parser = earley2.Parser(ParserConf(rules, parser_conf.callback, parser_conf.start))

def _prepare_expansion(self, expansion):
return [(sym,) if is_terminal(sym) else sym for sym in expansion]

def parse(self, text):
tokens = list(self.lex(text))
res = self.parser.parse(tokens)
assert len(res) ==1 , 'Ambiguious Parse! Not handled yet'
return res[0]


class Earley_NoLex:
def __init__(self, lexer_conf, parser_conf):
self.token_by_name = {t.name:t for t in lexer_conf.tokens}
@@ -101,4 +122,35 @@ class Earley_NoLex:
assert len(res) ==1 , 'Ambiguious Parse! Not handled yet'
return res[0]

ENGINE_DICT = { 'lalr': LALR, 'earley': Earley, 'earley_nolex': Earley_NoLex, 'lalr_contextual_lexer': LALR_ContextualLexer }

class MyEarley_NoLex:
def __init__(self, lexer_conf, parser_conf):
self.token_by_name = {t.name:t for t in lexer_conf.tokens}

rules = [(n, list(self._prepare_expansion(x)), a)
for n,x,a in parser_conf.rules]

self.parser = earley2.Parser(ParserConf(rules, parser_conf.callback, parser_conf.start))

def _prepare_expansion(self, expansion):
for sym in expansion:
if is_terminal(sym):
regexp = self.token_by_name[sym].to_regexp()
width = sre_parse.parse(regexp).getwidth()
if not width == (1,1):
raise GrammarError('Dynamic lexing requires all tokens to have a width of 1 (%s is %s)' % (regexp, width))
yield re.compile(regexp).match
else:
yield sym

def parse(self, text):
res = self.parser.parse(text)
assert len(res) ==1 , 'Ambiguious Parse! Not handled yet'
return res[0]

ENGINE_DICT = {
'lalr': LALR,
'earley': MyEarley,
'earley_nolex': Earley_NoLex,
'lalr_contextual_lexer': LALR_ContextualLexer
}

+ 68
- 55
lark/parsers/earley2.py Wyświetl plik

@@ -1,67 +1,63 @@
import sys

from ..common import ParseError, UnexpectedToken, is_terminal
from grammar_analysis import GrammarAnalyzer

from ..tree import Tree
# is_terminal = callable

class Item:
def __init__(self, rule_ptr, start, data):
self.rule_ptr = rule_ptr
def __init__(self, rule, ptr, start, data):
self.rule = rule
self.ptr = ptr
self.start = start
self.data = data

@property
def expect(self):
return self.rule_ptr.next
return self.rule.expansion[self.ptr]

@property
def is_complete(self):
return self.rule_ptr.is_satisfied

@property
def name(self):
return self.rule_ptr.rule.origin
return self.ptr == len(self.rule.expansion)

def advance(self, data):
return Item(self.rule_ptr.advance(self.expect), self.start, self.data + [data])
return Item(self.rule, self.ptr+1, self.start, self.data + [data])

def __eq__(self, other):
return self.rule_ptr == other.rule_ptr and self.start == other.start
return self.start == other.start and self.ptr == other.ptr and self.rule == other.rule
def __hash__(self):
return hash((self.rule_ptr, self.start))

def __repr__(self):
return '%s (%s)' % (self.rule_ptr, self.start)
return hash((self.rule, self.ptr, self.start))


class Parser:
def __init__(self, rules, start):
self.analyzer = GrammarAnalyzer(rules, start)
self.start = start
def __init__(self, parser_conf):
self.analysis = GrammarAnalyzer(parser_conf.rules, parser_conf.start)
self.start = parser_conf.start

self.postprocess = {}
self.predictions = {}
for rule in self.analysis.rules:
if rule.origin != '$root': # XXX kinda ugly
self.postprocess[rule] = getattr(parser_conf.callback, rule.alias)
self.predictions[rule.origin] = [(x.rule, x.index) for x in self.analysis.expand_rule(rule.origin)]

def parse(self, stream):
# Define parser functions

def predict(symbol, i):
assert not is_terminal(symbol), symbol
return {Item(rp, i, []) for rp in self.analyzer.expand_rule(symbol)}

def scan(item, inp):
if item.expect == inp: # TODO Do a smarter match, i.e. regexp
return {item.advance(inp)}
else:
return set()
return {Item(rule, index, i, []) for rule, index in self.predictions[symbol]}

def complete(item, table):
name = item.name
item.data = Tree(name, item.data)
#item.data = (item.rule_ptr.rule, item.data)
item.data = self.postprocess[item.rule](item.data)
return {old_item.advance(item.data) for old_item in table[item.start]
if not old_item.is_complete and old_item.expect == name}
if not old_item.is_complete and old_item.expect == item.rule.origin}

def process_column(i, char):
cur_set = table[-1]
def process_column(i, term):
assert i == len(table)-1
cur_set = table[i]
next_set = set()
table.append(next_set)

to_process = cur_set
while to_process:
@@ -71,61 +67,78 @@ class Parser:
new_items |= complete(item, table)
else:
if is_terminal(item.expect):
next_set |= scan(item, char)
# scan
if item.expect[0] == term:
next_set.add(item.advance(stream[i]))
else:
new_items |= predict(item.expect, i)
if item.ptr: # part of an already predicted batch
new_items |= predict(item.expect, i)

to_process = new_items - cur_set
to_process = new_items - cur_set # TODO: is this precaution necessary?
cur_set |= to_process

if not next_set and char != '$end':
expect = filter(is_terminal, [i.expect for i in cur_set if not i.is_complete])
raise UnexpectedToken(char, expect, stream, i)

if not next_set and term != '$end':
expect = filter(is_terminal, [x.expect for x in cur_set if not x.is_complete])
raise UnexpectedToken(term, expect, stream, i)

table.append(next_set)

# Main loop starts

table = [predict(self.start, 0)]

for i, char in enumerate(stream):
process_column(i, char)
process_column(i, char.type)

process_column(len(stream), '$end')

# Parse ended. Now build a parse tree
solutions = [n.data for n in table[len(stream)]
if n.is_complete and n.name==self.start and n.start==0]
if n.is_complete and n.rule.origin==self.start and n.start==0]

if not solutions:
raise ParseError('Incomplete parse: Could not find a solution to input')

return solutions
#return map(self.reduce_solution, solutions)

def reduce_solution(self, solution):
rule, children = solution
children = [self.reduce_solution(c) if isinstance(c, tuple) else c for c in children]
return self.postprocess[rule](children)




from ..common import ParserConf
# A = 'A'.__eq__
# rules = [
# ('a', ['a', 'A']),
# ('a', ['a', 'A', 'a']),
# ('a', ['a', 'A', 'A', 'a']),
# ('a', ['A']),
# ('a', ['a', A], None),
# ('a', ['a', A, 'a'], None),
# ('a', ['a', A, A, 'a'], None),
# ('a', [A], None),
# ]

# p = Parser(rules, 'a')
# p = Parser(ParserConf(rules, None, 'a'))
# for x in p.parse('AAAA'):
# print '->'
# print x.pretty()

# import re
# NUM = re.compile('[0-9]').match
# ADD = re.compile('[+-]').match
# MUL = re.compile('[*/]').match
# rules = [
# ('sum', ['sum', "A", 'product']),
# ('sum', ['product']),
# ('product', ['product', "M", 'factor']),
# ('product', ['factor']),
# ('factor', ['L', 'sum', 'R']),
# ('factor', ['number']),
# ('number', ['N', 'number']),
# ('number', ['N']),
# ('sum', ['sum', ADD, 'product'], None),
# ('sum', ['product'], None),
# ('product', ['product', MUL, 'factor'], None),
# ('product', ['factor'], None),
# ('factor', ['('.__eq__, 'sum', ')'.__eq__], None),
# ('factor', ['number'], None),
# ('number', [NUM, 'number'], None),
# ('number', [NUM], None),
# ]

# p = Parser(rules, 'sum')
# print p.parse('NALNMNANR')
# p = Parser(ParserConf(rules, None, 'sum'))
# # print p.parse('NALNMNANR')
# print p.parse('1+(2*3-4)')[0].pretty()

+ 1
- 1
lark/parsers/grammar_analysis.py Wyświetl plik

@@ -133,7 +133,7 @@ class GrammarAnalyzer(object):
"Returns all init_ptrs accessible by rule (recursive)"
init_ptrs = set()
def _expand_rule(rule):
assert not is_terminal(rule)
assert not is_terminal(rule), rule

for r in self.rules_by_origin[rule]:
init_ptr = RulePtr(r, 0)


+ 1
- 1
lark/parsers/lalr_analysis.py Wyświetl plik

@@ -4,7 +4,7 @@ from collections import defaultdict
from ..utils import classify, classify_bool, bfs, fzset
from ..common import GrammarError, is_terminal

from grammar_analysis import GrammarAnalyzer
from .grammar_analysis import GrammarAnalyzer

ACTION_SHIFT = 0



Ładowanie…
Anuluj
Zapisz