Browse Source

Dramatically improved memory consumption of Earley parser

I no longer keep the entire table, only the relevant columns.
I let Python handle garbage collection for me.
tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.5.1
Erez Shinan 8 years ago
parent
commit
302dad36fe
1 changed files with 15 additions and 16 deletions
  1. +15
    -16
      lark/parsers/earley.py

+ 15
- 16
lark/parsers/earley.py View File

@@ -33,9 +33,9 @@ class Item(object):
return Item(self.rule, self.ptr+1, self.start, self.data + [data]) return Item(self.rule, self.ptr+1, self.start, self.data + [data])


def __eq__(self, other): def __eq__(self, other):
return self.start == other.start and self.ptr == other.ptr and self.rule == other.rule
return self.start is other.start and self.ptr == other.ptr and self.rule == other.rule
def __hash__(self): def __hash__(self):
return hash((self.rule, self.ptr, self.start))
return hash((self.rule, self.ptr, id(self.start)))


def __repr__(self): def __repr__(self):
before = map(str, self.rule.expansion[:self.ptr]) before = map(str, self.rule.expansion[:self.ptr])
@@ -115,14 +115,12 @@ class Parser:
assert not is_terminal(nonterm), nonterm assert not is_terminal(nonterm), nonterm
return [Item(rule, 0, i, []) for rule in self.predictions[nonterm]] return [Item(rule, 0, i, []) for rule in self.predictions[nonterm]]


def complete(item, table):
def complete(item):
name = item.rule.origin name = item.rule.origin
item.data = self.postprocess[item.rule](item.data) item.data = self.postprocess[item.rule](item.data)
return [i.advance(item.data) for i in table[item.start].to_predict if i.expect == name]
return [i.advance(item.data) for i in item.start.to_predict if i.expect == name]


def process_column(i, token):
assert i == len(table)-1
cur_set = table[i]
def process_column(i, token, cur_set):
next_set = Column() next_set = Column()


while True: while True:
@@ -133,9 +131,9 @@ class Parser:
break break


for nonterm in to_predict: for nonterm in to_predict:
cur_set.add( predict(nonterm, i) )
cur_set.add( predict(nonterm, cur_set) )
for item in to_reduce: for item in to_reduce:
cur_set.add( complete(item, table) )
cur_set.add( complete(item) )




if token is not END_TOKEN: if token is not END_TOKEN:
@@ -148,20 +146,21 @@ class Parser:
expect = {i.expect for i in cur_set.to_scan} expect = {i.expect for i in cur_set.to_scan}
raise UnexpectedToken(token, expect, stream, i) raise UnexpectedToken(token, expect, stream, i)


table.append(next_set)
return cur_set, next_set


# Main loop starts # Main loop starts
table = [Column()]
table[0].add(predict(start, 0))
column0 = Column()
column0.add(predict(start, column0))


cur_set = column0
for i, char in enumerate(stream): for i, char in enumerate(stream):
process_column(i, char)
_, cur_set = process_column(i, char, cur_set)


process_column(len(stream), END_TOKEN)
last_set, _ = process_column(len(stream), END_TOKEN, cur_set)


# Parse ended. Now build a parse tree # Parse ended. Now build a parse tree
solutions = [n.data for n in table[len(stream)].to_reduce
if n.rule.origin==start and n.start==0]
solutions = [n.data for n in last_set.to_reduce
if n.rule.origin==start and n.start is column0]


if not solutions: if not solutions:
raise ParseError('Incomplete parse: Could not find a solution to input') raise ParseError('Incomplete parse: Could not find a solution to input')


Loading…
Cancel
Save