Преглед изворни кода

Feature: Added explicit ambiguity option for Earley

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.5.1
Erez Shinan пре 7 година
родитељ
комит
f374e70b2c
6 измењених фајлова са 102 додато и 23 уклоњено
  1. +39
    -0
      examples/fruitflies.py
  2. +1
    -1
      lark/__init__.py
  3. +19
    -6
      lark/lark.py
  4. +14
    -6
      lark/parser_frontends.py
  5. +12
    -10
      lark/parsers/earley.py
  6. +17
    -0
      tests/test_parser.py

+ 39
- 0
examples/fruitflies.py Прегледај датотеку

@@ -0,0 +1,39 @@
#
# This example shows how to use get explicit ambiguity from Lark's Earley parser.
#

from lark import Lark

g = """
sentence: noun verb noun -> simple
| noun verb "like" noun -> comparative

noun: ADJ? NOUN
verb: VERB

NOUN: "flies" | "bananas" | "fruit"
VERB: "like" | "flies"
ADJ: "fruit"

%import common.WS
%ignore WS
"""

lark = Lark(g, start='sentence', ambiguity='explicit')

print(lark.parse('fruit flies like bananas').pretty())

# Outputs:
#
# _ambig
# comparative
# noun fruit
# verb flies
# noun bananas
# simple
# noun
# fruit
# flies
# verb like
# noun bananas


+ 1
- 1
lark/__init__.py Прегледај датотеку

@@ -3,4 +3,4 @@ from .common import ParseError, GrammarError
from .lark import Lark
from .utils import inline_args

__version__ = "0.2.6"
__version__ = "0.2.7"

+ 19
- 6
lark/lark.py Прегледај датотеку

@@ -27,6 +27,11 @@ class LarkOptions(object):
"contextual": Stronger lexer (only works with parser="lalr")
"auto" (default): Choose for me based on grammar and parser

ambiguity - Decides how to handle ambiguity in the parse. Only relevant if parser="earley"
"resolve": The parser will automatically choose the simplest derivation
(it chooses consistently: greedy for tokens, non-greedy for rules)
"explicit": The parser will return all derivations wrapped in "_ambig" tree nodes (i.e. a forest).

transformer - Applies the transformer to every parse tree
debug - Affects verbosity (default: False)
keep_all_tokens - Don't automagically remove "punctuation" tokens (default: False)
@@ -49,6 +54,7 @@ class LarkOptions(object):
self.transformer = o.pop('transformer', None)
self.start = o.pop('start', 'start')
self.profile = o.pop('profile', False)
self.ambiguity = o.pop('ambiguity', 'auto')

assert self.parser in ('earley', 'lalr', None)

@@ -119,13 +125,20 @@ class Lark:
assert not self.options.profile, "Feature temporarily disabled"
self.profiler = Profiler() if self.options.profile else None

lexer = self.options.lexer
if lexer == 'auto':
if self.options.lexer == 'auto':
if self.options.parser == 'lalr':
lexer = 'standard'
self.options.lexer = 'standard'
elif self.options.parser == 'earley':
lexer = None
self.options.lexer = lexer
self.options.lexer = None
lexer = self.options.lexer
assert lexer in ('standard', 'contextual', None)

if self.options.ambiguity == 'auto':
if self.options.parser == 'earley':
self.options.ambiguity = 'resolve'
else:
assert self.options.parser == 'earley'
assert self.options.ambiguity in ('resolve', 'explicit', 'auto')

self.grammar = load_grammar(grammar, source)
tokens, self.rules, self.grammar_extra = self.grammar.compile(lexer=bool(lexer), start=self.options.start)
@@ -155,7 +168,7 @@ class Lark:
setattr(callback, f, self.profiler.make_wrapper('transformer', getattr(callback, f)))
parser_conf = ParserConf(rules, callback, self.options.start)

return self.parser_class(self.lexer_conf, parser_conf)
return self.parser_class(self.lexer_conf, parser_conf, options=self.options)


def lex(self, text):


+ 14
- 6
lark/parser_frontends.py Прегледај датотеку

@@ -20,7 +20,7 @@ class WithLexer:
return stream

class LALR(WithLexer):
def __init__(self, lexer_conf, parser_conf):
def __init__(self, lexer_conf, parser_conf, options=None):
WithLexer.__init__(self, lexer_conf)

self.parser_conf = parser_conf
@@ -31,7 +31,7 @@ class LALR(WithLexer):
return self.parser.parse(tokens)

class LALR_ContextualLexer:
def __init__(self, lexer_conf, parser_conf):
def __init__(self, lexer_conf, parser_conf, options=None):
self.lexer_conf = lexer_conf
self.parser_conf = parser_conf

@@ -126,12 +126,16 @@ class OldEarley_NoLex:
return res[0]

class Earley_NoLex:
def __init__(self, lexer_conf, parser_conf):
def __init__(self, lexer_conf, parser_conf, options=None):
self.token_by_name = {t.name:t for t in lexer_conf.tokens}

rules = [(n, list(self._prepare_expansion(x)), a) for n,x,a in parser_conf.rules]

self.parser = earley.Parser(rules, parser_conf.start, parser_conf.callback)
resolve_ambiguity = (options.ambiguity=='resolve') if options else True
self.parser = earley.Parser(rules,
parser_conf.start,
parser_conf.callback,
resolve_ambiguity=resolve_ambiguity)

def _prepare_expansion(self, expansion):
for sym in expansion:
@@ -149,12 +153,16 @@ class Earley_NoLex:
return self.parser.parse(new_text)

class Earley(WithLexer):
def __init__(self, lexer_conf, parser_conf):
def __init__(self, lexer_conf, parser_conf, options=None):
WithLexer.__init__(self, lexer_conf)

rules = [(n, self._prepare_expansion(x), a) for n,x,a in parser_conf.rules]

self.parser = earley.Parser(rules, parser_conf.start, parser_conf.callback)
resolve_ambiguity = (options.ambiguity=='resolve') if options else True
self.parser = earley.Parser(rules,
parser_conf.start,
parser_conf.callback,
resolve_ambiguity=resolve_ambiguity)

def _prepare_expansion(self, expansion):
return [Terminal_Token(sym) if is_terminal(sym) else sym for sym in expansion]


+ 12
- 10
lark/parsers/earley.py Прегледај датотеку

@@ -101,10 +101,10 @@ class Column:
# XXX Potential bug: What happens if there's ambiguity in an empty rule?
if item.rule.expansion and item in self.completed:
old_tree = self.completed[item].tree
if old_tree.data != 'ambig':
if old_tree.data != '_ambig':
new_tree = old_tree.copy()
new_tree.rule = old_tree.rule
old_tree.set('ambig', [new_tree])
old_tree.set('_ambig', [new_tree])
if item.tree.children[0] is old_tree: # XXX a little hacky!
raise ParseError("Infinite recursion in grammar!")
old_tree.children.append(item.tree)
@@ -125,9 +125,10 @@ class Column:
return bool(self.item_count)

class Parser:
def __init__(self, rules, start, callback):
def __init__(self, rules, start, callback, resolve_ambiguity=True):
self.analysis = GrammarAnalyzer(rules, start)
self.start = start
self.resolve_ambiguity = resolve_ambiguity

self.postprocess = {}
self.predictions = {}
@@ -197,9 +198,11 @@ class Parser:
elif len(solutions) == 1:
tree = solutions[0]
else:
tree = Tree('ambig', solutions)
tree = Tree('_ambig', solutions)

if self.resolve_ambiguity:
ResolveAmbig().visit(tree)

ResolveAmbig().visit(tree)
return ApplyCallbacks(self.postprocess).transform(tree)

@@ -220,9 +223,8 @@ def _compare_rules(rule1, rule2):
assert rule1.origin == rule2.origin
c = compare( len(rule1.expansion), len(rule2.expansion))
if rule1.origin.startswith('__'): # XXX hack! We need to set priority in parser, not here
return c
else:
return -c
c = -c
return c

def _compare_drv(tree1, tree2):
if not (isinstance(tree1, Tree) and isinstance(tree2, Tree)):
@@ -242,8 +244,8 @@ def _compare_drv(tree1, tree2):


class ResolveAmbig(Visitor_NoRecurse):
def ambig(self, tree):
best = max(tree.children, key=cmp_to_key(_compare_drv))
def _ambig(self, tree):
best = min(tree.children, key=cmp_to_key(_compare_drv))
assert best.data == 'drv'
tree.set('drv', best.children)
tree.rule = best.rule # needed for applying callbacks


+ 17
- 0
tests/test_parser.py Прегледај датотеку

@@ -120,6 +120,23 @@ class TestEarley(unittest.TestCase):
empty_tree = Tree('empty', [Tree('empty2', [])])
self.assertSequenceEqual(res.children, ['a', empty_tree, empty_tree, 'b'])

def test_earley_explicit_ambiguity(self):
# This was a sneaky bug!

grammar = """
start: a b | ab
a: "a"
b: "b"
ab: "ab"
"""

parser = Lark(grammar, parser='earley', lexer=None, ambiguity='explicit')
res = parser.parse('ab')

self.assertEqual( res.data, '_ambig')
self.assertEqual( len(res.children), 2)


def _make_parser_test(LEXER, PARSER):
def _Lark(grammar, **kwargs):
return Lark(grammar, lexer=LEXER, parser=PARSER, **kwargs)


Loading…
Откажи
Сачувај