Browse Source

Merge pull request #1 from erezsh/cyk_after_fixes

Cyk after fixes
tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.5.3
ehudt 6 years ago
committed by GitHub
parent
commit
3034b76ac3
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 326 additions and 347 deletions
  1. +3
    -2
      examples/python2.g
  2. +2
    -1
      examples/python3.g
  3. +24
    -18
      lark/parser_frontends.py
  4. +283
    -323
      lark/parsers/cyk.py
  5. +2
    -2
      lark/tree.py
  6. +1
    -0
      tests/__main__.py
  7. +11
    -1
      tests/test_parser.py

+ 3
- 2
examples/python2.g View File

@@ -100,13 +100,14 @@ comp_op: "<"|">"|"=="|">="|"<="|"<>"|"!="|"in"|"not" "in"|"is"|"is" "not"
| molecule "[" [subscriptlist] "]" -> getitem
| molecule "." NAME -> getattr
| atom
?atom: "(" [yield_expr|testlist_comp] ")"
?atom: "(" [yield_expr|testlist_comp] ")" -> tuple
| "[" [listmaker] "]"
| "{" [dictorsetmaker] "}"
| "`" testlist1 "`"
| "(" test ")"
| NAME | number | string+
listmaker: test ( list_for | ("," test)* [","] )
?testlist_comp: test ( comp_for | ("," test)* [","] )
?testlist_comp: test ( comp_for | ("," test)+ [","] | ",")
lambdef: "lambda" [paramlist] ":" test
?subscriptlist: subscript ("," subscript)* [","]
subscript: "." "." "." | test | [test] ":" [test] [sliceop]


+ 2
- 1
examples/python3.g View File

@@ -121,12 +121,13 @@ AWAIT: "await"
| "{" [dictorsetmaker] "}" -> dict
| NAME -> var
| number | string+
| "(" test ")"
| "..." -> ellipsis
| "None" -> const_none
| "True" -> const_true
| "False" -> const_false

testlist_comp: (test|star_expr) ( comp_for | ("," (test|star_expr))* [","] )
?testlist_comp: (test|star_expr) ( comp_for | ("," (test|star_expr))+ [","] | ",")
subscriptlist: subscript ("," subscript)* [","]
subscript: test | [test] ":" [test] [sliceop]
sliceop: ":" [test]


+ 24
- 18
lark/parser_frontends.py View File

@@ -1,10 +1,11 @@
import re
from .utils import get_regexp_width

from parsers.grammar_analysis import GrammarAnalyzer
from .parsers.grammar_analysis import GrammarAnalyzer
from .lexer import Lexer, ContextualLexer, Token

from .common import is_terminal, GrammarError, Terminal_Regexp, Terminal_Token
from .common import GrammarError
from .common import is_terminal, GrammarError
from .parsers import lalr_parser, earley, xearley, resolve_ambig, cyk
from .tree import Tree

@@ -137,31 +138,36 @@ class XEarley:
return self.parser.parse(text)


class Earley(WithLexer):
def __init__(self, lexer_conf, parser_conf, options=None):
self.init_traditional_lexer(lexer_conf)

self.parser = earley.Parser(parser_conf, self.match,
resolve_ambiguity=get_ambiguity_resolver(options))

def match(self, term, token):
return term == token.type

def parse(self, text):
tokens = self.lex(text)
return self.parser.parse(tokens)

class CYK(WithLexer):

def __init__(self, lexer_conf, parser_conf, options=None):
WithLexer.__init__(self, lexer_conf)
# TokenDef from synthetic rule to terminal value
self._token_by_name = {t.name: t for t in lexer_conf.tokens}
rules = [(lhs, self._prepare_expansion(rhs), cb, opt) for lhs, rhs, cb, opt in parser_conf.rules]
self._analysis = GrammarAnalyzer(rules, parser_conf.start)
self.init_traditional_lexer(lexer_conf)

self._analysis = GrammarAnalyzer(parser_conf)
self._parser = cyk.Parser(self._analysis.rules, parser_conf.start)

self._postprocess = {}
for rule in self._analysis.rules:
if rule.origin != '$root': # XXX kinda ugly
a = rule.alias
self._postprocess[a] = a if callable(a) else (a and getattr(parser_conf.callback, a))

def _prepare_expansion(self, expansion):
return [
Terminal_Regexp(sym, self._token_by_name[sym].pattern.to_regexp())
if is_terminal(sym) else sym for sym in expansion
]
a = rule.alias
self._postprocess[a] = a if callable(a) else (a and getattr(parser_conf.callback, a))

def parse(self, text):
tokenized = [token.value for token in self.lex(text)]
parse = self._parser.parse(tokenized)
tokens = list(self.lex(text))
parse = self._parser.parse(tokens)
parse = self._transform(parse)
return parse



+ 283
- 323
lark/parsers/cyk.py View File

@@ -1,225 +1,192 @@
"""This module implements a CYK parser."""

from collections import defaultdict
import itertools
import re

from ..common import ParseError, Terminal, Terminal_Regexp
from ..common import ParseError, is_terminal
from ..lexer import Token
from ..tree import Tree

def TypeName(x):
return type(x).__name__
try:
xrange
except NameError:
xrange = range

class Symbol(object):
"""Any grammar symbol."""
"""Any grammar symbol."""

def __init__(self, s):
self.s = s
def __init__(self, s):
self.s = s

def __repr__(self):
return '%s(%s)' % (TypeName(self), str(self))
def __repr__(self):
return '%s(%s)' % (type(self).__name__, str(self))

def __str__(self):
return str(self.s)
def __str__(self):
return str(self.s)

def __eq__(self, other):
return str(self) == str(other)
def __eq__(self, other):
return self.s == str(other)

def __ne__(self, other):
return not self.__eq__(other)
def __ne__(self, other):
return not self.__eq__(other)

def __hash__(self):
return hash(TypeName(self) + '&' + self.__str__())
def __hash__(self):
return hash((type(self), str(self.s)))


class T(Symbol):
"""Terminal."""

def __init__(self, s):
super(T, self).__init__(s)
self.regexp = re.compile(s)

def match(self, s):
m = self.regexp.match(s)
return bool(m) and len(m.group(0)) == len(s)
"""Terminal."""

def __eq__(self, other):
return super(T, self).__eq__(other) and isinstance(other, T)
def match(self, s):
return self.s == s.type


class NT(Symbol):
"""Non-terminal."""

def __eq__(self, other):
return super(NT, self).__eq__(other) and isinstance(other, NT)
"""Non-terminal."""
pass


class Rule(object):
"""Context-free grammar rule."""
"""Context-free grammar rule."""

def __init__(self, lhs, rhs, weight, alias):
super(Rule, self).__init__()
assert isinstance(lhs, NT), lhs
assert all(isinstance(x, NT) or isinstance(x, T) for x in rhs), rhs
self.lhs = lhs
self.rhs = rhs
self.weight = weight
self.alias = alias
def __init__(self, lhs, rhs, weight, alias):
super(Rule, self).__init__()
assert isinstance(lhs, NT), lhs
assert all(isinstance(x, NT) or isinstance(x, T) for x in rhs), rhs
self.lhs = lhs
self.rhs = rhs
self.weight = weight
self.alias = alias

def __str__(self):
return '%s -> %s' % (str(self.lhs), ' '.join(str(x) for x in self.rhs))
def __str__(self):
return '%s -> %s' % (str(self.lhs), ' '.join(str(x) for x in self.rhs))

def __repr__(self):
return str(self)
def __repr__(self):
return str(self)

def __hash__(self):
return hash(self.__repr__())
def __hash__(self):
return hash((self.lhs, tuple(self.rhs)))

def __eq__(self, other):
return self.lhs == other.lhs and self.rhs == other.rhs
def __eq__(self, other):
return self.lhs == other.lhs and self.rhs == other.rhs

def __ne__(self, other):
return not self.__eq__(other)
def __ne__(self, other):
return not (self == other)


class Grammar(object):
"""Context-free grammar."""
"""Context-free grammar."""

def __init__(self, rules):
super(Grammar, self).__init__()
self.rules = sorted(rules, key=lambda x: str(x))
def __init__(self, rules):
self.rules = frozenset(rules)

def __eq__(self, other):
return set(self.rules) == set(other.rules)
def __eq__(self, other):
return self.rules == other.rules

def __str__(self):
return '\n' + '\n'.join(sorted(x.__repr__() for x in self.rules)) + '\n'
def __str__(self):
return '\n' + '\n'.join(sorted(repr(x) for x in self.rules)) + '\n'

def __repr__(self):
return str(self)
def __repr__(self):
return str(self)


# Parse tree data structures
class RuleNode(object):
"""A node in the parse tree, which also contains the full rhs rule."""

def __init__(self, rule, children, weight=0):
super(RuleNode, self).__init__()
self.rule = rule
self.children = children
self.weight = weight

def __repr__(self):
return 'RuleNode(%s, [%s])' % (repr(self.rule.lhs), ', '.join(
str(x) for x in self.children))

def __hash__(self):
return hash(self.__repr__())
"""A node in the parse tree, which also contains the full rhs rule."""

def __init__(self, rule, children, weight=0):
self.rule = rule
self.children = children
self.weight = weight

class Node(object):
"""A node in the parse tree."""
def __repr__(self):
return 'RuleNode(%s, [%s])' % (repr(self.rule.lhs), ', '.join(str(x) for x in self.children))

def __init__(self, lhs, children):
super(Node, self).__init__()
self.lhs = lhs
self.children = children

def __repr__(self):
return 'Node(%s, [%s])' % (repr(self.lhs), ', '.join(
str(x) for x in self.children))

def __hash__(self):
return hash(self.__repr__())


class Parser(object):
"""Parser wrapper."""

def __init__(self, rules, start):
super(Parser, self).__init__()
self.orig_rules = {rule.alias: rule for rule in rules}
rules = [self._ToRule(rule) for rule in rules]
self.grammar = ToCnf(Grammar(rules))
self.start = NT(start)

def _ToRule(self, lark_rule):
"""Converts a lark rule, (lhs, rhs, callback, options), to a Rule."""
return Rule(
NT(lark_rule.origin), [
T(x.data) if (isinstance(x, Terminal_Regexp) or
isinstance(x, Terminal)) else NT(x)
for x in lark_rule.expansion
], weight=lark_rule.options.priority if lark_rule.options and lark_rule.options.priority else 0, alias=lark_rule.alias)

def parse(self, tokenized): # pylint: disable=invalid-name
"""Parses input, which is a list of tokens."""
table, trees = _Parse(tokenized, self.grammar)
# Check if the parse succeeded.
if all(r.lhs != self.start for r in table[(0, len(tokenized) - 1)]):
raise ParseError('Parsing failed.')
parse = trees[(0, len(tokenized) - 1)][NT(self.start)]
return self._ToTree(RevertCnf(parse))

def _ToTree(self, rule_node):
"""Converts a RuleNode parse tree to a lark Tree."""
orig_rule = self.orig_rules[rule_node.rule.alias]
children = []
for i, child in enumerate(rule_node.children):
if isinstance(child, RuleNode):
children.append(self._ToTree(child))
elif isinstance(child, Terminal_Regexp):
children.append(Token(orig_rule.expansion[i].name, child.s))
else:
children.append(Token(orig_rule.expansion[i], child.s))
return Tree(orig_rule.origin, children, rule=orig_rule)


def PrintParse(node, indent=0):
if isinstance(node, RuleNode):
print(' ' * (indent * 2) + str(node.rule.lhs))
for child in node.children:
PrintParse(child, indent + 1)
else:
print(' ' * (indent * 2) + str(node.s))


def _Parse(s, g):
"""Parses sentence 's' using CNF grammar 'g'."""
# The CYK table. Indexed with a 2-tuple: (start pos, end pos)
table = defaultdict(set)
# Top-level structure is similar to the CYK table. Each cell is a dict from
# rule name to the best (lightest) tree for that rule.
trees = defaultdict(dict)
# Populate base case with existing terminal production rules
for i, w in enumerate(s):
for terminal, rules in g.terminal_rules.iteritems():
if terminal.match(w):
for rule in rules:
table[(i, i)].add(rule)
if (rule.lhs not in trees[(i, i)] or
rule.weight < trees[(i, i)][rule.lhs].weight):
trees[(i, i)][rule.lhs] = RuleNode(rule, [T(w)], weight=rule.weight)
# Iterate over lengths of sub-sentences
for l in xrange(2, len(s) + 1):
# Iterate over sub-sentences with the given length
for i in xrange(len(s) - l + 1):
# Choose partition of the sub-sentence in [1, l)
for p in xrange(i + 1, i + l):
span1 = (i, p - 1)
span2 = (p, i + l - 1)
for r1, r2 in itertools.product(table[span1], table[span2]):
for rule in g.nonterminal_rules.get((r1.lhs, r2.lhs), []):
table[(i, i + l - 1)].add(rule)
r1_tree = trees[span1][r1.lhs]
r2_tree = trees[span2][r2.lhs]
rule_total_weight = rule.weight + r1_tree.weight + r2_tree.weight
if (rule.lhs not in trees[(i, i + l - 1)] or
rule_total_weight < trees[(i, i + l - 1)][rule.lhs].weight):
trees[(i, i + l - 1)][rule.lhs] = RuleNode(rule, [r1_tree, r2_tree], weight=rule_total_weight)
return table, trees
"""Parser wrapper."""

def __init__(self, rules, start):
super(Parser, self).__init__()
self.orig_rules = {rule.alias: rule for rule in rules}
rules = [self._to_rule(rule) for rule in rules]
self.grammar = to_cnf(Grammar(rules))
self.start = NT(start)

def _to_rule(self, lark_rule):
"""Converts a lark rule, (lhs, rhs, callback, options), to a Rule."""
return Rule(
NT(lark_rule.origin), [
T(x) if is_terminal(x) else NT(x) for x in lark_rule.expansion
], weight=lark_rule.options.priority if lark_rule.options and lark_rule.options.priority else 0, alias=lark_rule.alias)

def parse(self, tokenized): # pylint: disable=invalid-name
"""Parses input, which is a list of tokens."""
table, trees = _parse(tokenized, self.grammar)
# Check if the parse succeeded.
if all(r.lhs != self.start for r in table[(0, len(tokenized) - 1)]):
raise ParseError('Parsing failed.')
parse = trees[(0, len(tokenized) - 1)][NT(self.start)]
return self._to_tree(revert_cnf(parse))

def _to_tree(self, rule_node):
"""Converts a RuleNode parse tree to a lark Tree."""
orig_rule = self.orig_rules[rule_node.rule.alias]
children = []
for i, child in enumerate(rule_node.children):
if isinstance(child, RuleNode):
children.append(self._to_tree(child))
else:
assert isinstance(child.s, Token)
children.append(child.s)
return Tree(orig_rule.origin, children, rule=orig_rule)


def print_parse(node, indent=0):
if isinstance(node, RuleNode):
print(' ' * (indent * 2) + str(node.rule.lhs))
for child in node.children:
print_parse(child, indent + 1)
else:
print(' ' * (indent * 2) + str(node.s))


def _parse(s, g):
"""Parses sentence 's' using CNF grammar 'g'."""
# The CYK table. Indexed with a 2-tuple: (start pos, end pos)
table = defaultdict(set)
# Top-level structure is similar to the CYK table. Each cell is a dict from
# rule name to the best (lightest) tree for that rule.
trees = defaultdict(dict)
# Populate base case with existing terminal production rules
for i, w in enumerate(s):
for terminal, rules in g.terminal_rules.items():
if terminal.match(w):
for rule in rules:
table[(i, i)].add(rule)
if (rule.lhs not in trees[(i, i)] or
rule.weight < trees[(i, i)][rule.lhs].weight):
trees[(i, i)][rule.lhs] = RuleNode(rule, [T(w)], weight=rule.weight)

# Iterate over lengths of sub-sentences
for l in xrange(2, len(s) + 1):
# Iterate over sub-sentences with the given length
for i in xrange(len(s) - l + 1):
# Choose partition of the sub-sentence in [1, l)
for p in xrange(i + 1, i + l):
span1 = (i, p - 1)
span2 = (p, i + l - 1)
for r1, r2 in itertools.product(table[span1], table[span2]):
for rule in g.nonterminal_rules.get((r1.lhs, r2.lhs), []):
table[(i, i + l - 1)].add(rule)
r1_tree = trees[span1][r1.lhs]
r2_tree = trees[span2][r2.lhs]
rule_total_weight = rule.weight + r1_tree.weight + r2_tree.weight
if (rule.lhs not in trees[(i, i + l - 1)]
or rule_total_weight < trees[(i, i + l - 1)][rule.lhs].weight):
trees[(i, i + l - 1)][rule.lhs] = RuleNode(rule, [r1_tree, r2_tree], weight=rule_total_weight)
return table, trees


# This section implements context-free grammar converter to Chomsky normal form.
@@ -237,165 +204,158 @@ def _Parse(s, g):


class CnfWrapper(object):
"""CNF wrapper for grammar.
"""CNF wrapper for grammar.

Validates that the input grammar is CNF and provides helper data structures.
"""

def __init__(self, grammar):
super(CnfWrapper, self).__init__()
self.grammar = grammar
self.rules = grammar.rules
self.terminal_rules = defaultdict(list)
self.nonterminal_rules = defaultdict(list)
for r in self.rules:
# Validate that the grammar is CNF and populate auxiliary data structures.
assert isinstance(r.lhs, NT), r
assert len(r.rhs) in [1, 2], r
if len(r.rhs) == 1 and isinstance(r.rhs[0], T):
self.terminal_rules[r.rhs[0]].append(r)
elif len(r.rhs) == 2 and all(isinstance(x, NT) for x in r.rhs):
self.nonterminal_rules[tuple(r.rhs)].append(r)
else:
assert False, r
def __eq__(self, other):
return self.grammar == other.grammar
def __repr__(self):
return self.grammar.__repr__()
def __init__(self, grammar):
super(CnfWrapper, self).__init__()
self.grammar = grammar
self.rules = grammar.rules
self.terminal_rules = defaultdict(list)
self.nonterminal_rules = defaultdict(list)
for r in self.rules:
# Validate that the grammar is CNF and populate auxiliary data structures.
assert isinstance(r.lhs, NT), r
assert len(r.rhs) in [1, 2], r
if len(r.rhs) == 1 and isinstance(r.rhs[0], T):
self.terminal_rules[r.rhs[0]].append(r)
elif len(r.rhs) == 2 and all(isinstance(x, NT) for x in r.rhs):
self.nonterminal_rules[tuple(r.rhs)].append(r)
else:
assert False, r
def __eq__(self, other):
return self.grammar == other.grammar
def __repr__(self):
return repr(self.grammar)


class UnitSkipRule(Rule):
"""A rule that records NTs that were skipped during transformation."""

def __init__(self, lhs, rhs, skipped_rules, weight, alias):
super(UnitSkipRule, self).__init__(lhs, rhs, weight, alias)
self.skipped_rules = skipped_rules

def __eq__(self, other):
return (super(UnitSkipRule, self).__eq__(other) and
isinstance(other, type(self)) and
self.skipped_rules == other.skipped_rules)


def BuildUnitSkipRule(unit_rule, target_rule):
skipped_rules = []
if isinstance(unit_rule, UnitSkipRule):
skipped_rules += unit_rule.skipped_rules
skipped_rules.append(target_rule)
if isinstance(target_rule, UnitSkipRule):
skipped_rules += target_rule.skipped_rules
return UnitSkipRule(unit_rule.lhs, target_rule.rhs, skipped_rules,
weight=unit_rule.weight + target_rule.weight, alias=unit_rule.alias)
"""A rule that records NTs that were skipped during transformation."""

def __init__(self, lhs, rhs, skipped_rules, weight, alias):
super(UnitSkipRule, self).__init__(lhs, rhs, weight, alias)
self.skipped_rules = skipped_rules

def GetAnyNtUnitRule(g):
"""Returns a non-terminal unit rule from 'g', or None if there is none."""
for rule in g.rules:
if len(rule.rhs) == 1 and isinstance(rule.rhs[0], NT):
return rule
return None


def RemoveUnitRule(g, rule):
"""Removes 'rule' from 'g' without changing the langugage produced by 'g'."""
new_rules = [x for x in g.rules if x != rule]
refs = [x for x in g.rules if x.lhs == rule.rhs[0]]
for ref in refs:
new_rules.append(BuildUnitSkipRule(rule, ref))
return Grammar(new_rules)


def Split(rule):
"""Splits a rule whose len(rhs) > 2 into shorter rules."""
# if len(rule.rhs) <= 2:
# return [rule]
rule_str = str(rule.lhs) + '__' + '_'.join(str(x) for x in rule.rhs)
rule_name = '__SP_%s' % (rule_str) + '_%d'
new_rules = [Rule(rule.lhs, [rule.rhs[0], NT(rule_name % 1)], weight=rule.weight, alias=rule.alias)]
for i in xrange(1, len(rule.rhs) - 2):
new_rules.append(
Rule(NT(rule_name % i),
[rule.rhs[i], NT(rule_name % (i + 1))], weight=0, alias='Split'))
new_rules.append(Rule(NT(rule_name % (len(rule.rhs) - 2)), rule.rhs[-2:], weight=0, alias='Split'))
return new_rules


def Term(g):
"""Applies the TERM rule on 'g' (see top comment)."""
all_t = {x for rule in g.rules for x in rule.rhs if isinstance(x, T)}
t_rules = {t: Rule(NT('__T_%s' % str(t)), [t], weight=0, alias='Term') for t in all_t}
new_rules = []
for rule in g.rules:
if len(rule.rhs) > 1 and any(isinstance(x, T) for x in rule.rhs):
new_rhs = [t_rules[x].lhs if isinstance(x, T) else x for x in rule.rhs]
new_rules.append(Rule(rule.lhs, new_rhs, weight=rule.weight, alias=rule.alias))
new_rules.extend(v for k, v in t_rules.iteritems() if k in rule.rhs)
else:
new_rules.append(rule)
return Grammar(new_rules)
def __eq__(self, other):
return isinstance(other, type(self)) and self.skipped_rules == other.skipped_rules

__hash__ = Rule.__hash__


def build_unit_skiprule(unit_rule, target_rule):
skipped_rules = []
if isinstance(unit_rule, UnitSkipRule):
skipped_rules += unit_rule.skipped_rules
skipped_rules.append(target_rule)
if isinstance(target_rule, UnitSkipRule):
skipped_rules += target_rule.skipped_rules
return UnitSkipRule(unit_rule.lhs, target_rule.rhs, skipped_rules,
weight=unit_rule.weight + target_rule.weight, alias=unit_rule.alias)


def Bin(g):
"""Applies the BIN rule to 'g' (see top comment)."""
new_rules = []
for rule in g.rules:
if len(rule.rhs) > 2:
new_rules.extend(Split(rule))
def get_any_nt_unit_rule(g):
"""Returns a non-terminal unit rule from 'g', or None if there is none."""
for rule in g.rules:
if len(rule.rhs) == 1 and isinstance(rule.rhs[0], NT):
return rule
return None


def _remove_unit_rule(g, rule):
"""Removes 'rule' from 'g' without changing the langugage produced by 'g'."""
new_rules = [x for x in g.rules if x != rule]
refs = [x for x in g.rules if x.lhs == rule.rhs[0]]
new_rules += [build_unit_skiprule(rule, ref) for ref in refs]
return Grammar(new_rules)


def _split(rule):
"""Splits a rule whose len(rhs) > 2 into shorter rules."""
rule_str = str(rule.lhs) + '__' + '_'.join(str(x) for x in rule.rhs)
rule_name = '__SP_%s' % (rule_str) + '_%d'
yield Rule(rule.lhs, [rule.rhs[0], NT(rule_name % 1)], weight=rule.weight, alias=rule.alias)
for i in xrange(1, len(rule.rhs) - 2):
yield Rule(NT(rule_name % i), [rule.rhs[i], NT(rule_name % (i + 1))], weight=0, alias='Split')
yield Rule(NT(rule_name % (len(rule.rhs) - 2)), rule.rhs[-2:], weight=0, alias='Split')


def _term(g):
"""Applies the TERM rule on 'g' (see top comment)."""
all_t = {x for rule in g.rules for x in rule.rhs if isinstance(x, T)}
t_rules = {t: Rule(NT('__T_%s' % str(t)), [t], weight=0, alias='Term') for t in all_t}
new_rules = []
for rule in g.rules:
if len(rule.rhs) > 1 and any(isinstance(x, T) for x in rule.rhs):
new_rhs = [t_rules[x].lhs if isinstance(x, T) else x for x in rule.rhs]
new_rules.append(Rule(rule.lhs, new_rhs, weight=rule.weight, alias=rule.alias))
new_rules.extend(v for k, v in t_rules.items() if k in rule.rhs)
else:
new_rules.append(rule)
return Grammar(new_rules)


def _bin(g):
"""Applies the BIN rule to 'g' (see top comment)."""
new_rules = []
for rule in g.rules:
if len(rule.rhs) > 2:
new_rules += _split(rule)
else:
new_rules.append(rule)
return Grammar(new_rules)


def _unit(g):
"""Applies the UNIT rule to 'g' (see top comment)."""
nt_unit_rule = get_any_nt_unit_rule(g)
while nt_unit_rule:
g = _remove_unit_rule(g, nt_unit_rule)
nt_unit_rule = get_any_nt_unit_rule(g)
return g


def to_cnf(g):
"""Creates a CNF grammar from a general context-free grammar 'g'."""
g = _unit(_bin(_term(g)))
return CnfWrapper(g)


def unroll_unit_skiprule(lhs, orig_rhs, skipped_rules, children, weight, alias):
if not skipped_rules:
return RuleNode(Rule(lhs, orig_rhs, weight=weight, alias=alias), children, weight=weight)
else:
new_rules.append(rule)
return Grammar(new_rules)


def Unit(g):
"""Applies the UNIT rule to 'g' (see top comment)."""
nt_unit_rule = GetAnyNtUnitRule(g)
while nt_unit_rule:
g = RemoveUnitRule(g, nt_unit_rule)
nt_unit_rule = GetAnyNtUnitRule(g)
return g


def ToCnf(g):
"""Creates a CNF grammar from a general context-free grammar 'g'."""
g = Unit(Bin(Term(g)))
return CnfWrapper(g)


def UnrollUnitSkipRule(lhs, orig_rhs, skipped_rules, children, weight, alias):
if not skipped_rules:
return RuleNode(Rule(lhs, orig_rhs, weight=weight, alias=alias), children, weight=weight)
else:
weight = weight - skipped_rules[0].weight
return RuleNode(
Rule(lhs, [skipped_rules[0].lhs], weight=weight, alias=alias), [
UnrollUnitSkipRule(skipped_rules[0].lhs, orig_rhs,
skipped_rules[1:], children,
skipped_rules[0].weight, skipped_rules[0].alias)
], weight=weight)


def RevertCnf(node):
"""Reverts a parse tree (RuleNode) to its original non-CNF form (Node)."""
if isinstance(node, T):
return node
# Reverts TERM rule.
if node.rule.lhs.s.startswith('__T_'):
return node.children[0]
else:
children = []
reverted_children = [RevertCnf(x) for x in node.children]
for child in reverted_children:
# Reverts BIN rule.
if isinstance(child, RuleNode) and child.rule.lhs.s.startswith('__SP_'):
children.extend(child.children)
else:
children.append(child)
# Reverts UNIT rule.
if isinstance(node.rule, UnitSkipRule):
return UnrollUnitSkipRule(node.rule.lhs, node.rule.rhs,
node.rule.skipped_rules, children,
node.rule.weight, node.rule.alias)
weight = weight - skipped_rules[0].weight
return RuleNode(
Rule(lhs, [skipped_rules[0].lhs], weight=weight, alias=alias), [
unroll_unit_skiprule(skipped_rules[0].lhs, orig_rhs,
skipped_rules[1:], children,
skipped_rules[0].weight, skipped_rules[0].alias)
], weight=weight)


def revert_cnf(node):
"""Reverts a parse tree (RuleNode) to its original non-CNF form (Node)."""
if isinstance(node, T):
return node
# Reverts TERM rule.
if node.rule.lhs.s.startswith('__T_'):
return node.children[0]
else:
return RuleNode(node.rule, children)
children = []
for child in map(revert_cnf, node.children):
# Reverts BIN rule.
if isinstance(child, RuleNode) and child.rule.lhs.s.startswith('__SP_'):
children += child.children
else:
children.append(child)
# Reverts UNIT rule.
if isinstance(node.rule, UnitSkipRule):
return unroll_unit_skiprule(node.rule.lhs, node.rule.rhs,
node.rule.skipped_rules, children,
node.rule.weight, node.rule.alias)
else:
return RuleNode(node.rule, children)

+ 2
- 2
lark/tree.py View File

@@ -22,14 +22,14 @@ class Tree(object):

def _pretty(self, level, indent_str):
if len(self.children) == 1 and not isinstance(self.children[0], Tree):
return [ indent_str*level, self._pretty_label(), '\t', '%s' % self.children[0], '\n']
return [ indent_str*level, self._pretty_label(), '\t', '%s' % (self.children[0],), '\n']

l = [ indent_str*level, self._pretty_label(), '\n' ]
for n in self.children:
if isinstance(n, Tree):
l += n._pretty(level+1, indent_str)
else:
l += [ indent_str*(level+1), '%s' % n, '\n' ]
l += [ indent_str*(level+1), '%s' % (n,), '\n' ]

return l



+ 1
- 0
tests/__main__.py View File

@@ -16,6 +16,7 @@ except ImportError:
from .test_parser import (
TestLalrStandard,
TestEarleyStandard,
TestCykStandard,
TestLalrContextual,
TestEarleyScanless,
TestEarleyDynamic,


+ 11
- 1
tests/test_parser.py View File

@@ -382,6 +382,7 @@ def _make_parser_test(LEXER, PARSER):
g.parse(u'\xa3\u0101\u00a3\u0203\n')


@unittest.skipIf(PARSER == 'cyk', "Takes forever")
def test_stack_for_ebnf(self):
"""Verify that stack depth isn't an issue for EBNF grammars"""
g = _Lark(r"""start: a+
@@ -455,6 +456,7 @@ def _make_parser_test(LEXER, PARSER):



@unittest.skipIf(PARSER == 'cyk', "No empty rules")
def test_empty_expand1_list(self):
g = _Lark(r"""start: list
?list: item*
@@ -473,6 +475,7 @@ def _make_parser_test(LEXER, PARSER):
[list] = r.children
self.assertSequenceEqual([item.data for item in list.children], ())

@unittest.skipIf(PARSER == 'cyk', "No empty rules")
def test_empty_expand1_list_2(self):
g = _Lark(r"""start: list
?list: item* "!"?
@@ -492,6 +495,7 @@ def _make_parser_test(LEXER, PARSER):
self.assertSequenceEqual([item.data for item in list.children], ())


@unittest.skipIf(PARSER == 'cyk', "No empty rules")
def test_empty_flatten_list(self):
g = _Lark(r"""start: list
list: | item "," list
@@ -645,6 +649,7 @@ def _make_parser_test(LEXER, PARSER):
self.assertEqual(len(x.children), 1, '/a/ should not be considered anonymous')
self.assertEqual(x.children[0].type, "A")

@unittest.skipIf(PARSER == 'cyk', "No empty rules")
def test_maybe(self):
g = _Lark("""start: ["a"] """)
x = g.parse('a')
@@ -702,6 +707,7 @@ def _make_parser_test(LEXER, PARSER):
# B: A
# """)

@unittest.skipIf(PARSER == 'cyk', "No empty rules")
def test_empty(self):
# Fails an Earley implementation without special handling for empty rules,
# or re-processing of already completed rules.
@@ -732,6 +738,8 @@ def _make_parser_test(LEXER, PARSER):

def test_float_without_lexer(self):
expected_error = UnexpectedInput if LEXER == 'dynamic' else UnexpectedToken
if PARSER == 'cyk':
expected_error = ParseError

g = _Lark("""start: ["+"|"-"] float
float: digit* "." digit+ exp?
@@ -796,6 +804,7 @@ def _make_parser_test(LEXER, PARSER):
self.assertEqual(tree.children, ['a', 'A'])


@unittest.skipIf(PARSER == 'cyk', "No empty rules")
def test_twice_empty(self):
g = """!start: [["A"]]
"""
@@ -1001,6 +1010,7 @@ def _make_parser_test(LEXER, PARSER):


@unittest.skipIf(LEXER==None, "Scanless doesn't support regular expressions")
@unittest.skipIf(PARSER == 'cyk', "No empty rules")
def test_ignore(self):
grammar = r"""
COMMENT: /(!|(\/\/))[^\n]*/
@@ -1026,7 +1036,6 @@ def _make_parser_test(LEXER, PARSER):
self.assertEqual(tree.children, [])



@unittest.skipIf(LEXER==None, "Scanless doesn't support regular expressions")
def test_regex_escaping(self):
g = _Lark("start: /[ab]/")
@@ -1075,6 +1084,7 @@ def _make_parser_test(LEXER, PARSER):
# Note: You still have to import them in __main__ for the tests to run
_TO_TEST = [
('standard', 'earley'),
('standard', 'cyk'),
('dynamic', 'earley'),
('standard', 'lalr'),
('contextual', 'lalr'),


Loading…
Cancel
Save