Browse Source

Added tests and lots of fixes and refactoring

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.5.1
Erez Shinan 7 years ago
parent
commit
cde2b677bb
13 changed files with 747 additions and 290 deletions
  1. +64
    -0
      examples/json_parser.py
  2. +7
    -0
      lark/common.py
  3. +5
    -9
      lark/grammar_analysis.py
  4. +7
    -105
      lark/lark.py
  5. +177
    -175
      lark/load_grammar.py
  6. +76
    -0
      lark/parse_tree_builder.py
  7. +1
    -1
      lark/parser.py
  8. +31
    -0
      lark/parser_frontends.py
  9. +0
    -0
      lark/tests/__init__.py
  10. +14
    -0
      lark/tests/__main__.py
  11. +326
    -0
      lark/tests/test_parser.py
  12. +26
    -0
      lark/tests/test_trees.py
  13. +13
    -0
      lark/tree.py

+ 64
- 0
examples/json_parser.py View File

@@ -0,0 +1,64 @@
import sys
from lark.lark import Lark, inline_args
from lark.tree import Transformer

json_grammar = r"""
?start: value

?value: object
| array
| string
| number
| "true" -> true
| "false" -> false
| "null" -> null

array : "[" [value ("," value)*] "]"
object : "{" [pair ("," pair)*] "}"
pair : string ":" value

number : /-?\d+(\.\d+)?([eE][+-]?\d+)?/
string : /".*?(?<!\\)"/

WS.ignore.newline: /[ \t\n]+/
"""

class TreeToJson(Transformer):
@inline_args
def string(self, s):
return s[1:-1]

array = list
pair = tuple
object = dict
number = inline_args(float)

null = lambda self, _: None
true = lambda self, _: True
false = lambda self, _: False

json_parser = Lark(json_grammar, parser='lalr', transformer=TreeToJson())
parse = json_parser.parse

def test():
test_json = '''
{
"empty_object" : {},
"empty_array" : [],
"booleans" : { "YES" : true, "NO" : false },
"numbers" : [ 0, 1, -2, 3.3, 4.4e5, 6.6e-7 ],
"strings" : [ "This", [ "And" , "That" ] ],
"nothing" : null
}
'''

j = parse(test_json)
print j
import json
assert j == json.loads(test_json)

if __name__ == '__main__':
test()
with open(sys.argv[1]) as f:
print parse(f.read())


+ 7
- 0
lark/common.py View File

@@ -0,0 +1,7 @@

class GrammarError(Exception):
pass

def is_terminal(sym):
return sym.isupper() or sym[0] == '$'


+ 5
- 9
lark/grammar_analysis.py View File

@@ -1,14 +1,9 @@
from collections import defaultdict, deque
from utils import classify, classify_bool, bfs, fzset
from common import GrammarError, is_terminal

ACTION_SHIFT = 0

class GrammarError(Exception):
pass

def is_terminal(sym):
return sym.isupper() or sym[0] == '$'

class Rule(object):
"""
origin : a symbol
@@ -61,9 +56,10 @@ def update_set(set1, set2):
return set1 != copy

class GrammarAnalyzer(object):
def __init__(self, rule_tuples):
def __init__(self, rule_tuples, start_symbol):
self.start_symbol = start_symbol
rule_tuples = list(rule_tuples)
rule_tuples.append(('$root', ['start', '$end']))
rule_tuples.append(('$root', [start_symbol, '$end']))
rule_tuples = [(t[0], t[1], None) if len(t)==2 else t for t in rule_tuples]

self.rules = set()
@@ -78,7 +74,7 @@ class GrammarAnalyzer(object):
if not (is_terminal(sym) or sym in self.rules_by_origin):
raise GrammarError("Using an undefined rule: %s" % sym)

self.init_state = self.expand_rule('start')
self.init_state = self.expand_rule(start_symbol)

def expand_rule(self, rule):
"Returns all init_ptrs accessible by rule (recursive)"


+ 7
- 105
lark/lark.py View File

@@ -7,8 +7,8 @@ from .load_grammar import load_grammar
from .tree import Tree, Transformer

from .lexer import Lexer
from .grammar_analysis import GrammarAnalyzer, is_terminal
from . import parser, earley
from .parse_tree_builder import ParseTreeBuilder
from .parser_frontends import ENGINE_DICT

class LarkOptions(object):
"""Specifies the options for Lark
@@ -23,6 +23,7 @@ class LarkOptions(object):
keep_all_tokens - Don't automagically remove "punctuation" tokens (default: True)
cache_grammar - Cache the Lark grammar (Default: False)
postlex - Lexer post-processing (Default: None)
start - The start symbol (Default: start)
"""
__doc__ += OPTIONS_DOC
def __init__(self, options_dict):
@@ -36,6 +37,7 @@ class LarkOptions(object):
self.postlex = o.pop('postlex', None)
self.parser = o.pop('parser', 'earley')
self.transformer = o.pop('transformer', None)
self.start = o.pop('start', 'start')

assert self.parser in ENGINE_DICT
if self.parser == 'earley' and self.transformer:
@@ -47,71 +49,8 @@ class LarkOptions(object):
raise ValueError("Unknown options: %s" % o.keys())


class Callback(object):
pass


class RuleTreeToText(Transformer):
def expansions(self, x):
return x
def expansion(self, symbols):
return [sym.value for sym in symbols], None
def alias(self, ((expansion, _alias), alias)):
assert _alias is None, (alias, expansion, '-', _alias)
return expansion, alias.value



def create_rule_handler(expansion, usermethod):
to_include = [(i, sym.startswith('_')) for i, sym in enumerate(expansion)
if not (is_terminal(sym) and sym.startswith('_'))]

def _build_ast(match):
children = []
for i, to_expand in to_include:
if to_expand:
children += match[i].children
else:
children.append(match[i])

return usermethod(children)
return _build_ast

def create_expand1_tree_builder_function(tree_builder):
def f(children):
if len(children) == 1:
return children[0]
else:
return tree_builder(children)
return f

class LALR:
def build_parser(self, rules, callback):
ga = GrammarAnalyzer(rules)
ga.analyze()
return parser.Parser(ga, callback)

class Earley:
@staticmethod
def _process_expansion(x):
return [{'literal': s} if is_terminal(s) else s for s in x]

def build_parser(self, rules, callback):
rules = [{'name':n, 'symbols': self._process_expansion(x), 'postprocess':getattr(callback, a)} for n,x,a in rules]
return EarleyParser(earley.Parser(rules, 'start'))

class EarleyParser:
def __init__(self, parser):
self.parser = parser

def parse(self, text):
res = self.parser.parse(text)
assert len(res) ==1 , 'Ambiguious Parse! Not handled yet'
return res[0]


ENGINE_DICT = { 'lalr': LALR, 'earley': Earley }

class Lark:
def __init__(self, grammar, **options):
"""
@@ -147,6 +86,7 @@ class Lark:
self.lexer = self._build_lexer()
if not self.options.only_lex:
self.parser_engine = ENGINE_DICT[self.options.parser]()
self.parse_tree_builder = ParseTreeBuilder(self.options.tree_class)
self.parser = self._build_parser()

def _build_lexer(self):
@@ -160,50 +100,12 @@ class Lark:


def _build_parser(self):
transformer = self.options.transformer
callback = Callback()
rules = []
rule_tree_to_text = RuleTreeToText()
for origin, tree in self.rules.items():
for expansion, alias in rule_tree_to_text.transform(tree):
if alias and origin.startswith('_'):
raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases" % origin)

expand1 = origin.startswith('?')
_origin = origin.lstrip('?*')
if alias:
alias = alias.lstrip('*')
_alias = 'autoalias_%s_%s' % (_origin, '_'.join(expansion))

try:
f = transformer._get_func(alias or _origin)
# f = getattr(transformer, alias or _origin)
except AttributeError:
if alias:
f = self._create_tree_builder_function(alias)
else:
f = self._create_tree_builder_function(_origin)
if expand1:
f = create_expand1_tree_builder_function(f)

alias_handler = create_rule_handler(expansion, f)

assert not hasattr(callback, _alias)
setattr(callback, _alias, alias_handler)

rules.append((_origin, expansion, _alias))

return self.parser_engine.build_parser(rules, callback)
rules, callback = self.parse_tree_builder.create_tree_builder(self.rules, self.options.transformer)
return self.parser_engine.build_parser(rules, callback, self.options.start)


__init__.__doc__ += "\nOPTIONS:" + LarkOptions.OPTIONS_DOC

def _create_tree_builder_function(self, name):
tree_class = self.options.tree_class
def f(children):
return tree_class(name, children)
return f

def lex(self, text):
stream = self.lexer.lex(text)
if self.options.postlex:


+ 177
- 175
lark/load_grammar.py View File

@@ -1,16 +1,18 @@
import re
import codecs

from lexer import Lexer, Token
from grammar_analysis import GrammarAnalyzer
from parser import Parser
from .lexer import Lexer, Token

from tree import Tree as T, Transformer, InlineTransformer, Visitor
from .parse_tree_builder import ParseTreeBuilder
from .parser_frontends import LALR
from .common import is_terminal, GrammarError

from .tree import Tree as T, Transformer, InlineTransformer, Visitor

unicode_escape = codecs.getdecoder('unicode_escape')

_TOKEN_NAMES = {
':' : 'COLON',
':' : '_COLON',
',' : 'COMMA',
';' : 'SEMICOLON',
'+' : 'PLUS',
@@ -26,7 +28,7 @@ _TOKEN_NAMES = {
'<' : 'LESSTHAN',
'>' : 'MORETHAN',
'=' : 'EQUAL',
'.' : 'DOT',
'.' : '_DOT',
'%' : 'PERCENT',
'`' : 'BACKQUOTE',
'^' : 'CIRCUMFLEX',
@@ -34,8 +36,8 @@ _TOKEN_NAMES = {
'\'' : 'QUOTE',
'~' : 'TILDE',
'@' : 'AT',
'(' : 'LPAR',
')' : 'RPAR',
'(' : '_LPAR',
')' : '_RPAR',
'{' : 'LBRACE',
'}' : 'RBRACE',
'[' : 'LSQB',
@@ -44,151 +46,58 @@ _TOKEN_NAMES = {

# Grammar Parser
TOKENS = {
'LPAR': '\(',
'RPAR': '\)',
'LBRA': '\[',
'RBRA': '\]',
'_LPAR': '\(',
'_RPAR': '\)',
'_LBRA': '\[',
'_RBRA': '\]',
'OP': '[+*?]',
'COLON': ':',
'OR': '\|',
'DOT': '\.',
'_COLON': ':',
'_OR': '\|',
'_DOT': '\.',
'RULE': '[_?*]?[a-z][_a-z0-9]*',
'TOKEN': '_?[A-Z][_A-Z0-9]*',
'STRING': r'".*?[^\\]"',
'REGEXP': r"/(.|\n)*?[^\\]/",
'NL': r'(\r?\n)+\s*',
'_NL': r'(\r?\n)+\s*',
'WS': r'[ \t]+',
'COMMENT': r'//[^\n]*\n',
'TO': '->'
'_TO': '->'
}

RULES = [
('start', ['list']),
('list', ['item']),
('list', ['list', 'item']),
('item', ['rule']),
('item', ['token']),
('item', ['NL']),

('rule', ['RULE', 'COLON', 'expansions', 'NL']),
('expansions', ['expansion']),
('expansions', ['expansions', 'OR', 'expansion']),
('expansions', ['expansions', 'NL', 'OR', 'expansion']),

('expansion', ['_expansion']),
('expansion', ['_expansion', 'TO', 'RULE']),
RULES = {
'start': ['list'],
'list': ['item', 'list item'],
'item': ['rule', 'token', '_NL'],

('_expansion', []),
('_expansion', ['_expansion', 'expr']),
'rule': ['RULE _COLON expansions _NL'],
'expansions': ['expansion',
'expansions _OR expansion',
'expansions _NL _OR expansion'],

('expr', ['atom']),
('expr', ['atom', 'OP']),
'expansion': ['_expansion',
'_expansion _TO RULE'],

('atom', ['LPAR', 'expansions', 'RPAR']),
('atom', ['maybe']),
'_expansion': ['', '_expansion expr'],

('atom', ['RULE']),
('atom', ['TOKEN']),
('atom', ['anontoken']),
'?expr': ['atom',
'atom OP'],

('anontoken', ['tokenvalue']),
'?atom': ['_LPAR expansions _RPAR',
'maybe',
'RULE',
'TOKEN',
'anontoken'],

('maybe', ['LBRA', 'expansions', 'RBRA']),
'anontoken': ['tokenvalue'],

('token', ['TOKEN', 'COLON', 'tokenvalue', 'NL']),
('token', ['TOKEN', 'tokenmods', 'COLON', 'tokenvalue', 'NL']),
('tokenvalue', ['REGEXP']),
('tokenvalue', ['STRING']),
('tokenmods', ['DOT', 'RULE']),
('tokenmods', ['tokenmods', 'DOT', 'RULE']),
]

class SaveDefinitions(object):
def __init__(self):
self.rules = {}
self.token_set = set()
self.tokens = []
self.i = 0
'maybe': ['_LBRA expansions _RBRA'],

'token': ['TOKEN _COLON tokenvalue _NL',
'TOKEN tokenmods _COLON tokenvalue _NL'],

def atom__3(self, _1, value, _2):
return value
def atom__1(self, value):
return value

def expr__1(self, expr):
return expr
def expr(self, *x):
return T('expr', x)

def expansion__1(self, expansion):
return expansion
def expansion__3(self, expansion, _, alias):
return T('alias', [expansion, alias])
def _expansion(self, *x):
return T('expansion', x)

def expansions(self, *x):
items = [i for i in x if isinstance(i, T)]
return T('expansions', items)

def maybe(self, _1, expr, _2):
return T('expr', [expr, Token('OP', '?', -1)])

def rule(self, name, _1, expansion, _2):
name = name.value
if name in self.rules:
raise ValueError("Rule '%s' defined more than once" % name)

self.rules[name] = expansion

def token(self, *x):
name = x[0].value
if name in self.token_set:
raise ValueError("Token '%s' defined more than once" % name)
self.token_set.add(name)

if len(x) == 4:
self.tokens.append((name, x[2], []))
else:
self.tokens.append((name, x[3], x[1].children))

def tokenvalue(self, tokenvalue):
return tokenvalue

def anontoken(self, token):
if token.type == 'STRING':
value = token.value[1:-1]
try:
token_name = _TOKEN_NAMES[value]
except KeyError:
if value.isalnum() and value[0].isalpha():
token_name = value.upper()
else:
token_name = 'ANONSTR_%d' % self.i
self.i += 1
token_name = '__' + token_name

elif token.type == 'REGEXP':
token_name = 'ANONRE_%d' % self.i
self.i += 1
else:
assert False, x

if token_name not in self.token_set:
self.token_set.add(token_name)
self.tokens.append((token_name, token, []))

return Token('TOKEN', token_name, -1)

def tokenmods__2(self, _, rule):
return T('tokenmods', [rule.value])
def tokenmods__3(self, tokenmods, _, rule):
return T('tokenmods', tokenmods.children + [rule.value])

def start(self, *x): pass
def list(self, *x): pass
def item(self, *x): pass
'?tokenvalue': ['REGEXP', 'STRING'],
'tokenmods': ['_DOT RULE', 'tokenmods _DOT RULE'],
}


class EBNF_to_BNF(InlineTransformer):
@@ -281,46 +190,110 @@ def dict_update_safe(d1, d2):
d1[k] = v


def generate_aliases():
sd = SaveDefinitions()
for name, expansion in RULES:
try:
f = getattr(sd, "%s__%s" % (name, len(expansion)))
except AttributeError:
f = getattr(sd, name)
yield name, expansion, f.__name__
class RuleTreeToText(Transformer):
def expansions(self, x):
return x
def expansion(self, symbols):
return [sym.value for sym in symbols], None
def alias(self, ((expansion, _alias), alias)):
assert _alias is None, (alias, expansion, '-', _alias)
return expansion, alias.value


class SimplifyTree(InlineTransformer):
def maybe(self, expr):
return T('expr', [expr, Token('OP', '?', -1)])

def tokenmods(self, *args):
if len(args) == 1:
return list(args)
tokenmods, value = args
return tokenmods + [value]

def get_tokens(tree, token_set):
tokens = []
for t in tree.find_data('token'):
x = t.children
name = x[0].value
assert not name.startswith('__'), 'Names starting with double-underscore are reserved (Error at %s)' % name
if name in token_set:
raise ValueError("Token '%s' defined more than once" % name)
token_set.add(name)

if len(x) == 2:
yield name, x[1], []
else:
assert len(x) == 3
yield name, x[2], x[1]

class ExtractAnonTokens(InlineTransformer):
def __init__(self, tokens, token_set):
self.tokens = tokens
self.token_set = token_set
self.token_reverse = {value[1:-1]: name for name, value, _flags in tokens}

def anontoken(self, token):
if token.type == 'STRING':
value = token.value[1:-1]
try:
# If already defined, use the user-defined token name
token_name = self.token_reverse[value]
except KeyError:
# Try to assign an indicative anon-token name, otherwise use a numbered name
try:
token_name = _TOKEN_NAMES[value]
except KeyError:
if value.isalnum() and value[0].isalpha():
token_name = value.upper()
else:
token_name = 'ANONSTR_%d' % self.i
self.i += 1
token_name = '__' + token_name

elif token.type == 'REGEXP':
token_name = 'ANONRE_%d' % self.i
self.i += 1
else:
assert False, x

if token_name not in self.token_set:
self.token_set.add(token_name)
self.tokens.append((token_name, token, []))

return Token('TOKEN', token_name, -1)


def inline_args(f):
def _f(self, args):
return f(*args)
return _f

class GrammarLoader:
def __init__(self):
self.rules = list(generate_aliases())
self.ga = GrammarAnalyzer(self.rules)
self.ga.analyze()
self.lexer = Lexer(TOKENS.items(), {}, ignore=['WS', 'COMMENT'])
self.simplify_rule = SimplifyRule_Visitor()

def _generate_parser_callbacks(self, callbacks):
d = {alias: inline_args(getattr(callbacks, alias))
for _n, _x, alias in self.rules}
return type('Callback', (), d)()
d = {r: [(x.split(), None) for x in xs] for r, xs in RULES.items()}
rules, callback = ParseTreeBuilder(T).create_tree_builder(d, None)
self.parser = LALR().build_parser(rules, callback, 'start')

self.simplify_tree = SimplifyTree()
self.simplify_rule = SimplifyRule_Visitor()
self.rule_tree_to_text = RuleTreeToText()

def load_grammar(self, grammar_text):
sd = SaveDefinitions()
c = self._generate_parser_callbacks(sd)

p = Parser(self.ga, c)
p.parse( list(self.lexer.lex(grammar_text+"\n")) )
token_stream = list(self.lexer.lex(grammar_text+"\n"))
tree = self.simplify_tree.transform( self.parser.parse(token_stream) )

# =================
# Process Tokens
# =================

token_set = set()
tokens = list(get_tokens(tree, token_set))
extract_anon = ExtractAnonTokens(tokens, token_set)
tree = extract_anon.transform(tree) # Adds to tokens

# Tokens
token_ref = {}
re_tokens = []
str_tokens = []
for name, token, flags in sd.tokens:
for name, token, flags in tokens:
value = token.value[1:-1]
if '\u' in value:
# XXX for now, you can't mix unicode escaping and unicode characters at the same token
@@ -343,43 +316,70 @@ class GrammarLoader:
re_tokens.sort(key=lambda x:len(x[1]), reverse=True)
tokens = str_tokens + re_tokens # Order is important!

# Rules
# =================
# Process Rules
# =================

ebnf_to_bnf = EBNF_to_BNF()

rules = {name: ebnf_to_bnf.transform(r) for name, r in sd.rules.items()}
rules = {}
for rule in tree.find_data('rule'):
name, ebnf_tree = rule.children
name = name.value
if name in rules:
raise ValueError("Rule '%s' defined more than once" % name)

rules[name] = ebnf_to_bnf.transform(ebnf_tree)

dict_update_safe(rules, ebnf_to_bnf.new_rules)

for r in rules.values():
self.simplify_rule.visit(r)

rules = {origin: self.rule_tree_to_text.transform(tree) for origin, tree in rules.items()}

# ====================
# Verify correctness
# ====================
used_symbols = {symbol for expansions in rules.values()
for expansion, _alias in expansions
for symbol in expansion}
rule_set = {r.lstrip('?') for r in rules}
for sym in used_symbols:
if is_terminal(sym):
if sym not in token_set:
raise GrammarError("Token '%s' used but not defined" % sym)
else:
if sym not in rule_set:
raise GrammarError("Rule '%s' used but not defined" % sym)

return tokens, rules

load_grammar = GrammarLoader().load_grammar



def test():
g = """
start: add

# Rules
// Rules
add: mul
| add _add_sym mul

mul: _atom
| mul _add_mul _atom
mul: [mul _add_mul] _atom

neg: "-" _atom

_atom: neg
| number
_atom: "-" _atom -> neg
| NUMBER
| "(" add ")"

# Tokens
number: /[\d.]+/
// Tokens
NUMBER: /[\d.]+/
_add_sym: "+" | "-"
_add_mul: "*" | "/"

WS.ignore: /\s+/
WS.ignore.newline: /\s+/
"""

g2 = """
@@ -389,7 +389,9 @@ def test():
c: "c"
d: "+" | "-"
"""
load_grammar(g)
# print load_grammar(g)
print GrammarLoader().load_grammar2(g)


if __name__ == '__main__':
test()

+ 76
- 0
lark/parse_tree_builder.py View File

@@ -0,0 +1,76 @@
from .grammar_analysis import is_terminal

class Callback(object):
pass


def create_expand1_tree_builder_function(tree_builder):
def f(children):
if len(children) == 1:
return children[0]
else:
return tree_builder(children)
return f

def create_rule_handler(expansion, usermethod):
to_include = [(i, sym.startswith('_')) for i, sym in enumerate(expansion)
if not (is_terminal(sym) and sym.startswith('_'))]

def _build_ast(match):
children = []
for i, to_expand in to_include:
if to_expand:
children += match[i].children
else:
children.append(match[i])

return usermethod(children)
return _build_ast


class ParseTreeBuilder:
def __init__(self, tree_class):
self.tree_class = tree_class

def _create_tree_builder_function(self, name):
tree_class = self.tree_class
def f(children):
return tree_class(name, children)
return f


def create_tree_builder(self, rules, transformer):
callback = Callback()
new_rules = []
for origin, expansions in rules.items():
expand1 = origin.startswith('?')
_origin = origin.lstrip('?*')

for expansion, alias in expansions:
if alias and origin.startswith('_'):
raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases" % origin)

if alias:
alias = alias.lstrip('*')
_alias = 'autoalias_%s_%s' % (_origin, '_'.join(expansion))

try:
f = transformer._get_func(alias or _origin)
except AttributeError:
if alias:
f = self._create_tree_builder_function(alias)
else:
f = self._create_tree_builder_function(_origin)
if expand1:
f = create_expand1_tree_builder_function(f)

alias_handler = create_rule_handler(expansion, f)

assert not hasattr(callback, _alias)
setattr(callback, _alias, alias_handler)

new_rules.append(( _origin, expansion, _alias ))

return new_rules, callback



+ 1
- 1
lark/parser.py View File

@@ -34,7 +34,7 @@ class Parser(object):

res = self.callbacks[rule]([x[0] for x in s])

if rule.origin == 'start':
if rule.origin == self.ga.start_symbol and len(stack) == 1:
return res

_action, new_state = get_action(rule.origin)


+ 31
- 0
lark/parser_frontends.py View File

@@ -0,0 +1,31 @@
from .grammar_analysis import GrammarAnalyzer
from common import is_terminal
from . import parser, earley

class LALR:
def build_parser(self, rules, callback, start):
ga = GrammarAnalyzer(rules, start)
ga.analyze()
return parser.Parser(ga, callback)

class Earley:
@staticmethod
def _process_expansion(x):
return [{'literal': s} if is_terminal(s) else s for s in x]

def build_parser(self, rules, callback, start):
rules = [{'name':n, 'symbols': self._process_expansion(x), 'postprocess':getattr(callback, a)} for n,x,a in rules]
return EarleyParser(earley.Parser(rules, start))

class EarleyParser:
def __init__(self, parser):
self.parser = parser

def parse(self, text):
res = self.parser.parse(text)
assert len(res) ==1 , 'Ambiguious Parse! Not handled yet'
return res[0]


ENGINE_DICT = { 'lalr': LALR, 'earley': Earley }


+ 0
- 0
lark/tests/__init__.py View File


+ 14
- 0
lark/tests/__main__.py View File

@@ -0,0 +1,14 @@
from __future__ import absolute_import, print_function

import unittest
import logging

from .test_trees import TestTrees
# from .test_selectors import TestSelectors
from .test_parser import TestLalr
# from .test_grammars import TestPythonG, TestConfigG

logging.basicConfig(level=logging.INFO)

if __name__ == '__main__':
unittest.main()

+ 326
- 0
lark/tests/test_parser.py View File

@@ -0,0 +1,326 @@
from __future__ import absolute_import

import unittest
import logging
import os
import sys
try:
from cStringIO import StringIO as cStringIO
except ImportError:
# Available only in Python 2.x, 3.x only has io.StringIO from below
cStringIO = None
from io import (
StringIO as uStringIO,
open,
)

logging.basicConfig(level=logging.INFO)

from lark.lark import Lark
from lark.grammar_analysis import GrammarError
from lark.parser import ParseError

__path__ = os.path.dirname(__file__)
def _read(n, *args):
with open(os.path.join(__path__, n), *args) as f:
return f.read()


class TestLalr(unittest.TestCase):
def test_basic1(self):
g = Lark("""start: a+ b a* "b" a*
b: "b"
a: "a"
""", parser='lalr')
r = g.parse('aaabaab')
self.assertEqual( ''.join(x.data for x in r.children), 'aaabaa' )
r = g.parse('aaabaaba')
self.assertEqual( ''.join(x.data for x in r.children), 'aaabaaa' )

self.assertRaises(ParseError, g.parse, 'aaabaa')

def test_basic2(self):
# Multiple parsers and colliding tokens
g = Lark("""start: B A
B: "12"
A: "1" """)
g2 = Lark("""start: B A
B: "12"
A: "2" """)
x = g.parse('121')
assert x.data == 'start' and x.children == ['12', '1'], x
x = g2.parse('122')
assert x.data == 'start' and x.children == ['12', '2'], x

def test_basic3(self):
"Tests that Earley and LALR parsers produce equal trees"
g = Lark("""start: "(" name_list ("," "*" NAME)? ")"
name_list: NAME | name_list "," NAME
NAME: /\w+/ """, parser='lalr')
l = g.parse('(a,b,c,*x)')

g = Lark("""start: "(" name_list ("," "*" NAME)? ")"
name_list: NAME | name_list "," NAME
NAME: /\w+/ """)
l2 = g.parse('(a,b,c,*x)')
assert l == l2, '%s != %s' % (l.pretty(), l2.pretty())

@unittest.skipIf(cStringIO is None, "cStringIO not available")
def test_stringio_bytes(self):
"""Verify that a Lark can be created from file-like objects other than Python's standard 'file' object"""
Lark(cStringIO(b'start: a+ b a* "b" a*\n b: "b"\n a: "a" '))

def test_stringio_unicode(self):
"""Verify that a Lark can be created from file-like objects other than Python's standard 'file' object"""
Lark(uStringIO(u'start: a+ b a* "b" a*\n b: "b"\n a: "a" '))

def test_unicode(self):
g = Lark(u"""start: UNIA UNIB UNIA
UNIA: /\xa3/
UNIB: /\u0101/
""")
g.parse(u'\xa3\u0101\u00a3')

def test_unicode2(self):
g = Lark(r"""start: UNIA UNIB UNIA UNIC
UNIA: /\xa3/
UNIB: "a\u0101b\ "
UNIC: /a?\u0101c\n/
""")
g.parse(u'\xa3a\u0101b\\ \u00a3\u0101c\n')


def test_recurse_expansion(self):
"""Verify that stack depth doesn't get exceeded on recursive rules marked for expansion."""
g = Lark(r"""start: a | start a
a : "a" """)

# Force PLY to write to the debug log, but prevent writing it to the terminal (uses repr() on the half-built
# STree data structures, which uses recursion).
g.parse("a" * (sys.getrecursionlimit() // 4))

def test_expand1_lists_with_one_item(self):
g = Lark(r"""start: list
?list: item+
item : A
A: "a"
""")
r = g.parse("a")

# because 'list' is an expand-if-contains-one rule and we only provided one element it should have expanded to 'item'
self.assertSequenceEqual([subtree.data for subtree in r.children], ('item',))

# regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
self.assertEqual(len(r.children), 1)

def test_expand1_lists_with_one_item_2(self):
g = Lark(r"""start: list
?list: item+ "!"
item : A
A: "a"
""")
r = g.parse("a!")

# because 'list' is an expand-if-contains-one rule and we only provided one element it should have expanded to 'item'
self.assertSequenceEqual([subtree.data for subtree in r.children], ('item',))

# regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
self.assertEqual(len(r.children), 1)

def test_dont_expand1_lists_with_multiple_items(self):
g = Lark(r"""start: list
?list: item+
item : A
A: "a"
""")
r = g.parse("aa")

# because 'list' is an expand-if-contains-one rule and we've provided more than one element it should *not* have expanded
self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))

# regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
self.assertEqual(len(r.children), 1)

# Sanity check: verify that 'list' contains the two 'item's we've given it
[list] = r.children
self.assertSequenceEqual([item.data for item in list.children], ('item', 'item'))

def test_dont_expand1_lists_with_multiple_items_2(self):
g = Lark(r"""start: list
?list: item+ "!"
item : A
A: "a"
""")
r = g.parse("aa!")

# because 'list' is an expand-if-contains-one rule and we've provided more than one element it should *not* have expanded
self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))

# regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
self.assertEqual(len(r.children), 1)

# Sanity check: verify that 'list' contains the two 'item's we've given it
[list] = r.children
self.assertSequenceEqual([item.data for item in list.children], ('item', 'item'))



def test_empty_expand1_list(self):
g = Lark(r"""start: list
?list: item*
item : A
A: "a"
""")
r = g.parse("")

# because 'list' is an expand-if-contains-one rule and we've provided less than one element (i.e. none) it should *not* have expanded
self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))

# regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
self.assertEqual(len(r.children), 1)

# Sanity check: verify that 'list' contains no 'item's as we've given it none
[list] = r.children
self.assertSequenceEqual([item.data for item in list.children], ())

def test_empty_expand1_list_2(self):
g = Lark(r"""start: list
?list: item* "!"?
item : A
A: "a"
""")
r = g.parse("")

# because 'list' is an expand-if-contains-one rule and we've provided less than one element (i.e. none) it should *not* have expanded
self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))

# regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
self.assertEqual(len(r.children), 1)

# Sanity check: verify that 'list' contains no 'item's as we've given it none
[list] = r.children
self.assertSequenceEqual([item.data for item in list.children], ())


def test_empty_flatten_list(self):
g = Lark(r"""start: list
list: | item "," list
item : A
A: "a"
""")
r = g.parse("")

# Because 'list' is a flatten rule it's top-level element should *never* be expanded
self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))

# Sanity check: verify that 'list' contains no 'item's as we've given it none
[list] = r.children
self.assertSequenceEqual([item.data for item in list.children], ())

@unittest.skipIf(True, "Flattening list isn't implemented (and may never be)")
def test_single_item_flatten_list(self):
g = Lark(r"""start: list
list: | item "," list
item : A
A: "a"
""")
r = g.parse("a,")

# Because 'list' is a flatten rule it's top-level element should *never* be expanded
self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))

# Sanity check: verify that 'list' contains exactly the one 'item' we've given it
[list] = r.children
self.assertSequenceEqual([item.data for item in list.children], ('item',))

@unittest.skipIf(True, "Flattening list isn't implemented (and may never be)")
def test_multiple_item_flatten_list(self):
g = Lark(r"""start: list
#list: | item "," list
item : A
A: "a"
""")
r = g.parse("a,a,")

# Because 'list' is a flatten rule it's top-level element should *never* be expanded
self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))

# Sanity check: verify that 'list' contains exactly the two 'item's we've given it
[list] = r.children
self.assertSequenceEqual([item.data for item in list.children], ('item', 'item'))

@unittest.skipIf(True, "Flattening list isn't implemented (and may never be)")
def test_recurse_flatten(self):
"""Verify that stack depth doesn't get exceeded on recursive rules marked for flattening."""
g = Lark(r"""start: a | start a
a : A
A : "a" """)

# Force PLY to write to the debug log, but prevent writing it to the terminal (uses repr() on the half-built
# STree data structures, which uses recursion).
g.parse("a" * (sys.getrecursionlimit() // 4))

def test_token_collision(self):
g = Lark("""start: "Hello" NAME
NAME: /\w+/
WS.ignore: /\s+/
""", parser='lalr')
x = g.parse('Hello World')
self.assertSequenceEqual(x.children, ['World'])

def test_undefined_rule(self):
self.assertRaises(GrammarError, Lark, """start: a""", parser='lalr')

def test_undefined_token(self):
self.assertRaises(GrammarError, Lark, """start: A""", parser='lalr')

def test_rule_collision(self):
g = Lark("""start: "a"+ "b"
| "a"+ """, parser='lalr')
x = g.parse('aaaa')
x = g.parse('aaaab')

def test_rule_collision2(self):
g = Lark("""start: "a"* "b"
| "a"+ """, parser='lalr')
x = g.parse('aaaa')
x = g.parse('aaaab')
x = g.parse('b')

def test_regex_embed(self):
g = Lark("""start: A B C
A: /a/
B: /${A}b/
C: /${B}c/
""", parser='lalr')
x = g.parse('aababc')

def test_token_not_anon(self):
"""Tests that "a" is matched as A, rather than an anonymous token.

That means that "a" is not filtered out, despite being an 'immediate string'.
Whether or not this is the intuitive behavior, I'm not sure yet.

-Erez
"""

g = Lark("""start: "a"
A: "a" """, parser='lalr')
x = g.parse('a')
self.assertEqual(len(x.children), 1, '"a" should not be considered anonymous')
self.assertEqual(x.children[0].type, "A")

def test_maybe(self):
g = Lark("""start: ["a"] """, parser='lalr')
x = g.parse('a')
x = g.parse('')

def test_start(self):
g = Lark("""a: "a" a? """, parser='lalr', start='a')
x = g.parse('a')
x = g.parse('aa')
x = g.parse('aaa')

if __name__ == '__main__':
unittest.main()


+ 26
- 0
lark/tests/test_trees.py View File

@@ -0,0 +1,26 @@
from __future__ import absolute_import

from unittest import TestCase
import logging
import copy
import pickle

from lark.tree import Tree


class TestTrees(TestCase):
def setUp(self):
self.tree1 = Tree('a', [Tree(x, y) for x, y in zip('bcd', 'xyz')])

def test_deepcopy(self):
assert self.tree1 == copy.deepcopy(self.tree1)

def test_pickle(self):
s = copy.deepcopy(self.tree1)
data = pickle.dumps(s)
assert pickle.loads(data) == s


if __name__ == '__main__':
unittest.main()


+ 13
- 0
lark/tree.py View File

@@ -33,6 +33,19 @@ class Tree(object):
def __eq__(self, other):
return self.data == other.data and self.children == other.children

def find_pred(self, pred):
if pred(self):
yield self
else:
for i, c in enumerate(self.children):
if isinstance(c, Tree):
for t in c.find_pred(pred):
yield t

def find_data(self, data):
return self.find_pred(lambda t: t.data == data)


# def find_path(self, pred):
# if pred(self):
# yield []


Loading…
Cancel
Save