@@ -4,6 +4,7 @@ | |||
/lark_parser.egg-info/** | |||
tags | |||
.vscode | |||
.idea | |||
.ropeproject | |||
.cache | |||
/dist | |||
@@ -72,7 +72,7 @@ Lark is great at handling ambiguity. Let's parse the phrase "fruit flies like ba | |||
 | |||
See more [examples in the wiki](https://github.com/erezsh/lark/wiki/Examples) | |||
See more [examples here](https://github.com/lark-parser/lark/tree/master/examples) | |||
@@ -95,7 +95,7 @@ See more [examples in the wiki](https://github.com/erezsh/lark/wiki/Examples) | |||
- Extensive test suite [](https://codecov.io/gh/erezsh/lark) | |||
- And much more! | |||
See the full list of [features in the wiki](https://github.com/erezsh/lark/wiki/Features) | |||
See the full list of [features here](https://lark-parser.readthedocs.io/en/latest/features/) | |||
### Comparison to other libraries | |||
@@ -5,4 +5,4 @@ from .exceptions import ParseError, LexError, GrammarError, UnexpectedToken, Une | |||
from .lexer import Token | |||
from .lark import Lark | |||
__version__ = "0.7.2" | |||
__version__ = "0.7.4" |
@@ -8,7 +8,6 @@ from .exceptions import UnexpectedCharacters, LexError | |||
###{standalone | |||
class Pattern(Serialize): | |||
__serialize_fields__ = 'value', 'flags' | |||
def __init__(self, value, flags=()): | |||
self.value = value | |||
@@ -41,6 +40,8 @@ class Pattern(Serialize): | |||
class PatternStr(Pattern): | |||
__serialize_fields__ = 'value', 'flags' | |||
type = "str" | |||
def to_regexp(self): | |||
@@ -52,6 +53,8 @@ class PatternStr(Pattern): | |||
max_width = min_width | |||
class PatternRE(Pattern): | |||
__serialize_fields__ = 'value', 'flags', '_width' | |||
type = "re" | |||
def to_regexp(self): | |||
@@ -98,7 +101,7 @@ class Token(Str): | |||
self.type = type_ | |||
self.pos_in_stream = pos_in_stream | |||
self.value = Str(value) | |||
self.value = value | |||
self.line = line | |||
self.column = column | |||
self.end_line = end_line | |||
@@ -265,13 +268,14 @@ def build_mres(terminals, match_whole=False): | |||
return _build_mres(terminals, len(terminals), match_whole) | |||
def _regexp_has_newline(r): | |||
"""Expressions that may indicate newlines in a regexp: | |||
r"""Expressions that may indicate newlines in a regexp: | |||
- newlines (\n) | |||
- escaped newline (\\n) | |||
- anything but ([^...]) | |||
- any-char (.) when the flag (?s) exists | |||
- spaces (\s) | |||
""" | |||
return '\n' in r or '\\n' in r or '[^' in r or ('(?s' in r and '.' in r) | |||
return '\n' in r or '\\n' in r or '\\s' in r or '[^' in r or ('(?s' in r and '.' in r) | |||
class Lexer(object): | |||
"""Lexer interface | |||
@@ -12,7 +12,7 @@ from .parse_tree_builder import ParseTreeBuilder | |||
from .parser_frontends import LALR_TraditionalLexer | |||
from .common import LexerConf, ParserConf | |||
from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol | |||
from .utils import classify, suppress, dedup_list | |||
from .utils import classify, suppress, dedup_list, Str | |||
from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken | |||
from .tree import Tree, SlottedTree as ST | |||
@@ -351,7 +351,10 @@ def _fix_escaping(s): | |||
for n in i: | |||
w += n | |||
if n == '\\': | |||
n2 = next(i) | |||
try: | |||
n2 = next(i) | |||
except StopIteration: | |||
raise ValueError("Literal ended unexpectedly (bad escaping): `%r`" % s) | |||
if n2 == '\\': | |||
w += '\\\\' | |||
elif n2 not in 'uxnftr': | |||
@@ -451,9 +454,9 @@ class PrepareSymbols(Transformer_InPlace): | |||
if isinstance(v, Tree): | |||
return v | |||
elif v.type == 'RULE': | |||
return NonTerminal(v.value) | |||
return NonTerminal(Str(v.value)) | |||
elif v.type == 'TERMINAL': | |||
return Terminal(v.value, filter_out=v.startswith('_')) | |||
return Terminal(Str(v.value), filter_out=v.startswith('_')) | |||
assert False | |||
def _choice_of_rules(rules): | |||
@@ -511,12 +514,12 @@ class Grammar: | |||
simplify_rule = SimplifyRule_Visitor() | |||
compiled_rules = [] | |||
for i, rule_content in enumerate(rules): | |||
for rule_content in rules: | |||
name, tree, options = rule_content | |||
simplify_rule.visit(tree) | |||
expansions = rule_tree_to_text.transform(tree) | |||
for expansion, alias in expansions: | |||
for i, (expansion, alias) in enumerate(expansions): | |||
if alias and name.startswith('_'): | |||
raise GrammarError("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (name, alias)) | |||
@@ -538,7 +541,7 @@ class Grammar: | |||
for dups in duplicates.values(): | |||
if len(dups) > 1: | |||
if dups[0].expansion: | |||
raise GrammarError("Rules defined twice: %s" % ', '.join(str(i) for i in duplicates)) | |||
raise GrammarError("Rules defined twice: %s\n\n(Might happen due to colliding expansion of optionals: [] or ?)" % ''.join('\n * %s' % i for i in dups)) | |||
# Empty rule; assert all other attributes are equal | |||
assert len({(r.alias, r.order, r.options) for r in dups}) == len(dups) | |||
@@ -605,7 +608,9 @@ def import_from_grammar_into_namespace(grammar, namespace, aliases): | |||
_, tree, _ = imported_rules[symbol] | |||
except KeyError: | |||
raise GrammarError("Missing symbol '%s' in grammar %s" % (symbol, namespace)) | |||
return tree.scan_values(lambda x: x.type in ('RULE', 'TERMINAL')) | |||
return _find_used_symbols(tree) | |||
def get_namespace_name(name): | |||
try: | |||
@@ -682,6 +687,11 @@ class PrepareGrammar(Transformer_InPlace): | |||
return name | |||
def _find_used_symbols(tree): | |||
assert tree.data == 'expansions' | |||
return {t for x in tree.find_data('expansion') | |||
for t in x.scan_values(lambda t: t.type in ('RULE', 'TERMINAL'))} | |||
class GrammarLoader: | |||
def __init__(self): | |||
terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()] | |||
@@ -843,9 +853,7 @@ class GrammarLoader: | |||
rule_names.add(name) | |||
for name, expansions, _o in rules: | |||
used_symbols = {t for x in expansions.find_data('expansion') | |||
for t in x.scan_values(lambda t: t.type in ('RULE', 'TERMINAL'))} | |||
for sym in used_symbols: | |||
for sym in _find_used_symbols(expansions): | |||
if sym.type == 'TERMINAL': | |||
if sym not in terminal_names: | |||
raise GrammarError("Token '%s' used but not defined (in rule %s)" % (sym, name)) | |||
@@ -118,7 +118,7 @@ class LALR_ContextualLexer(LALR_WithLexer): | |||
class LALR_CustomLexer(LALR_WithLexer): | |||
def __init__(self, lexer_cls, lexer_conf, parser_conf, options=None): | |||
self.lexer = lexer_cls(self.lexer_conf) | |||
self.lexer = lexer_cls(lexer_conf) | |||
debug = options.debug if options else False | |||
self.parser = LALR_Parser(parser_conf, debug=debug) | |||
WithLexer.__init__(self, lexer_conf, parser_conf, options) | |||
@@ -139,7 +139,8 @@ class Earley(WithLexer): | |||
self.init_traditional_lexer() | |||
resolve_ambiguity = options.ambiguity == 'resolve' | |||
self.parser = earley.Parser(parser_conf, self.match, resolve_ambiguity=resolve_ambiguity) | |||
debug = options.debug if options else False | |||
self.parser = earley.Parser(parser_conf, self.match, resolve_ambiguity=resolve_ambiguity, debug=debug) | |||
def match(self, term, token): | |||
return term.name == token.type | |||
@@ -152,10 +153,12 @@ class XEarley(_ParserFrontend): | |||
self._prepare_match(lexer_conf) | |||
resolve_ambiguity = options.ambiguity == 'resolve' | |||
debug = options.debug if options else False | |||
self.parser = xearley.Parser(parser_conf, | |||
self.match, | |||
ignore=lexer_conf.ignore, | |||
resolve_ambiguity=resolve_ambiguity, | |||
debug=debug, | |||
**kw | |||
) | |||
@@ -20,10 +20,11 @@ from .earley_common import Item, TransitiveItem | |||
from .earley_forest import ForestToTreeVisitor, ForestSumVisitor, SymbolNode, ForestToAmbiguousTreeVisitor | |||
class Parser: | |||
def __init__(self, parser_conf, term_matcher, resolve_ambiguity=True): | |||
def __init__(self, parser_conf, term_matcher, resolve_ambiguity=True, debug=False): | |||
analysis = GrammarAnalyzer(parser_conf) | |||
self.parser_conf = parser_conf | |||
self.resolve_ambiguity = resolve_ambiguity | |||
self.debug = debug | |||
self.FIRST = analysis.FIRST | |||
self.NULLABLE = analysis.NULLABLE | |||
@@ -296,6 +297,10 @@ class Parser: | |||
# symbol should have been completed in the last step of the Earley cycle, and will be in | |||
# this column. Find the item for the start_symbol, which is the root of the SPPF tree. | |||
solutions = [n.node for n in columns[-1] if n.is_complete and n.node is not None and n.s == start_symbol and n.start == 0] | |||
if self.debug: | |||
from .earley_forest import ForestToPyDotVisitor | |||
debug_walker = ForestToPyDotVisitor() | |||
debug_walker.visit(solutions[0], "sppf.png") | |||
if not solutions: | |||
expected_tokens = [t.expect for t in to_scan] | |||
@@ -122,7 +122,7 @@ class PackedNode(ForestNode): | |||
ambiguously. Hence, we use the sort order to identify | |||
the order in which ambiguous children should be considered. | |||
""" | |||
return self.is_empty, -self.priority, -self.rule.order | |||
return self.is_empty, -self.priority, self.rule.order | |||
def __iter__(self): | |||
return iter([self.left, self.right]) | |||
@@ -24,8 +24,8 @@ from .earley_forest import SymbolNode | |||
class Parser(BaseParser): | |||
def __init__(self, parser_conf, term_matcher, resolve_ambiguity=True, ignore = (), complete_lex = False): | |||
BaseParser.__init__(self, parser_conf, term_matcher, resolve_ambiguity) | |||
def __init__(self, parser_conf, term_matcher, resolve_ambiguity=True, ignore = (), complete_lex = False, debug=False): | |||
BaseParser.__init__(self, parser_conf, term_matcher, resolve_ambiguity, debug) | |||
self.ignore = [Terminal(t) for t in ignore] | |||
self.complete_lex = complete_lex | |||
@@ -0,0 +1,39 @@ | |||
import codecs | |||
import sys | |||
import json | |||
from lark import Lark | |||
from lark.grammar import RuleOptions, Rule | |||
from lark.lexer import TerminalDef | |||
import argparse | |||
argparser = argparse.ArgumentParser(prog='python -m lark.tools.serialize') #description='''Lark Serialization Tool -- Stores Lark's internal state & LALR analysis as a convenient JSON file''') | |||
argparser.add_argument('grammar_file', type=argparse.FileType('r'), help='A valid .lark file') | |||
argparser.add_argument('-o', '--out', type=argparse.FileType('w'), default=sys.stdout, help='json file path to create (default=stdout)') | |||
argparser.add_argument('-s', '--start', default='start', help='start symbol (default="start")', nargs='+') | |||
argparser.add_argument('-l', '--lexer', default='standard', choices=['standard', 'contextual'], help='lexer type (default="standard")') | |||
def serialize(infile, outfile, lexer, start): | |||
lark_inst = Lark(infile, parser="lalr", lexer=lexer, start=start) # TODO contextual | |||
data, memo = lark_inst.memo_serialize([TerminalDef, Rule]) | |||
outfile.write('{\n') | |||
outfile.write(' "data": %s,\n' % json.dumps(data)) | |||
outfile.write(' "memo": %s\n' % json.dumps(memo)) | |||
outfile.write('}\n') | |||
def main(): | |||
if len(sys.argv) == 1 or '-h' in sys.argv or '--help' in sys.argv: | |||
print("Lark Serialization Tool - Stores Lark's internal state & LALR analysis as a JSON file") | |||
print("") | |||
argparser.print_help() | |||
else: | |||
args = argparser.parse_args() | |||
serialize(args.grammar_file, args.out, args.lexer, args.start) | |||
if __name__ == '__main__': | |||
main() |
@@ -56,30 +56,6 @@ class Tree(object): | |||
def __hash__(self): | |||
return hash((self.data, tuple(self.children))) | |||
###} | |||
def expand_kids_by_index(self, *indices): | |||
"Expand (inline) children at the given indices" | |||
for i in sorted(indices, reverse=True): # reverse so that changing tail won't affect indices | |||
kid = self.children[i] | |||
self.children[i:i+1] = kid.children | |||
def find_pred(self, pred): | |||
"Find all nodes where pred(tree) == True" | |||
return filter(pred, self.iter_subtrees()) | |||
def find_data(self, data): | |||
"Find all nodes where tree.data == data" | |||
return self.find_pred(lambda t: t.data == data) | |||
def scan_values(self, pred): | |||
for c in self.children: | |||
if isinstance(c, Tree): | |||
for t in c.scan_values(pred): | |||
yield t | |||
else: | |||
if pred(c): | |||
yield c | |||
def iter_subtrees(self): | |||
# TODO: Re-write as a more efficient version | |||
@@ -102,6 +78,31 @@ class Tree(object): | |||
yield x | |||
seen.add(id(x)) | |||
def find_pred(self, pred): | |||
"Find all nodes where pred(tree) == True" | |||
return filter(pred, self.iter_subtrees()) | |||
def find_data(self, data): | |||
"Find all nodes where tree.data == data" | |||
return self.find_pred(lambda t: t.data == data) | |||
###} | |||
def expand_kids_by_index(self, *indices): | |||
"Expand (inline) children at the given indices" | |||
for i in sorted(indices, reverse=True): # reverse so that changing tail won't affect indices | |||
kid = self.children[i] | |||
self.children[i:i+1] = kid.children | |||
def scan_values(self, pred): | |||
for c in self.children: | |||
if isinstance(c, Tree): | |||
for t in c.scan_values(pred): | |||
yield t | |||
else: | |||
if pred(c): | |||
yield c | |||
def iter_subtrees_topdown(self): | |||
stack = [self] | |||
while stack: | |||
@@ -160,7 +160,7 @@ def smart_decorator(f, create_decorator): | |||
elif isinstance(f, partial): | |||
# wraps does not work for partials in 2.7: https://bugs.python.org/issue3445 | |||
return create_decorator(f.__func__, True) | |||
return wraps(f.func)(create_decorator(lambda *args, **kw: f(*args[1:], **kw), True)) | |||
else: | |||
return create_decorator(f.__func__.__call__, True) | |||
@@ -172,7 +172,7 @@ import sre_parse | |||
import sre_constants | |||
def get_regexp_width(regexp): | |||
try: | |||
return sre_parse.parse(regexp).getwidth() | |||
return [int(x) for x in sre_parse.parse(regexp).getwidth()] | |||
except sre_constants.error: | |||
raise ValueError(regexp) | |||
@@ -0,0 +1,10 @@ | |||
version: 2 | |||
mkdocs: | |||
configuration: mkdocs.yml | |||
fail_on_warning: false | |||
formats: all | |||
python: | |||
version: 3.5 |
@@ -21,6 +21,7 @@ from .test_parser import ( | |||
TestCykStandard, | |||
TestLalrContextual, | |||
TestEarleyDynamic, | |||
TestLalrCustom, | |||
# TestFullEarleyStandard, | |||
TestFullEarleyDynamic, | |||
@@ -22,7 +22,7 @@ from lark.exceptions import GrammarError, ParseError, UnexpectedToken, Unexpecte | |||
from lark.tree import Tree | |||
from lark.visitors import Transformer, Transformer_InPlace, v_args | |||
from lark.grammar import Rule | |||
from lark.lexer import TerminalDef | |||
from lark.lexer import TerminalDef, Lexer, TraditionalLexer | |||
__path__ = os.path.dirname(__file__) | |||
def _read(n, *args): | |||
@@ -431,12 +431,22 @@ def _make_full_earley_test(LEXER): | |||
_TestFullEarley.__name__ = _NAME | |||
globals()[_NAME] = _TestFullEarley | |||
class CustomLexer(Lexer): | |||
""" | |||
Purpose of this custom lexer is to test the integration, | |||
so it uses the traditionalparser as implementation without custom lexing behaviour. | |||
""" | |||
def __init__(self, lexer_conf): | |||
self.lexer = TraditionalLexer(lexer_conf.tokens, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks) | |||
def lex(self, *args, **kwargs): | |||
return self.lexer.lex(*args, **kwargs) | |||
def _make_parser_test(LEXER, PARSER): | |||
lexer_class_or_name = CustomLexer if LEXER == 'custom' else LEXER | |||
def _Lark(grammar, **kwargs): | |||
return Lark(grammar, lexer=LEXER, parser=PARSER, propagate_positions=True, **kwargs) | |||
return Lark(grammar, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs) | |||
def _Lark_open(gfilename, **kwargs): | |||
return Lark.open(gfilename, lexer=LEXER, parser=PARSER, propagate_positions=True, **kwargs) | |||
return Lark.open(gfilename, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs) | |||
class _TestParser(unittest.TestCase): | |||
def test_basic1(self): | |||
g = _Lark("""start: a+ b a* "b" a* | |||
@@ -1532,7 +1542,7 @@ def _make_parser_test(LEXER, PARSER): | |||
parser = _Lark(grammar) | |||
@unittest.skipIf(PARSER!='lalr', "Serialize currently only works for LALR parsers (though it should be easy to extend)") | |||
@unittest.skipIf(PARSER!='lalr' or LEXER=='custom', "Serialize currently only works for LALR parsers without custom lexers (though it should be easy to extend)") | |||
def test_serialize(self): | |||
grammar = """ | |||
start: _ANY b "C" | |||
@@ -1558,6 +1568,28 @@ def _make_parser_test(LEXER, PARSER): | |||
self.assertEqual(parser.parse('xa', 'a'), Tree('a', [])) | |||
self.assertEqual(parser.parse('xb', 'b'), Tree('b', [])) | |||
def test_lexer_detect_newline_tokens(self): | |||
# Detect newlines in regular tokens | |||
g = _Lark(r"""start: "go" tail* | |||
!tail : SA "@" | SB "@" | SC "@" | SD "@" | |||
SA : "a" /\n/ | |||
SB : /b./s | |||
SC : "c" /[^a-z]/ | |||
SD : "d" /\s/ | |||
""") | |||
a,b,c,d = [x.children[1] for x in g.parse('goa\n@b\n@c\n@d\n@').children] | |||
self.assertEqual(a.line, 2) | |||
self.assertEqual(b.line, 3) | |||
self.assertEqual(c.line, 4) | |||
self.assertEqual(d.line, 5) | |||
# Detect newlines in ignored tokens | |||
for re in ['/\\n/', '/[^a-z]/', '/\\s/']: | |||
g = _Lark('''!start: "a" "a" | |||
%ignore {}'''.format(re)) | |||
a, b = g.parse('a\na').children | |||
self.assertEqual(a.line, 1) | |||
self.assertEqual(b.line, 2) | |||
_NAME = "Test" + PARSER.capitalize() + LEXER.capitalize() | |||
@@ -1572,6 +1604,7 @@ _TO_TEST = [ | |||
('dynamic_complete', 'earley'), | |||
('standard', 'lalr'), | |||
('contextual', 'lalr'), | |||
('custom', 'lalr'), | |||
# (None, 'earley'), | |||
] | |||
@@ -4,6 +4,7 @@ import unittest | |||
from unittest import TestCase | |||
import copy | |||
import pickle | |||
import functools | |||
from lark.tree import Tree | |||
from lark.visitors import Transformer, Interpreter, visit_children_decor, v_args, Discard | |||
@@ -146,6 +147,22 @@ class TestTrees(TestCase): | |||
res = T().transform(t) | |||
self.assertEqual(res, 2.9) | |||
def test_partial(self): | |||
tree = Tree("start", [Tree("a", ["test1"]), Tree("b", ["test2"])]) | |||
def test(prefix, s, postfix): | |||
return prefix + s.upper() + postfix | |||
@v_args(inline=True) | |||
class T(Transformer): | |||
a = functools.partial(test, "@", postfix="!") | |||
b = functools.partial(lambda s: s + "!") | |||
res = T().transform(tree) | |||
assert res.children == ["@TEST1!", "test2!"] | |||
def test_discard(self): | |||
class MyTransformer(Transformer): | |||
def a(self, args): | |||