diff --git a/README.md b/README.md index 183a162..596895a 100644 --- a/README.md +++ b/README.md @@ -1,19 +1,20 @@ -# Lark - a modern parsing library +# Lark - a modern parsing library for Python -Lark is a modern general-purpose parsing library for Python. +Parse any context-free grammar, FAST and EASY! -It's intended for everyone, from complete beginners to experts in parsing. +**Beginners**: Forget everything you knew about parsers! Lark's algorithm can quickly parse any grammar you throw at it, no matter how complicated. It also constructs a parse-tree for you. -Lark focuses on simplicity, power, and speed. It lets you choose between two parsing algorithms: +**Experts**: Lark lets you choose between Earley and LALR(1), to trade-off power and speed. It also contains experimental features such as a contextual-lexer. - - Earley : Parses all context-free grammars (even ambiguous ones)! It is the default. - - LALR(1): Only LR grammars. Outperforms PLY and most (if not all) other pure-python parsing libraries. +Lark can: -Both algorithms are written in Python and can be used interchangeably with the same grammar\*. Similarly, the lexer can be turned on/off without changing the grammar. That means you can write your parser without any limitations (just keep it context-free) and optimize it for speed only when you need to. + - Parse all context-free grammars, and handle all ambiguity (using Earley) + - Built a parse-tree automagically, no construction code required + - Outperform PLY (when using LALR(1)) + - Run on every Python interpreter (it's pure-python) -Lark can automagically build an AST from your grammar, without any more code on your part. +And many more features. Read ahead and find out. -\* *Both the lexer and the LALR algorithm require certain limitations on the grammar. If you choose to use them, it's better to learn what they are first.* ### Hello World @@ -152,17 +153,21 @@ You can use the output as a regular python module: - **Earley** parser - Can parse *ALL* context-free grammars - - Accepts and resolves ambiguous grammars using a parse forest - - Optional lexer + - Resolves ambiguous grammars using a parse forest + - Automatiic & user-defined rule priority for ambiguity resolution + - Dynamic lexer - **LALR(1)** parser - - Standard & Contextual lexers - - **EBNF** grammar (with a little extra) + - Standard lexer (like PLY) + - Contextual lexer (can handle some ambiguity and non-determinism) + - **EBNF** grammar (with a few extra features) - Builds a parse-tree (AST) automagically based on the grammar - Lexer with regular expressions (regexps) - Automatic line & column tracking - Automatic token collision resolution (unless both terminals are regexps) - **Standard library** of terminals (strings, numbers, names, etc.) - - Automatic reconstruction of input (experimental, see examples) + - Experimental features: + - Automatic reconstruction of input from parse-tree (see examples) + - Import grammars from Nearley.js - **Unicode** fully supported - Extensive test suite - **Python 2 & 3** compatible diff --git a/examples/json_parser.py b/examples/json_parser.py index 4f5feaf..aaa3d31 100644 --- a/examples/json_parser.py +++ b/examples/json_parser.py @@ -47,12 +47,12 @@ class TreeToJson(Transformer): true = lambda self, _: True false = lambda self, _: False -# json_parser = Lark(json_grammar, parser='earley', lexer='standard') -# def parse(x): -# return TreeToJson().transform(json_parser.parse(x)) +json_parser = Lark(json_grammar, parser='earley', lexer='standard') +def parse(x): + return TreeToJson().transform(json_parser.parse(x)) -json_parser = Lark(json_grammar, parser='lalr', transformer=TreeToJson()) -parse = json_parser.parse +# json_parser = Lark(json_grammar, parser='lalr', transformer=TreeToJson()) +# parse = json_parser.parse def test(): test_json = ''' diff --git a/lark/common.py b/lark/common.py index f1b6784..4de53f2 100644 --- a/lark/common.py +++ b/lark/common.py @@ -112,9 +112,10 @@ class Terminal: class Terminal_Regexp(Terminal): - def __init__(self, data): - Terminal.__init__(self, data) - self.match = re.compile(data).match + def __init__(self, name, regexp): + Terminal.__init__(self, regexp) + self.name = name + self.match = re.compile(regexp).match class Terminal_Token(Terminal): def match(self, other): diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index 46f75b4..23cf53f 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -79,7 +79,7 @@ class Earley_NoLex: width = sre_parse.parse(regexp).getwidth() if width != (1,1): raise GrammarError('Scanless parsing (lexer=None) requires all tokens to have a width of 1 (terminal %s: %s is %s)' % (sym, regexp, width)) - yield Terminal_Regexp(regexp) + yield Terminal_Regexp(sym, regexp) else: yield sym @@ -114,7 +114,7 @@ class XEarley: rules = [(n, list(self._prepare_expansion(x)), a, o) for n,x,a,o in parser_conf.rules] resolve_ambiguity = (options.ambiguity=='resolve') if options else True - ignore = [Terminal_Regexp(self.token_by_name[x].pattern.to_regexp()) for x in lexer_conf.ignore] + ignore = [Terminal_Regexp(x, self.token_by_name[x].pattern.to_regexp()) for x in lexer_conf.ignore] self.parser = xearley.Parser(rules, parser_conf.start, @@ -129,7 +129,7 @@ class XEarley: regexp = self.token_by_name[sym].pattern.to_regexp() width = sre_parse.parse(regexp).getwidth() assert width - yield Terminal_Regexp(regexp) + yield Terminal_Regexp(sym, regexp) else: yield sym diff --git a/lark/parsers/xearley.py b/lark/parsers/xearley.py index 729c326..ba86c5c 100644 --- a/lark/parsers/xearley.py +++ b/lark/parsers/xearley.py @@ -21,6 +21,7 @@ from collections import defaultdict from ..common import ParseError, UnexpectedToken, Terminal +from ..lexer import Token from ..tree import Tree from .grammar_analysis import GrammarAnalyzer @@ -33,6 +34,7 @@ class Parser: self.resolve_ambiguity = resolve_ambiguity self.ignore = list(ignore) + self.postprocess = {} self.predictions = {} for rule in self.analysis.rules: @@ -46,6 +48,9 @@ class Parser: start_symbol = start_symbol or self.start_symbol delayed_matches = defaultdict(list) + text_line = 1 + text_column = 0 + def predict(nonterm, column): assert not isinstance(nonterm, Terminal), nonterm return [Item(rule, 0, column, None) for rule in self.predictions[nonterm]] @@ -78,7 +83,8 @@ class Parser: for item in to_scan: m = item.expect.match(stream, i) if m: - delayed_matches[m.end()].append(item.advance(m.group(0))) + t = Token(item.expect.name, m.group(0), i, text_line, text_column) + delayed_matches[m.end()].append(item.advance(t)) s = m.group(0) for j in range(1, len(s)): @@ -98,10 +104,16 @@ class Parser: column = column0 for i, token in enumerate(stream): - predict_and_complete(column) column = scan(i, token, column) + if token == '\n': + text_line += 1 + text_column = 0 + else: + text_column += 1 + + predict_and_complete(column) # Parse ended. Now build a parse tree