| @@ -1,19 +1,20 @@ | |||
| # Lark - a modern parsing library | |||
| # Lark - a modern parsing library for Python | |||
| Lark is a modern general-purpose parsing library for Python. | |||
| Parse any context-free grammar, FAST and EASY! | |||
| It's intended for everyone, from complete beginners to experts in parsing. | |||
| **Beginners**: Forget everything you knew about parsers! Lark's algorithm can quickly parse any grammar you throw at it, no matter how complicated. It also constructs a parse-tree for you. | |||
| Lark focuses on simplicity, power, and speed. It lets you choose between two parsing algorithms: | |||
| **Experts**: Lark lets you choose between Earley and LALR(1), to trade-off power and speed. It also contains experimental features such as a contextual-lexer. | |||
| - Earley : Parses all context-free grammars (even ambiguous ones)! It is the default. | |||
| - LALR(1): Only LR grammars. Outperforms PLY and most (if not all) other pure-python parsing libraries. | |||
| Lark can: | |||
| Both algorithms are written in Python and can be used interchangeably with the same grammar\*. Similarly, the lexer can be turned on/off without changing the grammar. That means you can write your parser without any limitations (just keep it context-free) and optimize it for speed only when you need to. | |||
| - Parse all context-free grammars, and handle all ambiguity (using Earley) | |||
| - Built a parse-tree automagically, no construction code required | |||
| - Outperform PLY (when using LALR(1)) | |||
| - Run on every Python interpreter (it's pure-python) | |||
| Lark can automagically build an AST from your grammar, without any more code on your part. | |||
| And many more features. Read ahead and find out. | |||
| \* *Both the lexer and the LALR algorithm require certain limitations on the grammar. If you choose to use them, it's better to learn what they are first.* | |||
| ### Hello World | |||
| @@ -152,17 +153,21 @@ You can use the output as a regular python module: | |||
| - **Earley** parser | |||
| - Can parse *ALL* context-free grammars | |||
| - Accepts and resolves ambiguous grammars using a parse forest | |||
| - Optional lexer | |||
| - Resolves ambiguous grammars using a parse forest | |||
| - Automatiic & user-defined rule priority for ambiguity resolution | |||
| - Dynamic lexer | |||
| - **LALR(1)** parser | |||
| - Standard & Contextual lexers | |||
| - **EBNF** grammar (with a little extra) | |||
| - Standard lexer (like PLY) | |||
| - Contextual lexer (can handle some ambiguity and non-determinism) | |||
| - **EBNF** grammar (with a few extra features) | |||
| - Builds a parse-tree (AST) automagically based on the grammar | |||
| - Lexer with regular expressions (regexps) | |||
| - Automatic line & column tracking | |||
| - Automatic token collision resolution (unless both terminals are regexps) | |||
| - **Standard library** of terminals (strings, numbers, names, etc.) | |||
| - Automatic reconstruction of input (experimental, see examples) | |||
| - Experimental features: | |||
| - Automatic reconstruction of input from parse-tree (see examples) | |||
| - Import grammars from Nearley.js | |||
| - **Unicode** fully supported | |||
| - Extensive test suite | |||
| - **Python 2 & 3** compatible | |||
| @@ -47,12 +47,12 @@ class TreeToJson(Transformer): | |||
| true = lambda self, _: True | |||
| false = lambda self, _: False | |||
| # json_parser = Lark(json_grammar, parser='earley', lexer='standard') | |||
| # def parse(x): | |||
| # return TreeToJson().transform(json_parser.parse(x)) | |||
| json_parser = Lark(json_grammar, parser='earley', lexer='standard') | |||
| def parse(x): | |||
| return TreeToJson().transform(json_parser.parse(x)) | |||
| json_parser = Lark(json_grammar, parser='lalr', transformer=TreeToJson()) | |||
| parse = json_parser.parse | |||
| # json_parser = Lark(json_grammar, parser='lalr', transformer=TreeToJson()) | |||
| # parse = json_parser.parse | |||
| def test(): | |||
| test_json = ''' | |||
| @@ -112,9 +112,10 @@ class Terminal: | |||
| class Terminal_Regexp(Terminal): | |||
| def __init__(self, data): | |||
| Terminal.__init__(self, data) | |||
| self.match = re.compile(data).match | |||
| def __init__(self, name, regexp): | |||
| Terminal.__init__(self, regexp) | |||
| self.name = name | |||
| self.match = re.compile(regexp).match | |||
| class Terminal_Token(Terminal): | |||
| def match(self, other): | |||
| @@ -79,7 +79,7 @@ class Earley_NoLex: | |||
| width = sre_parse.parse(regexp).getwidth() | |||
| if width != (1,1): | |||
| raise GrammarError('Scanless parsing (lexer=None) requires all tokens to have a width of 1 (terminal %s: %s is %s)' % (sym, regexp, width)) | |||
| yield Terminal_Regexp(regexp) | |||
| yield Terminal_Regexp(sym, regexp) | |||
| else: | |||
| yield sym | |||
| @@ -114,7 +114,7 @@ class XEarley: | |||
| rules = [(n, list(self._prepare_expansion(x)), a, o) for n,x,a,o in parser_conf.rules] | |||
| resolve_ambiguity = (options.ambiguity=='resolve') if options else True | |||
| ignore = [Terminal_Regexp(self.token_by_name[x].pattern.to_regexp()) for x in lexer_conf.ignore] | |||
| ignore = [Terminal_Regexp(x, self.token_by_name[x].pattern.to_regexp()) for x in lexer_conf.ignore] | |||
| self.parser = xearley.Parser(rules, | |||
| parser_conf.start, | |||
| @@ -129,7 +129,7 @@ class XEarley: | |||
| regexp = self.token_by_name[sym].pattern.to_regexp() | |||
| width = sre_parse.parse(regexp).getwidth() | |||
| assert width | |||
| yield Terminal_Regexp(regexp) | |||
| yield Terminal_Regexp(sym, regexp) | |||
| else: | |||
| yield sym | |||
| @@ -21,6 +21,7 @@ | |||
| from collections import defaultdict | |||
| from ..common import ParseError, UnexpectedToken, Terminal | |||
| from ..lexer import Token | |||
| from ..tree import Tree | |||
| from .grammar_analysis import GrammarAnalyzer | |||
| @@ -33,6 +34,7 @@ class Parser: | |||
| self.resolve_ambiguity = resolve_ambiguity | |||
| self.ignore = list(ignore) | |||
| self.postprocess = {} | |||
| self.predictions = {} | |||
| for rule in self.analysis.rules: | |||
| @@ -46,6 +48,9 @@ class Parser: | |||
| start_symbol = start_symbol or self.start_symbol | |||
| delayed_matches = defaultdict(list) | |||
| text_line = 1 | |||
| text_column = 0 | |||
| def predict(nonterm, column): | |||
| assert not isinstance(nonterm, Terminal), nonterm | |||
| return [Item(rule, 0, column, None) for rule in self.predictions[nonterm]] | |||
| @@ -78,7 +83,8 @@ class Parser: | |||
| for item in to_scan: | |||
| m = item.expect.match(stream, i) | |||
| if m: | |||
| delayed_matches[m.end()].append(item.advance(m.group(0))) | |||
| t = Token(item.expect.name, m.group(0), i, text_line, text_column) | |||
| delayed_matches[m.end()].append(item.advance(t)) | |||
| s = m.group(0) | |||
| for j in range(1, len(s)): | |||
| @@ -98,10 +104,16 @@ class Parser: | |||
| column = column0 | |||
| for i, token in enumerate(stream): | |||
| predict_and_complete(column) | |||
| column = scan(i, token, column) | |||
| if token == '\n': | |||
| text_line += 1 | |||
| text_column = 0 | |||
| else: | |||
| text_column += 1 | |||
| predict_and_complete(column) | |||
| # Parse ended. Now build a parse tree | |||