| @@ -1,19 +1,20 @@ | |||||
| # Lark - a modern parsing library | |||||
| # Lark - a modern parsing library for Python | |||||
| Lark is a modern general-purpose parsing library for Python. | |||||
| Parse any context-free grammar, FAST and EASY! | |||||
| It's intended for everyone, from complete beginners to experts in parsing. | |||||
| **Beginners**: Forget everything you knew about parsers! Lark's algorithm can quickly parse any grammar you throw at it, no matter how complicated. It also constructs a parse-tree for you. | |||||
| Lark focuses on simplicity, power, and speed. It lets you choose between two parsing algorithms: | |||||
| **Experts**: Lark lets you choose between Earley and LALR(1), to trade-off power and speed. It also contains experimental features such as a contextual-lexer. | |||||
| - Earley : Parses all context-free grammars (even ambiguous ones)! It is the default. | |||||
| - LALR(1): Only LR grammars. Outperforms PLY and most (if not all) other pure-python parsing libraries. | |||||
| Lark can: | |||||
| Both algorithms are written in Python and can be used interchangeably with the same grammar\*. Similarly, the lexer can be turned on/off without changing the grammar. That means you can write your parser without any limitations (just keep it context-free) and optimize it for speed only when you need to. | |||||
| - Parse all context-free grammars, and handle all ambiguity (using Earley) | |||||
| - Built a parse-tree automagically, no construction code required | |||||
| - Outperform PLY (when using LALR(1)) | |||||
| - Run on every Python interpreter (it's pure-python) | |||||
| Lark can automagically build an AST from your grammar, without any more code on your part. | |||||
| And many more features. Read ahead and find out. | |||||
| \* *Both the lexer and the LALR algorithm require certain limitations on the grammar. If you choose to use them, it's better to learn what they are first.* | |||||
| ### Hello World | ### Hello World | ||||
| @@ -152,17 +153,21 @@ You can use the output as a regular python module: | |||||
| - **Earley** parser | - **Earley** parser | ||||
| - Can parse *ALL* context-free grammars | - Can parse *ALL* context-free grammars | ||||
| - Accepts and resolves ambiguous grammars using a parse forest | |||||
| - Optional lexer | |||||
| - Resolves ambiguous grammars using a parse forest | |||||
| - Automatiic & user-defined rule priority for ambiguity resolution | |||||
| - Dynamic lexer | |||||
| - **LALR(1)** parser | - **LALR(1)** parser | ||||
| - Standard & Contextual lexers | |||||
| - **EBNF** grammar (with a little extra) | |||||
| - Standard lexer (like PLY) | |||||
| - Contextual lexer (can handle some ambiguity and non-determinism) | |||||
| - **EBNF** grammar (with a few extra features) | |||||
| - Builds a parse-tree (AST) automagically based on the grammar | - Builds a parse-tree (AST) automagically based on the grammar | ||||
| - Lexer with regular expressions (regexps) | - Lexer with regular expressions (regexps) | ||||
| - Automatic line & column tracking | - Automatic line & column tracking | ||||
| - Automatic token collision resolution (unless both terminals are regexps) | - Automatic token collision resolution (unless both terminals are regexps) | ||||
| - **Standard library** of terminals (strings, numbers, names, etc.) | - **Standard library** of terminals (strings, numbers, names, etc.) | ||||
| - Automatic reconstruction of input (experimental, see examples) | |||||
| - Experimental features: | |||||
| - Automatic reconstruction of input from parse-tree (see examples) | |||||
| - Import grammars from Nearley.js | |||||
| - **Unicode** fully supported | - **Unicode** fully supported | ||||
| - Extensive test suite | - Extensive test suite | ||||
| - **Python 2 & 3** compatible | - **Python 2 & 3** compatible | ||||
| @@ -47,12 +47,12 @@ class TreeToJson(Transformer): | |||||
| true = lambda self, _: True | true = lambda self, _: True | ||||
| false = lambda self, _: False | false = lambda self, _: False | ||||
| # json_parser = Lark(json_grammar, parser='earley', lexer='standard') | |||||
| # def parse(x): | |||||
| # return TreeToJson().transform(json_parser.parse(x)) | |||||
| json_parser = Lark(json_grammar, parser='earley', lexer='standard') | |||||
| def parse(x): | |||||
| return TreeToJson().transform(json_parser.parse(x)) | |||||
| json_parser = Lark(json_grammar, parser='lalr', transformer=TreeToJson()) | |||||
| parse = json_parser.parse | |||||
| # json_parser = Lark(json_grammar, parser='lalr', transformer=TreeToJson()) | |||||
| # parse = json_parser.parse | |||||
| def test(): | def test(): | ||||
| test_json = ''' | test_json = ''' | ||||
| @@ -112,9 +112,10 @@ class Terminal: | |||||
| class Terminal_Regexp(Terminal): | class Terminal_Regexp(Terminal): | ||||
| def __init__(self, data): | |||||
| Terminal.__init__(self, data) | |||||
| self.match = re.compile(data).match | |||||
| def __init__(self, name, regexp): | |||||
| Terminal.__init__(self, regexp) | |||||
| self.name = name | |||||
| self.match = re.compile(regexp).match | |||||
| class Terminal_Token(Terminal): | class Terminal_Token(Terminal): | ||||
| def match(self, other): | def match(self, other): | ||||
| @@ -79,7 +79,7 @@ class Earley_NoLex: | |||||
| width = sre_parse.parse(regexp).getwidth() | width = sre_parse.parse(regexp).getwidth() | ||||
| if width != (1,1): | if width != (1,1): | ||||
| raise GrammarError('Scanless parsing (lexer=None) requires all tokens to have a width of 1 (terminal %s: %s is %s)' % (sym, regexp, width)) | raise GrammarError('Scanless parsing (lexer=None) requires all tokens to have a width of 1 (terminal %s: %s is %s)' % (sym, regexp, width)) | ||||
| yield Terminal_Regexp(regexp) | |||||
| yield Terminal_Regexp(sym, regexp) | |||||
| else: | else: | ||||
| yield sym | yield sym | ||||
| @@ -114,7 +114,7 @@ class XEarley: | |||||
| rules = [(n, list(self._prepare_expansion(x)), a, o) for n,x,a,o in parser_conf.rules] | rules = [(n, list(self._prepare_expansion(x)), a, o) for n,x,a,o in parser_conf.rules] | ||||
| resolve_ambiguity = (options.ambiguity=='resolve') if options else True | resolve_ambiguity = (options.ambiguity=='resolve') if options else True | ||||
| ignore = [Terminal_Regexp(self.token_by_name[x].pattern.to_regexp()) for x in lexer_conf.ignore] | |||||
| ignore = [Terminal_Regexp(x, self.token_by_name[x].pattern.to_regexp()) for x in lexer_conf.ignore] | |||||
| self.parser = xearley.Parser(rules, | self.parser = xearley.Parser(rules, | ||||
| parser_conf.start, | parser_conf.start, | ||||
| @@ -129,7 +129,7 @@ class XEarley: | |||||
| regexp = self.token_by_name[sym].pattern.to_regexp() | regexp = self.token_by_name[sym].pattern.to_regexp() | ||||
| width = sre_parse.parse(regexp).getwidth() | width = sre_parse.parse(regexp).getwidth() | ||||
| assert width | assert width | ||||
| yield Terminal_Regexp(regexp) | |||||
| yield Terminal_Regexp(sym, regexp) | |||||
| else: | else: | ||||
| yield sym | yield sym | ||||
| @@ -21,6 +21,7 @@ | |||||
| from collections import defaultdict | from collections import defaultdict | ||||
| from ..common import ParseError, UnexpectedToken, Terminal | from ..common import ParseError, UnexpectedToken, Terminal | ||||
| from ..lexer import Token | |||||
| from ..tree import Tree | from ..tree import Tree | ||||
| from .grammar_analysis import GrammarAnalyzer | from .grammar_analysis import GrammarAnalyzer | ||||
| @@ -33,6 +34,7 @@ class Parser: | |||||
| self.resolve_ambiguity = resolve_ambiguity | self.resolve_ambiguity = resolve_ambiguity | ||||
| self.ignore = list(ignore) | self.ignore = list(ignore) | ||||
| self.postprocess = {} | self.postprocess = {} | ||||
| self.predictions = {} | self.predictions = {} | ||||
| for rule in self.analysis.rules: | for rule in self.analysis.rules: | ||||
| @@ -46,6 +48,9 @@ class Parser: | |||||
| start_symbol = start_symbol or self.start_symbol | start_symbol = start_symbol or self.start_symbol | ||||
| delayed_matches = defaultdict(list) | delayed_matches = defaultdict(list) | ||||
| text_line = 1 | |||||
| text_column = 0 | |||||
| def predict(nonterm, column): | def predict(nonterm, column): | ||||
| assert not isinstance(nonterm, Terminal), nonterm | assert not isinstance(nonterm, Terminal), nonterm | ||||
| return [Item(rule, 0, column, None) for rule in self.predictions[nonterm]] | return [Item(rule, 0, column, None) for rule in self.predictions[nonterm]] | ||||
| @@ -78,7 +83,8 @@ class Parser: | |||||
| for item in to_scan: | for item in to_scan: | ||||
| m = item.expect.match(stream, i) | m = item.expect.match(stream, i) | ||||
| if m: | if m: | ||||
| delayed_matches[m.end()].append(item.advance(m.group(0))) | |||||
| t = Token(item.expect.name, m.group(0), i, text_line, text_column) | |||||
| delayed_matches[m.end()].append(item.advance(t)) | |||||
| s = m.group(0) | s = m.group(0) | ||||
| for j in range(1, len(s)): | for j in range(1, len(s)): | ||||
| @@ -98,10 +104,16 @@ class Parser: | |||||
| column = column0 | column = column0 | ||||
| for i, token in enumerate(stream): | for i, token in enumerate(stream): | ||||
| predict_and_complete(column) | predict_and_complete(column) | ||||
| column = scan(i, token, column) | column = scan(i, token, column) | ||||
| if token == '\n': | |||||
| text_line += 1 | |||||
| text_column = 0 | |||||
| else: | |||||
| text_column += 1 | |||||
| predict_and_complete(column) | predict_and_complete(column) | ||||
| # Parse ended. Now build a parse tree | # Parse ended. Now build a parse tree | ||||