@@ -1,19 +1,20 @@ | |||||
# Lark - a modern parsing library | |||||
# Lark - a modern parsing library for Python | |||||
Lark is a modern general-purpose parsing library for Python. | |||||
Parse any context-free grammar, FAST and EASY! | |||||
It's intended for everyone, from complete beginners to experts in parsing. | |||||
**Beginners**: Forget everything you knew about parsers! Lark's algorithm can quickly parse any grammar you throw at it, no matter how complicated. It also constructs a parse-tree for you. | |||||
Lark focuses on simplicity, power, and speed. It lets you choose between two parsing algorithms: | |||||
**Experts**: Lark lets you choose between Earley and LALR(1), to trade-off power and speed. It also contains experimental features such as a contextual-lexer. | |||||
- Earley : Parses all context-free grammars (even ambiguous ones)! It is the default. | |||||
- LALR(1): Only LR grammars. Outperforms PLY and most (if not all) other pure-python parsing libraries. | |||||
Lark can: | |||||
Both algorithms are written in Python and can be used interchangeably with the same grammar\*. Similarly, the lexer can be turned on/off without changing the grammar. That means you can write your parser without any limitations (just keep it context-free) and optimize it for speed only when you need to. | |||||
- Parse all context-free grammars, and handle all ambiguity (using Earley) | |||||
- Built a parse-tree automagically, no construction code required | |||||
- Outperform PLY (when using LALR(1)) | |||||
- Run on every Python interpreter (it's pure-python) | |||||
Lark can automagically build an AST from your grammar, without any more code on your part. | |||||
And many more features. Read ahead and find out. | |||||
\* *Both the lexer and the LALR algorithm require certain limitations on the grammar. If you choose to use them, it's better to learn what they are first.* | |||||
### Hello World | ### Hello World | ||||
@@ -152,17 +153,21 @@ You can use the output as a regular python module: | |||||
- **Earley** parser | - **Earley** parser | ||||
- Can parse *ALL* context-free grammars | - Can parse *ALL* context-free grammars | ||||
- Accepts and resolves ambiguous grammars using a parse forest | |||||
- Optional lexer | |||||
- Resolves ambiguous grammars using a parse forest | |||||
- Automatiic & user-defined rule priority for ambiguity resolution | |||||
- Dynamic lexer | |||||
- **LALR(1)** parser | - **LALR(1)** parser | ||||
- Standard & Contextual lexers | |||||
- **EBNF** grammar (with a little extra) | |||||
- Standard lexer (like PLY) | |||||
- Contextual lexer (can handle some ambiguity and non-determinism) | |||||
- **EBNF** grammar (with a few extra features) | |||||
- Builds a parse-tree (AST) automagically based on the grammar | - Builds a parse-tree (AST) automagically based on the grammar | ||||
- Lexer with regular expressions (regexps) | - Lexer with regular expressions (regexps) | ||||
- Automatic line & column tracking | - Automatic line & column tracking | ||||
- Automatic token collision resolution (unless both terminals are regexps) | - Automatic token collision resolution (unless both terminals are regexps) | ||||
- **Standard library** of terminals (strings, numbers, names, etc.) | - **Standard library** of terminals (strings, numbers, names, etc.) | ||||
- Automatic reconstruction of input (experimental, see examples) | |||||
- Experimental features: | |||||
- Automatic reconstruction of input from parse-tree (see examples) | |||||
- Import grammars from Nearley.js | |||||
- **Unicode** fully supported | - **Unicode** fully supported | ||||
- Extensive test suite | - Extensive test suite | ||||
- **Python 2 & 3** compatible | - **Python 2 & 3** compatible | ||||
@@ -47,12 +47,12 @@ class TreeToJson(Transformer): | |||||
true = lambda self, _: True | true = lambda self, _: True | ||||
false = lambda self, _: False | false = lambda self, _: False | ||||
# json_parser = Lark(json_grammar, parser='earley', lexer='standard') | |||||
# def parse(x): | |||||
# return TreeToJson().transform(json_parser.parse(x)) | |||||
json_parser = Lark(json_grammar, parser='earley', lexer='standard') | |||||
def parse(x): | |||||
return TreeToJson().transform(json_parser.parse(x)) | |||||
json_parser = Lark(json_grammar, parser='lalr', transformer=TreeToJson()) | |||||
parse = json_parser.parse | |||||
# json_parser = Lark(json_grammar, parser='lalr', transformer=TreeToJson()) | |||||
# parse = json_parser.parse | |||||
def test(): | def test(): | ||||
test_json = ''' | test_json = ''' | ||||
@@ -112,9 +112,10 @@ class Terminal: | |||||
class Terminal_Regexp(Terminal): | class Terminal_Regexp(Terminal): | ||||
def __init__(self, data): | |||||
Terminal.__init__(self, data) | |||||
self.match = re.compile(data).match | |||||
def __init__(self, name, regexp): | |||||
Terminal.__init__(self, regexp) | |||||
self.name = name | |||||
self.match = re.compile(regexp).match | |||||
class Terminal_Token(Terminal): | class Terminal_Token(Terminal): | ||||
def match(self, other): | def match(self, other): | ||||
@@ -79,7 +79,7 @@ class Earley_NoLex: | |||||
width = sre_parse.parse(regexp).getwidth() | width = sre_parse.parse(regexp).getwidth() | ||||
if width != (1,1): | if width != (1,1): | ||||
raise GrammarError('Scanless parsing (lexer=None) requires all tokens to have a width of 1 (terminal %s: %s is %s)' % (sym, regexp, width)) | raise GrammarError('Scanless parsing (lexer=None) requires all tokens to have a width of 1 (terminal %s: %s is %s)' % (sym, regexp, width)) | ||||
yield Terminal_Regexp(regexp) | |||||
yield Terminal_Regexp(sym, regexp) | |||||
else: | else: | ||||
yield sym | yield sym | ||||
@@ -114,7 +114,7 @@ class XEarley: | |||||
rules = [(n, list(self._prepare_expansion(x)), a, o) for n,x,a,o in parser_conf.rules] | rules = [(n, list(self._prepare_expansion(x)), a, o) for n,x,a,o in parser_conf.rules] | ||||
resolve_ambiguity = (options.ambiguity=='resolve') if options else True | resolve_ambiguity = (options.ambiguity=='resolve') if options else True | ||||
ignore = [Terminal_Regexp(self.token_by_name[x].pattern.to_regexp()) for x in lexer_conf.ignore] | |||||
ignore = [Terminal_Regexp(x, self.token_by_name[x].pattern.to_regexp()) for x in lexer_conf.ignore] | |||||
self.parser = xearley.Parser(rules, | self.parser = xearley.Parser(rules, | ||||
parser_conf.start, | parser_conf.start, | ||||
@@ -129,7 +129,7 @@ class XEarley: | |||||
regexp = self.token_by_name[sym].pattern.to_regexp() | regexp = self.token_by_name[sym].pattern.to_regexp() | ||||
width = sre_parse.parse(regexp).getwidth() | width = sre_parse.parse(regexp).getwidth() | ||||
assert width | assert width | ||||
yield Terminal_Regexp(regexp) | |||||
yield Terminal_Regexp(sym, regexp) | |||||
else: | else: | ||||
yield sym | yield sym | ||||
@@ -21,6 +21,7 @@ | |||||
from collections import defaultdict | from collections import defaultdict | ||||
from ..common import ParseError, UnexpectedToken, Terminal | from ..common import ParseError, UnexpectedToken, Terminal | ||||
from ..lexer import Token | |||||
from ..tree import Tree | from ..tree import Tree | ||||
from .grammar_analysis import GrammarAnalyzer | from .grammar_analysis import GrammarAnalyzer | ||||
@@ -33,6 +34,7 @@ class Parser: | |||||
self.resolve_ambiguity = resolve_ambiguity | self.resolve_ambiguity = resolve_ambiguity | ||||
self.ignore = list(ignore) | self.ignore = list(ignore) | ||||
self.postprocess = {} | self.postprocess = {} | ||||
self.predictions = {} | self.predictions = {} | ||||
for rule in self.analysis.rules: | for rule in self.analysis.rules: | ||||
@@ -46,6 +48,9 @@ class Parser: | |||||
start_symbol = start_symbol or self.start_symbol | start_symbol = start_symbol or self.start_symbol | ||||
delayed_matches = defaultdict(list) | delayed_matches = defaultdict(list) | ||||
text_line = 1 | |||||
text_column = 0 | |||||
def predict(nonterm, column): | def predict(nonterm, column): | ||||
assert not isinstance(nonterm, Terminal), nonterm | assert not isinstance(nonterm, Terminal), nonterm | ||||
return [Item(rule, 0, column, None) for rule in self.predictions[nonterm]] | return [Item(rule, 0, column, None) for rule in self.predictions[nonterm]] | ||||
@@ -78,7 +83,8 @@ class Parser: | |||||
for item in to_scan: | for item in to_scan: | ||||
m = item.expect.match(stream, i) | m = item.expect.match(stream, i) | ||||
if m: | if m: | ||||
delayed_matches[m.end()].append(item.advance(m.group(0))) | |||||
t = Token(item.expect.name, m.group(0), i, text_line, text_column) | |||||
delayed_matches[m.end()].append(item.advance(t)) | |||||
s = m.group(0) | s = m.group(0) | ||||
for j in range(1, len(s)): | for j in range(1, len(s)): | ||||
@@ -98,10 +104,16 @@ class Parser: | |||||
column = column0 | column = column0 | ||||
for i, token in enumerate(stream): | for i, token in enumerate(stream): | ||||
predict_and_complete(column) | predict_and_complete(column) | ||||
column = scan(i, token, column) | column = scan(i, token, column) | ||||
if token == '\n': | |||||
text_line += 1 | |||||
text_column = 0 | |||||
else: | |||||
text_column += 1 | |||||
predict_and_complete(column) | predict_and_complete(column) | ||||
# Parse ended. Now build a parse tree | # Parse ended. Now build a parse tree | ||||