@@ -1,19 +1,20 @@ | |||
# Lark - a modern parsing library | |||
# Lark - a modern parsing library for Python | |||
Lark is a modern general-purpose parsing library for Python. | |||
Parse any context-free grammar, FAST and EASY! | |||
It's intended for everyone, from complete beginners to experts in parsing. | |||
**Beginners**: Forget everything you knew about parsers! Lark's algorithm can quickly parse any grammar you throw at it, no matter how complicated. It also constructs a parse-tree for you. | |||
Lark focuses on simplicity, power, and speed. It lets you choose between two parsing algorithms: | |||
**Experts**: Lark lets you choose between Earley and LALR(1), to trade-off power and speed. It also contains experimental features such as a contextual-lexer. | |||
- Earley : Parses all context-free grammars (even ambiguous ones)! It is the default. | |||
- LALR(1): Only LR grammars. Outperforms PLY and most (if not all) other pure-python parsing libraries. | |||
Lark can: | |||
Both algorithms are written in Python and can be used interchangeably with the same grammar\*. Similarly, the lexer can be turned on/off without changing the grammar. That means you can write your parser without any limitations (just keep it context-free) and optimize it for speed only when you need to. | |||
- Parse all context-free grammars, and handle all ambiguity (using Earley) | |||
- Built a parse-tree automagically, no construction code required | |||
- Outperform PLY (when using LALR(1)) | |||
- Run on every Python interpreter (it's pure-python) | |||
Lark can automagically build an AST from your grammar, without any more code on your part. | |||
And many more features. Read ahead and find out. | |||
\* *Both the lexer and the LALR algorithm require certain limitations on the grammar. If you choose to use them, it's better to learn what they are first.* | |||
### Hello World | |||
@@ -152,17 +153,21 @@ You can use the output as a regular python module: | |||
- **Earley** parser | |||
- Can parse *ALL* context-free grammars | |||
- Accepts and resolves ambiguous grammars using a parse forest | |||
- Optional lexer | |||
- Resolves ambiguous grammars using a parse forest | |||
- Automatiic & user-defined rule priority for ambiguity resolution | |||
- Dynamic lexer | |||
- **LALR(1)** parser | |||
- Standard & Contextual lexers | |||
- **EBNF** grammar (with a little extra) | |||
- Standard lexer (like PLY) | |||
- Contextual lexer (can handle some ambiguity and non-determinism) | |||
- **EBNF** grammar (with a few extra features) | |||
- Builds a parse-tree (AST) automagically based on the grammar | |||
- Lexer with regular expressions (regexps) | |||
- Automatic line & column tracking | |||
- Automatic token collision resolution (unless both terminals are regexps) | |||
- **Standard library** of terminals (strings, numbers, names, etc.) | |||
- Automatic reconstruction of input (experimental, see examples) | |||
- Experimental features: | |||
- Automatic reconstruction of input from parse-tree (see examples) | |||
- Import grammars from Nearley.js | |||
- **Unicode** fully supported | |||
- Extensive test suite | |||
- **Python 2 & 3** compatible | |||
@@ -47,12 +47,12 @@ class TreeToJson(Transformer): | |||
true = lambda self, _: True | |||
false = lambda self, _: False | |||
# json_parser = Lark(json_grammar, parser='earley', lexer='standard') | |||
# def parse(x): | |||
# return TreeToJson().transform(json_parser.parse(x)) | |||
json_parser = Lark(json_grammar, parser='earley', lexer='standard') | |||
def parse(x): | |||
return TreeToJson().transform(json_parser.parse(x)) | |||
json_parser = Lark(json_grammar, parser='lalr', transformer=TreeToJson()) | |||
parse = json_parser.parse | |||
# json_parser = Lark(json_grammar, parser='lalr', transformer=TreeToJson()) | |||
# parse = json_parser.parse | |||
def test(): | |||
test_json = ''' | |||
@@ -112,9 +112,10 @@ class Terminal: | |||
class Terminal_Regexp(Terminal): | |||
def __init__(self, data): | |||
Terminal.__init__(self, data) | |||
self.match = re.compile(data).match | |||
def __init__(self, name, regexp): | |||
Terminal.__init__(self, regexp) | |||
self.name = name | |||
self.match = re.compile(regexp).match | |||
class Terminal_Token(Terminal): | |||
def match(self, other): | |||
@@ -79,7 +79,7 @@ class Earley_NoLex: | |||
width = sre_parse.parse(regexp).getwidth() | |||
if width != (1,1): | |||
raise GrammarError('Scanless parsing (lexer=None) requires all tokens to have a width of 1 (terminal %s: %s is %s)' % (sym, regexp, width)) | |||
yield Terminal_Regexp(regexp) | |||
yield Terminal_Regexp(sym, regexp) | |||
else: | |||
yield sym | |||
@@ -114,7 +114,7 @@ class XEarley: | |||
rules = [(n, list(self._prepare_expansion(x)), a, o) for n,x,a,o in parser_conf.rules] | |||
resolve_ambiguity = (options.ambiguity=='resolve') if options else True | |||
ignore = [Terminal_Regexp(self.token_by_name[x].pattern.to_regexp()) for x in lexer_conf.ignore] | |||
ignore = [Terminal_Regexp(x, self.token_by_name[x].pattern.to_regexp()) for x in lexer_conf.ignore] | |||
self.parser = xearley.Parser(rules, | |||
parser_conf.start, | |||
@@ -129,7 +129,7 @@ class XEarley: | |||
regexp = self.token_by_name[sym].pattern.to_regexp() | |||
width = sre_parse.parse(regexp).getwidth() | |||
assert width | |||
yield Terminal_Regexp(regexp) | |||
yield Terminal_Regexp(sym, regexp) | |||
else: | |||
yield sym | |||
@@ -21,6 +21,7 @@ | |||
from collections import defaultdict | |||
from ..common import ParseError, UnexpectedToken, Terminal | |||
from ..lexer import Token | |||
from ..tree import Tree | |||
from .grammar_analysis import GrammarAnalyzer | |||
@@ -33,6 +34,7 @@ class Parser: | |||
self.resolve_ambiguity = resolve_ambiguity | |||
self.ignore = list(ignore) | |||
self.postprocess = {} | |||
self.predictions = {} | |||
for rule in self.analysis.rules: | |||
@@ -46,6 +48,9 @@ class Parser: | |||
start_symbol = start_symbol or self.start_symbol | |||
delayed_matches = defaultdict(list) | |||
text_line = 1 | |||
text_column = 0 | |||
def predict(nonterm, column): | |||
assert not isinstance(nonterm, Terminal), nonterm | |||
return [Item(rule, 0, column, None) for rule in self.predictions[nonterm]] | |||
@@ -78,7 +83,8 @@ class Parser: | |||
for item in to_scan: | |||
m = item.expect.match(stream, i) | |||
if m: | |||
delayed_matches[m.end()].append(item.advance(m.group(0))) | |||
t = Token(item.expect.name, m.group(0), i, text_line, text_column) | |||
delayed_matches[m.end()].append(item.advance(t)) | |||
s = m.group(0) | |||
for j in range(1, len(s)): | |||
@@ -98,10 +104,16 @@ class Parser: | |||
column = column0 | |||
for i, token in enumerate(stream): | |||
predict_and_complete(column) | |||
column = scan(i, token, column) | |||
if token == '\n': | |||
text_line += 1 | |||
text_column = 0 | |||
else: | |||
text_column += 1 | |||
predict_and_complete(column) | |||
# Parse ended. Now build a parse tree | |||