Improved README. Dynamic lexer now generates tokens.

8 years ago · 0d5dd2bd0d
--- a/README.md
+++ b/README.md
@@ -1,19 +1,20 @@
 # Lark - a modern parsing library
 # Lark - a modern parsing library for Python

 Lark is a modern general-purpose parsing library for Python.
 Parse any context-free grammar, FAST and EASY!

 It's intended for everyone, from complete beginners to experts in parsing.
 **Beginners**: Forget everything you knew about parsers! Lark's algorithm can quickly parse any grammar you throw at it, no matter how complicated. It also constructs a parse-tree for you.

 Lark focuses on simplicity, power, and speed. It lets you choose between two parsing algorithms:
 **Experts**: Lark lets you choose between Earley and LALR(1), to trade-off power and speed. It also contains experimental features such as a contextual-lexer.

 - Earley : Parses all context-free grammars (even ambiguous ones)! It is the default.
 - LALR(1): Only LR grammars. Outperforms PLY and most (if not all) other pure-python parsing libraries.
 Lark can:

 Both algorithms are written in Python and can be used interchangeably with the same grammar\*. Similarly, the lexer can be turned on/off without changing the grammar. That means you can write your parser without any limitations (just keep it context-free) and optimize it for speed only when you need to.
    - Parse all context-free grammars, and handle all ambiguity (using Earley)
    - Built a parse-tree automagically, no construction code required
    - Outperform PLY (when using LALR(1))
    - Run on every Python interpreter (it's pure-python)

 Lark can automagically build an AST from your grammar, without any more code on your part.
 And many more features. Read ahead and find out.

 \* *Both the lexer and the LALR algorithm require certain limitations on the grammar. If you choose to use them, it's better to learn what they are first.*

 ### Hello World

@@ -152,17 +153,21 @@ You can use the output as a regular python module:

 - **Earley** parser
    - Can parse *ALL* context-free grammars
    - Accepts and resolves ambiguous grammars using a parse forest
    - Optional lexer
    - Resolves ambiguous grammars using a parse forest
    - Automatiic & user-defined rule priority for ambiguity resolution
    - Dynamic lexer
 - **LALR(1)** parser
    - Standard & Contextual lexers
 - **EBNF** grammar (with a little extra)
    - Standard lexer (like PLY)
    - Contextual lexer (can handle some ambiguity and non-determinism)
 - **EBNF** grammar (with a few extra features)
 - Builds a parse-tree (AST) automagically based on the grammar
 - Lexer with regular expressions (regexps)
     - Automatic line & column tracking
     - Automatic token collision resolution (unless both terminals are regexps)
 - **Standard library** of terminals (strings, numbers, names, etc.)
 - Automatic reconstruction of input (experimental, see examples)
 - Experimental features:
     - Automatic reconstruction of input from parse-tree (see examples)
     - Import grammars from Nearley.js
 - **Unicode** fully supported
 - Extensive test suite
 - **Python 2 & 3** compatible
--- a/examples/json_parser.py
+++ b/examples/json_parser.py
@@ -47,12 +47,12 @@ class TreeToJson(Transformer):
    true = lambda self, _: True
    false = lambda self, _: False

 # json_parser = Lark(json_grammar, parser='earley', lexer='standard')
 # def parse(x):
 #     return TreeToJson().transform(json_parser.parse(x))
 json_parser = Lark(json_grammar, parser='earley', lexer='standard')
 def parse(x):
    return TreeToJson().transform(json_parser.parse(x))

 json_parser = Lark(json_grammar, parser='lalr', transformer=TreeToJson())
 parse = json_parser.parse
 # json_parser = Lark(json_grammar, parser='lalr', transformer=TreeToJson())
 # parse = json_parser.parse

 def test():
    test_json = '''
--- a/lark/common.py
+++ b/lark/common.py
@@ -112,9 +112,10 @@ class Terminal:


 class Terminal_Regexp(Terminal):
    def __init__(self, data):
        Terminal.__init__(self, data)
        self.match = re.compile(data).match
    def __init__(self, name, regexp):
        Terminal.__init__(self, regexp)
        self.name = name
        self.match = re.compile(regexp).match

 class Terminal_Token(Terminal):
    def match(self, other):
--- a/lark/parser_frontends.py
+++ b/lark/parser_frontends.py
@@ -79,7 +79,7 @@ class Earley_NoLex:
                width = sre_parse.parse(regexp).getwidth()
                if width != (1,1):
                    raise GrammarError('Scanless parsing (lexer=None) requires all tokens to have a width of 1 (terminal %s: %s is %s)' % (sym, regexp, width))
                yield Terminal_Regexp(regexp)
                yield Terminal_Regexp(sym, regexp)
            else:
                yield sym

@@ -114,7 +114,7 @@ class XEarley:
        rules = [(n, list(self._prepare_expansion(x)), a, o) for n,x,a,o in parser_conf.rules]

        resolve_ambiguity = (options.ambiguity=='resolve') if options else True
        ignore = [Terminal_Regexp(self.token_by_name[x].pattern.to_regexp()) for x in lexer_conf.ignore]
        ignore = [Terminal_Regexp(x, self.token_by_name[x].pattern.to_regexp()) for x in lexer_conf.ignore]

        self.parser = xearley.Parser(rules,
                                    parser_conf.start,
@@ -129,7 +129,7 @@ class XEarley:
                regexp = self.token_by_name[sym].pattern.to_regexp()
                width = sre_parse.parse(regexp).getwidth()
                assert width
                yield Terminal_Regexp(regexp)
                yield Terminal_Regexp(sym, regexp)
            else:
                yield sym

--- a/lark/parsers/xearley.py
+++ b/lark/parsers/xearley.py
@@ -21,6 +21,7 @@
 from collections import defaultdict

 from ..common import ParseError, UnexpectedToken, Terminal
 from ..lexer import Token
 from ..tree import Tree
 from .grammar_analysis import GrammarAnalyzer

@@ -33,6 +34,7 @@ class Parser:
        self.resolve_ambiguity = resolve_ambiguity
        self.ignore = list(ignore)


        self.postprocess = {}
        self.predictions = {}
        for rule in self.analysis.rules:
@@ -46,6 +48,9 @@ class Parser:
        start_symbol = start_symbol or self.start_symbol
        delayed_matches = defaultdict(list)

        text_line = 1
        text_column = 0

        def predict(nonterm, column):
            assert not isinstance(nonterm, Terminal), nonterm
            return [Item(rule, 0, column, None) for rule in self.predictions[nonterm]]
@@ -78,7 +83,8 @@ class Parser:
            for item in to_scan:
                m = item.expect.match(stream, i)
                if m:
                    delayed_matches[m.end()].append(item.advance(m.group(0)))
                    t = Token(item.expect.name, m.group(0), i, text_line, text_column)
                    delayed_matches[m.end()].append(item.advance(t))

                    s = m.group(0)
                    for j in range(1, len(s)):
@@ -98,10 +104,16 @@ class Parser:

        column = column0
        for i, token in enumerate(stream):

            predict_and_complete(column)
            column = scan(i, token, column)

            if token == '\n':
                text_line += 1
                text_column = 0
            else:
                text_column += 1


        predict_and_complete(column)

        # Parse ended. Now build a parse tree