From e7445a11ae4cf8d5f4a284fff9eb2e24d9fe54b9 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Fri, 10 Feb 2017 16:10:13 +0200 Subject: [PATCH] Added a docs/reference --- README.md | 1 + docs/json_tutorial.md | 9 ++- docs/reference.md | 165 ++++++++++++++++++++++++++++++++++++++++++ lark/lark.py | 5 +- 4 files changed, 175 insertions(+), 5 deletions(-) create mode 100644 docs/reference.md diff --git a/README.md b/README.md index 0c09541..7c20f86 100644 --- a/README.md +++ b/README.md @@ -45,6 +45,7 @@ Notice punctuation doesn't appear in the resulting tree. It's automatically filt - Read the [tutorial](/docs/json_tutorial.md), which shows how to write a JSON parser in Lark. - Browse the [examples](/examples), which include a calculator, and a Python-code parser. + - Read the [reference](/docs/reference.md) ## List of Features diff --git a/docs/json_tutorial.md b/docs/json_tutorial.md index a3d19da..bf740cc 100644 --- a/docs/json_tutorial.md +++ b/docs/json_tutorial.md @@ -20,13 +20,16 @@ Knowledge assumed: Lark accepts its grammars in a format called [EBNF](https://www.wikiwand.com/en/Extended_Backus%E2%80%93Naur_form). It basically looks like this: - rule_name: some rules and TOKENS - | or others + rule_name : list of rules and TOKENS to match + | another possible list of items + | etc. TOKEN: "some text to match" (*a token is a string or a regular expression*) +The parser will try to match each rule (left-part) by matching its items (right-part) sequentially, trying each alternative (In practice, the parser is predictive so we don't have to try every alternative). + How to structure those rules is beyond the scope of this tutorial, but often it's enough to follow one's intuition. In the case of JSON, the structure is simple: A json document is either a list, or a dictionary, or a string/number/etc. @@ -393,7 +396,7 @@ PyPy is awesome! ### Conclusion -We've brought the run-time down from 36 seconds to 1.4 seconds, in a series of small and simple steps. +We've brought the run-time down from 36 seconds to 1.1 seconds, in a series of small and simple steps. Now let's compare the benchmarks in a nicely organized table. diff --git a/docs/reference.md b/docs/reference.md new file mode 100644 index 0000000..fc2b7af --- /dev/null +++ b/docs/reference.md @@ -0,0 +1,165 @@ +# Lark Reference + +## What is Lark? + +Lark is a general-purpose parsing library. It's written in Python, and supports two parsing algorithms: Earley (default) and LALR(1). + +## Grammar + +Lark accepts its grammars in [EBNF](https://www.wikiwand.com/en/Extended_Backus%E2%80%93Naur_form) form. + +The grammar is a list of rules and tokens, each in their own line. + +Rules can be defined on multiple lines when using the *OR* operator ( | ). + +Comments start with // and last to the end of the line (C++ style) + +Lark begins the parse with the rule 'start', unless specified otherwise in the options. + +### Tokens + +Tokens are defined in terms of: + + NAME : "string" or /regexp/ + + NAME.ignore : .. + +.ignore is a flag that drops the token before it reaches the parser (usually whitespace) + +Example: + + IF: "if" + + INTEGER : /[0-9]+/ + + WHITESPACE.ignore: /[ \t\n]+/ + +### Rules + +Each rule is defined in terms of: + + name : list of items to match + | another list of items -> optional_alias + | etc. + +An alias is a name for the specific rule alternative. It affects tree construction. + +An item is a: + + - rule + - token + - (item item ..) - Group items + - [item item ..] - Maybe. Same as: "(item item ..)?" + - item? - Zero or one instances of item ("maybe") + - item\* - Zero or more instances of item + - item+ - One or more instances of item + + +Example: + + float: "-"? DIGIT* "." DIGIT+ exp + | "-"? DIGIT+ exp + + exp: "-"? ("e" | "E") DIGIT+ + + DIGIT: /[0-9]/ + +## Tree Construction + +Lark builds a tree automatically based on the structure of the grammar. Is also accepts some hints. + +In general, Lark will place each rule as a branch, and its matches as the children of the branch. + +Using item+ or item\* will result in a list of items. + +Example: + + expr: "(" expr ")" + | NAME+ + + NAME: /\w+/ + +Lark will parse "(((hello world)))" as: + + expr + expr + expr + "hello" + "world" + +The brackets do not appear in the tree by design. + +Tokens that won't appear in the tree are: + + - Unnamed strings (like "keyword" or "+") + - Tokens whose name starts with an underscore (like \_DIGIT) + +Tokens that *will* appear in the tree are: + + - Unnamed regular expressions (like /[0-9]/) + - Named tokens whose name starts with a letter (like DIGIT) + +## Shaping the tree + +1. Rules whose name begins with an underscore will be inlined into their containing rule. + +Example: + + start: "(" _greet ")" + _greet: /\w+/ /\w+/ + +Lark will parse "(hello world)" as: + + start + "hello" + "world" + + +2. Rules that recieve a question mark (?) at the beginning of their definition, will be inlined if they have a single child. + +Example: + + start: greet greet + ?greet: "(" /\w+/ ")" + | /\w+ /\w+/ + +Lark will parse "hello world (planet)" as: + + start + greet + "hello" + "world" + "planet" + +3. Aliases - options in a rule can receive an alias. It will be then used as the branch name for the option. + +Example: + + start: greet greet + greet: "hello" -> hello + | "world" + +Lark will parse "hello world" as: + + start + hello + greet + +## Lark Options + +When initializing the Lark object, you can provide it with keyword options: + +- start - The start symbol (Default: "start") +- parser - Decides which parser engine to use, "earley" or "lalr". (Default: "earley") + Note: Both will use Lark's lexer. +- transformer - Applies the transformer to every parse tree (only allowed with parser="lalr") +- only\_lex - Don't build a parser. Useful for debugging (default: False) +- postlex - Lexer post-processing (Default: None) +- profile - Measure run-time usage in Lark. Read results from the profiler proprety (Default: False) + +To be supported: + +- debug +- cache\_grammar +- keep\_all\_tokens + diff --git a/lark/lark.py b/lark/lark.py index 8d4ad96..499ecff 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -21,10 +21,11 @@ class LarkOptions(object): transformer - Applies the transformer to every parse tree debug - Affects verbosity (default: False) only_lex - Don't build a parser. Useful for debugging (default: False) - keep_all_tokens - Don't automagically remove "punctuation" tokens (default: True) + keep_all_tokens - Don't automagically remove "punctuation" tokens (default: False) cache_grammar - Cache the Lark grammar (Default: False) postlex - Lexer post-processing (Default: None) start - The start symbol (Default: start) + profile - Measure run-time usage in Lark. Read results from the profiler proprety (Default: False) """ __doc__ += OPTIONS_DOC def __init__(self, options_dict): @@ -39,7 +40,7 @@ class LarkOptions(object): self.parser = o.pop('parser', 'earley') self.transformer = o.pop('transformer', None) self.start = o.pop('start', 'start') - self.profile = o.pop('profile', False) # XXX new + self.profile = o.pop('profile', False) assert self.parser in ENGINE_DICT if self.parser == 'earley' and self.transformer: