From e7445a11ae4cf8d5f4a284fff9eb2e24d9fe54b9 Mon Sep 17 00:00:00 2001
From: Erez Shinan <erezshin+git@gmail.com>
Date: Fri, 10 Feb 2017 16:10:13 +0200
Subject: [PATCH] Added a docs/reference

---
 README.md             |   1 +
 docs/json_tutorial.md |   9 ++-
 docs/reference.md     | 165 ++++++++++++++++++++++++++++++++++++++++++
 lark/lark.py          |   5 +-
 4 files changed, 175 insertions(+), 5 deletions(-)
 create mode 100644 docs/reference.md

diff --git a/README.md b/README.md
index 0c09541..7c20f86 100644
--- a/README.md
+++ b/README.md
@@ -45,6 +45,7 @@ Notice punctuation doesn't appear in the resulting tree. It's automatically filt
 
  - Read the [tutorial](/docs/json_tutorial.md), which shows how to write a JSON parser in Lark.
  - Browse the [examples](/examples), which include a calculator, and a Python-code parser.
+ - Read the [reference](/docs/reference.md)
 
 ## List of Features
 
diff --git a/docs/json_tutorial.md b/docs/json_tutorial.md
index a3d19da..bf740cc 100644
--- a/docs/json_tutorial.md
+++ b/docs/json_tutorial.md
@@ -20,13 +20,16 @@ Knowledge assumed:
 
 Lark accepts its grammars in a format called [EBNF](https://www.wikiwand.com/en/Extended_Backus%E2%80%93Naur_form). It basically looks like this:
 
-    rule_name: some rules and TOKENS
-             | or others
+    rule_name : list of rules and TOKENS to match
+              | another possible list of items
+              | etc.
 
     TOKEN: "some text to match"
 
 (*a token is a string or a regular expression*)
 
+The parser will try to match each rule (left-part) by matching its items (right-part) sequentially, trying each alternative (In practice, the parser is predictive so we don't have to try every alternative).
+
 How to structure those rules is beyond the scope of this tutorial, but often it's enough to follow one's intuition.
 
 In the case of JSON, the structure is simple: A json document is either a list, or a dictionary, or a string/number/etc.
@@ -393,7 +396,7 @@ PyPy is awesome!
 
 ### Conclusion
 
-We've brought the run-time down from 36 seconds to 1.4 seconds, in a series of small and simple steps.
+We've brought the run-time down from 36 seconds to 1.1 seconds, in a series of small and simple steps.
 
 Now let's compare the benchmarks in a nicely organized table.
 
diff --git a/docs/reference.md b/docs/reference.md
new file mode 100644
index 0000000..fc2b7af
--- /dev/null
+++ b/docs/reference.md
@@ -0,0 +1,165 @@
+# Lark Reference
+
+## What is Lark?
+
+Lark is a general-purpose parsing library. It's written in Python, and supports two parsing algorithms: Earley (default) and LALR(1).
+
+## Grammar
+
+Lark accepts its grammars in [EBNF](https://www.wikiwand.com/en/Extended_Backus%E2%80%93Naur_form) form.
+
+The grammar is a list of rules and tokens, each in their own line.
+
+Rules can be defined on multiple lines when using the *OR* operator ( | ).
+
+Comments start with // and last to the end of the line (C++ style)
+
+Lark begins the parse with the rule 'start', unless specified otherwise in the options.
+
+### Tokens
+
+Tokens are defined in terms of:
+
+    NAME : "string" or /regexp/
+                   
+    NAME.ignore : ..
+
+.ignore is a flag that drops the token before it reaches the parser (usually whitespace)
+
+Example:
+
+    IF: "if"
+
+    INTEGER : /[0-9]+/
+
+    WHITESPACE.ignore: /[ \t\n]+/
+
+### Rules
+
+Each rule is defined in terms of:
+
+    name : list of items to match
+         | another list of items    -> optional_alias
+         | etc.
+
+An alias is a name for the specific rule alternative. It affects tree construction.
+
+An item is a:
+    
+ - rule
+ - token
+ - (item item ..) - Group items
+ - [item item ..] - Maybe. Same as: "(item item ..)?"
+ - item? - Zero or one instances of item ("maybe")
+ - item\* - Zero or more instances of item
+ - item+ - One or more instances of item
+
+
+Example:
+
+    float: "-"? DIGIT* "." DIGIT+ exp
+         | "-"? DIGIT+ exp
+
+    exp: "-"? ("e" | "E") DIGIT+
+
+    DIGIT: /[0-9]/
+
+## Tree Construction
+
+Lark builds a tree automatically based on the structure of the grammar. Is also accepts some hints.
+
+In general, Lark will place each rule as a branch, and its matches as the children of the branch.
+
+Using item+ or item\* will result in a list of items.
+
+Example:
+
+    expr: "(" expr ")"
+        | NAME+
+
+    NAME: /\w+/
+
+Lark will parse "(((hello world)))" as:
+
+    expr
+        expr
+            expr
+                "hello"
+                "world"
+
+The brackets do not appear in the tree by design.
+
+Tokens that won't appear in the tree are:
+
+ - Unnamed strings (like "keyword" or "+")
+ - Tokens whose name starts with an underscore (like \_DIGIT)
+
+Tokens that *will* appear in the tree are:
+
+ - Unnamed regular expressions (like /[0-9]/)
+ - Named tokens whose name starts with a letter (like DIGIT)
+
+## Shaping the tree
+
+1. Rules whose name begins with an underscore will be inlined into their containing rule.
+
+Example:
+
+    start: "(" _greet ")"
+    _greet: /\w+/ /\w+/
+
+Lark will parse "(hello world)" as:
+
+    start
+        "hello"
+        "world"
+
+
+2. Rules that recieve a question mark (?) at the beginning of their definition, will be inlined if they have a single child.
+
+Example:
+
+    start: greet greet
+    ?greet: "(" /\w+/ ")"
+          | /\w+ /\w+/
+
+Lark will parse "hello world (planet)" as:
+
+    start
+        greet
+            "hello"
+            "world"
+        "planet"
+
+3. Aliases - options in a rule can receive an alias. It will be then used as the branch name for the option.
+
+Example:
+
+    start: greet greet
+    greet: "hello" -> hello
+         | "world"
+
+Lark will parse "hello world" as:
+
+    start
+        hello
+        greet
+
+## Lark Options
+
+When initializing the Lark object, you can provide it with keyword options:
+
+- start - The start symbol (Default: "start")
+- parser - Decides which parser engine to use, "earley" or "lalr". (Default: "earley")
+           Note: Both will use Lark's lexer.
+- transformer - Applies the transformer to every parse tree (only allowed with parser="lalr")
+- only\_lex - Don't build a parser. Useful for debugging (default: False)
+- postlex - Lexer post-processing (Default: None)
+- profile - Measure run-time usage in Lark. Read results from the profiler proprety (Default: False)  
+
+To be supported:
+
+- debug
+- cache\_grammar
+- keep\_all\_tokens
+
diff --git a/lark/lark.py b/lark/lark.py
index 8d4ad96..499ecff 100644
--- a/lark/lark.py
+++ b/lark/lark.py
@@ -21,10 +21,11 @@ class LarkOptions(object):
         transformer - Applies the transformer to every parse tree
         debug - Affects verbosity (default: False)
         only_lex - Don't build a parser. Useful for debugging (default: False)
-        keep_all_tokens - Don't automagically remove "punctuation" tokens (default: True)
+        keep_all_tokens - Don't automagically remove "punctuation" tokens (default: False)
         cache_grammar - Cache the Lark grammar (Default: False)
         postlex - Lexer post-processing (Default: None)
         start - The start symbol (Default: start)
+        profile - Measure run-time usage in Lark. Read results from the profiler proprety (Default: False)
     """
     __doc__ += OPTIONS_DOC
     def __init__(self, options_dict):
@@ -39,7 +40,7 @@ class LarkOptions(object):
         self.parser = o.pop('parser', 'earley')
         self.transformer = o.pop('transformer', None)
         self.start = o.pop('start', 'start')
-        self.profile = o.pop('profile', False)  # XXX new
+        self.profile = o.pop('profile', False)
 
         assert self.parser in ENGINE_DICT
         if self.parser == 'earley' and self.transformer: