Browse Source

Merge pull request #1 from lark-parser/master

Merge from origin.
tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.8.2
Yuan 5 years ago
committed by GitHub
parent
commit
c95257c906
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
53 changed files with 1986 additions and 763 deletions
  1. +1
    -0
      .gitignore
  2. +20
    -5
      README.md
  3. +27
    -117
      docs/classes.md
  4. +77
    -5
      docs/grammar.md
  5. +2
    -2
      docs/how_to_develop.md
  6. +1
    -1
      docs/how_to_use.md
  7. +3
    -2
      docs/index.md
  8. +10
    -5
      docs/json_tutorial.md
  9. +3
    -3
      docs/parsers.md
  10. +6
    -6
      docs/recipes.md
  11. +24
    -0
      docs/tree_construction.md
  12. +117
    -0
      docs/visitors.md
  13. +1
    -0
      examples/README.md
  14. +11
    -1
      examples/json_parser.py
  15. +6
    -4
      examples/lark.lark
  16. +5
    -5
      examples/python3.lark
  17. +77
    -0
      examples/python_bytecode.py
  18. +2
    -8
      examples/reconstruct_json.py
  19. +1
    -0
      examples/standalone/create_standalone.sh
  20. +418
    -197
      examples/standalone/json_parser.py
  21. +1
    -1
      lark/__init__.py
  22. +1
    -0
      lark/common.py
  23. +14
    -4
      lark/exceptions.py
  24. +4
    -2
      lark/grammar.py
  25. +41
    -40
      lark/lark.py
  26. +84
    -50
      lark/lexer.py
  27. +66
    -57
      lark/load_grammar.py
  28. +33
    -9
      lark/parse_tree_builder.py
  29. +41
    -18
      lark/parser_frontends.py
  30. +8
    -6
      lark/parsers/cyk.py
  31. +25
    -14
      lark/parsers/earley.py
  32. +3
    -3
      lark/parsers/earley_forest.py
  33. +46
    -14
      lark/parsers/grammar_analysis.py
  34. +199
    -52
      lark/parsers/lalr_analysis.py
  35. +16
    -16
      lark/parsers/lalr_parser.py
  36. +4
    -3
      lark/parsers/xearley.py
  37. +47
    -12
      lark/reconstruct.py
  38. +2
    -2
      lark/tools/nearley.py
  39. +39
    -0
      lark/tools/serialize.py
  40. +3
    -0
      lark/tools/standalone.py
  41. +31
    -28
      lark/tree.py
  42. +28
    -2
      lark/utils.py
  43. +113
    -43
      lark/visitors.py
  44. +1
    -0
      mkdocs.yml
  45. +10
    -0
      readthedocs.yml
  46. +2
    -1
      tests/__main__.py
  47. +1
    -0
      tests/grammars/test_unicode.lark
  48. +4
    -1
      tests/test_nearley/test_nearley.py
  49. +239
    -6
      tests/test_parser.py
  50. +2
    -2
      tests/test_reconstructor.py
  51. +3
    -0
      tests/test_relative_import_unicode.lark
  52. +8
    -15
      tests/test_tools.py
  53. +55
    -1
      tests/test_trees.py

+ 1
- 0
.gitignore View File

@@ -4,6 +4,7 @@
/lark_parser.egg-info/** /lark_parser.egg-info/**
tags tags
.vscode .vscode
.idea
.ropeproject .ropeproject
.cache .cache
/dist /dist


+ 20
- 5
README.md View File

@@ -34,13 +34,16 @@ Lark has no dependencies.


[![Build Status](https://travis-ci.org/lark-parser/lark.svg?branch=master)](https://travis-ci.org/lark-parser/lark) [![Build Status](https://travis-ci.org/lark-parser/lark.svg?branch=master)](https://travis-ci.org/lark-parser/lark)


### Syntax Highlighting (new)
### Syntax Highlighting


Lark now provides syntax highlighting for its grammar files (\*.lark):
Lark provides syntax highlighting for its grammar files (\*.lark):


- [Sublime Text & TextMate](https://github.com/lark-parser/lark_syntax) - [Sublime Text & TextMate](https://github.com/lark-parser/lark_syntax)
- [vscode](https://github.com/lark-parser/vscode-lark) - [vscode](https://github.com/lark-parser/vscode-lark)


### Clones

- [Lerche (Julia)](https://github.com/jamesrhester/Lerche.jl) - an unofficial clone, written entirely in Julia.


### Hello World ### Hello World


@@ -72,7 +75,7 @@ Lark is great at handling ambiguity. Let's parse the phrase "fruit flies like ba


![fruitflies.png](examples/fruitflies.png) ![fruitflies.png](examples/fruitflies.png)


See more [examples in the wiki](https://github.com/erezsh/lark/wiki/Examples)
See more [examples here](https://github.com/lark-parser/lark/tree/master/examples)






@@ -95,7 +98,7 @@ See more [examples in the wiki](https://github.com/erezsh/lark/wiki/Examples)
- Extensive test suite [![codecov](https://codecov.io/gh/erezsh/lark/branch/master/graph/badge.svg)](https://codecov.io/gh/erezsh/lark) - Extensive test suite [![codecov](https://codecov.io/gh/erezsh/lark/branch/master/graph/badge.svg)](https://codecov.io/gh/erezsh/lark)
- And much more! - And much more!


See the full list of [features in the wiki](https://github.com/erezsh/lark/wiki/Features)
See the full list of [features here](https://lark-parser.readthedocs.io/en/latest/features/)




### Comparison to other libraries ### Comparison to other libraries
@@ -132,9 +135,21 @@ Check out the [JSON tutorial](/docs/json_tutorial.md#conclusion) for more detail


### Projects using Lark ### Projects using Lark


- [storyscript](https://github.com/storyscript/storyscript) - The programming language for Application Storytelling
- [tartiflette](https://github.com/dailymotion/tartiflette) - a GraphQL engine by Dailymotion. Lark is used to parse the GraphQL schemas definitions.
- [Hypothesis](https://github.com/HypothesisWorks/hypothesis) - Library for property-based testing
- [mappyfile](https://github.com/geographika/mappyfile) - a MapFile parser for working with MapServer configuration - [mappyfile](https://github.com/geographika/mappyfile) - a MapFile parser for working with MapServer configuration
- [synapse](https://github.com/vertexproject/synapse) - an intelligence analysis platform
- [Datacube-core](https://github.com/opendatacube/datacube-core) - Open Data Cube analyses continental scale Earth Observation data through time
- [SPFlow](https://github.com/SPFlow/SPFlow) - Library for Sum-Product Networks
- [Torchani](https://github.com/aiqm/torchani) - Accurate Neural Network Potential on PyTorch
- [Command-Block-Assembly](https://github.com/simon816/Command-Block-Assembly) - An assembly language, and C compiler, for Minecraft commands
- [Fabric-SDK-Py](https://github.com/hyperledger/fabric-sdk-py) - Hyperledger fabric SDK with Python 3.x
- [required](https://github.com/shezadkhan137/required) - multi-field validation using docstrings
- [miniwdl](https://github.com/chanzuckerberg/miniwdl) - A static analysis toolkit for the Workflow Description Language
- [pytreeview](https://gitlab.com/parmenti/pytreeview) - a lightweight tree-based grammar explorer - [pytreeview](https://gitlab.com/parmenti/pytreeview) - a lightweight tree-based grammar explorer
- [tartiflette](https://github.com/dailymotion/tartiflette) - a GraphQL engine by Dailymotion (Lark is used to parse the GraphQL schemas definitions)
- [harmalysis](https://github.com/napulen/harmalysis) - A language for harmonic analysis and music theory



Using Lark? Send me a message and I'll add your project! Using Lark? Send me a message and I'll add your project!




+ 27
- 117
docs/classes.md View File

@@ -1,42 +1,42 @@
# Classes - Reference
# Classes Reference


This page details the important classes in Lark. This page details the important classes in Lark.


---- ----


## Lark
## lark.Lark


The Lark class is the main interface for the library. It's mostly a thin wrapper for the many different parsers, and for the tree constructor. The Lark class is the main interface for the library. It's mostly a thin wrapper for the many different parsers, and for the tree constructor.


### Methods

#### \_\_init\_\_(self, grammar, **options) #### \_\_init\_\_(self, grammar, **options)


The Lark class accepts a grammar string or file object, and keyword options: The Lark class accepts a grammar string or file object, and keyword options:


* start - The symbol in the grammar that begins the parse (Default: `"start"`)
* **start** - A list of the rules in the grammar that begin the parse (Default: `["start"]`)


* parser - Decides which parser engine to use, "earley", "lalr" or "cyk". (Default: `"earley"`)
* **parser** - Decides which parser engine to use, "earley", "lalr" or "cyk". (Default: `"earley"`)


* lexer - Overrides default lexer.
* **lexer** - Overrides default lexer, depending on parser.


* transformer - Applies the transformer instead of building a parse tree (only allowed with parser="lalr")
* **transformer** - Applies the provided transformer instead of building a parse tree (only allowed with parser="lalr")


* postlex - Lexer post-processing (Default: None. only works when lexer is "standard" or "contextual")
* **postlex** - Lexer post-processing (Default: `None`. only works when lexer is "standard" or "contextual")


* ambiguity (only relevant for earley and cyk)
* **ambiguity** (only relevant for earley and cyk)


* "explicit" - Return all derivations inside an "_ambig" data node. * "explicit" - Return all derivations inside an "_ambig" data node.


* "resolve" - Let the parser choose the best derivation (greedy for tokens, non-greedy for rules. Default) * "resolve" - Let the parser choose the best derivation (greedy for tokens, non-greedy for rules. Default)


* debug - Display warnings (such as Shift-Reduce warnings for LALR)
* **debug** - Display warnings (such as Shift-Reduce warnings for LALR)

* **keep_all_tokens** - Don't throw away any terminals from the tree (Default=`False`)


* keep_all_tokens - Don't throw away any terminals from the tree (Default=False)
* **propagate_positions** - Propagate line/column count to tree nodes, at the cost of performance (default=`False`)


* propagate_positions - Propagate line/column count to tree nodes (default=False)
* **maybe_placeholders** - When True, the `[]` operator returns `None` when not matched. When `False`, `[]` behaves like the `?` operator, and return no value at all, which may be a little faster (default=`False`)


* lexer_callbacks - A dictionary of callbacks of type f(Token) -> Token, used to interface with the lexer Token generation. Only works with the standard and contextual lexers. See [Recipes](recipes.md) for more information.
* **lexer_callbacks** - A dictionary of callbacks of type f(Token) -> Token, used to interface with the lexer Token generation. Only works with the standard and contextual lexers. See [Recipes](recipes.md) for more information.


#### parse(self, text) #### parse(self, text)


@@ -50,13 +50,10 @@ If a transformer is supplied to `__init__`, returns whatever is the result of th


The main tree class The main tree class


### Properties

* `data` - The name of the rule or alias * `data` - The name of the rule or alias
* `children` - List of matched sub-rules and terminals * `children` - List of matched sub-rules and terminals
* `meta` - Line & Column numbers, if using `propagate_positions`

### Methods
* `meta` - Line & Column numbers (if `propagate_positions` is enabled)
* meta attributes: `line`, `column`, `start_pos`, `end_line`, `end_column`, `end_pos`


#### \_\_init\_\_(self, data, children) #### \_\_init\_\_(self, data, children)


@@ -92,102 +89,6 @@ Trees can be hashed and compared.


---- ----


## Transformers & Visitors

Transformers & Visitors provide a convenient interface to process the parse-trees that Lark returns.

They are used by inheriting from the correct class (visitor or transformer), and implementing methods corresponding to the rule you wish to process. Each methods accepts the children as an argument. That can be modified using the `v-args` decorator, which allows to inline the arguments (akin to `*args`), or add the tree `meta` property as an argument.

See: https://github.com/lark-parser/lark/blob/master/lark/visitors.py

### Visitors

Visitors visit each node of the tree, and run the appropriate method on it according to the node's data.

They work bottom-up, starting with the leaves and ending at the root of the tree.

**Example**
```python
class IncreaseAllNumbers(Visitor):
def number(self, tree):
assert tree.data == "number"
tree.children[0] += 1

IncreaseAllNumbers().visit(parse_tree)
```

There are two classes that implement the visitor interface:

* Visitor - Visit every node (without recursion)

* Visitor_Recursive - Visit every node using recursion. Slightly faster.

### Transformers

Transformers visit each node of the tree, and run the appropriate method on it according to the node's data.

They work bottom-up (or: depth-first), starting with the leaves and ending at the root of the tree.

Transformers can be used to implement map & reduce patterns.

Because nodes are reduced from leaf to root, at any point the callbacks may assume the children have already been transformed (if applicable).

Transformers can be chained into a new transformer by using multiplication.

**Example:**
```python
from lark import Tree, Transformer

class EvalExpressions(Transformer):
def expr(self, args):
return eval(args[0])

t = Tree('a', [Tree('expr', ['1+2'])])
print(EvalExpressions().transform( t ))

# Prints: Tree(a, [3])
```


Here are the classes that implement the transformer interface:

- Transformer - Recursively transforms the tree. This is the one you probably want.
- Transformer_InPlace - Non-recursive. Changes the tree in-place instead of returning new instances
- Transformer_InPlaceRecursive - Recursive. Changes the tree in-place instead of returning new instances

### v_args

`v_args` is a decorator.

By default, callback methods of transformers/visitors accept one argument: a list of the node's children. `v_args` can modify this behavior.

When used on a transformer/visitor class definition, it applies to all the callback methods inside it.

`v_args` accepts one of three flags:

- `inline` - Children are provided as `*args` instead of a list argument (not recommended for very long lists).
- `meta` - Provides two arguments: `children` and `meta` (instead of just the first)
- `tree` - Provides the entire tree as the argument, instead of the children.

Examples:

```python
@v_args(inline=True)
class SolveArith(Transformer):
def add(self, left, right):
return left + right


class ReverseNotation(Transformer_InPlace):
@v_args(tree=True):
def tree_node(self, tree):
tree.children = tree.children[::-1]
```

### Discard

When raising the `Discard` exception in a transformer callback, that node is discarded and won't appear in the parent.

## Token ## Token


When using a lexer, the resulting tokens in the trees will be of the Token class, which inherits from Python's string. So, normal string comparisons and operations will work as expected. Tokens also have other useful attributes: When using a lexer, the resulting tokens in the trees will be of the Token class, which inherits from Python's string. So, normal string comparisons and operations will work as expected. Tokens also have other useful attributes:
@@ -198,18 +99,27 @@ When using a lexer, the resulting tokens in the trees will be of the Token class
* `column` - The column of the token in the text (starting with 1) * `column` - The column of the token in the text (starting with 1)
* `end_line` - The line where the token ends * `end_line` - The line where the token ends
* `end_column` - The next column after the end of the token. For example, if the token is a single character with a `column` value of 4, `end_column` will be 5. * `end_column` - The next column after the end of the token. For example, if the token is a single character with a `column` value of 4, `end_column` will be 5.
* `end_pos` - the index where the token ends (basically pos_in_stream + len(token))

## Transformer
## Visitor
## Interpreter

See the [visitors page](visitors.md)




## UnexpectedInput ## UnexpectedInput


## UnexpectedToken

## UnexpectedException

- `UnexpectedInput` - `UnexpectedInput`
- `UnexpectedToken` - The parser recieved an unexpected token - `UnexpectedToken` - The parser recieved an unexpected token
- `UnexpectedCharacters` - The lexer encountered an unexpected string - `UnexpectedCharacters` - The lexer encountered an unexpected string


After catching one of these exceptions, you may call the following helper methods to create a nicer error message: After catching one of these exceptions, you may call the following helper methods to create a nicer error message:


### Methods

#### get_context(text, span) #### get_context(text, span)


Returns a pretty string pinpointing the error in the text, with `span` amount of context characters around it. Returns a pretty string pinpointing the error in the text, with `span` amount of context characters around it.


+ 77
- 5
docs/grammar.md View File

@@ -1,5 +1,13 @@
# Grammar Reference # Grammar Reference


Table of contents:

1. [Definitions](#defs)
1. [Terminals](#terms)
1. [Rules](#rules)
1. [Directives](#dirs)

<a name="defs"></a>
## Definitions ## Definitions


**A grammar** is a list of rules and terminals, that together define a language. **A grammar** is a list of rules and terminals, that together define a language.
@@ -25,6 +33,7 @@ Lark begins the parse with the rule 'start', unless specified otherwise in the o
Names of rules are always in lowercase, while names of terminals are always in uppercase. This distinction has practical effects, for the shape of the generated parse-tree, and the automatic construction of the lexer (aka tokenizer, or scanner). Names of rules are always in lowercase, while names of terminals are always in uppercase. This distinction has practical effects, for the shape of the generated parse-tree, and the automatic construction of the lexer (aka tokenizer, or scanner).




<a name="terms"></a>
## Terminals ## Terminals


Terminals are used to match text into symbols. They can be defined as a combination of literals and other terminals. Terminals are used to match text into symbols. They can be defined as a combination of literals and other terminals.
@@ -45,6 +54,16 @@ Literals can be one of:
* `/re with flags/imulx` * `/re with flags/imulx`
* Literal range: `"a".."z"`, `"1".."9"`, etc. * Literal range: `"a".."z"`, `"1".."9"`, etc.


Terminals also support grammar operators, such as `|`, `+`, `*` and `?`.

Terminals are a linear construct, and therefor may not contain themselves (recursion isn't allowed).

### Priority

Terminals can be assigned priority only when using a lexer (future versions may support Earley's dynamic lexing).

Priority can be either positive or negative. In not specified for a terminal, it's assumed to be 1 (i.e. the default).

#### Notes for when using a lexer: #### Notes for when using a lexer:


When using a lexer (standard or contextual), it is the grammar-author's responsibility to make sure the literals don't collide, or that if they do, they are matched in the desired order. Literals are matched in an order according to the following criteria: When using a lexer (standard or contextual), it is the grammar-author's responsibility to make sure the literals don't collide, or that if they do, they are matched in the desired order. Literals are matched in an order according to the following criteria:
@@ -59,11 +78,58 @@ When using a lexer (standard or contextual), it is the grammar-author's responsi
IF: "if" IF: "if"
INTEGER : /[0-9]+/ INTEGER : /[0-9]+/
INTEGER2 : ("0".."9")+ //# Same as INTEGER INTEGER2 : ("0".."9")+ //# Same as INTEGER
DECIMAL.2: INTEGER "." INTEGER //# Will be matched before INTEGER
DECIMAL.2: INTEGER? "." INTEGER //# Will be matched before INTEGER
WHITESPACE: (" " | /\t/ )+ WHITESPACE: (" " | /\t/ )+
SQL_SELECT: "select"i SQL_SELECT: "select"i
``` ```


### Regular expressions & Ambiguity

Each terminal is eventually compiled to a regular expression. All the operators and references inside it are mapped to their respective expressions.

For example, in the following grammar, `A1` and `A2`, are equivalent:
```perl
A1: "a" | "b"
A2: /a|b/
```

This means that inside terminals, Lark cannot detect or resolve ambiguity, even when using Earley.

For example, for this grammar:
```perl
start : (A | B)+
A : "a" | "ab"
B : "b"
```
We get this behavior:

```bash
>>> p.parse("ab")
Tree(start, [Token(A, 'a'), Token(B, 'b')])
```

This is happening because Python's regex engine always returns the first matching option.

If you find yourself in this situation, the recommended solution is to use rules instead.

Example:

```python
>>> p = Lark("""start: (a | b)+
... !a: "a" | "ab"
... !b: "b"
... """, ambiguity="explicit")
>>> print(p.parse("ab").pretty())
_ambig
start
a ab
start
a a
b b
```


<a name="rules"></a>
## Rules ## Rules


**Syntax:** **Syntax:**
@@ -85,24 +151,30 @@ Each item is one of:
* `TERMINAL` * `TERMINAL`
* `"string literal"` or `/regexp literal/` * `"string literal"` or `/regexp literal/`
* `(item item ..)` - Group items * `(item item ..)` - Group items
* `[item item ..]` - Maybe. Same as `(item item ..)?`
* `[item item ..]` - Maybe. Same as `(item item ..)?`, but generates `None` if there is no match
* `item?` - Zero or one instances of item ("maybe") * `item?` - Zero or one instances of item ("maybe")
* `item*` - Zero or more instances of item * `item*` - Zero or more instances of item
* `item+` - One or more instances of item * `item+` - One or more instances of item
* `item ~ n` - Exactly *n* instances of item * `item ~ n` - Exactly *n* instances of item
* `item ~ n..m` - Between *n* to *m* instances of item
* `item ~ n..m` - Between *n* to *m* instances of item (not recommended for wide ranges, due to performance issues)


**Examples:** **Examples:**
```perl ```perl
hello_world: "hello" "world" hello_world: "hello" "world"
mul: [mul "*"] number //# Left-recursion is allowed!
mul: (mul "*")? number //# Left-recursion is allowed and encouraged!
expr: expr operator expr expr: expr operator expr
| value //# Multi-line, belongs to expr | value //# Multi-line, belongs to expr


four_words: word ~ 4 four_words: word ~ 4
``` ```


### Priority

Rules can be assigned priority only when using Earley (future versions may support LALR as well).

Priority can be either positive or negative. In not specified for a terminal, it's assumed to be 1 (i.e. the default).


<a name="dirs"></a>
## Directives ## Directives


### %ignore ### %ignore
@@ -111,7 +183,7 @@ All occurrences of the terminal will be ignored, and won't be part of the parse.


Using the `%ignore` directive results in a cleaner grammar. Using the `%ignore` directive results in a cleaner grammar.


It's especially important for the LALR(1) algorithm, because adding whitespace (or comments, or other extranous elements) explicitly in the grammar, harms its predictive abilities, which are based on a lookahead of 1.
It's especially important for the LALR(1) algorithm, because adding whitespace (or comments, or other extraneous elements) explicitly in the grammar, harms its predictive abilities, which are based on a lookahead of 1.


**Syntax:** **Syntax:**
```html ```html


+ 2
- 2
docs/how_to_develop.md View File

@@ -7,7 +7,7 @@ There are many ways you can help the project:
* Write new grammars for Lark's library * Write new grammars for Lark's library
* Write a blog post introducing Lark to your audience * Write a blog post introducing Lark to your audience
* Port Lark to another language * Port Lark to another language
* Help me with code developemnt
* Help me with code development


If you're interested in taking one of these on, let me know and I will provide more details and assist you in the process. If you're interested in taking one of these on, let me know and I will provide more details and assist you in the process.


@@ -60,4 +60,4 @@ Another way to run the tests is using setup.py:


```bash ```bash
python setup.py test python setup.py test
```
```

+ 1
- 1
docs/how_to_use.md View File

@@ -10,7 +10,7 @@ This is the recommended process for working with Lark:


3. Try your grammar in Lark against each input sample. Make sure the resulting parse-trees make sense. 3. Try your grammar in Lark against each input sample. Make sure the resulting parse-trees make sense.


4. Use Lark's grammar features to [[shape the tree|Tree Construction]]: Get rid of superfluous rules by inlining them, and use aliases when specific cases need clarification.
4. Use Lark's grammar features to [shape the tree](tree_construction.md): Get rid of superfluous rules by inlining them, and use aliases when specific cases need clarification.


- You can perform steps 1-4 repeatedly, gradually growing your grammar to include more sentences. - You can perform steps 1-4 repeatedly, gradually growing your grammar to include more sentences.




+ 3
- 2
docs/index.md View File

@@ -35,8 +35,8 @@ $ pip install lark-parser
* [Examples](https://github.com/lark-parser/lark/tree/master/examples) * [Examples](https://github.com/lark-parser/lark/tree/master/examples)
* Tutorials * Tutorials
* [How to write a DSL](http://blog.erezsh.com/how-to-write-a-dsl-in-python-with-lark/) - Implements a toy LOGO-like language with an interpreter * [How to write a DSL](http://blog.erezsh.com/how-to-write-a-dsl-in-python-with-lark/) - Implements a toy LOGO-like language with an interpreter
* [How to write a JSON parser](json_tutorial.md)
* External
* [How to write a JSON parser](json_tutorial.md) - Teaches you how to use Lark
* Unofficial
* [Program Synthesis is Possible](https://www.cs.cornell.edu/~asampson/blog/minisynth.html) - Creates a DSL for Z3 * [Program Synthesis is Possible](https://www.cs.cornell.edu/~asampson/blog/minisynth.html) - Creates a DSL for Z3
* Guides * Guides
* [How to use Lark](how_to_use.md) * [How to use Lark](how_to_use.md)
@@ -44,6 +44,7 @@ $ pip install lark-parser
* Reference * Reference
* [Grammar](grammar.md) * [Grammar](grammar.md)
* [Tree Construction](tree_construction.md) * [Tree Construction](tree_construction.md)
* [Visitors & Transformers](visitors.md)
* [Classes](classes.md) * [Classes](classes.md)
* [Cheatsheet (PDF)](lark_cheatsheet.pdf) * [Cheatsheet (PDF)](lark_cheatsheet.pdf)
* Discussion * Discussion


+ 10
- 5
docs/json_tutorial.md View File

@@ -230,7 +230,8 @@ from lark import Transformer
class MyTransformer(Transformer): class MyTransformer(Transformer):
def list(self, items): def list(self, items):
return list(items) return list(items)
def pair(self, (k,v)):
def pair(self, key_value):
k, v = key_value
return k, v return k, v
def dict(self, items): def dict(self, items):
return dict(items) return dict(items)
@@ -251,9 +252,11 @@ Also, our definitions of list and dict are a bit verbose. We can do better:
from lark import Transformer from lark import Transformer


class TreeToJson(Transformer): class TreeToJson(Transformer):
def string(self, (s,)):
def string(self, s):
(s,) = s
return s[1:-1] return s[1:-1]
def number(self, (n,)):
def number(self, n):
(n,) = n
return float(n) return float(n)


list = list list = list
@@ -315,9 +318,11 @@ json_grammar = r"""
""" """


class TreeToJson(Transformer): class TreeToJson(Transformer):
def string(self, (s,)):
def string(self, s):
(s,) = s
return s[1:-1] return s[1:-1]
def number(self, (n,)):
def number(self, n):
(n,) = n
return float(n) return float(n)


list = list list = list


+ 3
- 3
docs/parsers.md View File

@@ -5,9 +5,9 @@ Lark implements the following parsing algorithms: Earley, LALR(1), and CYK


An [Earley Parser](https://www.wikiwand.com/en/Earley_parser) is a chart parser capable of parsing any context-free grammar at O(n^3), and O(n^2) when the grammar is unambiguous. It can parse most LR grammars at O(n). Most programming languages are LR, and can be parsed at a linear time. An [Earley Parser](https://www.wikiwand.com/en/Earley_parser) is a chart parser capable of parsing any context-free grammar at O(n^3), and O(n^2) when the grammar is unambiguous. It can parse most LR grammars at O(n). Most programming languages are LR, and can be parsed at a linear time.


Lark's Earley implementation runs on top of a skipping chart parser, which allows it to use regular expressions, instead of matching characters one-by-one. This is a huge improvement to Earley that is unique to Lark. This feature is used by default, but can also be requested explicitely using `lexer='dynamic'`.
Lark's Earley implementation runs on top of a skipping chart parser, which allows it to use regular expressions, instead of matching characters one-by-one. This is a huge improvement to Earley that is unique to Lark. This feature is used by default, but can also be requested explicitly using `lexer='dynamic'`.


It's possible to bypass the dynamic lexer, and use the regular Earley parser with a traditional lexer, that tokenizes as an independant first step. Doing so will provide a speed benefit, but will tokenize without using Earley's ambiguity-resolution ability. So choose this only if you know why! Activate with `lexer='standard'`
It's possible to bypass the dynamic lexing, and use the regular Earley parser with a traditional lexer, that tokenizes as an independent first step. Doing so will provide a speed benefit, but will tokenize without using Earley's ambiguity-resolution ability. So choose this only if you know why! Activate with `lexer='standard'`


**SPPF & Ambiguity resolution** **SPPF & Ambiguity resolution**


@@ -21,7 +21,7 @@ Lark provides the following options to combat ambiguity:


1) Lark will choose the best derivation for you (default). Users can choose between different disambiguation strategies, and can prioritize (or demote) individual rules over others, using the rule-priority syntax. 1) Lark will choose the best derivation for you (default). Users can choose between different disambiguation strategies, and can prioritize (or demote) individual rules over others, using the rule-priority syntax.


2) Users may choose to recieve the set of all possible parse-trees (using ambiguity='explicit'), and choose the best derivation themselves. While simple and flexible, it comes at the cost of space and performance, and so it isn't recommended for highly ambiguous grammars, or very long inputs.
2) Users may choose to receive the set of all possible parse-trees (using ambiguity='explicit'), and choose the best derivation themselves. While simple and flexible, it comes at the cost of space and performance, and so it isn't recommended for highly ambiguous grammars, or very long inputs.


3) As an advanced feature, users may use specialized visitors to iterate the SPPF themselves. Future versions of Lark intend to improve and simplify this interface. 3) As an advanced feature, users may use specialized visitors to iterate the SPPF themselves. Future versions of Lark intend to improve and simplify this interface.




+ 6
- 6
docs/recipes.md View File

@@ -19,18 +19,18 @@ It only works with the standard and contextual lexers.
### Example 1: Replace string values with ints for INT tokens ### Example 1: Replace string values with ints for INT tokens


```python ```python
from lark import Lark, Token
from lark import Lark, Transformer


def tok_to_int(tok):
"Convert the value of `tok` from string to int, while maintaining line number & column."
# tok.type == 'INT'
return Token.new_borrow_pos(tok.type, int(tok), tok)
class T(Transformer):
def INT(self, tok):
"Convert the value of `tok` from string to int, while maintaining line number & column."
return tok.update(value=int(tok))


parser = Lark(""" parser = Lark("""
start: INT* start: INT*
%import common.INT %import common.INT
%ignore " " %ignore " "
""", parser="lalr", lexer_callbacks = {'INT': tok_to_int})
""", parser="lalr", transformer=T())


print(parser.parse('3 14 159')) print(parser.parse('3 14 159'))
``` ```


+ 24
- 0
docs/tree_construction.md View File

@@ -7,6 +7,12 @@ For example, the rule `node: child1 child2` will create a tree node with two chi


Using `item+` or `item*` will result in a list of items, equivalent to writing `item item item ..`. Using `item+` or `item*` will result in a list of items, equivalent to writing `item item item ..`.


Using `item?` will return the item if it matched, or nothing.

If `maybe_placeholders=False` (the default), then `[]` behaves like `()?`.

If `maybe_placeholders=True`, then using `[item]` will return the item if it matched, or the value `None`, if it didn't.

### Terminals ### Terminals


Terminals are always values in the tree, never branches. Terminals are always values in the tree, never branches.
@@ -23,6 +29,24 @@ Lark filters out certain types of terminals by default, considering them punctua
- Unnamed regular expressions (like `/[0-9]/`) - Unnamed regular expressions (like `/[0-9]/`)
- Named terminals whose name starts with a letter (like `DIGIT`) - Named terminals whose name starts with a letter (like `DIGIT`)


Note: Terminals composed of literals and other terminals always include the entire match without filtering any part.

**Example:**
```
start: PNAME pname

PNAME: "(" NAME ")"
pname: "(" NAME ")"

NAME: /\w+/
%ignore /\s+/
```
Lark will parse "(Hello) (World)" as:

start
(Hello)
pname World

Rules prefixed with `!` will retain all their literals regardless. Rules prefixed with `!` will retain all their literals regardless.






+ 117
- 0
docs/visitors.md View File

@@ -0,0 +1,117 @@
## Transformers & Visitors

Transformers & Visitors provide a convenient interface to process the parse-trees that Lark returns.

They are used by inheriting from the correct class (visitor or transformer), and implementing methods corresponding to the rule you wish to process. Each method accepts the children as an argument. That can be modified using the `v_args` decorator, which allows to inline the arguments (akin to `*args`), or add the tree `meta` property as an argument.

See: <a href="https://github.com/lark-parser/lark/blob/master/lark/visitors.py">visitors.py</a>

### Visitors

Visitors visit each node of the tree, and run the appropriate method on it according to the node's data.

They work bottom-up, starting with the leaves and ending at the root of the tree.

**Example**
```python
class IncreaseAllNumbers(Visitor):
def number(self, tree):
assert tree.data == "number"
tree.children[0] += 1

IncreaseAllNumbers().visit(parse_tree)
```

There are two classes that implement the visitor interface:

* Visitor - Visit every node (without recursion)

* Visitor_Recursive - Visit every node using recursion. Slightly faster.

### Transformers

Transformers visit each node of the tree, and run the appropriate method on it according to the node's data.

They work bottom-up (or: depth-first), starting with the leaves and ending at the root of the tree.

Transformers can be used to implement map & reduce patterns.

Because nodes are reduced from leaf to root, at any point the callbacks may assume the children have already been transformed (if applicable).

Transformers can be chained into a new transformer by using multiplication.

`Transformer` can do anything `Visitor` can do, but because it reconstructs the tree, it is slightly less efficient.


**Example:**
```python
from lark import Tree, Transformer

class EvalExpressions(Transformer):
def expr(self, args):
return eval(args[0])

t = Tree('a', [Tree('expr', ['1+2'])])
print(EvalExpressions().transform( t ))

# Prints: Tree(a, [3])
```

All these classes implement the transformer interface:

- Transformer - Recursively transforms the tree. This is the one you probably want.
- Transformer_InPlace - Non-recursive. Changes the tree in-place instead of returning new instances
- Transformer_InPlaceRecursive - Recursive. Changes the tree in-place instead of returning new instances

### visit_tokens

By default, transformers only visit rules. `visit_tokens=True` will tell Transformer to visit tokens as well. This is a slightly slower alternative to `lexer_callbacks`, but it's easier to maintain and works for all algorithms (even when there isn't a lexer).

Example:

```python
class T(Transformer):
INT = int
NUMBER = float
def NAME(self, name):
return lookup_dict.get(name, name)


T(visit_tokens=True).transform(tree)
```


### v_args

`v_args` is a decorator.

By default, callback methods of transformers/visitors accept one argument: a list of the node's children. `v_args` can modify this behavior.

When used on a transformer/visitor class definition, it applies to all the callback methods inside it.

`v_args` accepts one of three flags:

- `inline` - Children are provided as `*args` instead of a list argument (not recommended for very long lists).
- `meta` - Provides two arguments: `children` and `meta` (instead of just the first)
- `tree` - Provides the entire tree as the argument, instead of the children.

Examples:

```python
@v_args(inline=True)
class SolveArith(Transformer):
def add(self, left, right):
return left + right


class ReverseNotation(Transformer_InPlace):
@v_args(tree=True):
def tree_node(self, tree):
tree.children = tree.children[::-1]
```

### Discard

When raising the `Discard` exception in a transformer callback, that node is discarded and won't appear in the parent.



+ 1
- 0
examples/README.md View File

@@ -27,6 +27,7 @@ For example, the following will parse all the Python files in the standard libra


- [error\_reporting\_lalr.py](error_reporting_lalr.py) - A demonstration of example-driven error reporting with the LALR parser - [error\_reporting\_lalr.py](error_reporting_lalr.py) - A demonstration of example-driven error reporting with the LALR parser
- [python\_parser.py](python_parser.py) - A fully-working Python 2 & 3 parser (but not production ready yet!) - [python\_parser.py](python_parser.py) - A fully-working Python 2 & 3 parser (but not production ready yet!)
- [python\_bytecode.py](python_bytecode.py) - A toy example showing how to compile Python directly to bytecode
- [conf\_lalr.py](conf_lalr.py) - Demonstrates the power of LALR's contextual lexer on a toy configuration language - [conf\_lalr.py](conf_lalr.py) - Demonstrates the power of LALR's contextual lexer on a toy configuration language
- [conf\_earley.py](conf_earley.py) - Demonstrates the power of Earley's dynamic lexer on a toy configuration language - [conf\_earley.py](conf_earley.py) - Demonstrates the power of Earley's dynamic lexer on a toy configuration language
- [custom\_lexer.py](custom_lexer.py) - Demonstrates using a custom lexer to parse a non-textual stream of data - [custom\_lexer.py](custom_lexer.py) - Demonstrates using a custom lexer to parse a non-textual stream of data


+ 11
- 1
examples/json_parser.py View File

@@ -49,11 +49,21 @@ class TreeToJson(Transformer):
false = lambda self, _: False false = lambda self, _: False




### Create the JSON parser with Lark, using the Earley algorithm
# json_parser = Lark(json_grammar, parser='earley', lexer='standard') # json_parser = Lark(json_grammar, parser='earley', lexer='standard')
# def parse(x): # def parse(x):
# return TreeToJson().transform(json_parser.parse(x)) # return TreeToJson().transform(json_parser.parse(x))


json_parser = Lark(json_grammar, parser='lalr', lexer='standard', transformer=TreeToJson())
### Create the JSON parser with Lark, using the LALR algorithm
json_parser = Lark(json_grammar, parser='lalr',
# Using the standard lexer isn't required, and isn't usually recommended.
# But, it's good enough for JSON, and it's slightly faster.
lexer='standard',
# Disabling propagate_positions and placeholders slightly improves speed
propagate_positions=False,
maybe_placeholders=False,
# Using an internal transformer is faster and more memory efficient
transformer=TreeToJson())
parse = json_parser.parse parse = json_parser.parse






+ 6
- 4
examples/lark.lark View File

@@ -10,10 +10,12 @@ token: TOKEN priority? ":" expansions _NL
priority: "." NUMBER priority: "." NUMBER


statement: "%ignore" expansions _NL -> ignore statement: "%ignore" expansions _NL -> ignore
| "%import" import_args ["->" name] _NL -> import
| "%import" import_path ["->" name] _NL -> import
| "%import" import_path name_list _NL -> multi_import
| "%declare" name+ -> declare | "%declare" name+ -> declare


import_args: "."? name ("." name)*
!import_path: "."? name ("." name)*
name_list: "(" name ("," name)* ")"


?expansions: alias (_VBAR alias)* ?expansions: alias (_VBAR alias)*


@@ -33,7 +35,7 @@ name: RULE
| TOKEN | TOKEN


_VBAR: _NL? "|" _VBAR: _NL? "|"
OP: /[+*][?]?|[?](?![a-z])/
OP: /[+*]|[?](?![a-z])/
RULE: /!?[_?]?[a-z][_a-z0-9]*/ RULE: /!?[_?]?[a-z][_a-z0-9]*/
TOKEN: /_?[A-Z][_A-Z0-9]*/ TOKEN: /_?[A-Z][_A-Z0-9]*/
STRING: _STRING "i"? STRING: _STRING "i"?
@@ -44,7 +46,7 @@ _NL: /(\r?\n)+\s*/
%import common.INT -> NUMBER %import common.INT -> NUMBER
%import common.WS_INLINE %import common.WS_INLINE


COMMENT: "//" /[^\n]/*
COMMENT: /\s*/ "//" /[^\n]/*


%ignore WS_INLINE %ignore WS_INLINE
%ignore COMMENT %ignore COMMENT

+ 5
- 5
examples/python3.lark View File

@@ -81,7 +81,7 @@ with_item: test ["as" expr]
except_clause: "except" [test ["as" NAME]] except_clause: "except" [test ["as" NAME]]
suite: simple_stmt | _NEWLINE _INDENT stmt+ _DEDENT suite: simple_stmt | _NEWLINE _INDENT stmt+ _DEDENT


?test: or_test ["if" or_test "else" test] | lambdef
?test: or_test ("if" or_test "else" test)? | lambdef
?test_nocond: or_test | lambdef_nocond ?test_nocond: or_test | lambdef_nocond
lambdef: "lambda" [varargslist] ":" test lambdef: "lambda" [varargslist] ":" test
lambdef_nocond: "lambda" [varargslist] ":" test_nocond lambdef_nocond: "lambda" [varargslist] ":" test_nocond
@@ -107,7 +107,7 @@ star_expr: "*" expr
// sake of a __future__ import described in PEP 401 (which really works :-) // sake of a __future__ import described in PEP 401 (which really works :-)
!_comp_op: "<"|">"|"=="|">="|"<="|"<>"|"!="|"in"|"not" "in"|"is"|"is" "not" !_comp_op: "<"|">"|"=="|">="|"<="|"<>"|"!="|"in"|"not" "in"|"is"|"is" "not"


?power: await_expr ["**" factor]
?power: await_expr ("**" factor)?
?await_expr: AWAIT? atom_expr ?await_expr: AWAIT? atom_expr
AWAIT: "await" AWAIT: "await"


@@ -137,7 +137,7 @@ dictorsetmaker: ( ((test ":" test | "**" expr) (comp_for | ("," (test ":" test |


classdef: "class" NAME ["(" [arguments] ")"] ":" suite classdef: "class" NAME ["(" [arguments] ")"] ":" suite


arguments: argvalue ("," argvalue)* ["," [ starargs | kwargs]]
arguments: argvalue ("," argvalue)* ("," [ starargs | kwargs])?
| starargs | starargs
| kwargs | kwargs
| test comp_for | test comp_for
@@ -145,7 +145,7 @@ arguments: argvalue ("," argvalue)* ["," [ starargs | kwargs]]
starargs: "*" test ("," "*" test)* ("," argvalue)* ["," kwargs] starargs: "*" test ("," "*" test)* ("," argvalue)* ["," kwargs]
kwargs: "**" test kwargs: "**" test


?argvalue: test ["=" test]
?argvalue: test ("=" test)?






@@ -178,7 +178,7 @@ HEX_NUMBER.2: /0x[\da-f]*/i
OCT_NUMBER.2: /0o[0-7]*/i OCT_NUMBER.2: /0o[0-7]*/i
BIN_NUMBER.2 : /0b[0-1]*/i BIN_NUMBER.2 : /0b[0-1]*/i
FLOAT_NUMBER.2: /((\d+\.\d*|\.\d+)(e[-+]?\d+)?|\d+(e[-+]?\d+))/i FLOAT_NUMBER.2: /((\d+\.\d*|\.\d+)(e[-+]?\d+)?|\d+(e[-+]?\d+))/i
IMAG_NUMBER.2: /\d+j|${FLOAT_NUMBER}j/i
IMAG_NUMBER.2: /\d+j/i | FLOAT_NUMBER "j"i


%ignore /[\t \f]+/ // WS %ignore /[\t \f]+/ // WS
%ignore /\\[\t \f]*\r?\n/ // LINE_CONT %ignore /\\[\t \f]*\r?\n/ // LINE_CONT


+ 77
- 0
examples/python_bytecode.py View File

@@ -0,0 +1,77 @@
#
# This is a toy example that compiles Python directly to bytecode, without generating an AST.
# It currently only works for very very simple Python code.
#
# It requires the 'bytecode' library. You can get it using
#
# $ pip install bytecode
#

from lark import Lark, Transformer, v_args
from lark.indenter import Indenter

from bytecode import Instr, Bytecode

class PythonIndenter(Indenter):
NL_type = '_NEWLINE'
OPEN_PAREN_types = ['LPAR', 'LSQB', 'LBRACE']
CLOSE_PAREN_types = ['RPAR', 'RSQB', 'RBRACE']
INDENT_type = '_INDENT'
DEDENT_type = '_DEDENT'
tab_len = 8


@v_args(inline=True)
class Compile(Transformer):
def number(self, n):
return [Instr('LOAD_CONST', int(n))]
def string(self, s):
return [Instr('LOAD_CONST', s[1:-1])]
def var(self, n):
return [Instr('LOAD_NAME', n)]

def arith_expr(self, a, op, b):
# TODO support chain arithmetic
assert op == '+'
return a + b + [Instr('BINARY_ADD')]

def arguments(self, args):
return args

def funccall(self, name, args):
return name + args + [Instr('CALL_FUNCTION', 1)]

@v_args(inline=False)
def file_input(self, stmts):
return sum(stmts, []) + [Instr("RETURN_VALUE")]

def expr_stmt(self, lval, rval):
# TODO more complicated than that
name ,= lval
assert name.name == 'LOAD_NAME' # XXX avoid with another layer of abstraction
return rval + [Instr("STORE_NAME", name.arg)]

def __default__(self, *args):
assert False, args


python_parser3 = Lark.open('python3.lark', rel_to=__file__, start='file_input',
parser='lalr', postlex=PythonIndenter(),
transformer=Compile(), propagate_positions=False)

def compile_python(s):
insts = python_parser3.parse(s+"\n")
return Bytecode(insts).to_code()

code = compile_python("""
a = 3
b = 5
print("Hello World!")
print(a+(b+2))
print((a+b)+2)
""")
exec(code)
# -- Output --
# Hello World!
# 10
# 10

+ 2
- 8
examples/reconstruct_json.py View File

@@ -25,15 +25,9 @@ test_json = '''


def test_earley(): def test_earley():


json_parser = Lark(json_grammar)
json_parser = Lark(json_grammar, maybe_placeholders=False)
tree = json_parser.parse(test_json) tree = json_parser.parse(test_json)


# print ('@@', tree.pretty())
# for x in tree.find_data('true'):
# x.data = 'false'
# # x.children[0].value = '"HAHA"'


new_json = Reconstructor(json_parser).reconstruct(tree) new_json = Reconstructor(json_parser).reconstruct(tree)
print (new_json) print (new_json)
print (json.loads(new_json) == json.loads(test_json)) print (json.loads(new_json) == json.loads(test_json))
@@ -41,7 +35,7 @@ def test_earley():


def test_lalr(): def test_lalr():


json_parser = Lark(json_grammar, parser='lalr')
json_parser = Lark(json_grammar, parser='lalr', maybe_placeholders=False)
tree = json_parser.parse(test_json) tree = json_parser.parse(test_json)


new_json = Reconstructor(json_parser).reconstruct(tree) new_json = Reconstructor(json_parser).reconstruct(tree)


+ 1
- 0
examples/standalone/create_standalone.sh View File

@@ -1 +1,2 @@
#!/bin/sh
PYTHONPATH=../.. python -m lark.tools.standalone json.lark > json_parser.py PYTHONPATH=../.. python -m lark.tools.standalone json.lark > json_parser.py

+ 418
- 197
examples/standalone/json_parser.py
File diff suppressed because it is too large
View File


+ 1
- 1
lark/__init__.py View File

@@ -5,4 +5,4 @@ from .exceptions import ParseError, LexError, GrammarError, UnexpectedToken, Une
from .lexer import Token from .lexer import Token
from .lark import Lark from .lark import Lark


__version__ = "0.7.0"
__version__ = "0.8.1"

+ 1
- 0
lark/common.py View File

@@ -20,6 +20,7 @@ class LexerConf(Serialize):


class ParserConf: class ParserConf:
def __init__(self, rules, callbacks, start): def __init__(self, rules, callbacks, start):
assert isinstance(start, list)
self.rules = rules self.rules = rules
self.callbacks = callbacks self.callbacks = callbacks
self.start = start self.start = start


+ 14
- 4
lark/exceptions.py View File

@@ -13,6 +13,14 @@ class ParseError(LarkError):
class LexError(LarkError): class LexError(LarkError):
pass pass


class UnexpectedEOF(ParseError):
def __init__(self, expected):
self.expected = expected

message = ("Unexpected end-of-input. Expected one of: \n\t* %s\n" % '\n\t* '.join(x.name for x in self.expected))
super(UnexpectedEOF, self).__init__(message)


class UnexpectedInput(LarkError): class UnexpectedInput(LarkError):
pos_in_stream = None pos_in_stream = None


@@ -52,7 +60,7 @@ class UnexpectedInput(LarkError):




class UnexpectedCharacters(LexError, UnexpectedInput): class UnexpectedCharacters(LexError, UnexpectedInput):
def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None):
def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None):
message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos], line, column) message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos], line, column)


self.line = line self.line = line
@@ -65,6 +73,8 @@ class UnexpectedCharacters(LexError, UnexpectedInput):
message += '\n\n' + self.get_context(seq) message += '\n\n' + self.get_context(seq)
if allowed: if allowed:
message += '\nExpecting: %s\n' % allowed message += '\nExpecting: %s\n' % allowed
if token_history:
message += '\nPrevious tokens: %s\n' % ', '.join(repr(t) for t in token_history)


super(UnexpectedCharacters, self).__init__(message) super(UnexpectedCharacters, self).__init__(message)


@@ -87,10 +97,10 @@ class UnexpectedToken(ParseError, UnexpectedInput):
super(UnexpectedToken, self).__init__(message) super(UnexpectedToken, self).__init__(message)


class VisitError(LarkError): class VisitError(LarkError):
def __init__(self, tree, orig_exc):
self.tree = tree
def __init__(self, rule, obj, orig_exc):
self.obj = obj
self.orig_exc = orig_exc self.orig_exc = orig_exc


message = 'Error trying to process rule "%s":\n\n%s' % (tree.data, orig_exc)
message = 'Error trying to process rule "%s":\n\n%s' % (rule, orig_exc)
super(VisitError, self).__init__(message) super(VisitError, self).__init__(message)
###} ###}

+ 4
- 2
lark/grammar.py View File

@@ -3,6 +3,8 @@ from .utils import Serialize
###{standalone ###{standalone


class Symbol(Serialize): class Symbol(Serialize):
__slots__ = ('name',)

is_term = NotImplemented is_term = NotImplemented


def __init__(self, name): def __init__(self, name):
@@ -79,7 +81,7 @@ class Rule(Serialize):
self.expansion = expansion self.expansion = expansion
self.alias = alias self.alias = alias
self.order = order self.order = order
self.options = options
self.options = options or RuleOptions()
self._hash = hash((self.origin, tuple(self.expansion))) self._hash = hash((self.origin, tuple(self.expansion)))


def _deserialize(self): def _deserialize(self):
@@ -101,4 +103,4 @@ class Rule(Serialize):






###}
###}

+ 41
- 40
lark/lark.py View File

@@ -1,8 +1,6 @@
from __future__ import absolute_import from __future__ import absolute_import


import os import os
import time
from collections import defaultdict
from io import open from io import open


from .utils import STRING_TYPE, Serialize, SerializeMemoizer from .utils import STRING_TYPE, Serialize, SerializeMemoizer
@@ -43,8 +41,7 @@ class LarkOptions(Serialize):
keep_all_tokens - Don't automagically remove "punctuation" tokens (default: False) keep_all_tokens - Don't automagically remove "punctuation" tokens (default: False)
cache_grammar - Cache the Lark grammar (Default: False) cache_grammar - Cache the Lark grammar (Default: False)
postlex - Lexer post-processing (Default: None) Only works with the standard and contextual lexers. postlex - Lexer post-processing (Default: None) Only works with the standard and contextual lexers.
start - The start symbol (Default: start)
profile - Measure run-time usage in Lark. Read results from the profiler proprety (Default: False)
start - The start symbol, either a string, or a list of strings for multiple possible starts (Default: "start")
priority - How priorities should be evaluated - auto, none, normal, invert (Default: auto) priority - How priorities should be evaluated - auto, none, normal, invert (Default: auto)
propagate_positions - Propagates [line, column, end_line, end_column] attributes into all tree branches. propagate_positions - Propagates [line, column, end_line, end_column] attributes into all tree branches.
lexer_callbacks - Dictionary of callbacks for the lexer. May alter tokens during lexing. Use with caution. lexer_callbacks - Dictionary of callbacks for the lexer. May alter tokens during lexing. Use with caution.
@@ -63,12 +60,12 @@ class LarkOptions(Serialize):
'lexer': 'auto', 'lexer': 'auto',
'transformer': None, 'transformer': None,
'start': 'start', 'start': 'start',
'profile': False,
'priority': 'auto', 'priority': 'auto',
'ambiguity': 'auto', 'ambiguity': 'auto',
'propagate_positions': False, 'propagate_positions': False,
'lexer_callbacks': {}, 'lexer_callbacks': {},
'maybe_placeholders': False, 'maybe_placeholders': False,
'edit_terminals': None,
} }


def __init__(self, options_dict): def __init__(self, options_dict):
@@ -85,6 +82,9 @@ class LarkOptions(Serialize):


options[name] = value options[name] = value


if isinstance(options['start'], STRING_TYPE):
options['start'] = [options['start']]

self.__dict__['options'] = options self.__dict__['options'] = options


assert self.parser in ('earley', 'lalr', 'cyk', None) assert self.parser in ('earley', 'lalr', 'cyk', None)
@@ -97,7 +97,11 @@ class LarkOptions(Serialize):
raise ValueError("Unknown options: %s" % o.keys()) raise ValueError("Unknown options: %s" % o.keys())


def __getattr__(self, name): def __getattr__(self, name):
return self.options[name]
try:
return self.options[name]
except KeyError as e:
raise AttributeError(e)

def __setattr__(self, name, value): def __setattr__(self, name, value):
assert name in self.options assert name in self.options
self.options[name] = value self.options[name] = value
@@ -110,30 +114,6 @@ class LarkOptions(Serialize):
return cls(data) return cls(data)




class Profiler:
def __init__(self):
self.total_time = defaultdict(float)
self.cur_section = '__init__'
self.last_enter_time = time.time()

def enter_section(self, name):
cur_time = time.time()
self.total_time[self.cur_section] += cur_time - self.last_enter_time
self.last_enter_time = cur_time
self.cur_section = name

def make_wrapper(self, name, f):
def wrapper(*args, **kwargs):
last_section = self.cur_section
self.enter_section(name)
try:
return f(*args, **kwargs)
finally:
self.enter_section(last_section)

return wrapper


class Lark(Serialize): class Lark(Serialize):
def __init__(self, grammar, **options): def __init__(self, grammar, **options):
""" """
@@ -161,9 +141,6 @@ class Lark(Serialize):
if self.options.cache_grammar: if self.options.cache_grammar:
raise NotImplementedError("Not available yet") raise NotImplementedError("Not available yet")


assert not self.options.profile, "Feature temporarily disabled"
# self.profiler = Profiler() if self.options.profile else None

if self.options.lexer == 'auto': if self.options.lexer == 'auto':
if self.options.parser == 'lalr': if self.options.parser == 'lalr':
self.options.lexer = 'contextual' self.options.lexer = 'contextual'
@@ -200,22 +177,37 @@ class Lark(Serialize):
self.grammar = load_grammar(grammar, self.source) self.grammar = load_grammar(grammar, self.source)


# Compile the EBNF grammar into BNF # Compile the EBNF grammar into BNF
self.terminals, self.rules, self.ignore_tokens = self.grammar.compile()
self.terminals, self.rules, self.ignore_tokens = self.grammar.compile(self.options.start)

if self.options.edit_terminals:
for t in self.terminals:
self.options.edit_terminals(t)

self._terminals_dict = {t.name:t for t in self.terminals}


# If the user asked to invert the priorities, negate them all here. # If the user asked to invert the priorities, negate them all here.
# This replaces the old 'resolve__antiscore_sum' option. # This replaces the old 'resolve__antiscore_sum' option.
if self.options.priority == 'invert': if self.options.priority == 'invert':
for rule in self.rules: for rule in self.rules:
if rule.options and rule.options.priority is not None:
if rule.options.priority is not None:
rule.options.priority = -rule.options.priority rule.options.priority = -rule.options.priority
# Else, if the user asked to disable priorities, strip them from the # Else, if the user asked to disable priorities, strip them from the
# rules. This allows the Earley parsers to skip an extra forest walk # rules. This allows the Earley parsers to skip an extra forest walk
# for improved performance, if you don't need them (or didn't specify any). # for improved performance, if you don't need them (or didn't specify any).
elif self.options.priority == None: elif self.options.priority == None:
for rule in self.rules: for rule in self.rules:
if rule.options and rule.options.priority is not None:
if rule.options.priority is not None:
rule.options.priority = None rule.options.priority = None
self.lexer_conf = LexerConf(self.terminals, self.ignore_tokens, self.options.postlex, self.options.lexer_callbacks)

# TODO Deprecate lexer_callbacks?
lexer_callbacks = dict(self.options.lexer_callbacks)
if self.options.transformer:
t = self.options.transformer
for term in self.terminals:
if hasattr(t, term.name):
lexer_callbacks[term.name] = getattr(t, term.name)

self.lexer_conf = LexerConf(self.terminals, self.ignore_tokens, self.options.postlex, lexer_callbacks)


if self.options.parser: if self.options.parser:
self.parser = self._build_parser() self.parser = self._build_parser()
@@ -287,8 +279,17 @@ class Lark(Serialize):
return self.options.postlex.process(stream) return self.options.postlex.process(stream)
return stream return stream


def parse(self, text):
"Parse the given text, according to the options provided. Returns a tree, unless specified otherwise."
return self.parser.parse(text)
def get_terminal(self, name):
"Get information about a terminal"
return self._terminals_dict[name]

def parse(self, text, start=None):
"""Parse the given text, according to the options provided.

The 'start' parameter is required if Lark was given multiple possible start symbols (using the start option).

Returns a tree, unless specified otherwise.
"""
return self.parser.parse(text, start=start)


###} ###}

+ 84
- 50
lark/lexer.py View File

@@ -3,12 +3,11 @@
import re import re


from .utils import Str, classify, get_regexp_width, Py36, Serialize from .utils import Str, classify, get_regexp_width, Py36, Serialize
from .exceptions import UnexpectedCharacters, LexError
from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken


###{standalone ###{standalone


class Pattern(Serialize): class Pattern(Serialize):
__serialize_fields__ = 'value', 'flags'


def __init__(self, value, flags=()): def __init__(self, value, flags=()):
self.value = value self.value = value
@@ -41,6 +40,10 @@ class Pattern(Serialize):




class PatternStr(Pattern): class PatternStr(Pattern):
__serialize_fields__ = 'value', 'flags'

type = "str"

def to_regexp(self): def to_regexp(self):
return self._get_flags(re.escape(self.value)) return self._get_flags(re.escape(self.value))


@@ -50,15 +53,25 @@ class PatternStr(Pattern):
max_width = min_width max_width = min_width


class PatternRE(Pattern): class PatternRE(Pattern):
__serialize_fields__ = 'value', 'flags', '_width'

type = "re"

def to_regexp(self): def to_regexp(self):
return self._get_flags(self.value) return self._get_flags(self.value)


_width = None
def _get_width(self):
if self._width is None:
self._width = get_regexp_width(self.to_regexp())
return self._width

@property @property
def min_width(self): def min_width(self):
return get_regexp_width(self.to_regexp())[0]
return self._get_width()[0]
@property @property
def max_width(self): def max_width(self):
return get_regexp_width(self.to_regexp())[1]
return self._get_width()[1]




class TerminalDef(Serialize): class TerminalDef(Serialize):
@@ -77,9 +90,9 @@ class TerminalDef(Serialize):




class Token(Str): class Token(Str):
__slots__ = ('type', 'pos_in_stream', 'value', 'line', 'column', 'end_line', 'end_column')
__slots__ = ('type', 'pos_in_stream', 'value', 'line', 'column', 'end_line', 'end_column', 'end_pos')


def __new__(cls, type_, value, pos_in_stream=None, line=None, column=None, end_line=None, end_column=None):
def __new__(cls, type_, value, pos_in_stream=None, line=None, column=None, end_line=None, end_column=None, end_pos=None):
try: try:
self = super(Token, cls).__new__(cls, value) self = super(Token, cls).__new__(cls, value)
except UnicodeDecodeError: except UnicodeDecodeError:
@@ -93,11 +106,19 @@ class Token(Str):
self.column = column self.column = column
self.end_line = end_line self.end_line = end_line
self.end_column = end_column self.end_column = end_column
self.end_pos = end_pos
return self return self


def update(self, type_=None, value=None):
return Token.new_borrow_pos(
type_ if type_ is not None else self.type,
value if value is not None else self.value,
self
)

@classmethod @classmethod
def new_borrow_pos(cls, type_, value, borrow_t): def new_borrow_pos(cls, type_, value, borrow_t):
return cls(type_, value, borrow_t.pos_in_stream, borrow_t.line, borrow_t.column, borrow_t.end_line, borrow_t.end_column)
return cls(type_, value, borrow_t.pos_in_stream, borrow_t.line, borrow_t.column, borrow_t.end_line, borrow_t.end_column, borrow_t.end_pos)


def __reduce__(self): def __reduce__(self):
return (self.__class__, (self.type, self.value, self.pos_in_stream, self.line, self.column, )) return (self.__class__, (self.type, self.value, self.pos_in_stream, self.line, self.column, ))
@@ -149,38 +170,38 @@ class _Lex:
newline_types = frozenset(newline_types) newline_types = frozenset(newline_types)
ignore_types = frozenset(ignore_types) ignore_types = frozenset(ignore_types)
line_ctr = LineCounter() line_ctr = LineCounter()
last_token = None


while line_ctr.char_pos < len(stream): while line_ctr.char_pos < len(stream):
lexer = self.lexer lexer = self.lexer
for mre, type_from_index in lexer.mres:
m = mre.match(stream, line_ctr.char_pos)
if not m:
continue

t = None
value = m.group(0)
type_ = type_from_index[m.lastindex]
if type_ not in ignore_types:
t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
if t.type in lexer.callback:
t = lexer.callback[t.type](t)
if not isinstance(t, Token):
raise ValueError("Callbacks must return a token (returned %r)" % t)
yield t
else:
if type_ in lexer.callback:
t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
lexer.callback[type_](t)
res = lexer.match(stream, line_ctr.char_pos)
if not res:
allowed = {v for m, tfi in lexer.mres for v in tfi.values()} - ignore_types
if not allowed:
allowed = {"<END-OF-FILE>"}
raise UnexpectedCharacters(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column, allowed=allowed, state=self.state, token_history=last_token and [last_token])


line_ctr.feed(value, type_ in newline_types)
if t:
t.end_line = line_ctr.line
t.end_column = line_ctr.column
value, type_ = res


break
if type_ not in ignore_types:
t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
line_ctr.feed(value, type_ in newline_types)
t.end_line = line_ctr.line
t.end_column = line_ctr.column
t.end_pos = line_ctr.char_pos
if t.type in lexer.callback:
t = lexer.callback[t.type](t)
if not isinstance(t, Token):
raise ValueError("Callbacks must return a token (returned %r)" % t)
yield t
last_token = t
else: else:
allowed = {v for m, tfi in lexer.mres for v in tfi.values()}
raise UnexpectedCharacters(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column, allowed=allowed, state=self.state)
if type_ in lexer.callback:
t2 = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
lexer.callback[type_](t2)
line_ctr.feed(value, type_ in newline_types)






class UnlessCallback: class UnlessCallback:
@@ -253,23 +274,21 @@ def build_mres(terminals, match_whole=False):
return _build_mres(terminals, len(terminals), match_whole) return _build_mres(terminals, len(terminals), match_whole)


def _regexp_has_newline(r): def _regexp_has_newline(r):
"""Expressions that may indicate newlines in a regexp:
r"""Expressions that may indicate newlines in a regexp:
- newlines (\n) - newlines (\n)
- escaped newline (\\n) - escaped newline (\\n)
- anything but ([^...]) - anything but ([^...])
- any-char (.) when the flag (?s) exists - any-char (.) when the flag (?s) exists
- spaces (\s)
""" """
return '\n' in r or '\\n' in r or '[^' in r or ('(?s' in r and '.' in r)
return '\n' in r or '\\n' in r or '\\s' in r or '[^' in r or ('(?s' in r and '.' in r)


class Lexer(object): class Lexer(object):
"""Lexer interface """Lexer interface


Method Signatures: Method Signatures:
lex(self, stream) -> Iterator[Token] lex(self, stream) -> Iterator[Token]

set_parser_state(self, state) # Optional
""" """
set_parser_state = NotImplemented
lex = NotImplemented lex = NotImplemented




@@ -284,7 +303,7 @@ class TraditionalLexer(Lexer):
for t in terminals: for t in terminals:
try: try:
re.compile(t.pattern.to_regexp()) re.compile(t.pattern.to_regexp())
except:
except re.error:
raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern)) raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern))


if t.pattern.min_width == 0: if t.pattern.min_width == 0:
@@ -314,6 +333,11 @@ class TraditionalLexer(Lexer):


self.mres = build_mres(terminals) self.mres = build_mres(terminals)


def match(self, stream, pos):
for mre, type_from_index in self.mres:
m = mre.match(stream, pos)
if m:
return m.group(0), type_from_index[m.lastindex]


def lex(self, stream): def lex(self, stream):
return _Lex(self).lex(stream, self.newline_types, self.ignore_types) return _Lex(self).lex(stream, self.newline_types, self.ignore_types)
@@ -322,6 +346,7 @@ class TraditionalLexer(Lexer):




class ContextualLexer(Lexer): class ContextualLexer(Lexer):

def __init__(self, terminals, states, ignore=(), always_accept=(), user_callbacks={}): def __init__(self, terminals, states, ignore=(), always_accept=(), user_callbacks={}):
tokens_by_name = {} tokens_by_name = {}
for t in terminals: for t in terminals:
@@ -344,16 +369,25 @@ class ContextualLexer(Lexer):


self.root_lexer = TraditionalLexer(terminals, ignore=ignore, user_callbacks=user_callbacks) self.root_lexer = TraditionalLexer(terminals, ignore=ignore, user_callbacks=user_callbacks)


self.set_parser_state(None) # Needs to be set on the outside

def set_parser_state(self, state):
self.parser_state = state

def lex(self, stream):
l = _Lex(self.lexers[self.parser_state], self.parser_state)
for x in l.lex(stream, self.root_lexer.newline_types, self.root_lexer.ignore_types):
yield x
l.lexer = self.lexers[self.parser_state]
l.state = self.parser_state
def lex(self, stream, get_parser_state):
parser_state = get_parser_state()
l = _Lex(self.lexers[parser_state], parser_state)
try:
for x in l.lex(stream, self.root_lexer.newline_types, self.root_lexer.ignore_types):
yield x
parser_state = get_parser_state()
l.lexer = self.lexers[parser_state]
l.state = parser_state # For debug only, no need to worry about multithreading
except UnexpectedCharacters as e:
# In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined,
# but not in the current context.
# This tests the input against the global context, to provide a nicer error.
root_match = self.root_lexer.match(stream, e.pos_in_stream)
if not root_match:
raise

value, type_ = root_match
t = Token(type_, value, e.pos_in_stream, e.line, e.column)
raise UnexpectedToken(t, e.allowed, state=e.state)


###} ###}

+ 66
- 57
lark/load_grammar.py View File

@@ -2,17 +2,17 @@


import os.path import os.path
import sys import sys
from ast import literal_eval
from copy import copy, deepcopy from copy import copy, deepcopy
from io import open


from .utils import bfs
from .utils import bfs, eval_escaping
from .lexer import Token, TerminalDef, PatternStr, PatternRE from .lexer import Token, TerminalDef, PatternStr, PatternRE


from .parse_tree_builder import ParseTreeBuilder from .parse_tree_builder import ParseTreeBuilder
from .parser_frontends import LALR_TraditionalLexer from .parser_frontends import LALR_TraditionalLexer
from .common import LexerConf, ParserConf from .common import LexerConf, ParserConf
from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol
from .utils import classify, suppress, dedup_list
from .utils import classify, suppress, dedup_list, Str
from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken


from .tree import Tree, SlottedTree as ST from .tree import Tree, SlottedTree as ST
@@ -73,11 +73,12 @@ TERMINALS = {
'_RPAR': r'\)', '_RPAR': r'\)',
'_LBRA': r'\[', '_LBRA': r'\[',
'_RBRA': r'\]', '_RBRA': r'\]',
'OP': '[+*][?]?|[?](?![a-z])',
'OP': '[+*]|[?](?![a-z])',
'_COLON': ':', '_COLON': ':',
'_COMMA': ',', '_COMMA': ',',
'_OR': r'\|', '_OR': r'\|',
'_DOT': r'\.',
'_DOT': r'\.(?!\.)',
'_DOTDOT': r'\.\.',
'TILDE': '~', 'TILDE': '~',
'RULE': '!?[_?]?[a-z][_a-z0-9]*', 'RULE': '!?[_?]?[a-z][_a-z0-9]*',
'TERMINAL': '_?[A-Z][_A-Z0-9]*', 'TERMINAL': '_?[A-Z][_A-Z0-9]*',
@@ -85,12 +86,12 @@ TERMINALS = {
'REGEXP': r'/(?!/)(\\/|\\\\|[^/\n])*?/[%s]*' % _RE_FLAGS, 'REGEXP': r'/(?!/)(\\/|\\\\|[^/\n])*?/[%s]*' % _RE_FLAGS,
'_NL': r'(\r?\n)+\s*', '_NL': r'(\r?\n)+\s*',
'WS': r'[ \t]+', 'WS': r'[ \t]+',
'COMMENT': r'//[^\n]*',
'COMMENT': r'\s*//[^\n]*',
'_TO': '->', '_TO': '->',
'_IGNORE': r'%ignore', '_IGNORE': r'%ignore',
'_DECLARE': r'%declare', '_DECLARE': r'%declare',
'_IMPORT': r'%import', '_IMPORT': r'%import',
'NUMBER': r'\d+',
'NUMBER': r'[+-]?\d+',
} }


RULES = { RULES = {
@@ -112,7 +113,7 @@ RULES = {
'?expr': ['atom', '?expr': ['atom',
'atom OP', 'atom OP',
'atom TILDE NUMBER', 'atom TILDE NUMBER',
'atom TILDE NUMBER _DOT _DOT NUMBER',
'atom TILDE NUMBER _DOTDOT NUMBER',
], ],


'?atom': ['_LPAR expansions _RPAR', '?atom': ['_LPAR expansions _RPAR',
@@ -130,7 +131,7 @@ RULES = {
'?name': ['RULE', 'TERMINAL'], '?name': ['RULE', 'TERMINAL'],


'maybe': ['_LBRA expansions _RBRA'], 'maybe': ['_LBRA expansions _RBRA'],
'range': ['STRING _DOT _DOT STRING'],
'range': ['STRING _DOTDOT STRING'],


'term': ['TERMINAL _COLON expansions _NL', 'term': ['TERMINAL _COLON expansions _NL',
'TERMINAL _DOT NUMBER _COLON expansions _NL'], 'TERMINAL _DOT NUMBER _COLON expansions _NL'],
@@ -196,7 +197,7 @@ class EBNF_to_BNF(Transformer_InPlace):
mn = mx = int(args[0]) mn = mx = int(args[0])
else: else:
mn, mx = map(int, args) mn, mx = map(int, args)
if mx < mn:
if mx < mn or mn < 0:
raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (rule, mn, mx)) raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (rule, mn, mx))
return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx+1)]) return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx+1)])
assert False, op assert False, op
@@ -205,7 +206,7 @@ class EBNF_to_BNF(Transformer_InPlace):
keep_all_tokens = self.rule_options and self.rule_options.keep_all_tokens keep_all_tokens = self.rule_options and self.rule_options.keep_all_tokens


def will_not_get_removed(sym): def will_not_get_removed(sym):
if isinstance(sym, NonTerminal):
if isinstance(sym, NonTerminal):
return not sym.name.startswith('_') return not sym.name.startswith('_')
if isinstance(sym, Terminal): if isinstance(sym, Terminal):
return keep_all_tokens or not sym.filter_out return keep_all_tokens or not sym.filter_out
@@ -345,28 +346,6 @@ def _rfind(s, choices):






def _fix_escaping(s):
w = ''
i = iter(s)
for n in i:
w += n
if n == '\\':
n2 = next(i)
if n2 == '\\':
w += '\\\\'
elif n2 not in 'uxnftr':
w += '\\'
w += n2
w = w.replace('\\"', '"').replace("'", "\\'")

to_eval = "u'''%s'''" % w
try:
s = literal_eval(to_eval)
except SyntaxError as e:
raise ValueError(s, e)

return s



def _literal_to_pattern(literal): def _literal_to_pattern(literal):
v = literal.value v = literal.value
@@ -379,7 +358,7 @@ def _literal_to_pattern(literal):
assert v[0] == v[-1] and v[0] in '"/' assert v[0] == v[-1] and v[0] in '"/'
x = v[1:-1] x = v[1:-1]


s = _fix_escaping(x)
s = eval_escaping(x)


if literal.type == 'STRING': if literal.type == 'STRING':
s = s.replace('\\\\', '\\') s = s.replace('\\\\', '\\')
@@ -397,7 +376,7 @@ class PrepareLiterals(Transformer_InPlace):
assert start.type == end.type == 'STRING' assert start.type == end.type == 'STRING'
start = start.value[1:-1] start = start.value[1:-1]
end = end.value[1:-1] end = end.value[1:-1]
assert len(_fix_escaping(start)) == len(_fix_escaping(end)) == 1, (start, end, len(_fix_escaping(start)), len(_fix_escaping(end)))
assert len(eval_escaping(start)) == len(eval_escaping(end)) == 1, (start, end, len(eval_escaping(start)), len(eval_escaping(end)))
regexp = '[%s-%s]' % (start, end) regexp = '[%s-%s]' % (start, end)
return ST('pattern', [PatternRE(regexp)]) return ST('pattern', [PatternRE(regexp)])


@@ -451,9 +430,9 @@ class PrepareSymbols(Transformer_InPlace):
if isinstance(v, Tree): if isinstance(v, Tree):
return v return v
elif v.type == 'RULE': elif v.type == 'RULE':
return NonTerminal(v.value)
return NonTerminal(Str(v.value))
elif v.type == 'TERMINAL': elif v.type == 'TERMINAL':
return Terminal(v.value, filter_out=v.startswith('_'))
return Terminal(Str(v.value), filter_out=v.startswith('_'))
assert False assert False


def _choice_of_rules(rules): def _choice_of_rules(rules):
@@ -465,7 +444,7 @@ class Grammar:
self.rule_defs = rule_defs self.rule_defs = rule_defs
self.ignore = ignore self.ignore = ignore


def compile(self):
def compile(self, start):
# We change the trees in-place (to support huge grammars) # We change the trees in-place (to support huge grammars)
# So deepcopy allows calling compile more than once. # So deepcopy allows calling compile more than once.
term_defs = deepcopy(list(self.term_defs)) term_defs = deepcopy(list(self.term_defs))
@@ -476,7 +455,7 @@ class Grammar:
# =================== # ===================


# Convert terminal-trees to strings/regexps # Convert terminal-trees to strings/regexps
transformer = PrepareLiterals() * TerminalTreeToPattern()
for name, (term_tree, priority) in term_defs: for name, (term_tree, priority) in term_defs:
if term_tree is None: # Terminal added through %declare if term_tree is None: # Terminal added through %declare
continue continue
@@ -484,7 +463,8 @@ class Grammar:
if len(expansions) == 1 and not expansions[0].children: if len(expansions) == 1 and not expansions[0].children:
raise GrammarError("Terminals cannot be empty (%s)" % name) raise GrammarError("Terminals cannot be empty (%s)" % name)


terminals = [TerminalDef(name, transformer.transform(term_tree), priority)
transformer = PrepareLiterals() * TerminalTreeToPattern()
terminals = [TerminalDef(name, transformer.transform( term_tree ), priority)
for name, (term_tree, priority) in term_defs if term_tree] for name, (term_tree, priority) in term_defs if term_tree]


# ================= # =================
@@ -498,7 +478,8 @@ class Grammar:
ebnf_to_bnf = EBNF_to_BNF() ebnf_to_bnf = EBNF_to_BNF()
rules = [] rules = []
for name, rule_tree, options in rule_defs: for name, rule_tree, options in rule_defs:
ebnf_to_bnf.rule_options = RuleOptions(keep_all_tokens=True) if options and options.keep_all_tokens else None
ebnf_to_bnf.rule_options = RuleOptions(keep_all_tokens=True) if options.keep_all_tokens else None
ebnf_to_bnf.prefix = name
tree = transformer.transform(rule_tree) tree = transformer.transform(rule_tree)
res = ebnf_to_bnf.transform(tree) res = ebnf_to_bnf.transform(tree)
rules.append((name, res, options)) rules.append((name, res, options))
@@ -511,18 +492,18 @@ class Grammar:


simplify_rule = SimplifyRule_Visitor() simplify_rule = SimplifyRule_Visitor()
compiled_rules = [] compiled_rules = []
for i, rule_content in enumerate(rules):
for rule_content in rules:
name, tree, options = rule_content name, tree, options = rule_content
simplify_rule.visit(tree) simplify_rule.visit(tree)
expansions = rule_tree_to_text.transform(tree) expansions = rule_tree_to_text.transform(tree)


for expansion, alias in expansions:
for i, (expansion, alias) in enumerate(expansions):
if alias and name.startswith('_'): if alias and name.startswith('_'):
raise GrammarError("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (name, alias)) raise GrammarError("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (name, alias))


empty_indices = [x==_EMPTY for i, x in enumerate(expansion)]
empty_indices = [x==_EMPTY for x in expansion]
if any(empty_indices): if any(empty_indices):
exp_options = copy(options) if options else RuleOptions()
exp_options = copy(options) or RuleOptions()
exp_options.empty_indices = empty_indices exp_options.empty_indices = empty_indices
expansion = [x for x in expansion if x!=_EMPTY] expansion = [x for x in expansion if x!=_EMPTY]
else: else:
@@ -538,7 +519,8 @@ class Grammar:
for dups in duplicates.values(): for dups in duplicates.values():
if len(dups) > 1: if len(dups) > 1:
if dups[0].expansion: if dups[0].expansion:
raise GrammarError("Rules defined twice: %s" % ', '.join(str(i) for i in duplicates))
raise GrammarError("Rules defined twice: %s\n\n(Might happen due to colliding expansion of optionals: [] or ?)"
% ''.join('\n * %s' % i for i in dups))


# Empty rule; assert all other attributes are equal # Empty rule; assert all other attributes are equal
assert len({(r.alias, r.order, r.options) for r in dups}) == len(dups) assert len({(r.alias, r.order, r.options) for r in dups}) == len(dups)
@@ -546,6 +528,19 @@ class Grammar:
# Remove duplicates # Remove duplicates
compiled_rules = list(set(compiled_rules)) compiled_rules = list(set(compiled_rules))



# Filter out unused rules
while True:
c = len(compiled_rules)
used_rules = {s for r in compiled_rules
for s in r.expansion
if isinstance(s, NonTerminal)
and s != r.origin}
used_rules |= {NonTerminal(s) for s in start}
compiled_rules = [r for r in compiled_rules if r.origin in used_rules]
if len(compiled_rules) == c:
break

# Filter out unused terminals # Filter out unused terminals
used_terms = {t.name for r in compiled_rules used_terms = {t.name for r in compiled_rules
for t in r.expansion for t in r.expansion
@@ -563,13 +558,13 @@ def import_grammar(grammar_path, base_paths=[]):
for import_path in import_paths: for import_path in import_paths:
with suppress(IOError): with suppress(IOError):
joined_path = os.path.join(import_path, grammar_path) joined_path = os.path.join(import_path, grammar_path)
with open(joined_path) as f:
with open(joined_path, encoding='utf8') as f:
text = f.read() text = f.read()
grammar = load_grammar(text, joined_path) grammar = load_grammar(text, joined_path)
_imported_grammars[grammar_path] = grammar _imported_grammars[grammar_path] = grammar
break break
else: else:
open(grammar_path)
open(grammar_path, encoding='utf8')
assert False assert False


return _imported_grammars[grammar_path] return _imported_grammars[grammar_path]
@@ -592,7 +587,9 @@ def import_from_grammar_into_namespace(grammar, namespace, aliases):
_, tree, _ = imported_rules[symbol] _, tree, _ = imported_rules[symbol]
except KeyError: except KeyError:
raise GrammarError("Missing symbol '%s' in grammar %s" % (symbol, namespace)) raise GrammarError("Missing symbol '%s' in grammar %s" % (symbol, namespace))
return tree.scan_values(lambda x: x.type in ('RULE', 'TERMINAL'))

return _find_used_symbols(tree)



def get_namespace_name(name): def get_namespace_name(name):
try: try:
@@ -620,11 +617,10 @@ def import_from_grammar_into_namespace(grammar, namespace, aliases):




def resolve_term_references(term_defs): def resolve_term_references(term_defs):
# TODO Cycles detection
# TODO Solve with transitive closure (maybe) # TODO Solve with transitive closure (maybe)


token_dict = {k:t for k, (t,_p) in term_defs}
assert len(token_dict) == len(term_defs), "Same name defined twice?"
term_dict = {k:t for k, (t,_p) in term_defs}
assert len(term_dict) == len(term_defs), "Same name defined twice?"


while True: while True:
changed = False changed = False
@@ -637,11 +633,21 @@ def resolve_term_references(term_defs):
if item.type == 'RULE': if item.type == 'RULE':
raise GrammarError("Rules aren't allowed inside terminals (%s in %s)" % (item, name)) raise GrammarError("Rules aren't allowed inside terminals (%s in %s)" % (item, name))
if item.type == 'TERMINAL': if item.type == 'TERMINAL':
exp.children[0] = token_dict[item]
term_value = term_dict[item]
assert term_value is not None
exp.children[0] = term_value
changed = True changed = True
if not changed: if not changed:
break break


for name, term in term_dict.items():
if term: # Not just declared
for child in term.children:
ids = [id(x) for x in child.iter_subtrees()]
if id(term) in ids:
raise GrammarError("Recursion in terminal '%s' (recursion is only allowed in rules, not terminals)" % name)


def options_from_rule(name, *x): def options_from_rule(name, *x):
if len(x) > 1: if len(x) > 1:
priority, expansions = x priority, expansions = x
@@ -669,6 +675,11 @@ class PrepareGrammar(Transformer_InPlace):
return name return name




def _find_used_symbols(tree):
assert tree.data == 'expansions'
return {t for x in tree.find_data('expansion')
for t in x.scan_values(lambda t: t.type in ('RULE', 'TERMINAL'))}

class GrammarLoader: class GrammarLoader:
def __init__(self): def __init__(self):
terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()] terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()]
@@ -678,7 +689,7 @@ class GrammarLoader:
callback = ParseTreeBuilder(rules, ST).create_callback() callback = ParseTreeBuilder(rules, ST).create_callback()
lexer_conf = LexerConf(terminals, ['WS', 'COMMENT']) lexer_conf = LexerConf(terminals, ['WS', 'COMMENT'])


parser_conf = ParserConf(rules, callback, 'start')
parser_conf = ParserConf(rules, callback, ['start'])
self.parser = LALR_TraditionalLexer(lexer_conf, parser_conf) self.parser = LALR_TraditionalLexer(lexer_conf, parser_conf)


self.canonize_tree = CanonizeTree() self.canonize_tree = CanonizeTree()
@@ -830,9 +841,7 @@ class GrammarLoader:
rule_names.add(name) rule_names.add(name)


for name, expansions, _o in rules: for name, expansions, _o in rules:
used_symbols = {t for x in expansions.find_data('expansion')
for t in x.scan_values(lambda t: t.type in ('RULE', 'TERMINAL'))}
for sym in used_symbols:
for sym in _find_used_symbols(expansions):
if sym.type == 'TERMINAL': if sym.type == 'TERMINAL':
if sym not in terminal_names: if sym not in terminal_names:
raise GrammarError("Token '%s' used but not defined (in rule %s)" % (sym, name)) raise GrammarError("Token '%s' used but not defined (in rule %s)" % (sym, name))


+ 33
- 9
lark/parse_tree_builder.py View File

@@ -2,6 +2,8 @@ from .exceptions import GrammarError
from .lexer import Token from .lexer import Token
from .tree import Tree from .tree import Tree
from .visitors import InlineTransformer # XXX Deprecated from .visitors import InlineTransformer # XXX Deprecated
from .visitors import Transformer_InPlace
from .visitors import _vargs_meta, _vargs_meta_inline


###{standalone ###{standalone
from functools import partial, wraps from functools import partial, wraps
@@ -27,7 +29,7 @@ class PropagatePositions:


if isinstance(res, Tree): if isinstance(res, Tree):
for c in children: for c in children:
if isinstance(c, Tree) and c.children and not c.meta.empty:
if isinstance(c, Tree) and not c.meta.empty:
res.meta.line = c.meta.line res.meta.line = c.meta.line
res.meta.column = c.meta.column res.meta.column = c.meta.column
res.meta.start_pos = c.meta.start_pos res.meta.start_pos = c.meta.start_pos
@@ -41,7 +43,7 @@ class PropagatePositions:
break break


for c in reversed(children): for c in reversed(children):
if isinstance(c, Tree) and c.children and not c.meta.empty:
if isinstance(c, Tree) and not c.meta.empty:
res.meta.end_line = c.meta.end_line res.meta.end_line = c.meta.end_line
res.meta.end_column = c.meta.end_column res.meta.end_column = c.meta.end_column
res.meta.end_pos = c.meta.end_pos res.meta.end_pos = c.meta.end_pos
@@ -50,7 +52,7 @@ class PropagatePositions:
elif isinstance(c, Token): elif isinstance(c, Token):
res.meta.end_line = c.end_line res.meta.end_line = c.end_line
res.meta.end_column = c.end_column res.meta.end_column = c.end_column
res.meta.end_pos = c.pos_in_stream + len(c.value)
res.meta.end_pos = c.end_pos
res.meta.empty = False res.meta.empty = False
break break


@@ -193,6 +195,23 @@ def ptb_inline_args(func):
return func(*children) return func(*children)
return f return f


def inplace_transformer(func):
@wraps(func)
def f(children):
# function name in a Transformer is a rule name.
tree = Tree(func.__name__, children)
return func(tree)
return f

def apply_visit_wrapper(func, name, wrapper):
if wrapper is _vargs_meta or wrapper is _vargs_meta_inline:
raise NotImplementedError("Meta args not supported for internal transformer")
@wraps(func)
def f(children):
return wrapper(func, name, children, None)
return f


class ParseTreeBuilder: class ParseTreeBuilder:
def __init__(self, rules, tree_class, propagate_positions=False, keep_all_tokens=False, ambiguous=False, maybe_placeholders=False): def __init__(self, rules, tree_class, propagate_positions=False, keep_all_tokens=False, ambiguous=False, maybe_placeholders=False):
self.tree_class = tree_class self.tree_class = tree_class
@@ -206,12 +225,12 @@ class ParseTreeBuilder:
def _init_builders(self, rules): def _init_builders(self, rules):
for rule in rules: for rule in rules:
options = rule.options options = rule.options
keep_all_tokens = self.always_keep_all_tokens or (options.keep_all_tokens if options else False)
expand_single_child = options.expand1 if options else False
keep_all_tokens = self.always_keep_all_tokens or options.keep_all_tokens
expand_single_child = options.expand1


wrapper_chain = list(filter(None, [ wrapper_chain = list(filter(None, [
(expand_single_child and not rule.alias) and ExpandSingleChild, (expand_single_child and not rule.alias) and ExpandSingleChild,
maybe_create_child_filter(rule.expansion, keep_all_tokens, self.ambiguous, options.empty_indices if self.maybe_placeholders and options else None),
maybe_create_child_filter(rule.expansion, keep_all_tokens, self.ambiguous, options.empty_indices if self.maybe_placeholders else None),
self.propagate_positions and PropagatePositions, self.propagate_positions and PropagatePositions,
self.ambiguous and maybe_create_ambiguous_expander(self.tree_class, rule.expansion, keep_all_tokens), self.ambiguous and maybe_create_ambiguous_expander(self.tree_class, rule.expansion, keep_all_tokens),
])) ]))
@@ -227,10 +246,15 @@ class ParseTreeBuilder:
user_callback_name = rule.alias or rule.origin.name user_callback_name = rule.alias or rule.origin.name
try: try:
f = getattr(transformer, user_callback_name) f = getattr(transformer, user_callback_name)
assert not getattr(f, 'meta', False), "Meta args not supported for internal transformer"
# XXX InlineTransformer is deprecated! # XXX InlineTransformer is deprecated!
if getattr(f, 'inline', False) or isinstance(transformer, InlineTransformer):
f = ptb_inline_args(f)
wrapper = getattr(f, 'visit_wrapper', None)
if wrapper is not None:
f = apply_visit_wrapper(f, user_callback_name, wrapper)
else:
if isinstance(transformer, InlineTransformer):
f = ptb_inline_args(f)
elif isinstance(transformer, Transformer_InPlace):
f = inplace_transformer(f)
except AttributeError: except AttributeError:
f = partial(self.tree_class, user_callback_name) f = partial(self.tree_class, user_callback_name)




+ 41
- 18
lark/parser_frontends.py View File

@@ -44,18 +44,28 @@ def get_frontend(parser, lexer):
raise ValueError('Unknown parser: %s' % parser) raise ValueError('Unknown parser: %s' % parser)




class _ParserFrontend(Serialize):
def _parse(self, input, start, *args):
if start is None:
start = self.start
if len(start) > 1:
raise ValueError("Lark initialized with more than 1 possible start rule. Must specify which start rule to parse", start)
start ,= start
return self.parser.parse(input, start, *args)




class WithLexer(Serialize):
class WithLexer(_ParserFrontend):
lexer = None lexer = None
parser = None parser = None
lexer_conf = None lexer_conf = None
start = None


__serialize_fields__ = 'parser', 'lexer_conf'
__serialize_fields__ = 'parser', 'lexer_conf', 'start'
__serialize_namespace__ = LexerConf, __serialize_namespace__ = LexerConf,


def __init__(self, lexer_conf, parser_conf, options=None): def __init__(self, lexer_conf, parser_conf, options=None):
self.lexer_conf = lexer_conf self.lexer_conf = lexer_conf
self.start = parser_conf.start
self.postlex = lexer_conf.postlex self.postlex = lexer_conf.postlex


@classmethod @classmethod
@@ -65,18 +75,17 @@ class WithLexer(Serialize):
inst.parser = LALR_Parser.deserialize(inst.parser, memo, callbacks) inst.parser = LALR_Parser.deserialize(inst.parser, memo, callbacks)
inst.init_lexer() inst.init_lexer()
return inst return inst
def _serialize(self, data, memo): def _serialize(self, data, memo):
data['parser'] = data['parser'].serialize(memo) data['parser'] = data['parser'].serialize(memo)


def lex(self, text):
stream = self.lexer.lex(text)
def lex(self, *args):
stream = self.lexer.lex(*args)
return self.postlex.process(stream) if self.postlex else stream return self.postlex.process(stream) if self.postlex else stream


def parse(self, text):
def parse(self, text, start=None):
token_stream = self.lex(text) token_stream = self.lex(text)
sps = self.lexer.set_parser_state
return self.parser.parse(token_stream, *[sps] if sps is not NotImplemented else [])
return self._parse(token_stream, start)


def init_traditional_lexer(self): def init_traditional_lexer(self):
self.lexer = TraditionalLexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks) self.lexer = TraditionalLexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks)
@@ -104,14 +113,24 @@ class LALR_ContextualLexer(LALR_WithLexer):
ignore=self.lexer_conf.ignore, ignore=self.lexer_conf.ignore,
always_accept=always_accept, always_accept=always_accept,
user_callbacks=self.lexer_conf.callbacks) user_callbacks=self.lexer_conf.callbacks)


def parse(self, text, start=None):
parser_state = [None]
def set_parser_state(s):
parser_state[0] = s

token_stream = self.lex(text, lambda: parser_state[0])
return self._parse(token_stream, start, set_parser_state)
###} ###}


class LALR_CustomLexer(LALR_WithLexer): class LALR_CustomLexer(LALR_WithLexer):
def __init__(self, lexer_cls, lexer_conf, parser_conf, options=None): def __init__(self, lexer_cls, lexer_conf, parser_conf, options=None):
pass # TODO
self.lexer = lexer_cls(lexer_conf)
debug = options.debug if options else False
self.parser = LALR_Parser(parser_conf, debug=debug)
WithLexer.__init__(self, lexer_conf, parser_conf, options)


def init_lexer(self):
self.lexer = lexer_cls(self.lexer_conf)


def tokenize_text(text): def tokenize_text(text):
line = 1 line = 1
@@ -128,22 +147,26 @@ class Earley(WithLexer):
self.init_traditional_lexer() self.init_traditional_lexer()


resolve_ambiguity = options.ambiguity == 'resolve' resolve_ambiguity = options.ambiguity == 'resolve'
self.parser = earley.Parser(parser_conf, self.match, resolve_ambiguity=resolve_ambiguity)
debug = options.debug if options else False
self.parser = earley.Parser(parser_conf, self.match, resolve_ambiguity=resolve_ambiguity, debug=debug)


def match(self, term, token): def match(self, term, token):
return term.name == token.type return term.name == token.type




class XEarley:
class XEarley(_ParserFrontend):
def __init__(self, lexer_conf, parser_conf, options=None, **kw): def __init__(self, lexer_conf, parser_conf, options=None, **kw):
self.token_by_name = {t.name:t for t in lexer_conf.tokens} self.token_by_name = {t.name:t for t in lexer_conf.tokens}
self.start = parser_conf.start


self._prepare_match(lexer_conf) self._prepare_match(lexer_conf)
resolve_ambiguity = options.ambiguity == 'resolve' resolve_ambiguity = options.ambiguity == 'resolve'
debug = options.debug if options else False
self.parser = xearley.Parser(parser_conf, self.parser = xearley.Parser(parser_conf,
self.match, self.match,
ignore=lexer_conf.ignore, ignore=lexer_conf.ignore,
resolve_ambiguity=resolve_ambiguity, resolve_ambiguity=resolve_ambiguity,
debug=debug,
**kw **kw
) )


@@ -166,8 +189,8 @@ class XEarley:


self.regexps[t.name] = re.compile(regexp) self.regexps[t.name] = re.compile(regexp)


def parse(self, text):
return self.parser.parse(text)
def parse(self, text, start):
return self._parse(text, start)


class XEarley_CompleteLex(XEarley): class XEarley_CompleteLex(XEarley):
def __init__(self, *args, **kw): def __init__(self, *args, **kw):
@@ -182,13 +205,13 @@ class CYK(WithLexer):
self.init_traditional_lexer() self.init_traditional_lexer()


self._analysis = GrammarAnalyzer(parser_conf) self._analysis = GrammarAnalyzer(parser_conf)
self._parser = cyk.Parser(parser_conf.rules, parser_conf.start)
self.parser = cyk.Parser(parser_conf.rules)


self.callbacks = parser_conf.callbacks self.callbacks = parser_conf.callbacks


def parse(self, text):
def parse(self, text, start):
tokens = list(self.lex(text)) tokens = list(self.lex(text))
parse = self._parser.parse(tokens)
parse = self._parse(tokens, start)
parse = self._transform(parse) parse = self._transform(parse)
return parse return parse




+ 8
- 6
lark/parsers/cyk.py View File

@@ -84,12 +84,11 @@ class RuleNode(object):
class Parser(object): class Parser(object):
"""Parser wrapper.""" """Parser wrapper."""


def __init__(self, rules, start):
def __init__(self, rules):
super(Parser, self).__init__() super(Parser, self).__init__()
self.orig_rules = {rule: rule for rule in rules} self.orig_rules = {rule: rule for rule in rules}
rules = [self._to_rule(rule) for rule in rules] rules = [self._to_rule(rule) for rule in rules]
self.grammar = to_cnf(Grammar(rules)) self.grammar = to_cnf(Grammar(rules))
self.start = NT(start)


def _to_rule(self, lark_rule): def _to_rule(self, lark_rule):
"""Converts a lark rule, (lhs, rhs, callback, options), to a Rule.""" """Converts a lark rule, (lhs, rhs, callback, options), to a Rule."""
@@ -97,16 +96,19 @@ class Parser(object):
assert all(isinstance(x, Symbol) for x in lark_rule.expansion) assert all(isinstance(x, Symbol) for x in lark_rule.expansion)
return Rule( return Rule(
lark_rule.origin, lark_rule.expansion, lark_rule.origin, lark_rule.expansion,
weight=lark_rule.options.priority if lark_rule.options and lark_rule.options.priority else 0,
weight=lark_rule.options.priority if lark_rule.options.priority else 0,
alias=lark_rule) alias=lark_rule)


def parse(self, tokenized): # pylint: disable=invalid-name
def parse(self, tokenized, start): # pylint: disable=invalid-name
"""Parses input, which is a list of tokens.""" """Parses input, which is a list of tokens."""
assert start
start = NT(start)

table, trees = _parse(tokenized, self.grammar) table, trees = _parse(tokenized, self.grammar)
# Check if the parse succeeded. # Check if the parse succeeded.
if all(r.lhs != self.start for r in table[(0, len(tokenized) - 1)]):
if all(r.lhs != start for r in table[(0, len(tokenized) - 1)]):
raise ParseError('Parsing failed.') raise ParseError('Parsing failed.')
parse = trees[(0, len(tokenized) - 1)][self.start]
parse = trees[(0, len(tokenized) - 1)][start]
return self._to_tree(revert_cnf(parse)) return self._to_tree(revert_cnf(parse))


def _to_tree(self, rule_node): def _to_tree(self, rule_node):


+ 25
- 14
lark/parsers/earley.py View File

@@ -10,20 +10,22 @@ is better documented here:
http://www.bramvandersanden.com/post/2014/06/shared-packed-parse-forest/ http://www.bramvandersanden.com/post/2014/06/shared-packed-parse-forest/
""" """


import logging
from collections import deque from collections import deque


from ..visitors import Transformer_InPlace, v_args from ..visitors import Transformer_InPlace, v_args
from ..exceptions import ParseError, UnexpectedToken
from ..exceptions import UnexpectedEOF, UnexpectedToken
from .grammar_analysis import GrammarAnalyzer from .grammar_analysis import GrammarAnalyzer
from ..grammar import NonTerminal from ..grammar import NonTerminal
from .earley_common import Item, TransitiveItem from .earley_common import Item, TransitiveItem
from .earley_forest import ForestToTreeVisitor, ForestSumVisitor, SymbolNode, ForestToAmbiguousTreeVisitor from .earley_forest import ForestToTreeVisitor, ForestSumVisitor, SymbolNode, ForestToAmbiguousTreeVisitor


class Parser: class Parser:
def __init__(self, parser_conf, term_matcher, resolve_ambiguity=True):
def __init__(self, parser_conf, term_matcher, resolve_ambiguity=True, debug=False):
analysis = GrammarAnalyzer(parser_conf) analysis = GrammarAnalyzer(parser_conf)
self.parser_conf = parser_conf self.parser_conf = parser_conf
self.resolve_ambiguity = resolve_ambiguity self.resolve_ambiguity = resolve_ambiguity
self.debug = debug


self.FIRST = analysis.FIRST self.FIRST = analysis.FIRST
self.NULLABLE = analysis.NULLABLE self.NULLABLE = analysis.NULLABLE
@@ -43,13 +45,9 @@ class Parser:
# the priorities will be stripped from all rules before they reach us, allowing us to # the priorities will be stripped from all rules before they reach us, allowing us to
# skip the extra tree walk. We'll also skip this if the user just didn't specify priorities # skip the extra tree walk. We'll also skip this if the user just didn't specify priorities
# on any rules. # on any rules.
if self.forest_sum_visitor is None and rule.options and rule.options.priority is not None:
self.forest_sum_visitor = ForestSumVisitor()
if self.forest_sum_visitor is None and rule.options.priority is not None:
self.forest_sum_visitor = ForestSumVisitor


if resolve_ambiguity:
self.forest_tree_visitor = ForestToTreeVisitor(self.callbacks, self.forest_sum_visitor)
else:
self.forest_tree_visitor = ForestToAmbiguousTreeVisitor(self.callbacks, self.forest_sum_visitor)
self.term_matcher = term_matcher self.term_matcher = term_matcher




@@ -272,9 +270,11 @@ class Parser:


## Column is now the final column in the parse. ## Column is now the final column in the parse.
assert i == len(columns)-1 assert i == len(columns)-1
return to_scan


def parse(self, stream, start_symbol=None):
start_symbol = NonTerminal(start_symbol or self.parser_conf.start)
def parse(self, stream, start):
assert start, start
start_symbol = NonTerminal(start)


columns = [set()] columns = [set()]
to_scan = set() # The scan buffer. 'Q' in E.Scott's paper. to_scan = set() # The scan buffer. 'Q' in E.Scott's paper.
@@ -289,22 +289,33 @@ class Parser:
else: else:
columns[0].add(item) columns[0].add(item)


self._parse(stream, columns, to_scan, start_symbol)
to_scan = self._parse(stream, columns, to_scan, start_symbol)


# If the parse was successful, the start # If the parse was successful, the start
# symbol should have been completed in the last step of the Earley cycle, and will be in # symbol should have been completed in the last step of the Earley cycle, and will be in
# this column. Find the item for the start_symbol, which is the root of the SPPF tree. # this column. Find the item for the start_symbol, which is the root of the SPPF tree.
solutions = [n.node for n in columns[-1] if n.is_complete and n.node is not None and n.s == start_symbol and n.start == 0] solutions = [n.node for n in columns[-1] if n.is_complete and n.node is not None and n.s == start_symbol and n.start == 0]
if self.debug:
from .earley_forest import ForestToPyDotVisitor
try:
debug_walker = ForestToPyDotVisitor()
except ImportError:
logging.warning("Cannot find dependency 'pydot', will not generate sppf debug image")
else:
debug_walker.visit(solutions[0], "sppf.png")



if not solutions: if not solutions:
expected_tokens = [t.expect for t in to_scan] expected_tokens = [t.expect for t in to_scan]
# raise ParseError('Incomplete parse: Could not find a solution to input')
raise ParseError('Unexpected end of input! Expecting a terminal of: %s' % expected_tokens)
raise UnexpectedEOF(expected_tokens)
elif len(solutions) > 1: elif len(solutions) > 1:
assert False, 'Earley should not generate multiple start symbol items!' assert False, 'Earley should not generate multiple start symbol items!'


# Perform our SPPF -> AST conversion using the right ForestVisitor. # Perform our SPPF -> AST conversion using the right ForestVisitor.
return self.forest_tree_visitor.visit(solutions[0])
forest_tree_visitor_cls = ForestToTreeVisitor if self.resolve_ambiguity else ForestToAmbiguousTreeVisitor
forest_tree_visitor = forest_tree_visitor_cls(self.callbacks, self.forest_sum_visitor and self.forest_sum_visitor())

return forest_tree_visitor.visit(solutions[0])




class ApplyCallbacks(Transformer_InPlace): class ApplyCallbacks(Transformer_InPlace):


+ 3
- 3
lark/parsers/earley_forest.py View File

@@ -122,7 +122,7 @@ class PackedNode(ForestNode):
ambiguously. Hence, we use the sort order to identify ambiguously. Hence, we use the sort order to identify
the order in which ambiguous children should be considered. the order in which ambiguous children should be considered.
""" """
return self.is_empty, -self.priority, -self.rule.order
return self.is_empty, -self.priority, self.rule.order


def __iter__(self): def __iter__(self):
return iter([self.left, self.right]) return iter([self.left, self.right])
@@ -195,7 +195,7 @@ class ForestVisitor(object):
continue continue


if id(next_node) in visiting: if id(next_node) in visiting:
raise ParseError("Infinite recursion in grammar!")
raise ParseError("Infinite recursion in grammar, in rule '%s'!" % next_node.s.name)


input_stack.append(next_node) input_stack.append(next_node)
continue continue
@@ -250,7 +250,7 @@ class ForestSumVisitor(ForestVisitor):
return iter(node.children) return iter(node.children)


def visit_packed_node_out(self, node): def visit_packed_node_out(self, node):
priority = node.rule.options.priority if not node.parent.is_intermediate and node.rule.options and node.rule.options.priority else 0
priority = node.rule.options.priority if not node.parent.is_intermediate and node.rule.options.priority else 0
priority += getattr(node.right, 'priority', 0) priority += getattr(node.right, 'priority', 0)
priority += getattr(node.left, 'priority', 0) priority += getattr(node.left, 'priority', 0)
node.priority = priority node.priority = priority


+ 46
- 14
lark/parsers/grammar_analysis.py View File

@@ -1,4 +1,4 @@
from collections import Counter
from collections import Counter, defaultdict


from ..utils import bfs, fzset, classify from ..utils import bfs, fzset, classify
from ..exceptions import GrammarError from ..exceptions import GrammarError
@@ -37,8 +37,22 @@ class RulePtr(object):
return hash((self.rule, self.index)) return hash((self.rule, self.index))




# state generation ensures no duplicate LR0ItemSets
class LR0ItemSet(object):
__slots__ = ('kernel', 'closure', 'transitions', 'lookaheads')

def __init__(self, kernel, closure):
self.kernel = fzset(kernel)
self.closure = fzset(closure)
self.transitions = {}
self.lookaheads = defaultdict(set)

def __repr__(self):
return '{%s | %s}' % (', '.join([repr(r) for r in self.kernel]), ', '.join([repr(r) for r in self.closure]))


def update_set(set1, set2): def update_set(set1, set2):
if not set2:
if not set2 or set1 > set2:
return False return False


copy = set(set1) copy = set(set1)
@@ -85,6 +99,8 @@ def calculate_sets(rules):
if set(rule.expansion[:i]) <= NULLABLE: if set(rule.expansion[:i]) <= NULLABLE:
if update_set(FIRST[rule.origin], FIRST[sym]): if update_set(FIRST[rule.origin], FIRST[sym]):
changed = True changed = True
else:
break


# Calculate FOLLOW # Calculate FOLLOW
changed = True changed = True
@@ -109,7 +125,10 @@ class GrammarAnalyzer(object):
def __init__(self, parser_conf, debug=False): def __init__(self, parser_conf, debug=False):
self.debug = debug self.debug = debug


rules = parser_conf.rules + [Rule(NonTerminal('$root'), [NonTerminal(parser_conf.start), Terminal('$END')])]
root_rules = {start: Rule(NonTerminal('$root_' + start), [NonTerminal(start), Terminal('$END')])
for start in parser_conf.start}

rules = parser_conf.rules + list(root_rules.values())
self.rules_by_origin = classify(rules, lambda r: r.origin) self.rules_by_origin = classify(rules, lambda r: r.origin)


if len(rules) != len(set(rules)): if len(rules) != len(set(rules)):
@@ -121,17 +140,37 @@ class GrammarAnalyzer(object):
if not (sym.is_term or sym in self.rules_by_origin): if not (sym.is_term or sym in self.rules_by_origin):
raise GrammarError("Using an undefined rule: %s" % sym) # TODO test validation raise GrammarError("Using an undefined rule: %s" % sym) # TODO test validation


self.start_state = self.expand_rule(NonTerminal('$root'))
self.start_states = {start: self.expand_rule(root_rule.origin)
for start, root_rule in root_rules.items()}

self.end_states = {start: fzset({RulePtr(root_rule, len(root_rule.expansion))})
for start, root_rule in root_rules.items()}

lr0_root_rules = {start: Rule(NonTerminal('$root_' + start), [NonTerminal(start)])
for start in parser_conf.start}

lr0_rules = parser_conf.rules + list(lr0_root_rules.values())
assert(len(lr0_rules) == len(set(lr0_rules)))

self.lr0_rules_by_origin = classify(lr0_rules, lambda r: r.origin)

# cache RulePtr(r, 0) in r (no duplicate RulePtr objects)
self.lr0_start_states = {start: LR0ItemSet([RulePtr(root_rule, 0)], self.expand_rule(root_rule.origin, self.lr0_rules_by_origin))
for start, root_rule in lr0_root_rules.items()}


self.FIRST, self.FOLLOW, self.NULLABLE = calculate_sets(rules) self.FIRST, self.FOLLOW, self.NULLABLE = calculate_sets(rules)


def expand_rule(self, rule):
def expand_rule(self, source_rule, rules_by_origin=None):
"Returns all init_ptrs accessible by rule (recursive)" "Returns all init_ptrs accessible by rule (recursive)"

if rules_by_origin is None:
rules_by_origin = self.rules_by_origin

init_ptrs = set() init_ptrs = set()
def _expand_rule(rule): def _expand_rule(rule):
assert not rule.is_term, rule assert not rule.is_term, rule


for r in self.rules_by_origin[rule]:
for r in rules_by_origin[rule]:
init_ptr = RulePtr(r, 0) init_ptr = RulePtr(r, 0)
init_ptrs.add(init_ptr) init_ptrs.add(init_ptr)


@@ -140,14 +179,7 @@ class GrammarAnalyzer(object):
if not new_r.is_term: if not new_r.is_term:
yield new_r yield new_r


for _ in bfs([rule], _expand_rule):
for _ in bfs([source_rule], _expand_rule):
pass pass


return fzset(init_ptrs) return fzset(init_ptrs)

def _first(self, r):
if r.is_term:
return {r}
else:
return {rp.next for rp in self.expand_rule(r) if rp.next.is_term}


+ 199
- 52
lark/parsers/lalr_analysis.py View File

@@ -7,12 +7,12 @@ For now, shift/reduce conflicts are automatically resolved as shifts.
# Email : erezshin@gmail.com # Email : erezshin@gmail.com


import logging import logging
from collections import defaultdict
from collections import defaultdict, deque


from ..utils import classify, classify_bool, bfs, fzset, Serialize, Enumerator from ..utils import classify, classify_bool, bfs, fzset, Serialize, Enumerator
from ..exceptions import GrammarError from ..exceptions import GrammarError


from .grammar_analysis import GrammarAnalyzer, Terminal
from .grammar_analysis import GrammarAnalyzer, Terminal, LR0ItemSet
from ..grammar import Rule from ..grammar import Rule


###{standalone ###{standalone
@@ -28,11 +28,12 @@ class Action:
Shift = Action('Shift') Shift = Action('Shift')
Reduce = Action('Reduce') Reduce = Action('Reduce')



class ParseTable: class ParseTable:
def __init__(self, states, start_state, end_state):
def __init__(self, states, start_states, end_states):
self.states = states self.states = states
self.start_state = start_state
self.end_state = end_state
self.start_states = start_states
self.end_states = end_states


def serialize(self, memo): def serialize(self, memo):
tokens = Enumerator() tokens = Enumerator()
@@ -47,8 +48,8 @@ class ParseTable:
return { return {
'tokens': tokens.reversed(), 'tokens': tokens.reversed(),
'states': states, 'states': states,
'start_state': self.start_state,
'end_state': self.end_state,
'start_states': self.start_states,
'end_states': self.end_states,
} }


@classmethod @classmethod
@@ -59,7 +60,7 @@ class ParseTable:
for token, (action, arg) in actions.items()} for token, (action, arg) in actions.items()}
for state, actions in data['states'].items() for state, actions in data['states'].items()
} }
return cls(states, data['start_state'], data['end_state'])
return cls(states, data['start_states'], data['end_states'])




class IntParseTable(ParseTable): class IntParseTable(ParseTable):
@@ -76,66 +77,212 @@ class IntParseTable(ParseTable):
int_states[ state_to_idx[s] ] = la int_states[ state_to_idx[s] ] = la




start_state = state_to_idx[parse_table.start_state]
end_state = state_to_idx[parse_table.end_state]
return cls(int_states, start_state, end_state)
start_states = {start:state_to_idx[s] for start, s in parse_table.start_states.items()}
end_states = {start:state_to_idx[s] for start, s in parse_table.end_states.items()}
return cls(int_states, start_states, end_states)


###} ###}



# digraph and traverse, see The Theory and Practice of Compiler Writing

# computes F(x) = G(x) union (union { G(y) | x R y })
# X: nodes
# R: relation (function mapping node -> list of nodes that satisfy the relation)
# G: set valued function
def digraph(X, R, G):
F = {}
S = []
N = {}
for x in X:
N[x] = 0
for x in X:
# this is always true for the first iteration, but N[x] may be updated in traverse below
if N[x] == 0:
traverse(x, S, N, X, R, G, F)
return F

# x: single node
# S: stack
# N: weights
# X: nodes
# R: relation (see above)
# G: set valued function
# F: set valued function we are computing (map of input -> output)
def traverse(x, S, N, X, R, G, F):
S.append(x)
d = len(S)
N[x] = d
F[x] = G[x]
for y in R[x]:
if N[y] == 0:
traverse(y, S, N, X, R, G, F)
n_x = N[x]
assert(n_x > 0)
n_y = N[y]
assert(n_y != 0)
if (n_y > 0) and (n_y < n_x):
N[x] = n_y
F[x].update(F[y])
if N[x] == d:
f_x = F[x]
while True:
z = S.pop()
N[z] = -1
F[z] = f_x
if z == x:
break


class LALR_Analyzer(GrammarAnalyzer): class LALR_Analyzer(GrammarAnalyzer):
def __init__(self, parser_conf, debug=False):
GrammarAnalyzer.__init__(self, parser_conf, debug)
self.nonterminal_transitions = []
self.directly_reads = defaultdict(set)
self.reads = defaultdict(set)
self.includes = defaultdict(set)
self.lookback = defaultdict(set)



def compute_lookahead(self):
self.end_states = []
def compute_lr0_states(self):
self.lr0_states = set()
# map of kernels to LR0ItemSets
cache = {}


self.states = {}
def step(state): def step(state):
lookahead = defaultdict(list)
sat, unsat = classify_bool(state, lambda rp: rp.is_satisfied)
for rp in sat:
for term in self.FOLLOW.get(rp.rule.origin, ()):
lookahead[term].append((Reduce, rp.rule))
_, unsat = classify_bool(state.closure, lambda rp: rp.is_satisfied)


d = classify(unsat, lambda rp: rp.next) d = classify(unsat, lambda rp: rp.next)
for sym, rps in d.items(): for sym, rps in d.items():
rps = {rp.advance(sym) for rp in rps}
kernel = fzset({rp.advance(sym) for rp in rps})
new_state = cache.get(kernel, None)
if new_state is None:
closure = set(kernel)
for rp in kernel:
if not rp.is_satisfied and not rp.next.is_term:
closure |= self.expand_rule(rp.next, self.lr0_rules_by_origin)
new_state = LR0ItemSet(kernel, closure)
cache[kernel] = new_state

state.transitions[sym] = new_state
yield new_state


for rp in set(rps):
if not rp.is_satisfied and not rp.next.is_term:
rps |= self.expand_rule(rp.next)
self.lr0_states.add(state)


new_state = fzset(rps)
lookahead[sym].append((Shift, new_state))
if sym == Terminal('$END'):
self.end_states.append( new_state )
yield new_state
for _ in bfs(self.lr0_start_states.values(), step):
pass


for k, v in lookahead.items():
if len(v) > 1:
def compute_reads_relations(self):
# handle start state
for root in self.lr0_start_states.values():
assert(len(root.kernel) == 1)
for rp in root.kernel:
assert(rp.index == 0)
self.directly_reads[(root, rp.next)] = set([ Terminal('$END') ])

for state in self.lr0_states:
seen = set()
for rp in state.closure:
if rp.is_satisfied:
continue
s = rp.next
# if s is a not a nonterminal
if s not in self.lr0_rules_by_origin:
continue
if s in seen:
continue
seen.add(s)
nt = (state, s)
self.nonterminal_transitions.append(nt)
dr = self.directly_reads[nt]
r = self.reads[nt]
next_state = state.transitions[s]
for rp2 in next_state.closure:
if rp2.is_satisfied:
continue
s2 = rp2.next
# if s2 is a terminal
if s2 not in self.lr0_rules_by_origin:
dr.add(s2)
if s2 in self.NULLABLE:
r.add((next_state, s2))

def compute_includes_lookback(self):
for nt in self.nonterminal_transitions:
state, nonterminal = nt
includes = []
lookback = self.lookback[nt]
for rp in state.closure:
if rp.rule.origin != nonterminal:
continue
# traverse the states for rp(.rule)
state2 = state
for i in range(rp.index, len(rp.rule.expansion)):
s = rp.rule.expansion[i]
nt2 = (state2, s)
state2 = state2.transitions[s]
if nt2 not in self.reads:
continue
for j in range(i + 1, len(rp.rule.expansion)):
if not rp.rule.expansion[j] in self.NULLABLE:
break
else:
includes.append(nt2)
# state2 is at the final state for rp.rule
if rp.index == 0:
for rp2 in state2.closure:
if (rp2.rule == rp.rule) and rp2.is_satisfied:
lookback.add((state2, rp2.rule))
for nt2 in includes:
self.includes[nt2].add(nt)

def compute_lookaheads(self):
read_sets = digraph(self.nonterminal_transitions, self.reads, self.directly_reads)
follow_sets = digraph(self.nonterminal_transitions, self.includes, read_sets)

for nt, lookbacks in self.lookback.items():
for state, rule in lookbacks:
for s in follow_sets[nt]:
state.lookaheads[s].add(rule)

def compute_lalr1_states(self):
m = {}
for state in self.lr0_states:
actions = {}
for la, next_state in state.transitions.items():
actions[la] = (Shift, next_state.closure)
for la, rules in state.lookaheads.items():
if len(rules) > 1:
raise GrammarError('Reduce/Reduce collision in %s between the following rules: %s' % (la, ''.join([ '\n\t\t- ' + str(r) for r in rules ])))
if la in actions:
if self.debug: if self.debug:
logging.warn("Shift/reduce conflict for terminal %s: (resolving as shift)", k.name)
for act, arg in v:
logging.warn(' * %s: %s', act, arg)
for x in v:
# XXX resolving shift/reduce into shift, like PLY
# Give a proper warning
if x[0] is Shift:
lookahead[k] = [x]

for k, v in lookahead.items():
if not len(v) == 1:
raise GrammarError("Collision in %s: %s" %(k, ', '.join(['\n * %s: %s' % x for x in v])))

self.states[state] = {k.name:v[0] for k, v in lookahead.items()}

for _ in bfs([self.start_state], step):
pass
logging.warning('Shift/Reduce conflict for terminal %s: (resolving as shift)', la.name)
logging.warning(' * %s', list(rules)[0])
else:
actions[la] = (Reduce, list(rules)[0])
m[state] = { k.name: v for k, v in actions.items() }


self.end_state ,= self.end_states
states = { k.closure: v for k, v in m.items() }


self._parse_table = ParseTable(self.states, self.start_state, self.end_state)
# compute end states
end_states = {}
for state in states:
for rp in state:
for start in self.lr0_start_states:
if rp.rule.origin.name == ('$root_' + start) and rp.is_satisfied:
assert(not start in end_states)
end_states[start] = state

_parse_table = ParseTable(states, { start: state.closure for start, state in self.lr0_start_states.items() }, end_states)


if self.debug: if self.debug:
self.parse_table = self._parse_table
self.parse_table = _parse_table
else: else:
self.parse_table = IntParseTable.from_ParseTable(self._parse_table)

self.parse_table = IntParseTable.from_ParseTable(_parse_table)

def compute_lalr(self):
self.compute_lr0_states()
self.compute_reads_relations()
self.compute_includes_lookback()
self.compute_lookaheads()
self.compute_lalr1_states()

+ 16
- 16
lark/parsers/lalr_parser.py View File

@@ -6,16 +6,15 @@ from ..exceptions import UnexpectedToken
from ..lexer import Token from ..lexer import Token
from ..utils import Enumerator, Serialize from ..utils import Enumerator, Serialize


from .lalr_analysis import LALR_Analyzer, Shift, IntParseTable
from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable




###{standalone ###{standalone
class LALR_Parser(object): class LALR_Parser(object):
def __init__(self, parser_conf, debug=False): def __init__(self, parser_conf, debug=False):
assert all(r.options is None or r.options.priority is None
for r in parser_conf.rules), "LALR doesn't yet support prioritization"
assert all(r.options.priority is None for r in parser_conf.rules), "LALR doesn't yet support prioritization"
analysis = LALR_Analyzer(parser_conf, debug=debug) analysis = LALR_Analyzer(parser_conf, debug=debug)
analysis.compute_lookahead()
analysis.compute_lalr()
callbacks = parser_conf.callbacks callbacks = parser_conf.callbacks


self._parse_table = analysis.parse_table self._parse_table = analysis.parse_table
@@ -39,19 +38,22 @@ class LALR_Parser(object):
class _Parser: class _Parser:
def __init__(self, parse_table, callbacks): def __init__(self, parse_table, callbacks):
self.states = parse_table.states self.states = parse_table.states
self.start_state = parse_table.start_state
self.end_state = parse_table.end_state
self.start_states = parse_table.start_states
self.end_states = parse_table.end_states
self.callbacks = callbacks self.callbacks = callbacks


def parse(self, seq, set_state=None):
def parse(self, seq, start, set_state=None):
token = None token = None
stream = iter(seq) stream = iter(seq)
states = self.states states = self.states


state_stack = [self.start_state]
start_state = self.start_states[start]
end_state = self.end_states[start]

state_stack = [start_state]
value_stack = [] value_stack = []


if set_state: set_state(self.start_state)
if set_state: set_state(start_state)


def get_action(token): def get_action(token):
state = state_stack[-1] state = state_stack[-1]
@@ -81,7 +83,7 @@ class _Parser:
for token in stream: for token in stream:
while True: while True:
action, arg = get_action(token) action, arg = get_action(token)
assert arg != self.end_state
assert arg != end_state


if action is Shift: if action is Shift:
state_stack.append(arg) state_stack.append(arg)
@@ -94,11 +96,9 @@ class _Parser:
token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1) token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1)
while True: while True:
_action, arg = get_action(token) _action, arg = get_action(token)
if _action is Shift:
assert arg == self.end_state
val ,= value_stack
return val
else:
reduce(arg)
assert(_action is Reduce)
reduce(arg)
if state_stack[-1] == end_state:
return value_stack[-1]


###} ###}

+ 4
- 3
lark/parsers/xearley.py View File

@@ -24,8 +24,8 @@ from .earley_forest import SymbolNode




class Parser(BaseParser): class Parser(BaseParser):
def __init__(self, parser_conf, term_matcher, resolve_ambiguity=True, ignore = (), complete_lex = False):
BaseParser.__init__(self, parser_conf, term_matcher, resolve_ambiguity)
def __init__(self, parser_conf, term_matcher, resolve_ambiguity=True, ignore = (), complete_lex = False, debug=False):
BaseParser.__init__(self, parser_conf, term_matcher, resolve_ambiguity, debug)
self.ignore = [Terminal(t) for t in ignore] self.ignore = [Terminal(t) for t in ignore]
self.complete_lex = complete_lex self.complete_lex = complete_lex


@@ -146,4 +146,5 @@ class Parser(BaseParser):
self.predict_and_complete(i, to_scan, columns, transitives) self.predict_and_complete(i, to_scan, columns, transitives)


## Column is now the final column in the parse. ## Column is now the final column in the parse.
assert i == len(columns)-1
assert i == len(columns)-1
return to_scan

+ 47
- 12
lark/reconstruct.py View File

@@ -19,13 +19,15 @@ def is_iter_empty(i):
except StopIteration: except StopIteration:
return True return True



class WriteTokensTransformer(Transformer_InPlace): class WriteTokensTransformer(Transformer_InPlace):
def __init__(self, tokens):
"Inserts discarded tokens into their correct place, according to the rules of grammar"

def __init__(self, tokens, term_subs):
self.tokens = tokens self.tokens = tokens
self.term_subs = term_subs


def __default__(self, data, children, meta): def __default__(self, data, children, meta):
# if not isinstance(t, MatchTree):
# return t
if not getattr(meta, 'match_tree', False): if not getattr(meta, 'match_tree', False):
return Tree(data, children) return Tree(data, children)


@@ -33,10 +35,15 @@ class WriteTokensTransformer(Transformer_InPlace):
to_write = [] to_write = []
for sym in meta.orig_expansion: for sym in meta.orig_expansion:
if is_discarded_terminal(sym): if is_discarded_terminal(sym):
t = self.tokens[sym.name]
if not isinstance(t.pattern, PatternStr):
raise NotImplementedError("Reconstructing regexps not supported yet: %s" % t)
to_write.append(t.pattern.value)
try:
v = self.term_subs[sym.name](sym)
except KeyError:
t = self.tokens[sym.name]
if not isinstance(t.pattern, PatternStr):
raise NotImplementedError("Reconstructing regexps not supported yet: %s" % t)

v = t.pattern.value
to_write.append(v)
else: else:
x = next(iter_args) x = next(iter_args)
if isinstance(x, list): if isinstance(x, list):
@@ -66,19 +73,39 @@ class MakeMatchTree:
t.meta.orig_expansion = self.expansion t.meta.orig_expansion = self.expansion
return t return t


def best_from_group(seq, group_key, cmp_key):
d = {}
for item in seq:
key = group_key(item)
if key in d:
v1 = cmp_key(item)
v2 = cmp_key(d[key])
if v2 > v1:
d[key] = item
else:
d[key] = item
return list(d.values())

class Reconstructor: class Reconstructor:
def __init__(self, parser):
def __init__(self, parser, term_subs={}):
# XXX TODO calling compile twice returns different results! # XXX TODO calling compile twice returns different results!
tokens, rules, _grammar_extra = parser.grammar.compile()
assert parser.options.maybe_placeholders == False
tokens, rules, _grammar_extra = parser.grammar.compile(parser.options.start)


self.write_tokens = WriteTokensTransformer({t.name:t for t in tokens})
self.write_tokens = WriteTokensTransformer({t.name:t for t in tokens}, term_subs)
self.rules = list(self._build_recons_rules(rules)) self.rules = list(self._build_recons_rules(rules))
self.rules.reverse()

# Choose the best rule from each group of {rule => [rule.alias]}, since we only really need one derivation.
self.rules = best_from_group(self.rules, lambda r: r, lambda r: -len(r.expansion))

self.rules.sort(key=lambda r: len(r.expansion))
callbacks = {rule: rule.alias for rule in self.rules} # TODO pass callbacks through dict, instead of alias? callbacks = {rule: rule.alias for rule in self.rules} # TODO pass callbacks through dict, instead of alias?
self.parser = earley.Parser(ParserConf(self.rules, callbacks, parser.options.start), self.parser = earley.Parser(ParserConf(self.rules, callbacks, parser.options.start),
self._match, resolve_ambiguity=True) self._match, resolve_ambiguity=True)


def _build_recons_rules(self, rules): def _build_recons_rules(self, rules):
expand1s = {r.origin for r in rules if r.options and r.options.expand1}
expand1s = {r.origin for r in rules if r.options.expand1}


aliases = defaultdict(list) aliases = defaultdict(list)
for r in rules: for r in rules:
@@ -126,4 +153,12 @@ class Reconstructor:
yield item yield item


def reconstruct(self, tree): def reconstruct(self, tree):
return ''.join(self._reconstruct(tree))
x = self._reconstruct(tree)
y = []
prev_item = ''
for item in x:
if prev_item and item and prev_item[-1].isalnum() and item[0].isalnum():
y.append(' ')
y.append(item)
prev_item = item
return ''.join(y)

+ 2
- 2
lark/tools/nearley.py View File

@@ -18,7 +18,7 @@ nearley_grammar = r"""


expansion: expr+ js expansion: expr+ js


?expr: item [":" /[+*?]/]
?expr: item (":" /[+*?]/)?


?item: rule|string|regexp|null ?item: rule|string|regexp|null
| "(" expansions ")" | "(" expansions ")"
@@ -167,7 +167,7 @@ def create_code_for_nearley_grammar(g, start, builtin_path, folder_path):
emit(" __default__ = lambda self, n, c, m: c if c else None") emit(" __default__ = lambda self, n, c, m: c if c else None")


emit() emit()
emit('parser = Lark(grammar, start="n_%s")' % start)
emit('parser = Lark(grammar, start="n_%s", maybe_placeholders=False)' % start)
emit('def parse(text):') emit('def parse(text):')
emit(' return TransformNearley().transform(parser.parse(text))') emit(' return TransformNearley().transform(parser.parse(text))')




+ 39
- 0
lark/tools/serialize.py View File

@@ -0,0 +1,39 @@
import codecs
import sys
import json

from lark import Lark
from lark.grammar import RuleOptions, Rule
from lark.lexer import TerminalDef

import argparse

argparser = argparse.ArgumentParser(prog='python -m lark.tools.serialize') #description='''Lark Serialization Tool -- Stores Lark's internal state & LALR analysis as a convenient JSON file''')

argparser.add_argument('grammar_file', type=argparse.FileType('r'), help='A valid .lark file')
argparser.add_argument('-o', '--out', type=argparse.FileType('w'), default=sys.stdout, help='json file path to create (default=stdout)')
argparser.add_argument('-s', '--start', default='start', help='start symbol (default="start")', nargs='+')
argparser.add_argument('-l', '--lexer', default='standard', choices=['standard', 'contextual'], help='lexer type (default="standard")')


def serialize(infile, outfile, lexer, start):
lark_inst = Lark(infile, parser="lalr", lexer=lexer, start=start) # TODO contextual

data, memo = lark_inst.memo_serialize([TerminalDef, Rule])
outfile.write('{\n')
outfile.write(' "data": %s,\n' % json.dumps(data))
outfile.write(' "memo": %s\n' % json.dumps(memo))
outfile.write('}\n')


def main():
if len(sys.argv) == 1 or '-h' in sys.argv or '--help' in sys.argv:
print("Lark Serialization Tool - Stores Lark's internal state & LALR analysis as a JSON file")
print("")
argparser.print_help()
else:
args = argparser.parse_args()
serialize(args.grammar_file, args.out, args.lexer, args.start)

if __name__ == '__main__':
main()

+ 3
- 0
lark/tools/standalone.py View File

@@ -34,6 +34,9 @@
# See <http://www.gnu.org/licenses/>. # See <http://www.gnu.org/licenses/>.
# #
# #

import os
from io import open
###} ###}


import pprint import pprint


+ 31
- 28
lark/tree.py View File

@@ -56,30 +56,6 @@ class Tree(object):


def __hash__(self): def __hash__(self):
return hash((self.data, tuple(self.children))) return hash((self.data, tuple(self.children)))
###}

def expand_kids_by_index(self, *indices):
"Expand (inline) children at the given indices"
for i in sorted(indices, reverse=True): # reverse so that changing tail won't affect indices
kid = self.children[i]
self.children[i:i+1] = kid.children

def find_pred(self, pred):
"Find all nodes where pred(tree) == True"
return filter(pred, self.iter_subtrees())

def find_data(self, data):
"Find all nodes where tree.data == data"
return self.find_pred(lambda t: t.data == data)

def scan_values(self, pred):
for c in self.children:
if isinstance(c, Tree):
for t in c.scan_values(pred):
yield t
else:
if pred(c):
yield c


def iter_subtrees(self): def iter_subtrees(self):
# TODO: Re-write as a more efficient version # TODO: Re-write as a more efficient version
@@ -102,6 +78,31 @@ class Tree(object):
yield x yield x
seen.add(id(x)) seen.add(id(x))


def find_pred(self, pred):
"Find all nodes where pred(tree) == True"
return filter(pred, self.iter_subtrees())

def find_data(self, data):
"Find all nodes where tree.data == data"
return self.find_pred(lambda t: t.data == data)

###}

def expand_kids_by_index(self, *indices):
"Expand (inline) children at the given indices"
for i in sorted(indices, reverse=True): # reverse so that changing tail won't affect indices
kid = self.children[i]
self.children[i:i+1] = kid.children

def scan_values(self, pred):
for c in self.children:
if isinstance(c, Tree):
for t in c.scan_values(pred):
yield t
else:
if pred(c):
yield c

def iter_subtrees_topdown(self): def iter_subtrees_topdown(self):
stack = [self] stack = [self]
while stack: while stack:
@@ -141,17 +142,19 @@ class SlottedTree(Tree):
__slots__ = 'data', 'children', 'rule', '_meta' __slots__ = 'data', 'children', 'rule', '_meta'




def pydot__tree_to_png(tree, filename, rankdir="LR"):
def pydot__tree_to_png(tree, filename, rankdir="LR", **kwargs):
"""Creates a colorful image that represents the tree (data+children, without meta) """Creates a colorful image that represents the tree (data+children, without meta)


Possible values for `rankdir` are "TB", "LR", "BT", "RL", corresponding to Possible values for `rankdir` are "TB", "LR", "BT", "RL", corresponding to
directed graphs drawn from top to bottom, from left to right, from bottom to directed graphs drawn from top to bottom, from left to right, from bottom to
top, and from right to left, respectively. See:
https://www.graphviz.org/doc/info/attrs.html#k:rankdir
top, and from right to left, respectively.

`kwargs` can be any graph attribute (e. g. `dpi=200`). For a list of
possible attributes, see https://www.graphviz.org/doc/info/attrs.html.
""" """


import pydot import pydot
graph = pydot.Dot(graph_type='digraph', rankdir=rankdir)
graph = pydot.Dot(graph_type='digraph', rankdir=rankdir, **kwargs)


i = [0] i = [0]




+ 28
- 2
lark/utils.py View File

@@ -1,4 +1,5 @@
import sys import sys
from ast import literal_eval
from collections import deque from collections import deque


class fzset(frozenset): class fzset(frozenset):
@@ -160,7 +161,7 @@ def smart_decorator(f, create_decorator):


elif isinstance(f, partial): elif isinstance(f, partial):
# wraps does not work for partials in 2.7: https://bugs.python.org/issue3445 # wraps does not work for partials in 2.7: https://bugs.python.org/issue3445
return create_decorator(f.__func__, True)
return wraps(f.func)(create_decorator(lambda *args, **kw: f(*args[1:], **kw), True))


else: else:
return create_decorator(f.__func__.__call__, True) return create_decorator(f.__func__.__call__, True)
@@ -172,7 +173,7 @@ import sre_parse
import sre_constants import sre_constants
def get_regexp_width(regexp): def get_regexp_width(regexp):
try: try:
return sre_parse.parse(regexp).getwidth()
return [int(x) for x in sre_parse.parse(regexp).getwidth()]
except sre_constants.error: except sre_constants.error:
raise ValueError(regexp) raise ValueError(regexp)


@@ -239,3 +240,28 @@ class Enumerator(Serialize):
assert len(r) == len(self.enums) assert len(r) == len(self.enums)
return r return r



def eval_escaping(s):
w = ''
i = iter(s)
for n in i:
w += n
if n == '\\':
try:
n2 = next(i)
except StopIteration:
raise ValueError("Literal ended unexpectedly (bad escaping): `%r`" % s)
if n2 == '\\':
w += '\\\\'
elif n2 not in 'uxnftr':
w += '\\'
w += n2
w = w.replace('\\"', '"').replace("'", "\\'")

to_eval = "u'''%s'''" % w
try:
s = literal_eval(to_eval)
except SyntaxError as e:
raise ValueError(s, e)

return s

+ 113
- 43
lark/visitors.py View File

@@ -3,6 +3,7 @@ from functools import wraps
from .utils import smart_decorator from .utils import smart_decorator
from .tree import Tree from .tree import Tree
from .exceptions import VisitError, GrammarError from .exceptions import VisitError, GrammarError
from .lexer import Token


###{standalone ###{standalone
from inspect import getmembers, getmro from inspect import getmembers, getmro
@@ -12,7 +13,31 @@ class Discard(Exception):


# Transformers # Transformers


class Transformer:
class _Decoratable:
@classmethod
def _apply_decorator(cls, decorator, **kwargs):
mro = getmro(cls)
assert mro[0] is cls
libmembers = {name for _cls in mro[1:] for name, _ in getmembers(_cls)}
for name, value in getmembers(cls):

# Make sure the function isn't inherited (unless it's overwritten)
if name.startswith('_') or (name in libmembers and name not in cls.__dict__):
continue
if not callable(cls.__dict__[name]):
continue

# Skip if v_args already applied (at the function level)
if hasattr(cls.__dict__[name], 'vargs_applied'):
continue

static = isinstance(cls.__dict__[name], (staticmethod, classmethod))
setattr(cls, name, decorator(value, static=static, **kwargs))
return cls



class Transformer(_Decoratable):
"""Visits the tree recursively, starting with the leaves and finally the root (bottom-up) """Visits the tree recursively, starting with the leaves and finally the root (bottom-up)


Calls its methods (provided by user via inheritance) according to tree.data Calls its methods (provided by user via inheritance) according to tree.data
@@ -21,6 +46,10 @@ class Transformer:
Can be used to implement map or reduce. Can be used to implement map or reduce.
""" """


__visit_tokens__ = True # For backwards compatibility
def __init__(self, visit_tokens=True):
self.__visit_tokens__ = visit_tokens

def _call_userfunc(self, tree, new_children=None): def _call_userfunc(self, tree, new_children=None):
# Assumes tree is already transformed # Assumes tree is already transformed
children = new_children if new_children is not None else tree.children children = new_children if new_children is not None else tree.children
@@ -30,25 +59,39 @@ class Transformer:
return self.__default__(tree.data, children, tree.meta) return self.__default__(tree.data, children, tree.meta)
else: else:
try: try:
if getattr(f, 'meta', False):
return f(children, tree.meta)
elif getattr(f, 'inline', False):
return f(*children)
elif getattr(f, 'whole_tree', False):
if new_children is not None:
raise NotImplementedError("Doesn't work with the base Transformer class")
return f(tree)
wrapper = getattr(f, 'visit_wrapper', None)
if wrapper is not None:
return f.visit_wrapper(f, tree.data, children, tree.meta)
else: else:
return f(children) return f(children)
except (GrammarError, Discard): except (GrammarError, Discard):
raise raise
except Exception as e: except Exception as e:
raise VisitError(tree, e)
raise VisitError(tree.data, tree, e)

def _call_userfunc_token(self, token):
try:
f = getattr(self, token.type)
except AttributeError:
return self.__default_token__(token)
else:
try:
return f(token)
except (GrammarError, Discard):
raise
except Exception as e:
raise VisitError(token.type, token, e)



def _transform_children(self, children): def _transform_children(self, children):
for c in children: for c in children:
try: try:
yield self._transform_tree(c) if isinstance(c, Tree) else c
if isinstance(c, Tree):
yield self._transform_tree(c)
elif self.__visit_tokens__ and isinstance(c, Token):
yield self._call_userfunc_token(c)
else:
yield c
except Discard: except Discard:
pass pass


@@ -66,26 +109,10 @@ class Transformer:
"Default operation on tree (for override)" "Default operation on tree (for override)"
return Tree(data, children, meta) return Tree(data, children, meta)


@classmethod
def _apply_decorator(cls, decorator, **kwargs):
mro = getmro(cls)
assert mro[0] is cls
libmembers = {name for _cls in mro[1:] for name, _ in getmembers(_cls)}
for name, value in getmembers(cls):

# Make sure the function isn't inherited (unless it's overwritten)
if name.startswith('_') or (name in libmembers and name not in cls.__dict__):
continue
if not callable(cls.__dict__[name]):
continue

# Skip if v_args already applied (at the function level)
if hasattr(cls.__dict__[name], 'vargs_applied'):
continue
def __default_token__(self, token):
"Default operation on token (for override)"
return token


static = isinstance(cls.__dict__[name], (staticmethod, classmethod))
setattr(cls, name, decorator(value, static=static, **kwargs))
return cls




class InlineTransformer(Transformer): # XXX Deprecated class InlineTransformer(Transformer): # XXX Deprecated
@@ -157,6 +184,11 @@ class Visitor(VisitorBase):
self._call_userfunc(subtree) self._call_userfunc(subtree)
return tree return tree


def visit_topdown(self,tree):
for subtree in tree.iter_subtrees_topdown():
self._call_userfunc(subtree)
return tree

class Visitor_Recursive(VisitorBase): class Visitor_Recursive(VisitorBase):
"""Bottom-up visitor, recursive """Bottom-up visitor, recursive


@@ -169,8 +201,16 @@ class Visitor_Recursive(VisitorBase):
if isinstance(child, Tree): if isinstance(child, Tree):
self.visit(child) self.visit(child)


f = getattr(self, tree.data, self.__default__)
f(tree)
self._call_userfunc(tree)
return tree

def visit_topdown(self,tree):
self._call_userfunc(tree)

for child in tree.children:
if isinstance(child, Tree):
self.visit_topdown(child)

return tree return tree




@@ -184,7 +224,7 @@ def visit_children_decor(func):
return inner return inner




class Interpreter:
class Interpreter(_Decoratable):
"""Top-down visitor, recursive """Top-down visitor, recursive


Visits the tree, starting with the root and finally the leaves (top-down) Visits the tree, starting with the root and finally the leaves (top-down)
@@ -193,8 +233,14 @@ class Interpreter:
Unlike Transformer and Visitor, the Interpreter doesn't automatically visit its sub-branches. Unlike Transformer and Visitor, the Interpreter doesn't automatically visit its sub-branches.
The user has to explicitly call visit_children, or use the @visit_children_decor The user has to explicitly call visit_children, or use the @visit_children_decor
""" """

def visit(self, tree): def visit(self, tree):
return getattr(self, tree.data)(tree)
f = getattr(self, tree.data)
wrapper = getattr(f, 'visit_wrapper', None)
if wrapper is not None:
return f.visit_wrapper(f, tree.data, tree.children, tree.meta)
else:
return f(tree)


def visit_children(self, tree): def visit_children(self, tree):
return [self.visit(child) if isinstance(child, Tree) else child return [self.visit(child) if isinstance(child, Tree) else child
@@ -240,8 +286,7 @@ def inline_args(obj): # XXX Deprecated






def _visitor_args_func_dec(func, inline=False, meta=False, whole_tree=False, static=False):
assert [whole_tree, meta, inline].count(True) <= 1
def _visitor_args_func_dec(func, visit_wrapper=None, static=False):
def create_decorator(_f, with_self): def create_decorator(_f, with_self):
if with_self: if with_self:
def f(self, *args, **kwargs): def f(self, *args, **kwargs):
@@ -256,17 +301,42 @@ def _visitor_args_func_dec(func, inline=False, meta=False, whole_tree=False, sta
else: else:
f = smart_decorator(func, create_decorator) f = smart_decorator(func, create_decorator)
f.vargs_applied = True f.vargs_applied = True
f.inline = inline
f.meta = meta
f.whole_tree = whole_tree
f.visit_wrapper = visit_wrapper
return f return f


def v_args(inline=False, meta=False, tree=False):

def _vargs_inline(f, data, children, meta):
return f(*children)
def _vargs_meta_inline(f, data, children, meta):
return f(meta, *children)
def _vargs_meta(f, data, children, meta):
return f(children, meta) # TODO swap these for consistency? Backwards incompatible!
def _vargs_tree(f, data, children, meta):
return f(Tree(data, children, meta))

def v_args(inline=False, meta=False, tree=False, wrapper=None):
"A convenience decorator factory, for modifying the behavior of user-supplied visitor methods" "A convenience decorator factory, for modifying the behavior of user-supplied visitor methods"
if [tree, meta, inline].count(True) > 1:
raise ValueError("Visitor functions can either accept tree, or meta, or be inlined. These cannot be combined.")
if tree and (meta or inline):
raise ValueError("Visitor functions cannot combine 'tree' with 'meta' or 'inline'.")

func = None
if meta:
if inline:
func = _vargs_meta_inline
else:
func = _vargs_meta
elif inline:
func = _vargs_inline
elif tree:
func = _vargs_tree

if wrapper is not None:
if func is not None:
raise ValueError("Cannot use 'wrapper' along with 'tree', 'meta' or 'inline'.")
func = wrapper

def _visitor_args_dec(obj): def _visitor_args_dec(obj):
return _apply_decorator(obj, _visitor_args_func_dec, inline=inline, meta=meta, whole_tree=tree)
return _apply_decorator(obj, _visitor_args_func_dec, visit_wrapper=func)
return _visitor_args_dec return _visitor_args_dec






+ 1
- 0
mkdocs.yml View File

@@ -9,5 +9,6 @@ pages:
- How To Develop (Guide): how_to_develop.md - How To Develop (Guide): how_to_develop.md
- Grammar Reference: grammar.md - Grammar Reference: grammar.md
- Tree Construction Reference: tree_construction.md - Tree Construction Reference: tree_construction.md
- Visitors and Transformers: visitors.md
- Classes Reference: classes.md - Classes Reference: classes.md
- Recipes: recipes.md - Recipes: recipes.md

+ 10
- 0
readthedocs.yml View File

@@ -0,0 +1,10 @@
version: 2

mkdocs:
configuration: mkdocs.yml
fail_on_warning: false

formats: all

python:
version: 3.5

+ 2
- 1
tests/__main__.py View File

@@ -10,7 +10,7 @@ from .test_reconstructor import TestReconstructor
try: try:
from .test_nearley.test_nearley import TestNearley from .test_nearley.test_nearley import TestNearley
except ImportError: except ImportError:
pass
logging.warning("Warning: Skipping tests for Nearley grammar imports (js2py required)")


# from .test_selectors import TestSelectors # from .test_selectors import TestSelectors
# from .test_grammars import TestPythonG, TestConfigG # from .test_grammars import TestPythonG, TestConfigG
@@ -21,6 +21,7 @@ from .test_parser import (
TestCykStandard, TestCykStandard,
TestLalrContextual, TestLalrContextual,
TestEarleyDynamic, TestEarleyDynamic,
TestLalrCustom,


# TestFullEarleyStandard, # TestFullEarleyStandard,
TestFullEarleyDynamic, TestFullEarleyDynamic,


+ 1
- 0
tests/grammars/test_unicode.lark View File

@@ -0,0 +1 @@
UNICODE : /[a-zØ-öø-ÿ]/

+ 4
- 1
tests/test_nearley/test_nearley.py View File

@@ -15,9 +15,12 @@ NEARLEY_PATH = os.path.join(TEST_PATH, 'nearley')
BUILTIN_PATH = os.path.join(NEARLEY_PATH, 'builtin') BUILTIN_PATH = os.path.join(NEARLEY_PATH, 'builtin')


if not os.path.exists(NEARLEY_PATH): if not os.path.exists(NEARLEY_PATH):
print("Skipping Nearley tests!")
logging.warn("Nearley not installed. Skipping Nearley tests!")
raise ImportError("Skipping Nearley tests!") raise ImportError("Skipping Nearley tests!")


import js2py # Ensures that js2py exists, to avoid failing tests


class TestNearley(unittest.TestCase): class TestNearley(unittest.TestCase):
def test_css(self): def test_css(self):
fn = os.path.join(NEARLEY_PATH, 'examples/csscolor.ne') fn = os.path.join(NEARLEY_PATH, 'examples/csscolor.ne')


+ 239
- 6
tests/test_parser.py View File

@@ -5,6 +5,7 @@ import unittest
import logging import logging
import os import os
import sys import sys
from copy import deepcopy
try: try:
from cStringIO import StringIO as cStringIO from cStringIO import StringIO as cStringIO
except ImportError: except ImportError:
@@ -20,9 +21,9 @@ logging.basicConfig(level=logging.INFO)
from lark.lark import Lark from lark.lark import Lark
from lark.exceptions import GrammarError, ParseError, UnexpectedToken, UnexpectedInput, UnexpectedCharacters from lark.exceptions import GrammarError, ParseError, UnexpectedToken, UnexpectedInput, UnexpectedCharacters
from lark.tree import Tree from lark.tree import Tree
from lark.visitors import Transformer
from lark.visitors import Transformer, Transformer_InPlace, v_args
from lark.grammar import Rule from lark.grammar import Rule
from lark.lexer import TerminalDef
from lark.lexer import TerminalDef, Lexer, TraditionalLexer


__path__ = os.path.dirname(__file__) __path__ = os.path.dirname(__file__)
def _read(n, *args): def _read(n, *args):
@@ -62,6 +63,14 @@ class TestParsers(unittest.TestCase):
r = g.parse('a') r = g.parse('a')
self.assertEqual( r.children[0].meta.line, 1 ) self.assertEqual( r.children[0].meta.line, 1 )


g = Lark("""start: x
x: a
a: "a"
""", propagate_positions=True)

r = g.parse('a')
self.assertEqual( r.children[0].meta.line, 1 )

def test_expand1(self): def test_expand1(self):


g = Lark("""start: a g = Lark("""start: a
@@ -94,6 +103,98 @@ class TestParsers(unittest.TestCase):
r = g.parse('xx') r = g.parse('xx')
self.assertEqual( r.children[0].data, "c" ) self.assertEqual( r.children[0].data, "c" )


def test_comment_in_rule_definition(self):
g = Lark("""start: a
a: "a"
// A comment
// Another comment
| "b"
// Still more

c: "unrelated"
""")
r = g.parse('b')
self.assertEqual( r.children[0].data, "a" )

def test_visit_tokens(self):
class T(Transformer):
def a(self, children):
return children[0] + "!"
def A(self, tok):
return tok.update(value=tok.upper())

# Test regular
g = """start: a
a : A
A: "x"
"""
p = Lark(g, parser='lalr')
r = T(False).transform(p.parse("x"))
self.assertEqual( r.children, ["x!"] )
r = T().transform(p.parse("x"))
self.assertEqual( r.children, ["X!"] )

# Test internal transformer
p = Lark(g, parser='lalr', transformer=T())
r = p.parse("x")
self.assertEqual( r.children, ["X!"] )

def test_vargs_meta(self):

@v_args(meta=True)
class T1(Transformer):
def a(self, children, meta):
assert not children
return meta.line

def start(self, children, meta):
return children

@v_args(meta=True, inline=True)
class T2(Transformer):
def a(self, meta):
return meta.line

def start(self, meta, *res):
return list(res)

for T in (T1, T2):
for internal in [False, True]:
try:
g = Lark(r"""start: a+
a : "x" _NL?
_NL: /\n/+
""", parser='lalr', transformer=T() if internal else None, propagate_positions=True)
except NotImplementedError:
assert internal
continue

res = g.parse("xx\nx\nxxx\n\n\nxx")
assert not internal
res = T().transform(res)

self.assertEqual(res, [1, 1, 2, 3, 3, 3, 6, 6])

def test_vargs_tree(self):
tree = Lark('''
start: a a a
!a: "A"
''').parse('AAA')
tree_copy = deepcopy(tree)

@v_args(tree=True)
class T(Transformer):
def a(self, tree):
return 1
def start(self, tree):
return tree.children

res = T().transform(tree)
self.assertEqual(res, [1, 1, 1])
self.assertEqual(tree, tree_copy)



def test_embedded_transformer(self): def test_embedded_transformer(self):
class T(Transformer): class T(Transformer):
def a(self, children): def a(self, children):
@@ -150,6 +251,51 @@ class TestParsers(unittest.TestCase):
r = g.parse("xx") r = g.parse("xx")
self.assertEqual( r.children, ["<c>"] ) self.assertEqual( r.children, ["<c>"] )


def test_embedded_transformer_inplace(self):
@v_args(tree=True)
class T1(Transformer_InPlace):
def a(self, tree):
assert isinstance(tree, Tree), tree
tree.children.append("tested")
return tree

def b(self, tree):
return Tree(tree.data, tree.children + ['tested2'])

@v_args(tree=True)
class T2(Transformer):
def a(self, tree):
assert isinstance(tree, Tree), tree
tree.children.append("tested")
return tree

def b(self, tree):
return Tree(tree.data, tree.children + ['tested2'])

class T3(Transformer):
@v_args(tree=True)
def a(self, tree):
assert isinstance(tree, Tree)
tree.children.append("tested")
return tree

@v_args(tree=True)
def b(self, tree):
return Tree(tree.data, tree.children + ['tested2'])

for t in [T1(), T2(), T3()]:
for internal in [False, True]:
g = Lark("""start: a b
a : "x"
b : "y"
""", parser='lalr', transformer=t if internal else None)
r = g.parse("xy")
if not internal:
r = t.transform(r)

a, b = r.children
self.assertEqual(a.children, ["tested"])
self.assertEqual(b.children, ["tested2"])


def test_alias(self): def test_alias(self):
Lark("""start: ["a"] "b" ["c"] "e" ["f"] ["g"] ["h"] "x" -> d """) Lark("""start: ["a"] "b" ["c"] "e" ["f"] ["g"] ["h"] "x" -> d """)
@@ -386,12 +532,22 @@ def _make_full_earley_test(LEXER):
_TestFullEarley.__name__ = _NAME _TestFullEarley.__name__ = _NAME
globals()[_NAME] = _TestFullEarley globals()[_NAME] = _TestFullEarley


class CustomLexer(Lexer):
"""
Purpose of this custom lexer is to test the integration,
so it uses the traditionalparser as implementation without custom lexing behaviour.
"""
def __init__(self, lexer_conf):
self.lexer = TraditionalLexer(lexer_conf.tokens, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks)
def lex(self, *args, **kwargs):
return self.lexer.lex(*args, **kwargs)


def _make_parser_test(LEXER, PARSER): def _make_parser_test(LEXER, PARSER):
lexer_class_or_name = CustomLexer if LEXER == 'custom' else LEXER
def _Lark(grammar, **kwargs): def _Lark(grammar, **kwargs):
return Lark(grammar, lexer=LEXER, parser=PARSER, propagate_positions=True, **kwargs)
return Lark(grammar, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs)
def _Lark_open(gfilename, **kwargs): def _Lark_open(gfilename, **kwargs):
return Lark.open(gfilename, lexer=LEXER, parser=PARSER, propagate_positions=True, **kwargs)
return Lark.open(gfilename, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs)
class _TestParser(unittest.TestCase): class _TestParser(unittest.TestCase):
def test_basic1(self): def test_basic1(self):
g = _Lark("""start: a+ b a* "b" a* g = _Lark("""start: a+ b a* "b" a*
@@ -890,7 +1046,7 @@ def _make_parser_test(LEXER, PARSER):


@unittest.skipIf(PARSER == 'cyk', "No empty rules") @unittest.skipIf(PARSER == 'cyk', "No empty rules")
def test_twice_empty(self): def test_twice_empty(self):
g = """!start: [["A"]]
g = """!start: ("A"?)?
""" """
l = _Lark(g) l = _Lark(g)
tree = l.parse('A') tree = l.parse('A')
@@ -984,6 +1140,32 @@ def _make_parser_test(LEXER, PARSER):
self.assertEqual(res.children, ['ab']) self.assertEqual(res.children, ['ab'])




grammar = """
start: A B | AB
A: "a"
B.-20: "b"
AB.-10: "ab"
"""
l = _Lark(grammar)
res = l.parse("ab")
self.assertEqual(res.children, ['a', 'b'])


grammar = """
start: A B | AB
A.-99999999999999999999999: "a"
B: "b"
AB: "ab"
"""
l = _Lark(grammar)
res = l.parse("ab")

self.assertEqual(res.children, ['ab'])







def test_import(self): def test_import(self):
grammar = """ grammar = """
@@ -1021,6 +1203,12 @@ def _make_parser_test(LEXER, PARSER):
self.assertEqual(x.children, ['12', 'lions']) self.assertEqual(x.children, ['12', 'lions'])




def test_relative_import_unicode(self):
l = _Lark_open('test_relative_import_unicode.lark', rel_to=__file__)
x = l.parse(u'Ø')
self.assertEqual(x.children, [u'Ø'])


def test_relative_import_rename(self): def test_relative_import_rename(self):
l = _Lark_open('test_relative_import_rename.lark', rel_to=__file__) l = _Lark_open('test_relative_import_rename.lark', rel_to=__file__)
x = l.parse('12 lions') x = l.parse('12 lions')
@@ -1448,7 +1636,20 @@ def _make_parser_test(LEXER, PARSER):


parser.parse(r'"That" "And a \"b"') parser.parse(r'"That" "And a \"b"')


@unittest.skipIf(PARSER!='lalr', "Serialize currently only works for LALR parsers (though it should be easy to extend)")

def test_meddling_unused(self):
"Unless 'unused' is removed, LALR analysis will fail on reduce-reduce collision"

grammar = """
start: EKS* x
x: EKS
unused: x*
EKS: "x"
"""
parser = _Lark(grammar)


@unittest.skipIf(PARSER!='lalr' or LEXER=='custom', "Serialize currently only works for LALR parsers without custom lexers (though it should be easy to extend)")
def test_serialize(self): def test_serialize(self):
grammar = """ grammar = """
start: _ANY b "C" start: _ANY b "C"
@@ -1465,6 +1666,37 @@ def _make_parser_test(LEXER, PARSER):
parser3 = Lark.deserialize(d, namespace, m) parser3 = Lark.deserialize(d, namespace, m)
self.assertEqual(parser3.parse('ABC'), Tree('start', [Tree('b', [])]) ) self.assertEqual(parser3.parse('ABC'), Tree('start', [Tree('b', [])]) )


def test_multi_start(self):
parser = _Lark('''
a: "x" "a"?
b: "x" "b"?
''', start=['a', 'b'])

self.assertEqual(parser.parse('xa', 'a'), Tree('a', []))
self.assertEqual(parser.parse('xb', 'b'), Tree('b', []))

def test_lexer_detect_newline_tokens(self):
# Detect newlines in regular tokens
g = _Lark(r"""start: "go" tail*
!tail : SA "@" | SB "@" | SC "@" | SD "@"
SA : "a" /\n/
SB : /b./s
SC : "c" /[^a-z]/
SD : "d" /\s/
""")
a,b,c,d = [x.children[1] for x in g.parse('goa\n@b\n@c\n@d\n@').children]
self.assertEqual(a.line, 2)
self.assertEqual(b.line, 3)
self.assertEqual(c.line, 4)
self.assertEqual(d.line, 5)

# Detect newlines in ignored tokens
for re in ['/\\n/', '/[^a-z]/', '/\\s/']:
g = _Lark('''!start: "a" "a"
%ignore {}'''.format(re))
a, b = g.parse('a\na').children
self.assertEqual(a.line, 1)
self.assertEqual(b.line, 2)




_NAME = "Test" + PARSER.capitalize() + LEXER.capitalize() _NAME = "Test" + PARSER.capitalize() + LEXER.capitalize()
@@ -1479,6 +1711,7 @@ _TO_TEST = [
('dynamic_complete', 'earley'), ('dynamic_complete', 'earley'),
('standard', 'lalr'), ('standard', 'lalr'),
('contextual', 'lalr'), ('contextual', 'lalr'),
('custom', 'lalr'),
# (None, 'earley'), # (None, 'earley'),
] ]




+ 2
- 2
tests/test_reconstructor.py View File

@@ -16,7 +16,7 @@ def _remove_ws(s):
class TestReconstructor(TestCase): class TestReconstructor(TestCase):


def assert_reconstruct(self, grammar, code): def assert_reconstruct(self, grammar, code):
parser = Lark(grammar, parser='lalr')
parser = Lark(grammar, parser='lalr', maybe_placeholders=False)
tree = parser.parse(code) tree = parser.parse(code)
new = Reconstructor(parser).reconstruct(tree) new = Reconstructor(parser).reconstruct(tree)
self.assertEqual(_remove_ws(code), _remove_ws(new)) self.assertEqual(_remove_ws(code), _remove_ws(new))
@@ -105,7 +105,7 @@ class TestReconstructor(TestCase):
%ignore WS %ignore WS
""" """


json_parser = Lark(json_grammar, parser='lalr')
json_parser = Lark(json_grammar, parser='lalr', maybe_placeholders=False)
tree = json_parser.parse(test_json) tree = json_parser.parse(test_json)


new_json = Reconstructor(json_parser).reconstruct(tree) new_json = Reconstructor(json_parser).reconstruct(tree)


+ 3
- 0
tests/test_relative_import_unicode.lark View File

@@ -0,0 +1,3 @@
start: UNICODE

%import .grammars.test_unicode.UNICODE

+ 8
- 15
tests/test_tools.py View File

@@ -1,11 +1,9 @@
from __future__ import absolute_import from __future__ import absolute_import


import sys import sys
import unittest
from unittest import TestCase
from unittest import TestCase, main


from lark.tree import Tree from lark.tree import Tree

from lark.tools import standalone from lark.tools import standalone


try: try:
@@ -49,6 +47,8 @@ class TestStandalone(TestCase):
l = _Lark() l = _Lark()
x = l.parse('12 elephants') x = l.parse('12 elephants')
self.assertEqual(x.children, ['12', 'elephants']) self.assertEqual(x.children, ['12', 'elephants'])
x = l.parse('16 candles')
self.assertEqual(x.children, ['16', 'candles'])


def test_contextual(self): def test_contextual(self):
grammar = """ grammar = """
@@ -92,26 +92,19 @@ class TestStandalone(TestCase):
_NEWLINE: /\n/ _NEWLINE: /\n/
""" """


# from lark import Lark
# l = Lark(grammar, parser='lalr', lexer='contextual', postlex=MyIndenter())
# x = l.parse('(\n)\n')
# print('@@', x)


context = self._create_standalone(grammar) context = self._create_standalone(grammar)
_Lark = context['Lark_StandAlone'] _Lark = context['Lark_StandAlone']


# l = _Lark(postlex=MyIndenter())
# x = l.parse('()\n')
# print(x)
l = _Lark(postlex=MyIndenter())
x = l.parse('()\n')
self.assertEqual(x, Tree('start', []))
l = _Lark(postlex=MyIndenter()) l = _Lark(postlex=MyIndenter())
x = l.parse('(\n)\n') x = l.parse('(\n)\n')
print(x)

self.assertEqual(x, Tree('start', []))






if __name__ == '__main__': if __name__ == '__main__':
unittest.main()
main()





+ 55
- 1
tests/test_trees.py View File

@@ -4,9 +4,10 @@ import unittest
from unittest import TestCase from unittest import TestCase
import copy import copy
import pickle import pickle
import functools


from lark.tree import Tree from lark.tree import Tree
from lark.visitors import Transformer, Interpreter, visit_children_decor, v_args, Discard
from lark.visitors import Visitor, Visitor_Recursive, Transformer, Interpreter, visit_children_decor, v_args, Discard




class TestTrees(TestCase): class TestTrees(TestCase):
@@ -33,6 +34,43 @@ class TestTrees(TestCase):
nodes = list(self.tree1.iter_subtrees_topdown()) nodes = list(self.tree1.iter_subtrees_topdown())
self.assertEqual(nodes, expected) self.assertEqual(nodes, expected)


def test_visitor(self):
class Visitor1(Visitor):
def __init__(self):
self.nodes=[]

def __default__(self,tree):
self.nodes.append(tree)
class Visitor1_Recursive(Visitor_Recursive):
def __init__(self):
self.nodes=[]

def __default__(self,tree):
self.nodes.append(tree)

visitor1=Visitor1()
visitor1_recursive=Visitor1_Recursive()

expected_top_down = [Tree('a', [Tree('b', 'x'), Tree('c', 'y'), Tree('d', 'z')]),
Tree('b', 'x'), Tree('c', 'y'), Tree('d', 'z')]
expected_botton_up= [Tree('b', 'x'), Tree('c', 'y'), Tree('d', 'z'),
Tree('a', [Tree('b', 'x'), Tree('c', 'y'), Tree('d', 'z')])]

visitor1.visit(self.tree1)
self.assertEqual(visitor1.nodes,expected_botton_up)

visitor1_recursive.visit(self.tree1)
self.assertEqual(visitor1_recursive.nodes,expected_botton_up)

visitor1.nodes=[]
visitor1_recursive.nodes=[]

visitor1.visit_topdown(self.tree1)
self.assertEqual(visitor1.nodes,expected_top_down)

visitor1_recursive.visit_topdown(self.tree1)
self.assertEqual(visitor1_recursive.nodes,expected_top_down)

def test_interp(self): def test_interp(self):
t = Tree('a', [Tree('b', []), Tree('c', []), 'd']) t = Tree('a', [Tree('b', []), Tree('c', []), 'd'])


@@ -146,6 +184,22 @@ class TestTrees(TestCase):
res = T().transform(t) res = T().transform(t)
self.assertEqual(res, 2.9) self.assertEqual(res, 2.9)


def test_partial(self):

tree = Tree("start", [Tree("a", ["test1"]), Tree("b", ["test2"])])

def test(prefix, s, postfix):
return prefix + s.upper() + postfix

@v_args(inline=True)
class T(Transformer):
a = functools.partial(test, "@", postfix="!")
b = functools.partial(lambda s: s + "!")

res = T().transform(tree)
assert res.children == ["@TEST1!", "test2!"]


def test_discard(self): def test_discard(self):
class MyTransformer(Transformer): class MyTransformer(Transformer):
def a(self, args): def a(self, args):


Loading…
Cancel
Save