Browse Source

Merge pull request #1 from lark-parser/master

Merge from origin.
tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.8.2
Yuan 5 years ago
committed by GitHub
parent
commit
c95257c906
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
53 changed files with 1986 additions and 763 deletions
  1. +1
    -0
      .gitignore
  2. +20
    -5
      README.md
  3. +27
    -117
      docs/classes.md
  4. +77
    -5
      docs/grammar.md
  5. +2
    -2
      docs/how_to_develop.md
  6. +1
    -1
      docs/how_to_use.md
  7. +3
    -2
      docs/index.md
  8. +10
    -5
      docs/json_tutorial.md
  9. +3
    -3
      docs/parsers.md
  10. +6
    -6
      docs/recipes.md
  11. +24
    -0
      docs/tree_construction.md
  12. +117
    -0
      docs/visitors.md
  13. +1
    -0
      examples/README.md
  14. +11
    -1
      examples/json_parser.py
  15. +6
    -4
      examples/lark.lark
  16. +5
    -5
      examples/python3.lark
  17. +77
    -0
      examples/python_bytecode.py
  18. +2
    -8
      examples/reconstruct_json.py
  19. +1
    -0
      examples/standalone/create_standalone.sh
  20. +418
    -197
      examples/standalone/json_parser.py
  21. +1
    -1
      lark/__init__.py
  22. +1
    -0
      lark/common.py
  23. +14
    -4
      lark/exceptions.py
  24. +4
    -2
      lark/grammar.py
  25. +41
    -40
      lark/lark.py
  26. +84
    -50
      lark/lexer.py
  27. +66
    -57
      lark/load_grammar.py
  28. +33
    -9
      lark/parse_tree_builder.py
  29. +41
    -18
      lark/parser_frontends.py
  30. +8
    -6
      lark/parsers/cyk.py
  31. +25
    -14
      lark/parsers/earley.py
  32. +3
    -3
      lark/parsers/earley_forest.py
  33. +46
    -14
      lark/parsers/grammar_analysis.py
  34. +199
    -52
      lark/parsers/lalr_analysis.py
  35. +16
    -16
      lark/parsers/lalr_parser.py
  36. +4
    -3
      lark/parsers/xearley.py
  37. +47
    -12
      lark/reconstruct.py
  38. +2
    -2
      lark/tools/nearley.py
  39. +39
    -0
      lark/tools/serialize.py
  40. +3
    -0
      lark/tools/standalone.py
  41. +31
    -28
      lark/tree.py
  42. +28
    -2
      lark/utils.py
  43. +113
    -43
      lark/visitors.py
  44. +1
    -0
      mkdocs.yml
  45. +10
    -0
      readthedocs.yml
  46. +2
    -1
      tests/__main__.py
  47. +1
    -0
      tests/grammars/test_unicode.lark
  48. +4
    -1
      tests/test_nearley/test_nearley.py
  49. +239
    -6
      tests/test_parser.py
  50. +2
    -2
      tests/test_reconstructor.py
  51. +3
    -0
      tests/test_relative_import_unicode.lark
  52. +8
    -15
      tests/test_tools.py
  53. +55
    -1
      tests/test_trees.py

+ 1
- 0
.gitignore View File

@@ -4,6 +4,7 @@
/lark_parser.egg-info/**
tags
.vscode
.idea
.ropeproject
.cache
/dist


+ 20
- 5
README.md View File

@@ -34,13 +34,16 @@ Lark has no dependencies.

[![Build Status](https://travis-ci.org/lark-parser/lark.svg?branch=master)](https://travis-ci.org/lark-parser/lark)

### Syntax Highlighting (new)
### Syntax Highlighting

Lark now provides syntax highlighting for its grammar files (\*.lark):
Lark provides syntax highlighting for its grammar files (\*.lark):

- [Sublime Text & TextMate](https://github.com/lark-parser/lark_syntax)
- [vscode](https://github.com/lark-parser/vscode-lark)

### Clones

- [Lerche (Julia)](https://github.com/jamesrhester/Lerche.jl) - an unofficial clone, written entirely in Julia.

### Hello World

@@ -72,7 +75,7 @@ Lark is great at handling ambiguity. Let's parse the phrase "fruit flies like ba

![fruitflies.png](examples/fruitflies.png)

See more [examples in the wiki](https://github.com/erezsh/lark/wiki/Examples)
See more [examples here](https://github.com/lark-parser/lark/tree/master/examples)



@@ -95,7 +98,7 @@ See more [examples in the wiki](https://github.com/erezsh/lark/wiki/Examples)
- Extensive test suite [![codecov](https://codecov.io/gh/erezsh/lark/branch/master/graph/badge.svg)](https://codecov.io/gh/erezsh/lark)
- And much more!

See the full list of [features in the wiki](https://github.com/erezsh/lark/wiki/Features)
See the full list of [features here](https://lark-parser.readthedocs.io/en/latest/features/)


### Comparison to other libraries
@@ -132,9 +135,21 @@ Check out the [JSON tutorial](/docs/json_tutorial.md#conclusion) for more detail

### Projects using Lark

- [storyscript](https://github.com/storyscript/storyscript) - The programming language for Application Storytelling
- [tartiflette](https://github.com/dailymotion/tartiflette) - a GraphQL engine by Dailymotion. Lark is used to parse the GraphQL schemas definitions.
- [Hypothesis](https://github.com/HypothesisWorks/hypothesis) - Library for property-based testing
- [mappyfile](https://github.com/geographika/mappyfile) - a MapFile parser for working with MapServer configuration
- [synapse](https://github.com/vertexproject/synapse) - an intelligence analysis platform
- [Datacube-core](https://github.com/opendatacube/datacube-core) - Open Data Cube analyses continental scale Earth Observation data through time
- [SPFlow](https://github.com/SPFlow/SPFlow) - Library for Sum-Product Networks
- [Torchani](https://github.com/aiqm/torchani) - Accurate Neural Network Potential on PyTorch
- [Command-Block-Assembly](https://github.com/simon816/Command-Block-Assembly) - An assembly language, and C compiler, for Minecraft commands
- [Fabric-SDK-Py](https://github.com/hyperledger/fabric-sdk-py) - Hyperledger fabric SDK with Python 3.x
- [required](https://github.com/shezadkhan137/required) - multi-field validation using docstrings
- [miniwdl](https://github.com/chanzuckerberg/miniwdl) - A static analysis toolkit for the Workflow Description Language
- [pytreeview](https://gitlab.com/parmenti/pytreeview) - a lightweight tree-based grammar explorer
- [tartiflette](https://github.com/dailymotion/tartiflette) - a GraphQL engine by Dailymotion (Lark is used to parse the GraphQL schemas definitions)
- [harmalysis](https://github.com/napulen/harmalysis) - A language for harmonic analysis and music theory


Using Lark? Send me a message and I'll add your project!



+ 27
- 117
docs/classes.md View File

@@ -1,42 +1,42 @@
# Classes - Reference
# Classes Reference

This page details the important classes in Lark.

----

## Lark
## lark.Lark

The Lark class is the main interface for the library. It's mostly a thin wrapper for the many different parsers, and for the tree constructor.

### Methods

#### \_\_init\_\_(self, grammar, **options)

The Lark class accepts a grammar string or file object, and keyword options:

* start - The symbol in the grammar that begins the parse (Default: `"start"`)
* **start** - A list of the rules in the grammar that begin the parse (Default: `["start"]`)

* parser - Decides which parser engine to use, "earley", "lalr" or "cyk". (Default: `"earley"`)
* **parser** - Decides which parser engine to use, "earley", "lalr" or "cyk". (Default: `"earley"`)

* lexer - Overrides default lexer.
* **lexer** - Overrides default lexer, depending on parser.

* transformer - Applies the transformer instead of building a parse tree (only allowed with parser="lalr")
* **transformer** - Applies the provided transformer instead of building a parse tree (only allowed with parser="lalr")

* postlex - Lexer post-processing (Default: None. only works when lexer is "standard" or "contextual")
* **postlex** - Lexer post-processing (Default: `None`. only works when lexer is "standard" or "contextual")

* ambiguity (only relevant for earley and cyk)
* **ambiguity** (only relevant for earley and cyk)

* "explicit" - Return all derivations inside an "_ambig" data node.

* "resolve" - Let the parser choose the best derivation (greedy for tokens, non-greedy for rules. Default)

* debug - Display warnings (such as Shift-Reduce warnings for LALR)
* **debug** - Display warnings (such as Shift-Reduce warnings for LALR)

* **keep_all_tokens** - Don't throw away any terminals from the tree (Default=`False`)

* keep_all_tokens - Don't throw away any terminals from the tree (Default=False)
* **propagate_positions** - Propagate line/column count to tree nodes, at the cost of performance (default=`False`)

* propagate_positions - Propagate line/column count to tree nodes (default=False)
* **maybe_placeholders** - When True, the `[]` operator returns `None` when not matched. When `False`, `[]` behaves like the `?` operator, and return no value at all, which may be a little faster (default=`False`)

* lexer_callbacks - A dictionary of callbacks of type f(Token) -> Token, used to interface with the lexer Token generation. Only works with the standard and contextual lexers. See [Recipes](recipes.md) for more information.
* **lexer_callbacks** - A dictionary of callbacks of type f(Token) -> Token, used to interface with the lexer Token generation. Only works with the standard and contextual lexers. See [Recipes](recipes.md) for more information.

#### parse(self, text)

@@ -50,13 +50,10 @@ If a transformer is supplied to `__init__`, returns whatever is the result of th

The main tree class

### Properties

* `data` - The name of the rule or alias
* `children` - List of matched sub-rules and terminals
* `meta` - Line & Column numbers, if using `propagate_positions`

### Methods
* `meta` - Line & Column numbers (if `propagate_positions` is enabled)
* meta attributes: `line`, `column`, `start_pos`, `end_line`, `end_column`, `end_pos`

#### \_\_init\_\_(self, data, children)

@@ -92,102 +89,6 @@ Trees can be hashed and compared.

----

## Transformers & Visitors

Transformers & Visitors provide a convenient interface to process the parse-trees that Lark returns.

They are used by inheriting from the correct class (visitor or transformer), and implementing methods corresponding to the rule you wish to process. Each methods accepts the children as an argument. That can be modified using the `v-args` decorator, which allows to inline the arguments (akin to `*args`), or add the tree `meta` property as an argument.

See: https://github.com/lark-parser/lark/blob/master/lark/visitors.py

### Visitors

Visitors visit each node of the tree, and run the appropriate method on it according to the node's data.

They work bottom-up, starting with the leaves and ending at the root of the tree.

**Example**
```python
class IncreaseAllNumbers(Visitor):
def number(self, tree):
assert tree.data == "number"
tree.children[0] += 1

IncreaseAllNumbers().visit(parse_tree)
```

There are two classes that implement the visitor interface:

* Visitor - Visit every node (without recursion)

* Visitor_Recursive - Visit every node using recursion. Slightly faster.

### Transformers

Transformers visit each node of the tree, and run the appropriate method on it according to the node's data.

They work bottom-up (or: depth-first), starting with the leaves and ending at the root of the tree.

Transformers can be used to implement map & reduce patterns.

Because nodes are reduced from leaf to root, at any point the callbacks may assume the children have already been transformed (if applicable).

Transformers can be chained into a new transformer by using multiplication.

**Example:**
```python
from lark import Tree, Transformer

class EvalExpressions(Transformer):
def expr(self, args):
return eval(args[0])

t = Tree('a', [Tree('expr', ['1+2'])])
print(EvalExpressions().transform( t ))

# Prints: Tree(a, [3])
```


Here are the classes that implement the transformer interface:

- Transformer - Recursively transforms the tree. This is the one you probably want.
- Transformer_InPlace - Non-recursive. Changes the tree in-place instead of returning new instances
- Transformer_InPlaceRecursive - Recursive. Changes the tree in-place instead of returning new instances

### v_args

`v_args` is a decorator.

By default, callback methods of transformers/visitors accept one argument: a list of the node's children. `v_args` can modify this behavior.

When used on a transformer/visitor class definition, it applies to all the callback methods inside it.

`v_args` accepts one of three flags:

- `inline` - Children are provided as `*args` instead of a list argument (not recommended for very long lists).
- `meta` - Provides two arguments: `children` and `meta` (instead of just the first)
- `tree` - Provides the entire tree as the argument, instead of the children.

Examples:

```python
@v_args(inline=True)
class SolveArith(Transformer):
def add(self, left, right):
return left + right


class ReverseNotation(Transformer_InPlace):
@v_args(tree=True):
def tree_node(self, tree):
tree.children = tree.children[::-1]
```

### Discard

When raising the `Discard` exception in a transformer callback, that node is discarded and won't appear in the parent.

## Token

When using a lexer, the resulting tokens in the trees will be of the Token class, which inherits from Python's string. So, normal string comparisons and operations will work as expected. Tokens also have other useful attributes:
@@ -198,18 +99,27 @@ When using a lexer, the resulting tokens in the trees will be of the Token class
* `column` - The column of the token in the text (starting with 1)
* `end_line` - The line where the token ends
* `end_column` - The next column after the end of the token. For example, if the token is a single character with a `column` value of 4, `end_column` will be 5.
* `end_pos` - the index where the token ends (basically pos_in_stream + len(token))

## Transformer
## Visitor
## Interpreter

See the [visitors page](visitors.md)


## UnexpectedInput

## UnexpectedToken

## UnexpectedException

- `UnexpectedInput`
- `UnexpectedToken` - The parser recieved an unexpected token
- `UnexpectedCharacters` - The lexer encountered an unexpected string

After catching one of these exceptions, you may call the following helper methods to create a nicer error message:

### Methods

#### get_context(text, span)

Returns a pretty string pinpointing the error in the text, with `span` amount of context characters around it.


+ 77
- 5
docs/grammar.md View File

@@ -1,5 +1,13 @@
# Grammar Reference

Table of contents:

1. [Definitions](#defs)
1. [Terminals](#terms)
1. [Rules](#rules)
1. [Directives](#dirs)

<a name="defs"></a>
## Definitions

**A grammar** is a list of rules and terminals, that together define a language.
@@ -25,6 +33,7 @@ Lark begins the parse with the rule 'start', unless specified otherwise in the o
Names of rules are always in lowercase, while names of terminals are always in uppercase. This distinction has practical effects, for the shape of the generated parse-tree, and the automatic construction of the lexer (aka tokenizer, or scanner).


<a name="terms"></a>
## Terminals

Terminals are used to match text into symbols. They can be defined as a combination of literals and other terminals.
@@ -45,6 +54,16 @@ Literals can be one of:
* `/re with flags/imulx`
* Literal range: `"a".."z"`, `"1".."9"`, etc.

Terminals also support grammar operators, such as `|`, `+`, `*` and `?`.

Terminals are a linear construct, and therefor may not contain themselves (recursion isn't allowed).

### Priority

Terminals can be assigned priority only when using a lexer (future versions may support Earley's dynamic lexing).

Priority can be either positive or negative. In not specified for a terminal, it's assumed to be 1 (i.e. the default).

#### Notes for when using a lexer:

When using a lexer (standard or contextual), it is the grammar-author's responsibility to make sure the literals don't collide, or that if they do, they are matched in the desired order. Literals are matched in an order according to the following criteria:
@@ -59,11 +78,58 @@ When using a lexer (standard or contextual), it is the grammar-author's responsi
IF: "if"
INTEGER : /[0-9]+/
INTEGER2 : ("0".."9")+ //# Same as INTEGER
DECIMAL.2: INTEGER "." INTEGER //# Will be matched before INTEGER
DECIMAL.2: INTEGER? "." INTEGER //# Will be matched before INTEGER
WHITESPACE: (" " | /\t/ )+
SQL_SELECT: "select"i
```

### Regular expressions & Ambiguity

Each terminal is eventually compiled to a regular expression. All the operators and references inside it are mapped to their respective expressions.

For example, in the following grammar, `A1` and `A2`, are equivalent:
```perl
A1: "a" | "b"
A2: /a|b/
```

This means that inside terminals, Lark cannot detect or resolve ambiguity, even when using Earley.

For example, for this grammar:
```perl
start : (A | B)+
A : "a" | "ab"
B : "b"
```
We get this behavior:

```bash
>>> p.parse("ab")
Tree(start, [Token(A, 'a'), Token(B, 'b')])
```

This is happening because Python's regex engine always returns the first matching option.

If you find yourself in this situation, the recommended solution is to use rules instead.

Example:

```python
>>> p = Lark("""start: (a | b)+
... !a: "a" | "ab"
... !b: "b"
... """, ambiguity="explicit")
>>> print(p.parse("ab").pretty())
_ambig
start
a ab
start
a a
b b
```


<a name="rules"></a>
## Rules

**Syntax:**
@@ -85,24 +151,30 @@ Each item is one of:
* `TERMINAL`
* `"string literal"` or `/regexp literal/`
* `(item item ..)` - Group items
* `[item item ..]` - Maybe. Same as `(item item ..)?`
* `[item item ..]` - Maybe. Same as `(item item ..)?`, but generates `None` if there is no match
* `item?` - Zero or one instances of item ("maybe")
* `item*` - Zero or more instances of item
* `item+` - One or more instances of item
* `item ~ n` - Exactly *n* instances of item
* `item ~ n..m` - Between *n* to *m* instances of item
* `item ~ n..m` - Between *n* to *m* instances of item (not recommended for wide ranges, due to performance issues)

**Examples:**
```perl
hello_world: "hello" "world"
mul: [mul "*"] number //# Left-recursion is allowed!
mul: (mul "*")? number //# Left-recursion is allowed and encouraged!
expr: expr operator expr
| value //# Multi-line, belongs to expr

four_words: word ~ 4
```

### Priority

Rules can be assigned priority only when using Earley (future versions may support LALR as well).

Priority can be either positive or negative. In not specified for a terminal, it's assumed to be 1 (i.e. the default).

<a name="dirs"></a>
## Directives

### %ignore
@@ -111,7 +183,7 @@ All occurrences of the terminal will be ignored, and won't be part of the parse.

Using the `%ignore` directive results in a cleaner grammar.

It's especially important for the LALR(1) algorithm, because adding whitespace (or comments, or other extranous elements) explicitly in the grammar, harms its predictive abilities, which are based on a lookahead of 1.
It's especially important for the LALR(1) algorithm, because adding whitespace (or comments, or other extraneous elements) explicitly in the grammar, harms its predictive abilities, which are based on a lookahead of 1.

**Syntax:**
```html


+ 2
- 2
docs/how_to_develop.md View File

@@ -7,7 +7,7 @@ There are many ways you can help the project:
* Write new grammars for Lark's library
* Write a blog post introducing Lark to your audience
* Port Lark to another language
* Help me with code developemnt
* Help me with code development

If you're interested in taking one of these on, let me know and I will provide more details and assist you in the process.

@@ -60,4 +60,4 @@ Another way to run the tests is using setup.py:

```bash
python setup.py test
```
```

+ 1
- 1
docs/how_to_use.md View File

@@ -10,7 +10,7 @@ This is the recommended process for working with Lark:

3. Try your grammar in Lark against each input sample. Make sure the resulting parse-trees make sense.

4. Use Lark's grammar features to [[shape the tree|Tree Construction]]: Get rid of superfluous rules by inlining them, and use aliases when specific cases need clarification.
4. Use Lark's grammar features to [shape the tree](tree_construction.md): Get rid of superfluous rules by inlining them, and use aliases when specific cases need clarification.

- You can perform steps 1-4 repeatedly, gradually growing your grammar to include more sentences.



+ 3
- 2
docs/index.md View File

@@ -35,8 +35,8 @@ $ pip install lark-parser
* [Examples](https://github.com/lark-parser/lark/tree/master/examples)
* Tutorials
* [How to write a DSL](http://blog.erezsh.com/how-to-write-a-dsl-in-python-with-lark/) - Implements a toy LOGO-like language with an interpreter
* [How to write a JSON parser](json_tutorial.md)
* External
* [How to write a JSON parser](json_tutorial.md) - Teaches you how to use Lark
* Unofficial
* [Program Synthesis is Possible](https://www.cs.cornell.edu/~asampson/blog/minisynth.html) - Creates a DSL for Z3
* Guides
* [How to use Lark](how_to_use.md)
@@ -44,6 +44,7 @@ $ pip install lark-parser
* Reference
* [Grammar](grammar.md)
* [Tree Construction](tree_construction.md)
* [Visitors & Transformers](visitors.md)
* [Classes](classes.md)
* [Cheatsheet (PDF)](lark_cheatsheet.pdf)
* Discussion


+ 10
- 5
docs/json_tutorial.md View File

@@ -230,7 +230,8 @@ from lark import Transformer
class MyTransformer(Transformer):
def list(self, items):
return list(items)
def pair(self, (k,v)):
def pair(self, key_value):
k, v = key_value
return k, v
def dict(self, items):
return dict(items)
@@ -251,9 +252,11 @@ Also, our definitions of list and dict are a bit verbose. We can do better:
from lark import Transformer

class TreeToJson(Transformer):
def string(self, (s,)):
def string(self, s):
(s,) = s
return s[1:-1]
def number(self, (n,)):
def number(self, n):
(n,) = n
return float(n)

list = list
@@ -315,9 +318,11 @@ json_grammar = r"""
"""

class TreeToJson(Transformer):
def string(self, (s,)):
def string(self, s):
(s,) = s
return s[1:-1]
def number(self, (n,)):
def number(self, n):
(n,) = n
return float(n)

list = list


+ 3
- 3
docs/parsers.md View File

@@ -5,9 +5,9 @@ Lark implements the following parsing algorithms: Earley, LALR(1), and CYK

An [Earley Parser](https://www.wikiwand.com/en/Earley_parser) is a chart parser capable of parsing any context-free grammar at O(n^3), and O(n^2) when the grammar is unambiguous. It can parse most LR grammars at O(n). Most programming languages are LR, and can be parsed at a linear time.

Lark's Earley implementation runs on top of a skipping chart parser, which allows it to use regular expressions, instead of matching characters one-by-one. This is a huge improvement to Earley that is unique to Lark. This feature is used by default, but can also be requested explicitely using `lexer='dynamic'`.
Lark's Earley implementation runs on top of a skipping chart parser, which allows it to use regular expressions, instead of matching characters one-by-one. This is a huge improvement to Earley that is unique to Lark. This feature is used by default, but can also be requested explicitly using `lexer='dynamic'`.

It's possible to bypass the dynamic lexer, and use the regular Earley parser with a traditional lexer, that tokenizes as an independant first step. Doing so will provide a speed benefit, but will tokenize without using Earley's ambiguity-resolution ability. So choose this only if you know why! Activate with `lexer='standard'`
It's possible to bypass the dynamic lexing, and use the regular Earley parser with a traditional lexer, that tokenizes as an independent first step. Doing so will provide a speed benefit, but will tokenize without using Earley's ambiguity-resolution ability. So choose this only if you know why! Activate with `lexer='standard'`

**SPPF & Ambiguity resolution**

@@ -21,7 +21,7 @@ Lark provides the following options to combat ambiguity:

1) Lark will choose the best derivation for you (default). Users can choose between different disambiguation strategies, and can prioritize (or demote) individual rules over others, using the rule-priority syntax.

2) Users may choose to recieve the set of all possible parse-trees (using ambiguity='explicit'), and choose the best derivation themselves. While simple and flexible, it comes at the cost of space and performance, and so it isn't recommended for highly ambiguous grammars, or very long inputs.
2) Users may choose to receive the set of all possible parse-trees (using ambiguity='explicit'), and choose the best derivation themselves. While simple and flexible, it comes at the cost of space and performance, and so it isn't recommended for highly ambiguous grammars, or very long inputs.

3) As an advanced feature, users may use specialized visitors to iterate the SPPF themselves. Future versions of Lark intend to improve and simplify this interface.



+ 6
- 6
docs/recipes.md View File

@@ -19,18 +19,18 @@ It only works with the standard and contextual lexers.
### Example 1: Replace string values with ints for INT tokens

```python
from lark import Lark, Token
from lark import Lark, Transformer

def tok_to_int(tok):
"Convert the value of `tok` from string to int, while maintaining line number & column."
# tok.type == 'INT'
return Token.new_borrow_pos(tok.type, int(tok), tok)
class T(Transformer):
def INT(self, tok):
"Convert the value of `tok` from string to int, while maintaining line number & column."
return tok.update(value=int(tok))

parser = Lark("""
start: INT*
%import common.INT
%ignore " "
""", parser="lalr", lexer_callbacks = {'INT': tok_to_int})
""", parser="lalr", transformer=T())

print(parser.parse('3 14 159'))
```


+ 24
- 0
docs/tree_construction.md View File

@@ -7,6 +7,12 @@ For example, the rule `node: child1 child2` will create a tree node with two chi

Using `item+` or `item*` will result in a list of items, equivalent to writing `item item item ..`.

Using `item?` will return the item if it matched, or nothing.

If `maybe_placeholders=False` (the default), then `[]` behaves like `()?`.

If `maybe_placeholders=True`, then using `[item]` will return the item if it matched, or the value `None`, if it didn't.

### Terminals

Terminals are always values in the tree, never branches.
@@ -23,6 +29,24 @@ Lark filters out certain types of terminals by default, considering them punctua
- Unnamed regular expressions (like `/[0-9]/`)
- Named terminals whose name starts with a letter (like `DIGIT`)

Note: Terminals composed of literals and other terminals always include the entire match without filtering any part.

**Example:**
```
start: PNAME pname

PNAME: "(" NAME ")"
pname: "(" NAME ")"

NAME: /\w+/
%ignore /\s+/
```
Lark will parse "(Hello) (World)" as:

start
(Hello)
pname World

Rules prefixed with `!` will retain all their literals regardless.




+ 117
- 0
docs/visitors.md View File

@@ -0,0 +1,117 @@
## Transformers & Visitors

Transformers & Visitors provide a convenient interface to process the parse-trees that Lark returns.

They are used by inheriting from the correct class (visitor or transformer), and implementing methods corresponding to the rule you wish to process. Each method accepts the children as an argument. That can be modified using the `v_args` decorator, which allows to inline the arguments (akin to `*args`), or add the tree `meta` property as an argument.

See: <a href="https://github.com/lark-parser/lark/blob/master/lark/visitors.py">visitors.py</a>

### Visitors

Visitors visit each node of the tree, and run the appropriate method on it according to the node's data.

They work bottom-up, starting with the leaves and ending at the root of the tree.

**Example**
```python
class IncreaseAllNumbers(Visitor):
def number(self, tree):
assert tree.data == "number"
tree.children[0] += 1

IncreaseAllNumbers().visit(parse_tree)
```

There are two classes that implement the visitor interface:

* Visitor - Visit every node (without recursion)

* Visitor_Recursive - Visit every node using recursion. Slightly faster.

### Transformers

Transformers visit each node of the tree, and run the appropriate method on it according to the node's data.

They work bottom-up (or: depth-first), starting with the leaves and ending at the root of the tree.

Transformers can be used to implement map & reduce patterns.

Because nodes are reduced from leaf to root, at any point the callbacks may assume the children have already been transformed (if applicable).

Transformers can be chained into a new transformer by using multiplication.

`Transformer` can do anything `Visitor` can do, but because it reconstructs the tree, it is slightly less efficient.


**Example:**
```python
from lark import Tree, Transformer

class EvalExpressions(Transformer):
def expr(self, args):
return eval(args[0])

t = Tree('a', [Tree('expr', ['1+2'])])
print(EvalExpressions().transform( t ))

# Prints: Tree(a, [3])
```

All these classes implement the transformer interface:

- Transformer - Recursively transforms the tree. This is the one you probably want.
- Transformer_InPlace - Non-recursive. Changes the tree in-place instead of returning new instances
- Transformer_InPlaceRecursive - Recursive. Changes the tree in-place instead of returning new instances

### visit_tokens

By default, transformers only visit rules. `visit_tokens=True` will tell Transformer to visit tokens as well. This is a slightly slower alternative to `lexer_callbacks`, but it's easier to maintain and works for all algorithms (even when there isn't a lexer).

Example:

```python
class T(Transformer):
INT = int
NUMBER = float
def NAME(self, name):
return lookup_dict.get(name, name)


T(visit_tokens=True).transform(tree)
```


### v_args

`v_args` is a decorator.

By default, callback methods of transformers/visitors accept one argument: a list of the node's children. `v_args` can modify this behavior.

When used on a transformer/visitor class definition, it applies to all the callback methods inside it.

`v_args` accepts one of three flags:

- `inline` - Children are provided as `*args` instead of a list argument (not recommended for very long lists).
- `meta` - Provides two arguments: `children` and `meta` (instead of just the first)
- `tree` - Provides the entire tree as the argument, instead of the children.

Examples:

```python
@v_args(inline=True)
class SolveArith(Transformer):
def add(self, left, right):
return left + right


class ReverseNotation(Transformer_InPlace):
@v_args(tree=True):
def tree_node(self, tree):
tree.children = tree.children[::-1]
```

### Discard

When raising the `Discard` exception in a transformer callback, that node is discarded and won't appear in the parent.



+ 1
- 0
examples/README.md View File

@@ -27,6 +27,7 @@ For example, the following will parse all the Python files in the standard libra

- [error\_reporting\_lalr.py](error_reporting_lalr.py) - A demonstration of example-driven error reporting with the LALR parser
- [python\_parser.py](python_parser.py) - A fully-working Python 2 & 3 parser (but not production ready yet!)
- [python\_bytecode.py](python_bytecode.py) - A toy example showing how to compile Python directly to bytecode
- [conf\_lalr.py](conf_lalr.py) - Demonstrates the power of LALR's contextual lexer on a toy configuration language
- [conf\_earley.py](conf_earley.py) - Demonstrates the power of Earley's dynamic lexer on a toy configuration language
- [custom\_lexer.py](custom_lexer.py) - Demonstrates using a custom lexer to parse a non-textual stream of data


+ 11
- 1
examples/json_parser.py View File

@@ -49,11 +49,21 @@ class TreeToJson(Transformer):
false = lambda self, _: False


### Create the JSON parser with Lark, using the Earley algorithm
# json_parser = Lark(json_grammar, parser='earley', lexer='standard')
# def parse(x):
# return TreeToJson().transform(json_parser.parse(x))

json_parser = Lark(json_grammar, parser='lalr', lexer='standard', transformer=TreeToJson())
### Create the JSON parser with Lark, using the LALR algorithm
json_parser = Lark(json_grammar, parser='lalr',
# Using the standard lexer isn't required, and isn't usually recommended.
# But, it's good enough for JSON, and it's slightly faster.
lexer='standard',
# Disabling propagate_positions and placeholders slightly improves speed
propagate_positions=False,
maybe_placeholders=False,
# Using an internal transformer is faster and more memory efficient
transformer=TreeToJson())
parse = json_parser.parse




+ 6
- 4
examples/lark.lark View File

@@ -10,10 +10,12 @@ token: TOKEN priority? ":" expansions _NL
priority: "." NUMBER

statement: "%ignore" expansions _NL -> ignore
| "%import" import_args ["->" name] _NL -> import
| "%import" import_path ["->" name] _NL -> import
| "%import" import_path name_list _NL -> multi_import
| "%declare" name+ -> declare

import_args: "."? name ("." name)*
!import_path: "."? name ("." name)*
name_list: "(" name ("," name)* ")"

?expansions: alias (_VBAR alias)*

@@ -33,7 +35,7 @@ name: RULE
| TOKEN

_VBAR: _NL? "|"
OP: /[+*][?]?|[?](?![a-z])/
OP: /[+*]|[?](?![a-z])/
RULE: /!?[_?]?[a-z][_a-z0-9]*/
TOKEN: /_?[A-Z][_A-Z0-9]*/
STRING: _STRING "i"?
@@ -44,7 +46,7 @@ _NL: /(\r?\n)+\s*/
%import common.INT -> NUMBER
%import common.WS_INLINE

COMMENT: "//" /[^\n]/*
COMMENT: /\s*/ "//" /[^\n]/*

%ignore WS_INLINE
%ignore COMMENT

+ 5
- 5
examples/python3.lark View File

@@ -81,7 +81,7 @@ with_item: test ["as" expr]
except_clause: "except" [test ["as" NAME]]
suite: simple_stmt | _NEWLINE _INDENT stmt+ _DEDENT

?test: or_test ["if" or_test "else" test] | lambdef
?test: or_test ("if" or_test "else" test)? | lambdef
?test_nocond: or_test | lambdef_nocond
lambdef: "lambda" [varargslist] ":" test
lambdef_nocond: "lambda" [varargslist] ":" test_nocond
@@ -107,7 +107,7 @@ star_expr: "*" expr
// sake of a __future__ import described in PEP 401 (which really works :-)
!_comp_op: "<"|">"|"=="|">="|"<="|"<>"|"!="|"in"|"not" "in"|"is"|"is" "not"

?power: await_expr ["**" factor]
?power: await_expr ("**" factor)?
?await_expr: AWAIT? atom_expr
AWAIT: "await"

@@ -137,7 +137,7 @@ dictorsetmaker: ( ((test ":" test | "**" expr) (comp_for | ("," (test ":" test |

classdef: "class" NAME ["(" [arguments] ")"] ":" suite

arguments: argvalue ("," argvalue)* ["," [ starargs | kwargs]]
arguments: argvalue ("," argvalue)* ("," [ starargs | kwargs])?
| starargs
| kwargs
| test comp_for
@@ -145,7 +145,7 @@ arguments: argvalue ("," argvalue)* ["," [ starargs | kwargs]]
starargs: "*" test ("," "*" test)* ("," argvalue)* ["," kwargs]
kwargs: "**" test

?argvalue: test ["=" test]
?argvalue: test ("=" test)?



@@ -178,7 +178,7 @@ HEX_NUMBER.2: /0x[\da-f]*/i
OCT_NUMBER.2: /0o[0-7]*/i
BIN_NUMBER.2 : /0b[0-1]*/i
FLOAT_NUMBER.2: /((\d+\.\d*|\.\d+)(e[-+]?\d+)?|\d+(e[-+]?\d+))/i
IMAG_NUMBER.2: /\d+j|${FLOAT_NUMBER}j/i
IMAG_NUMBER.2: /\d+j/i | FLOAT_NUMBER "j"i

%ignore /[\t \f]+/ // WS
%ignore /\\[\t \f]*\r?\n/ // LINE_CONT


+ 77
- 0
examples/python_bytecode.py View File

@@ -0,0 +1,77 @@
#
# This is a toy example that compiles Python directly to bytecode, without generating an AST.
# It currently only works for very very simple Python code.
#
# It requires the 'bytecode' library. You can get it using
#
# $ pip install bytecode
#

from lark import Lark, Transformer, v_args
from lark.indenter import Indenter

from bytecode import Instr, Bytecode

class PythonIndenter(Indenter):
NL_type = '_NEWLINE'
OPEN_PAREN_types = ['LPAR', 'LSQB', 'LBRACE']
CLOSE_PAREN_types = ['RPAR', 'RSQB', 'RBRACE']
INDENT_type = '_INDENT'
DEDENT_type = '_DEDENT'
tab_len = 8


@v_args(inline=True)
class Compile(Transformer):
def number(self, n):
return [Instr('LOAD_CONST', int(n))]
def string(self, s):
return [Instr('LOAD_CONST', s[1:-1])]
def var(self, n):
return [Instr('LOAD_NAME', n)]

def arith_expr(self, a, op, b):
# TODO support chain arithmetic
assert op == '+'
return a + b + [Instr('BINARY_ADD')]

def arguments(self, args):
return args

def funccall(self, name, args):
return name + args + [Instr('CALL_FUNCTION', 1)]

@v_args(inline=False)
def file_input(self, stmts):
return sum(stmts, []) + [Instr("RETURN_VALUE")]

def expr_stmt(self, lval, rval):
# TODO more complicated than that
name ,= lval
assert name.name == 'LOAD_NAME' # XXX avoid with another layer of abstraction
return rval + [Instr("STORE_NAME", name.arg)]

def __default__(self, *args):
assert False, args


python_parser3 = Lark.open('python3.lark', rel_to=__file__, start='file_input',
parser='lalr', postlex=PythonIndenter(),
transformer=Compile(), propagate_positions=False)

def compile_python(s):
insts = python_parser3.parse(s+"\n")
return Bytecode(insts).to_code()

code = compile_python("""
a = 3
b = 5
print("Hello World!")
print(a+(b+2))
print((a+b)+2)
""")
exec(code)
# -- Output --
# Hello World!
# 10
# 10

+ 2
- 8
examples/reconstruct_json.py View File

@@ -25,15 +25,9 @@ test_json = '''

def test_earley():

json_parser = Lark(json_grammar)
json_parser = Lark(json_grammar, maybe_placeholders=False)
tree = json_parser.parse(test_json)

# print ('@@', tree.pretty())
# for x in tree.find_data('true'):
# x.data = 'false'
# # x.children[0].value = '"HAHA"'


new_json = Reconstructor(json_parser).reconstruct(tree)
print (new_json)
print (json.loads(new_json) == json.loads(test_json))
@@ -41,7 +35,7 @@ def test_earley():

def test_lalr():

json_parser = Lark(json_grammar, parser='lalr')
json_parser = Lark(json_grammar, parser='lalr', maybe_placeholders=False)
tree = json_parser.parse(test_json)

new_json = Reconstructor(json_parser).reconstruct(tree)


+ 1
- 0
examples/standalone/create_standalone.sh View File

@@ -1 +1,2 @@
#!/bin/sh
PYTHONPATH=../.. python -m lark.tools.standalone json.lark > json_parser.py

+ 418
- 197
examples/standalone/json_parser.py
File diff suppressed because it is too large
View File


+ 1
- 1
lark/__init__.py View File

@@ -5,4 +5,4 @@ from .exceptions import ParseError, LexError, GrammarError, UnexpectedToken, Une
from .lexer import Token
from .lark import Lark

__version__ = "0.7.0"
__version__ = "0.8.1"

+ 1
- 0
lark/common.py View File

@@ -20,6 +20,7 @@ class LexerConf(Serialize):

class ParserConf:
def __init__(self, rules, callbacks, start):
assert isinstance(start, list)
self.rules = rules
self.callbacks = callbacks
self.start = start


+ 14
- 4
lark/exceptions.py View File

@@ -13,6 +13,14 @@ class ParseError(LarkError):
class LexError(LarkError):
pass

class UnexpectedEOF(ParseError):
def __init__(self, expected):
self.expected = expected

message = ("Unexpected end-of-input. Expected one of: \n\t* %s\n" % '\n\t* '.join(x.name for x in self.expected))
super(UnexpectedEOF, self).__init__(message)


class UnexpectedInput(LarkError):
pos_in_stream = None

@@ -52,7 +60,7 @@ class UnexpectedInput(LarkError):


class UnexpectedCharacters(LexError, UnexpectedInput):
def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None):
def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None):
message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos], line, column)

self.line = line
@@ -65,6 +73,8 @@ class UnexpectedCharacters(LexError, UnexpectedInput):
message += '\n\n' + self.get_context(seq)
if allowed:
message += '\nExpecting: %s\n' % allowed
if token_history:
message += '\nPrevious tokens: %s\n' % ', '.join(repr(t) for t in token_history)

super(UnexpectedCharacters, self).__init__(message)

@@ -87,10 +97,10 @@ class UnexpectedToken(ParseError, UnexpectedInput):
super(UnexpectedToken, self).__init__(message)

class VisitError(LarkError):
def __init__(self, tree, orig_exc):
self.tree = tree
def __init__(self, rule, obj, orig_exc):
self.obj = obj
self.orig_exc = orig_exc

message = 'Error trying to process rule "%s":\n\n%s' % (tree.data, orig_exc)
message = 'Error trying to process rule "%s":\n\n%s' % (rule, orig_exc)
super(VisitError, self).__init__(message)
###}

+ 4
- 2
lark/grammar.py View File

@@ -3,6 +3,8 @@ from .utils import Serialize
###{standalone

class Symbol(Serialize):
__slots__ = ('name',)

is_term = NotImplemented

def __init__(self, name):
@@ -79,7 +81,7 @@ class Rule(Serialize):
self.expansion = expansion
self.alias = alias
self.order = order
self.options = options
self.options = options or RuleOptions()
self._hash = hash((self.origin, tuple(self.expansion)))

def _deserialize(self):
@@ -101,4 +103,4 @@ class Rule(Serialize):



###}
###}

+ 41
- 40
lark/lark.py View File

@@ -1,8 +1,6 @@
from __future__ import absolute_import

import os
import time
from collections import defaultdict
from io import open

from .utils import STRING_TYPE, Serialize, SerializeMemoizer
@@ -43,8 +41,7 @@ class LarkOptions(Serialize):
keep_all_tokens - Don't automagically remove "punctuation" tokens (default: False)
cache_grammar - Cache the Lark grammar (Default: False)
postlex - Lexer post-processing (Default: None) Only works with the standard and contextual lexers.
start - The start symbol (Default: start)
profile - Measure run-time usage in Lark. Read results from the profiler proprety (Default: False)
start - The start symbol, either a string, or a list of strings for multiple possible starts (Default: "start")
priority - How priorities should be evaluated - auto, none, normal, invert (Default: auto)
propagate_positions - Propagates [line, column, end_line, end_column] attributes into all tree branches.
lexer_callbacks - Dictionary of callbacks for the lexer. May alter tokens during lexing. Use with caution.
@@ -63,12 +60,12 @@ class LarkOptions(Serialize):
'lexer': 'auto',
'transformer': None,
'start': 'start',
'profile': False,
'priority': 'auto',
'ambiguity': 'auto',
'propagate_positions': False,
'lexer_callbacks': {},
'maybe_placeholders': False,
'edit_terminals': None,
}

def __init__(self, options_dict):
@@ -85,6 +82,9 @@ class LarkOptions(Serialize):

options[name] = value

if isinstance(options['start'], STRING_TYPE):
options['start'] = [options['start']]

self.__dict__['options'] = options

assert self.parser in ('earley', 'lalr', 'cyk', None)
@@ -97,7 +97,11 @@ class LarkOptions(Serialize):
raise ValueError("Unknown options: %s" % o.keys())

def __getattr__(self, name):
return self.options[name]
try:
return self.options[name]
except KeyError as e:
raise AttributeError(e)

def __setattr__(self, name, value):
assert name in self.options
self.options[name] = value
@@ -110,30 +114,6 @@ class LarkOptions(Serialize):
return cls(data)


class Profiler:
def __init__(self):
self.total_time = defaultdict(float)
self.cur_section = '__init__'
self.last_enter_time = time.time()

def enter_section(self, name):
cur_time = time.time()
self.total_time[self.cur_section] += cur_time - self.last_enter_time
self.last_enter_time = cur_time
self.cur_section = name

def make_wrapper(self, name, f):
def wrapper(*args, **kwargs):
last_section = self.cur_section
self.enter_section(name)
try:
return f(*args, **kwargs)
finally:
self.enter_section(last_section)

return wrapper


class Lark(Serialize):
def __init__(self, grammar, **options):
"""
@@ -161,9 +141,6 @@ class Lark(Serialize):
if self.options.cache_grammar:
raise NotImplementedError("Not available yet")

assert not self.options.profile, "Feature temporarily disabled"
# self.profiler = Profiler() if self.options.profile else None

if self.options.lexer == 'auto':
if self.options.parser == 'lalr':
self.options.lexer = 'contextual'
@@ -200,22 +177,37 @@ class Lark(Serialize):
self.grammar = load_grammar(grammar, self.source)

# Compile the EBNF grammar into BNF
self.terminals, self.rules, self.ignore_tokens = self.grammar.compile()
self.terminals, self.rules, self.ignore_tokens = self.grammar.compile(self.options.start)

if self.options.edit_terminals:
for t in self.terminals:
self.options.edit_terminals(t)

self._terminals_dict = {t.name:t for t in self.terminals}

# If the user asked to invert the priorities, negate them all here.
# This replaces the old 'resolve__antiscore_sum' option.
if self.options.priority == 'invert':
for rule in self.rules:
if rule.options and rule.options.priority is not None:
if rule.options.priority is not None:
rule.options.priority = -rule.options.priority
# Else, if the user asked to disable priorities, strip them from the
# rules. This allows the Earley parsers to skip an extra forest walk
# for improved performance, if you don't need them (or didn't specify any).
elif self.options.priority == None:
for rule in self.rules:
if rule.options and rule.options.priority is not None:
if rule.options.priority is not None:
rule.options.priority = None
self.lexer_conf = LexerConf(self.terminals, self.ignore_tokens, self.options.postlex, self.options.lexer_callbacks)

# TODO Deprecate lexer_callbacks?
lexer_callbacks = dict(self.options.lexer_callbacks)
if self.options.transformer:
t = self.options.transformer
for term in self.terminals:
if hasattr(t, term.name):
lexer_callbacks[term.name] = getattr(t, term.name)

self.lexer_conf = LexerConf(self.terminals, self.ignore_tokens, self.options.postlex, lexer_callbacks)

if self.options.parser:
self.parser = self._build_parser()
@@ -287,8 +279,17 @@ class Lark(Serialize):
return self.options.postlex.process(stream)
return stream

def parse(self, text):
"Parse the given text, according to the options provided. Returns a tree, unless specified otherwise."
return self.parser.parse(text)
def get_terminal(self, name):
"Get information about a terminal"
return self._terminals_dict[name]

def parse(self, text, start=None):
"""Parse the given text, according to the options provided.

The 'start' parameter is required if Lark was given multiple possible start symbols (using the start option).

Returns a tree, unless specified otherwise.
"""
return self.parser.parse(text, start=start)

###}

+ 84
- 50
lark/lexer.py View File

@@ -3,12 +3,11 @@
import re

from .utils import Str, classify, get_regexp_width, Py36, Serialize
from .exceptions import UnexpectedCharacters, LexError
from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken

###{standalone

class Pattern(Serialize):
__serialize_fields__ = 'value', 'flags'

def __init__(self, value, flags=()):
self.value = value
@@ -41,6 +40,10 @@ class Pattern(Serialize):


class PatternStr(Pattern):
__serialize_fields__ = 'value', 'flags'

type = "str"

def to_regexp(self):
return self._get_flags(re.escape(self.value))

@@ -50,15 +53,25 @@ class PatternStr(Pattern):
max_width = min_width

class PatternRE(Pattern):
__serialize_fields__ = 'value', 'flags', '_width'

type = "re"

def to_regexp(self):
return self._get_flags(self.value)

_width = None
def _get_width(self):
if self._width is None:
self._width = get_regexp_width(self.to_regexp())
return self._width

@property
def min_width(self):
return get_regexp_width(self.to_regexp())[0]
return self._get_width()[0]
@property
def max_width(self):
return get_regexp_width(self.to_regexp())[1]
return self._get_width()[1]


class TerminalDef(Serialize):
@@ -77,9 +90,9 @@ class TerminalDef(Serialize):


class Token(Str):
__slots__ = ('type', 'pos_in_stream', 'value', 'line', 'column', 'end_line', 'end_column')
__slots__ = ('type', 'pos_in_stream', 'value', 'line', 'column', 'end_line', 'end_column', 'end_pos')

def __new__(cls, type_, value, pos_in_stream=None, line=None, column=None, end_line=None, end_column=None):
def __new__(cls, type_, value, pos_in_stream=None, line=None, column=None, end_line=None, end_column=None, end_pos=None):
try:
self = super(Token, cls).__new__(cls, value)
except UnicodeDecodeError:
@@ -93,11 +106,19 @@ class Token(Str):
self.column = column
self.end_line = end_line
self.end_column = end_column
self.end_pos = end_pos
return self

def update(self, type_=None, value=None):
return Token.new_borrow_pos(
type_ if type_ is not None else self.type,
value if value is not None else self.value,
self
)

@classmethod
def new_borrow_pos(cls, type_, value, borrow_t):
return cls(type_, value, borrow_t.pos_in_stream, borrow_t.line, borrow_t.column, borrow_t.end_line, borrow_t.end_column)
return cls(type_, value, borrow_t.pos_in_stream, borrow_t.line, borrow_t.column, borrow_t.end_line, borrow_t.end_column, borrow_t.end_pos)

def __reduce__(self):
return (self.__class__, (self.type, self.value, self.pos_in_stream, self.line, self.column, ))
@@ -149,38 +170,38 @@ class _Lex:
newline_types = frozenset(newline_types)
ignore_types = frozenset(ignore_types)
line_ctr = LineCounter()
last_token = None

while line_ctr.char_pos < len(stream):
lexer = self.lexer
for mre, type_from_index in lexer.mres:
m = mre.match(stream, line_ctr.char_pos)
if not m:
continue

t = None
value = m.group(0)
type_ = type_from_index[m.lastindex]
if type_ not in ignore_types:
t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
if t.type in lexer.callback:
t = lexer.callback[t.type](t)
if not isinstance(t, Token):
raise ValueError("Callbacks must return a token (returned %r)" % t)
yield t
else:
if type_ in lexer.callback:
t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
lexer.callback[type_](t)
res = lexer.match(stream, line_ctr.char_pos)
if not res:
allowed = {v for m, tfi in lexer.mres for v in tfi.values()} - ignore_types
if not allowed:
allowed = {"<END-OF-FILE>"}
raise UnexpectedCharacters(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column, allowed=allowed, state=self.state, token_history=last_token and [last_token])

line_ctr.feed(value, type_ in newline_types)
if t:
t.end_line = line_ctr.line
t.end_column = line_ctr.column
value, type_ = res

break
if type_ not in ignore_types:
t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
line_ctr.feed(value, type_ in newline_types)
t.end_line = line_ctr.line
t.end_column = line_ctr.column
t.end_pos = line_ctr.char_pos
if t.type in lexer.callback:
t = lexer.callback[t.type](t)
if not isinstance(t, Token):
raise ValueError("Callbacks must return a token (returned %r)" % t)
yield t
last_token = t
else:
allowed = {v for m, tfi in lexer.mres for v in tfi.values()}
raise UnexpectedCharacters(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column, allowed=allowed, state=self.state)
if type_ in lexer.callback:
t2 = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
lexer.callback[type_](t2)
line_ctr.feed(value, type_ in newline_types)




class UnlessCallback:
@@ -253,23 +274,21 @@ def build_mres(terminals, match_whole=False):
return _build_mres(terminals, len(terminals), match_whole)

def _regexp_has_newline(r):
"""Expressions that may indicate newlines in a regexp:
r"""Expressions that may indicate newlines in a regexp:
- newlines (\n)
- escaped newline (\\n)
- anything but ([^...])
- any-char (.) when the flag (?s) exists
- spaces (\s)
"""
return '\n' in r or '\\n' in r or '[^' in r or ('(?s' in r and '.' in r)
return '\n' in r or '\\n' in r or '\\s' in r or '[^' in r or ('(?s' in r and '.' in r)

class Lexer(object):
"""Lexer interface

Method Signatures:
lex(self, stream) -> Iterator[Token]

set_parser_state(self, state) # Optional
"""
set_parser_state = NotImplemented
lex = NotImplemented


@@ -284,7 +303,7 @@ class TraditionalLexer(Lexer):
for t in terminals:
try:
re.compile(t.pattern.to_regexp())
except:
except re.error:
raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern))

if t.pattern.min_width == 0:
@@ -314,6 +333,11 @@ class TraditionalLexer(Lexer):

self.mres = build_mres(terminals)

def match(self, stream, pos):
for mre, type_from_index in self.mres:
m = mre.match(stream, pos)
if m:
return m.group(0), type_from_index[m.lastindex]

def lex(self, stream):
return _Lex(self).lex(stream, self.newline_types, self.ignore_types)
@@ -322,6 +346,7 @@ class TraditionalLexer(Lexer):


class ContextualLexer(Lexer):

def __init__(self, terminals, states, ignore=(), always_accept=(), user_callbacks={}):
tokens_by_name = {}
for t in terminals:
@@ -344,16 +369,25 @@ class ContextualLexer(Lexer):

self.root_lexer = TraditionalLexer(terminals, ignore=ignore, user_callbacks=user_callbacks)

self.set_parser_state(None) # Needs to be set on the outside

def set_parser_state(self, state):
self.parser_state = state

def lex(self, stream):
l = _Lex(self.lexers[self.parser_state], self.parser_state)
for x in l.lex(stream, self.root_lexer.newline_types, self.root_lexer.ignore_types):
yield x
l.lexer = self.lexers[self.parser_state]
l.state = self.parser_state
def lex(self, stream, get_parser_state):
parser_state = get_parser_state()
l = _Lex(self.lexers[parser_state], parser_state)
try:
for x in l.lex(stream, self.root_lexer.newline_types, self.root_lexer.ignore_types):
yield x
parser_state = get_parser_state()
l.lexer = self.lexers[parser_state]
l.state = parser_state # For debug only, no need to worry about multithreading
except UnexpectedCharacters as e:
# In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined,
# but not in the current context.
# This tests the input against the global context, to provide a nicer error.
root_match = self.root_lexer.match(stream, e.pos_in_stream)
if not root_match:
raise

value, type_ = root_match
t = Token(type_, value, e.pos_in_stream, e.line, e.column)
raise UnexpectedToken(t, e.allowed, state=e.state)

###}

+ 66
- 57
lark/load_grammar.py View File

@@ -2,17 +2,17 @@

import os.path
import sys
from ast import literal_eval
from copy import copy, deepcopy
from io import open

from .utils import bfs
from .utils import bfs, eval_escaping
from .lexer import Token, TerminalDef, PatternStr, PatternRE

from .parse_tree_builder import ParseTreeBuilder
from .parser_frontends import LALR_TraditionalLexer
from .common import LexerConf, ParserConf
from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol
from .utils import classify, suppress, dedup_list
from .utils import classify, suppress, dedup_list, Str
from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken

from .tree import Tree, SlottedTree as ST
@@ -73,11 +73,12 @@ TERMINALS = {
'_RPAR': r'\)',
'_LBRA': r'\[',
'_RBRA': r'\]',
'OP': '[+*][?]?|[?](?![a-z])',
'OP': '[+*]|[?](?![a-z])',
'_COLON': ':',
'_COMMA': ',',
'_OR': r'\|',
'_DOT': r'\.',
'_DOT': r'\.(?!\.)',
'_DOTDOT': r'\.\.',
'TILDE': '~',
'RULE': '!?[_?]?[a-z][_a-z0-9]*',
'TERMINAL': '_?[A-Z][_A-Z0-9]*',
@@ -85,12 +86,12 @@ TERMINALS = {
'REGEXP': r'/(?!/)(\\/|\\\\|[^/\n])*?/[%s]*' % _RE_FLAGS,
'_NL': r'(\r?\n)+\s*',
'WS': r'[ \t]+',
'COMMENT': r'//[^\n]*',
'COMMENT': r'\s*//[^\n]*',
'_TO': '->',
'_IGNORE': r'%ignore',
'_DECLARE': r'%declare',
'_IMPORT': r'%import',
'NUMBER': r'\d+',
'NUMBER': r'[+-]?\d+',
}

RULES = {
@@ -112,7 +113,7 @@ RULES = {
'?expr': ['atom',
'atom OP',
'atom TILDE NUMBER',
'atom TILDE NUMBER _DOT _DOT NUMBER',
'atom TILDE NUMBER _DOTDOT NUMBER',
],

'?atom': ['_LPAR expansions _RPAR',
@@ -130,7 +131,7 @@ RULES = {
'?name': ['RULE', 'TERMINAL'],

'maybe': ['_LBRA expansions _RBRA'],
'range': ['STRING _DOT _DOT STRING'],
'range': ['STRING _DOTDOT STRING'],

'term': ['TERMINAL _COLON expansions _NL',
'TERMINAL _DOT NUMBER _COLON expansions _NL'],
@@ -196,7 +197,7 @@ class EBNF_to_BNF(Transformer_InPlace):
mn = mx = int(args[0])
else:
mn, mx = map(int, args)
if mx < mn:
if mx < mn or mn < 0:
raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (rule, mn, mx))
return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx+1)])
assert False, op
@@ -205,7 +206,7 @@ class EBNF_to_BNF(Transformer_InPlace):
keep_all_tokens = self.rule_options and self.rule_options.keep_all_tokens

def will_not_get_removed(sym):
if isinstance(sym, NonTerminal):
if isinstance(sym, NonTerminal):
return not sym.name.startswith('_')
if isinstance(sym, Terminal):
return keep_all_tokens or not sym.filter_out
@@ -345,28 +346,6 @@ def _rfind(s, choices):



def _fix_escaping(s):
w = ''
i = iter(s)
for n in i:
w += n
if n == '\\':
n2 = next(i)
if n2 == '\\':
w += '\\\\'
elif n2 not in 'uxnftr':
w += '\\'
w += n2
w = w.replace('\\"', '"').replace("'", "\\'")

to_eval = "u'''%s'''" % w
try:
s = literal_eval(to_eval)
except SyntaxError as e:
raise ValueError(s, e)

return s


def _literal_to_pattern(literal):
v = literal.value
@@ -379,7 +358,7 @@ def _literal_to_pattern(literal):
assert v[0] == v[-1] and v[0] in '"/'
x = v[1:-1]

s = _fix_escaping(x)
s = eval_escaping(x)

if literal.type == 'STRING':
s = s.replace('\\\\', '\\')
@@ -397,7 +376,7 @@ class PrepareLiterals(Transformer_InPlace):
assert start.type == end.type == 'STRING'
start = start.value[1:-1]
end = end.value[1:-1]
assert len(_fix_escaping(start)) == len(_fix_escaping(end)) == 1, (start, end, len(_fix_escaping(start)), len(_fix_escaping(end)))
assert len(eval_escaping(start)) == len(eval_escaping(end)) == 1, (start, end, len(eval_escaping(start)), len(eval_escaping(end)))
regexp = '[%s-%s]' % (start, end)
return ST('pattern', [PatternRE(regexp)])

@@ -451,9 +430,9 @@ class PrepareSymbols(Transformer_InPlace):
if isinstance(v, Tree):
return v
elif v.type == 'RULE':
return NonTerminal(v.value)
return NonTerminal(Str(v.value))
elif v.type == 'TERMINAL':
return Terminal(v.value, filter_out=v.startswith('_'))
return Terminal(Str(v.value), filter_out=v.startswith('_'))
assert False

def _choice_of_rules(rules):
@@ -465,7 +444,7 @@ class Grammar:
self.rule_defs = rule_defs
self.ignore = ignore

def compile(self):
def compile(self, start):
# We change the trees in-place (to support huge grammars)
# So deepcopy allows calling compile more than once.
term_defs = deepcopy(list(self.term_defs))
@@ -476,7 +455,7 @@ class Grammar:
# ===================

# Convert terminal-trees to strings/regexps
transformer = PrepareLiterals() * TerminalTreeToPattern()
for name, (term_tree, priority) in term_defs:
if term_tree is None: # Terminal added through %declare
continue
@@ -484,7 +463,8 @@ class Grammar:
if len(expansions) == 1 and not expansions[0].children:
raise GrammarError("Terminals cannot be empty (%s)" % name)

terminals = [TerminalDef(name, transformer.transform(term_tree), priority)
transformer = PrepareLiterals() * TerminalTreeToPattern()
terminals = [TerminalDef(name, transformer.transform( term_tree ), priority)
for name, (term_tree, priority) in term_defs if term_tree]

# =================
@@ -498,7 +478,8 @@ class Grammar:
ebnf_to_bnf = EBNF_to_BNF()
rules = []
for name, rule_tree, options in rule_defs:
ebnf_to_bnf.rule_options = RuleOptions(keep_all_tokens=True) if options and options.keep_all_tokens else None
ebnf_to_bnf.rule_options = RuleOptions(keep_all_tokens=True) if options.keep_all_tokens else None
ebnf_to_bnf.prefix = name
tree = transformer.transform(rule_tree)
res = ebnf_to_bnf.transform(tree)
rules.append((name, res, options))
@@ -511,18 +492,18 @@ class Grammar:

simplify_rule = SimplifyRule_Visitor()
compiled_rules = []
for i, rule_content in enumerate(rules):
for rule_content in rules:
name, tree, options = rule_content
simplify_rule.visit(tree)
expansions = rule_tree_to_text.transform(tree)

for expansion, alias in expansions:
for i, (expansion, alias) in enumerate(expansions):
if alias and name.startswith('_'):
raise GrammarError("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (name, alias))

empty_indices = [x==_EMPTY for i, x in enumerate(expansion)]
empty_indices = [x==_EMPTY for x in expansion]
if any(empty_indices):
exp_options = copy(options) if options else RuleOptions()
exp_options = copy(options) or RuleOptions()
exp_options.empty_indices = empty_indices
expansion = [x for x in expansion if x!=_EMPTY]
else:
@@ -538,7 +519,8 @@ class Grammar:
for dups in duplicates.values():
if len(dups) > 1:
if dups[0].expansion:
raise GrammarError("Rules defined twice: %s" % ', '.join(str(i) for i in duplicates))
raise GrammarError("Rules defined twice: %s\n\n(Might happen due to colliding expansion of optionals: [] or ?)"
% ''.join('\n * %s' % i for i in dups))

# Empty rule; assert all other attributes are equal
assert len({(r.alias, r.order, r.options) for r in dups}) == len(dups)
@@ -546,6 +528,19 @@ class Grammar:
# Remove duplicates
compiled_rules = list(set(compiled_rules))


# Filter out unused rules
while True:
c = len(compiled_rules)
used_rules = {s for r in compiled_rules
for s in r.expansion
if isinstance(s, NonTerminal)
and s != r.origin}
used_rules |= {NonTerminal(s) for s in start}
compiled_rules = [r for r in compiled_rules if r.origin in used_rules]
if len(compiled_rules) == c:
break

# Filter out unused terminals
used_terms = {t.name for r in compiled_rules
for t in r.expansion
@@ -563,13 +558,13 @@ def import_grammar(grammar_path, base_paths=[]):
for import_path in import_paths:
with suppress(IOError):
joined_path = os.path.join(import_path, grammar_path)
with open(joined_path) as f:
with open(joined_path, encoding='utf8') as f:
text = f.read()
grammar = load_grammar(text, joined_path)
_imported_grammars[grammar_path] = grammar
break
else:
open(grammar_path)
open(grammar_path, encoding='utf8')
assert False

return _imported_grammars[grammar_path]
@@ -592,7 +587,9 @@ def import_from_grammar_into_namespace(grammar, namespace, aliases):
_, tree, _ = imported_rules[symbol]
except KeyError:
raise GrammarError("Missing symbol '%s' in grammar %s" % (symbol, namespace))
return tree.scan_values(lambda x: x.type in ('RULE', 'TERMINAL'))

return _find_used_symbols(tree)


def get_namespace_name(name):
try:
@@ -620,11 +617,10 @@ def import_from_grammar_into_namespace(grammar, namespace, aliases):


def resolve_term_references(term_defs):
# TODO Cycles detection
# TODO Solve with transitive closure (maybe)

token_dict = {k:t for k, (t,_p) in term_defs}
assert len(token_dict) == len(term_defs), "Same name defined twice?"
term_dict = {k:t for k, (t,_p) in term_defs}
assert len(term_dict) == len(term_defs), "Same name defined twice?"

while True:
changed = False
@@ -637,11 +633,21 @@ def resolve_term_references(term_defs):
if item.type == 'RULE':
raise GrammarError("Rules aren't allowed inside terminals (%s in %s)" % (item, name))
if item.type == 'TERMINAL':
exp.children[0] = token_dict[item]
term_value = term_dict[item]
assert term_value is not None
exp.children[0] = term_value
changed = True
if not changed:
break

for name, term in term_dict.items():
if term: # Not just declared
for child in term.children:
ids = [id(x) for x in child.iter_subtrees()]
if id(term) in ids:
raise GrammarError("Recursion in terminal '%s' (recursion is only allowed in rules, not terminals)" % name)


def options_from_rule(name, *x):
if len(x) > 1:
priority, expansions = x
@@ -669,6 +675,11 @@ class PrepareGrammar(Transformer_InPlace):
return name


def _find_used_symbols(tree):
assert tree.data == 'expansions'
return {t for x in tree.find_data('expansion')
for t in x.scan_values(lambda t: t.type in ('RULE', 'TERMINAL'))}

class GrammarLoader:
def __init__(self):
terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()]
@@ -678,7 +689,7 @@ class GrammarLoader:
callback = ParseTreeBuilder(rules, ST).create_callback()
lexer_conf = LexerConf(terminals, ['WS', 'COMMENT'])

parser_conf = ParserConf(rules, callback, 'start')
parser_conf = ParserConf(rules, callback, ['start'])
self.parser = LALR_TraditionalLexer(lexer_conf, parser_conf)

self.canonize_tree = CanonizeTree()
@@ -830,9 +841,7 @@ class GrammarLoader:
rule_names.add(name)

for name, expansions, _o in rules:
used_symbols = {t for x in expansions.find_data('expansion')
for t in x.scan_values(lambda t: t.type in ('RULE', 'TERMINAL'))}
for sym in used_symbols:
for sym in _find_used_symbols(expansions):
if sym.type == 'TERMINAL':
if sym not in terminal_names:
raise GrammarError("Token '%s' used but not defined (in rule %s)" % (sym, name))


+ 33
- 9
lark/parse_tree_builder.py View File

@@ -2,6 +2,8 @@ from .exceptions import GrammarError
from .lexer import Token
from .tree import Tree
from .visitors import InlineTransformer # XXX Deprecated
from .visitors import Transformer_InPlace
from .visitors import _vargs_meta, _vargs_meta_inline

###{standalone
from functools import partial, wraps
@@ -27,7 +29,7 @@ class PropagatePositions:

if isinstance(res, Tree):
for c in children:
if isinstance(c, Tree) and c.children and not c.meta.empty:
if isinstance(c, Tree) and not c.meta.empty:
res.meta.line = c.meta.line
res.meta.column = c.meta.column
res.meta.start_pos = c.meta.start_pos
@@ -41,7 +43,7 @@ class PropagatePositions:
break

for c in reversed(children):
if isinstance(c, Tree) and c.children and not c.meta.empty:
if isinstance(c, Tree) and not c.meta.empty:
res.meta.end_line = c.meta.end_line
res.meta.end_column = c.meta.end_column
res.meta.end_pos = c.meta.end_pos
@@ -50,7 +52,7 @@ class PropagatePositions:
elif isinstance(c, Token):
res.meta.end_line = c.end_line
res.meta.end_column = c.end_column
res.meta.end_pos = c.pos_in_stream + len(c.value)
res.meta.end_pos = c.end_pos
res.meta.empty = False
break

@@ -193,6 +195,23 @@ def ptb_inline_args(func):
return func(*children)
return f

def inplace_transformer(func):
@wraps(func)
def f(children):
# function name in a Transformer is a rule name.
tree = Tree(func.__name__, children)
return func(tree)
return f

def apply_visit_wrapper(func, name, wrapper):
if wrapper is _vargs_meta or wrapper is _vargs_meta_inline:
raise NotImplementedError("Meta args not supported for internal transformer")
@wraps(func)
def f(children):
return wrapper(func, name, children, None)
return f


class ParseTreeBuilder:
def __init__(self, rules, tree_class, propagate_positions=False, keep_all_tokens=False, ambiguous=False, maybe_placeholders=False):
self.tree_class = tree_class
@@ -206,12 +225,12 @@ class ParseTreeBuilder:
def _init_builders(self, rules):
for rule in rules:
options = rule.options
keep_all_tokens = self.always_keep_all_tokens or (options.keep_all_tokens if options else False)
expand_single_child = options.expand1 if options else False
keep_all_tokens = self.always_keep_all_tokens or options.keep_all_tokens
expand_single_child = options.expand1

wrapper_chain = list(filter(None, [
(expand_single_child and not rule.alias) and ExpandSingleChild,
maybe_create_child_filter(rule.expansion, keep_all_tokens, self.ambiguous, options.empty_indices if self.maybe_placeholders and options else None),
maybe_create_child_filter(rule.expansion, keep_all_tokens, self.ambiguous, options.empty_indices if self.maybe_placeholders else None),
self.propagate_positions and PropagatePositions,
self.ambiguous and maybe_create_ambiguous_expander(self.tree_class, rule.expansion, keep_all_tokens),
]))
@@ -227,10 +246,15 @@ class ParseTreeBuilder:
user_callback_name = rule.alias or rule.origin.name
try:
f = getattr(transformer, user_callback_name)
assert not getattr(f, 'meta', False), "Meta args not supported for internal transformer"
# XXX InlineTransformer is deprecated!
if getattr(f, 'inline', False) or isinstance(transformer, InlineTransformer):
f = ptb_inline_args(f)
wrapper = getattr(f, 'visit_wrapper', None)
if wrapper is not None:
f = apply_visit_wrapper(f, user_callback_name, wrapper)
else:
if isinstance(transformer, InlineTransformer):
f = ptb_inline_args(f)
elif isinstance(transformer, Transformer_InPlace):
f = inplace_transformer(f)
except AttributeError:
f = partial(self.tree_class, user_callback_name)



+ 41
- 18
lark/parser_frontends.py View File

@@ -44,18 +44,28 @@ def get_frontend(parser, lexer):
raise ValueError('Unknown parser: %s' % parser)


class _ParserFrontend(Serialize):
def _parse(self, input, start, *args):
if start is None:
start = self.start
if len(start) > 1:
raise ValueError("Lark initialized with more than 1 possible start rule. Must specify which start rule to parse", start)
start ,= start
return self.parser.parse(input, start, *args)


class WithLexer(Serialize):
class WithLexer(_ParserFrontend):
lexer = None
parser = None
lexer_conf = None
start = None

__serialize_fields__ = 'parser', 'lexer_conf'
__serialize_fields__ = 'parser', 'lexer_conf', 'start'
__serialize_namespace__ = LexerConf,

def __init__(self, lexer_conf, parser_conf, options=None):
self.lexer_conf = lexer_conf
self.start = parser_conf.start
self.postlex = lexer_conf.postlex

@classmethod
@@ -65,18 +75,17 @@ class WithLexer(Serialize):
inst.parser = LALR_Parser.deserialize(inst.parser, memo, callbacks)
inst.init_lexer()
return inst
def _serialize(self, data, memo):
data['parser'] = data['parser'].serialize(memo)

def lex(self, text):
stream = self.lexer.lex(text)
def lex(self, *args):
stream = self.lexer.lex(*args)
return self.postlex.process(stream) if self.postlex else stream

def parse(self, text):
def parse(self, text, start=None):
token_stream = self.lex(text)
sps = self.lexer.set_parser_state
return self.parser.parse(token_stream, *[sps] if sps is not NotImplemented else [])
return self._parse(token_stream, start)

def init_traditional_lexer(self):
self.lexer = TraditionalLexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks)
@@ -104,14 +113,24 @@ class LALR_ContextualLexer(LALR_WithLexer):
ignore=self.lexer_conf.ignore,
always_accept=always_accept,
user_callbacks=self.lexer_conf.callbacks)


def parse(self, text, start=None):
parser_state = [None]
def set_parser_state(s):
parser_state[0] = s

token_stream = self.lex(text, lambda: parser_state[0])
return self._parse(token_stream, start, set_parser_state)
###}

class LALR_CustomLexer(LALR_WithLexer):
def __init__(self, lexer_cls, lexer_conf, parser_conf, options=None):
pass # TODO
self.lexer = lexer_cls(lexer_conf)
debug = options.debug if options else False
self.parser = LALR_Parser(parser_conf, debug=debug)
WithLexer.__init__(self, lexer_conf, parser_conf, options)

def init_lexer(self):
self.lexer = lexer_cls(self.lexer_conf)

def tokenize_text(text):
line = 1
@@ -128,22 +147,26 @@ class Earley(WithLexer):
self.init_traditional_lexer()

resolve_ambiguity = options.ambiguity == 'resolve'
self.parser = earley.Parser(parser_conf, self.match, resolve_ambiguity=resolve_ambiguity)
debug = options.debug if options else False
self.parser = earley.Parser(parser_conf, self.match, resolve_ambiguity=resolve_ambiguity, debug=debug)

def match(self, term, token):
return term.name == token.type


class XEarley:
class XEarley(_ParserFrontend):
def __init__(self, lexer_conf, parser_conf, options=None, **kw):
self.token_by_name = {t.name:t for t in lexer_conf.tokens}
self.start = parser_conf.start

self._prepare_match(lexer_conf)
resolve_ambiguity = options.ambiguity == 'resolve'
debug = options.debug if options else False
self.parser = xearley.Parser(parser_conf,
self.match,
ignore=lexer_conf.ignore,
resolve_ambiguity=resolve_ambiguity,
debug=debug,
**kw
)

@@ -166,8 +189,8 @@ class XEarley:

self.regexps[t.name] = re.compile(regexp)

def parse(self, text):
return self.parser.parse(text)
def parse(self, text, start):
return self._parse(text, start)

class XEarley_CompleteLex(XEarley):
def __init__(self, *args, **kw):
@@ -182,13 +205,13 @@ class CYK(WithLexer):
self.init_traditional_lexer()

self._analysis = GrammarAnalyzer(parser_conf)
self._parser = cyk.Parser(parser_conf.rules, parser_conf.start)
self.parser = cyk.Parser(parser_conf.rules)

self.callbacks = parser_conf.callbacks

def parse(self, text):
def parse(self, text, start):
tokens = list(self.lex(text))
parse = self._parser.parse(tokens)
parse = self._parse(tokens, start)
parse = self._transform(parse)
return parse



+ 8
- 6
lark/parsers/cyk.py View File

@@ -84,12 +84,11 @@ class RuleNode(object):
class Parser(object):
"""Parser wrapper."""

def __init__(self, rules, start):
def __init__(self, rules):
super(Parser, self).__init__()
self.orig_rules = {rule: rule for rule in rules}
rules = [self._to_rule(rule) for rule in rules]
self.grammar = to_cnf(Grammar(rules))
self.start = NT(start)

def _to_rule(self, lark_rule):
"""Converts a lark rule, (lhs, rhs, callback, options), to a Rule."""
@@ -97,16 +96,19 @@ class Parser(object):
assert all(isinstance(x, Symbol) for x in lark_rule.expansion)
return Rule(
lark_rule.origin, lark_rule.expansion,
weight=lark_rule.options.priority if lark_rule.options and lark_rule.options.priority else 0,
weight=lark_rule.options.priority if lark_rule.options.priority else 0,
alias=lark_rule)

def parse(self, tokenized): # pylint: disable=invalid-name
def parse(self, tokenized, start): # pylint: disable=invalid-name
"""Parses input, which is a list of tokens."""
assert start
start = NT(start)

table, trees = _parse(tokenized, self.grammar)
# Check if the parse succeeded.
if all(r.lhs != self.start for r in table[(0, len(tokenized) - 1)]):
if all(r.lhs != start for r in table[(0, len(tokenized) - 1)]):
raise ParseError('Parsing failed.')
parse = trees[(0, len(tokenized) - 1)][self.start]
parse = trees[(0, len(tokenized) - 1)][start]
return self._to_tree(revert_cnf(parse))

def _to_tree(self, rule_node):


+ 25
- 14
lark/parsers/earley.py View File

@@ -10,20 +10,22 @@ is better documented here:
http://www.bramvandersanden.com/post/2014/06/shared-packed-parse-forest/
"""

import logging
from collections import deque

from ..visitors import Transformer_InPlace, v_args
from ..exceptions import ParseError, UnexpectedToken
from ..exceptions import UnexpectedEOF, UnexpectedToken
from .grammar_analysis import GrammarAnalyzer
from ..grammar import NonTerminal
from .earley_common import Item, TransitiveItem
from .earley_forest import ForestToTreeVisitor, ForestSumVisitor, SymbolNode, ForestToAmbiguousTreeVisitor

class Parser:
def __init__(self, parser_conf, term_matcher, resolve_ambiguity=True):
def __init__(self, parser_conf, term_matcher, resolve_ambiguity=True, debug=False):
analysis = GrammarAnalyzer(parser_conf)
self.parser_conf = parser_conf
self.resolve_ambiguity = resolve_ambiguity
self.debug = debug

self.FIRST = analysis.FIRST
self.NULLABLE = analysis.NULLABLE
@@ -43,13 +45,9 @@ class Parser:
# the priorities will be stripped from all rules before they reach us, allowing us to
# skip the extra tree walk. We'll also skip this if the user just didn't specify priorities
# on any rules.
if self.forest_sum_visitor is None and rule.options and rule.options.priority is not None:
self.forest_sum_visitor = ForestSumVisitor()
if self.forest_sum_visitor is None and rule.options.priority is not None:
self.forest_sum_visitor = ForestSumVisitor

if resolve_ambiguity:
self.forest_tree_visitor = ForestToTreeVisitor(self.callbacks, self.forest_sum_visitor)
else:
self.forest_tree_visitor = ForestToAmbiguousTreeVisitor(self.callbacks, self.forest_sum_visitor)
self.term_matcher = term_matcher


@@ -272,9 +270,11 @@ class Parser:

## Column is now the final column in the parse.
assert i == len(columns)-1
return to_scan

def parse(self, stream, start_symbol=None):
start_symbol = NonTerminal(start_symbol or self.parser_conf.start)
def parse(self, stream, start):
assert start, start
start_symbol = NonTerminal(start)

columns = [set()]
to_scan = set() # The scan buffer. 'Q' in E.Scott's paper.
@@ -289,22 +289,33 @@ class Parser:
else:
columns[0].add(item)

self._parse(stream, columns, to_scan, start_symbol)
to_scan = self._parse(stream, columns, to_scan, start_symbol)

# If the parse was successful, the start
# symbol should have been completed in the last step of the Earley cycle, and will be in
# this column. Find the item for the start_symbol, which is the root of the SPPF tree.
solutions = [n.node for n in columns[-1] if n.is_complete and n.node is not None and n.s == start_symbol and n.start == 0]
if self.debug:
from .earley_forest import ForestToPyDotVisitor
try:
debug_walker = ForestToPyDotVisitor()
except ImportError:
logging.warning("Cannot find dependency 'pydot', will not generate sppf debug image")
else:
debug_walker.visit(solutions[0], "sppf.png")


if not solutions:
expected_tokens = [t.expect for t in to_scan]
# raise ParseError('Incomplete parse: Could not find a solution to input')
raise ParseError('Unexpected end of input! Expecting a terminal of: %s' % expected_tokens)
raise UnexpectedEOF(expected_tokens)
elif len(solutions) > 1:
assert False, 'Earley should not generate multiple start symbol items!'

# Perform our SPPF -> AST conversion using the right ForestVisitor.
return self.forest_tree_visitor.visit(solutions[0])
forest_tree_visitor_cls = ForestToTreeVisitor if self.resolve_ambiguity else ForestToAmbiguousTreeVisitor
forest_tree_visitor = forest_tree_visitor_cls(self.callbacks, self.forest_sum_visitor and self.forest_sum_visitor())

return forest_tree_visitor.visit(solutions[0])


class ApplyCallbacks(Transformer_InPlace):


+ 3
- 3
lark/parsers/earley_forest.py View File

@@ -122,7 +122,7 @@ class PackedNode(ForestNode):
ambiguously. Hence, we use the sort order to identify
the order in which ambiguous children should be considered.
"""
return self.is_empty, -self.priority, -self.rule.order
return self.is_empty, -self.priority, self.rule.order

def __iter__(self):
return iter([self.left, self.right])
@@ -195,7 +195,7 @@ class ForestVisitor(object):
continue

if id(next_node) in visiting:
raise ParseError("Infinite recursion in grammar!")
raise ParseError("Infinite recursion in grammar, in rule '%s'!" % next_node.s.name)

input_stack.append(next_node)
continue
@@ -250,7 +250,7 @@ class ForestSumVisitor(ForestVisitor):
return iter(node.children)

def visit_packed_node_out(self, node):
priority = node.rule.options.priority if not node.parent.is_intermediate and node.rule.options and node.rule.options.priority else 0
priority = node.rule.options.priority if not node.parent.is_intermediate and node.rule.options.priority else 0
priority += getattr(node.right, 'priority', 0)
priority += getattr(node.left, 'priority', 0)
node.priority = priority


+ 46
- 14
lark/parsers/grammar_analysis.py View File

@@ -1,4 +1,4 @@
from collections import Counter
from collections import Counter, defaultdict

from ..utils import bfs, fzset, classify
from ..exceptions import GrammarError
@@ -37,8 +37,22 @@ class RulePtr(object):
return hash((self.rule, self.index))


# state generation ensures no duplicate LR0ItemSets
class LR0ItemSet(object):
__slots__ = ('kernel', 'closure', 'transitions', 'lookaheads')

def __init__(self, kernel, closure):
self.kernel = fzset(kernel)
self.closure = fzset(closure)
self.transitions = {}
self.lookaheads = defaultdict(set)

def __repr__(self):
return '{%s | %s}' % (', '.join([repr(r) for r in self.kernel]), ', '.join([repr(r) for r in self.closure]))


def update_set(set1, set2):
if not set2:
if not set2 or set1 > set2:
return False

copy = set(set1)
@@ -85,6 +99,8 @@ def calculate_sets(rules):
if set(rule.expansion[:i]) <= NULLABLE:
if update_set(FIRST[rule.origin], FIRST[sym]):
changed = True
else:
break

# Calculate FOLLOW
changed = True
@@ -109,7 +125,10 @@ class GrammarAnalyzer(object):
def __init__(self, parser_conf, debug=False):
self.debug = debug

rules = parser_conf.rules + [Rule(NonTerminal('$root'), [NonTerminal(parser_conf.start), Terminal('$END')])]
root_rules = {start: Rule(NonTerminal('$root_' + start), [NonTerminal(start), Terminal('$END')])
for start in parser_conf.start}

rules = parser_conf.rules + list(root_rules.values())
self.rules_by_origin = classify(rules, lambda r: r.origin)

if len(rules) != len(set(rules)):
@@ -121,17 +140,37 @@ class GrammarAnalyzer(object):
if not (sym.is_term or sym in self.rules_by_origin):
raise GrammarError("Using an undefined rule: %s" % sym) # TODO test validation

self.start_state = self.expand_rule(NonTerminal('$root'))
self.start_states = {start: self.expand_rule(root_rule.origin)
for start, root_rule in root_rules.items()}

self.end_states = {start: fzset({RulePtr(root_rule, len(root_rule.expansion))})
for start, root_rule in root_rules.items()}

lr0_root_rules = {start: Rule(NonTerminal('$root_' + start), [NonTerminal(start)])
for start in parser_conf.start}

lr0_rules = parser_conf.rules + list(lr0_root_rules.values())
assert(len(lr0_rules) == len(set(lr0_rules)))

self.lr0_rules_by_origin = classify(lr0_rules, lambda r: r.origin)

# cache RulePtr(r, 0) in r (no duplicate RulePtr objects)
self.lr0_start_states = {start: LR0ItemSet([RulePtr(root_rule, 0)], self.expand_rule(root_rule.origin, self.lr0_rules_by_origin))
for start, root_rule in lr0_root_rules.items()}

self.FIRST, self.FOLLOW, self.NULLABLE = calculate_sets(rules)

def expand_rule(self, rule):
def expand_rule(self, source_rule, rules_by_origin=None):
"Returns all init_ptrs accessible by rule (recursive)"

if rules_by_origin is None:
rules_by_origin = self.rules_by_origin

init_ptrs = set()
def _expand_rule(rule):
assert not rule.is_term, rule

for r in self.rules_by_origin[rule]:
for r in rules_by_origin[rule]:
init_ptr = RulePtr(r, 0)
init_ptrs.add(init_ptr)

@@ -140,14 +179,7 @@ class GrammarAnalyzer(object):
if not new_r.is_term:
yield new_r

for _ in bfs([rule], _expand_rule):
for _ in bfs([source_rule], _expand_rule):
pass

return fzset(init_ptrs)

def _first(self, r):
if r.is_term:
return {r}
else:
return {rp.next for rp in self.expand_rule(r) if rp.next.is_term}


+ 199
- 52
lark/parsers/lalr_analysis.py View File

@@ -7,12 +7,12 @@ For now, shift/reduce conflicts are automatically resolved as shifts.
# Email : erezshin@gmail.com

import logging
from collections import defaultdict
from collections import defaultdict, deque

from ..utils import classify, classify_bool, bfs, fzset, Serialize, Enumerator
from ..exceptions import GrammarError

from .grammar_analysis import GrammarAnalyzer, Terminal
from .grammar_analysis import GrammarAnalyzer, Terminal, LR0ItemSet
from ..grammar import Rule

###{standalone
@@ -28,11 +28,12 @@ class Action:
Shift = Action('Shift')
Reduce = Action('Reduce')


class ParseTable:
def __init__(self, states, start_state, end_state):
def __init__(self, states, start_states, end_states):
self.states = states
self.start_state = start_state
self.end_state = end_state
self.start_states = start_states
self.end_states = end_states

def serialize(self, memo):
tokens = Enumerator()
@@ -47,8 +48,8 @@ class ParseTable:
return {
'tokens': tokens.reversed(),
'states': states,
'start_state': self.start_state,
'end_state': self.end_state,
'start_states': self.start_states,
'end_states': self.end_states,
}

@classmethod
@@ -59,7 +60,7 @@ class ParseTable:
for token, (action, arg) in actions.items()}
for state, actions in data['states'].items()
}
return cls(states, data['start_state'], data['end_state'])
return cls(states, data['start_states'], data['end_states'])


class IntParseTable(ParseTable):
@@ -76,66 +77,212 @@ class IntParseTable(ParseTable):
int_states[ state_to_idx[s] ] = la


start_state = state_to_idx[parse_table.start_state]
end_state = state_to_idx[parse_table.end_state]
return cls(int_states, start_state, end_state)
start_states = {start:state_to_idx[s] for start, s in parse_table.start_states.items()}
end_states = {start:state_to_idx[s] for start, s in parse_table.end_states.items()}
return cls(int_states, start_states, end_states)

###}


# digraph and traverse, see The Theory and Practice of Compiler Writing

# computes F(x) = G(x) union (union { G(y) | x R y })
# X: nodes
# R: relation (function mapping node -> list of nodes that satisfy the relation)
# G: set valued function
def digraph(X, R, G):
F = {}
S = []
N = {}
for x in X:
N[x] = 0
for x in X:
# this is always true for the first iteration, but N[x] may be updated in traverse below
if N[x] == 0:
traverse(x, S, N, X, R, G, F)
return F

# x: single node
# S: stack
# N: weights
# X: nodes
# R: relation (see above)
# G: set valued function
# F: set valued function we are computing (map of input -> output)
def traverse(x, S, N, X, R, G, F):
S.append(x)
d = len(S)
N[x] = d
F[x] = G[x]
for y in R[x]:
if N[y] == 0:
traverse(y, S, N, X, R, G, F)
n_x = N[x]
assert(n_x > 0)
n_y = N[y]
assert(n_y != 0)
if (n_y > 0) and (n_y < n_x):
N[x] = n_y
F[x].update(F[y])
if N[x] == d:
f_x = F[x]
while True:
z = S.pop()
N[z] = -1
F[z] = f_x
if z == x:
break


class LALR_Analyzer(GrammarAnalyzer):
def __init__(self, parser_conf, debug=False):
GrammarAnalyzer.__init__(self, parser_conf, debug)
self.nonterminal_transitions = []
self.directly_reads = defaultdict(set)
self.reads = defaultdict(set)
self.includes = defaultdict(set)
self.lookback = defaultdict(set)


def compute_lookahead(self):
self.end_states = []
def compute_lr0_states(self):
self.lr0_states = set()
# map of kernels to LR0ItemSets
cache = {}

self.states = {}
def step(state):
lookahead = defaultdict(list)
sat, unsat = classify_bool(state, lambda rp: rp.is_satisfied)
for rp in sat:
for term in self.FOLLOW.get(rp.rule.origin, ()):
lookahead[term].append((Reduce, rp.rule))
_, unsat = classify_bool(state.closure, lambda rp: rp.is_satisfied)

d = classify(unsat, lambda rp: rp.next)
for sym, rps in d.items():
rps = {rp.advance(sym) for rp in rps}
kernel = fzset({rp.advance(sym) for rp in rps})
new_state = cache.get(kernel, None)
if new_state is None:
closure = set(kernel)
for rp in kernel:
if not rp.is_satisfied and not rp.next.is_term:
closure |= self.expand_rule(rp.next, self.lr0_rules_by_origin)
new_state = LR0ItemSet(kernel, closure)
cache[kernel] = new_state

state.transitions[sym] = new_state
yield new_state

for rp in set(rps):
if not rp.is_satisfied and not rp.next.is_term:
rps |= self.expand_rule(rp.next)
self.lr0_states.add(state)

new_state = fzset(rps)
lookahead[sym].append((Shift, new_state))
if sym == Terminal('$END'):
self.end_states.append( new_state )
yield new_state
for _ in bfs(self.lr0_start_states.values(), step):
pass

for k, v in lookahead.items():
if len(v) > 1:
def compute_reads_relations(self):
# handle start state
for root in self.lr0_start_states.values():
assert(len(root.kernel) == 1)
for rp in root.kernel:
assert(rp.index == 0)
self.directly_reads[(root, rp.next)] = set([ Terminal('$END') ])

for state in self.lr0_states:
seen = set()
for rp in state.closure:
if rp.is_satisfied:
continue
s = rp.next
# if s is a not a nonterminal
if s not in self.lr0_rules_by_origin:
continue
if s in seen:
continue
seen.add(s)
nt = (state, s)
self.nonterminal_transitions.append(nt)
dr = self.directly_reads[nt]
r = self.reads[nt]
next_state = state.transitions[s]
for rp2 in next_state.closure:
if rp2.is_satisfied:
continue
s2 = rp2.next
# if s2 is a terminal
if s2 not in self.lr0_rules_by_origin:
dr.add(s2)
if s2 in self.NULLABLE:
r.add((next_state, s2))

def compute_includes_lookback(self):
for nt in self.nonterminal_transitions:
state, nonterminal = nt
includes = []
lookback = self.lookback[nt]
for rp in state.closure:
if rp.rule.origin != nonterminal:
continue
# traverse the states for rp(.rule)
state2 = state
for i in range(rp.index, len(rp.rule.expansion)):
s = rp.rule.expansion[i]
nt2 = (state2, s)
state2 = state2.transitions[s]
if nt2 not in self.reads:
continue
for j in range(i + 1, len(rp.rule.expansion)):
if not rp.rule.expansion[j] in self.NULLABLE:
break
else:
includes.append(nt2)
# state2 is at the final state for rp.rule
if rp.index == 0:
for rp2 in state2.closure:
if (rp2.rule == rp.rule) and rp2.is_satisfied:
lookback.add((state2, rp2.rule))
for nt2 in includes:
self.includes[nt2].add(nt)

def compute_lookaheads(self):
read_sets = digraph(self.nonterminal_transitions, self.reads, self.directly_reads)
follow_sets = digraph(self.nonterminal_transitions, self.includes, read_sets)

for nt, lookbacks in self.lookback.items():
for state, rule in lookbacks:
for s in follow_sets[nt]:
state.lookaheads[s].add(rule)

def compute_lalr1_states(self):
m = {}
for state in self.lr0_states:
actions = {}
for la, next_state in state.transitions.items():
actions[la] = (Shift, next_state.closure)
for la, rules in state.lookaheads.items():
if len(rules) > 1:
raise GrammarError('Reduce/Reduce collision in %s between the following rules: %s' % (la, ''.join([ '\n\t\t- ' + str(r) for r in rules ])))
if la in actions:
if self.debug:
logging.warn("Shift/reduce conflict for terminal %s: (resolving as shift)", k.name)
for act, arg in v:
logging.warn(' * %s: %s', act, arg)
for x in v:
# XXX resolving shift/reduce into shift, like PLY
# Give a proper warning
if x[0] is Shift:
lookahead[k] = [x]

for k, v in lookahead.items():
if not len(v) == 1:
raise GrammarError("Collision in %s: %s" %(k, ', '.join(['\n * %s: %s' % x for x in v])))

self.states[state] = {k.name:v[0] for k, v in lookahead.items()}

for _ in bfs([self.start_state], step):
pass
logging.warning('Shift/Reduce conflict for terminal %s: (resolving as shift)', la.name)
logging.warning(' * %s', list(rules)[0])
else:
actions[la] = (Reduce, list(rules)[0])
m[state] = { k.name: v for k, v in actions.items() }

self.end_state ,= self.end_states
states = { k.closure: v for k, v in m.items() }

self._parse_table = ParseTable(self.states, self.start_state, self.end_state)
# compute end states
end_states = {}
for state in states:
for rp in state:
for start in self.lr0_start_states:
if rp.rule.origin.name == ('$root_' + start) and rp.is_satisfied:
assert(not start in end_states)
end_states[start] = state

_parse_table = ParseTable(states, { start: state.closure for start, state in self.lr0_start_states.items() }, end_states)

if self.debug:
self.parse_table = self._parse_table
self.parse_table = _parse_table
else:
self.parse_table = IntParseTable.from_ParseTable(self._parse_table)

self.parse_table = IntParseTable.from_ParseTable(_parse_table)

def compute_lalr(self):
self.compute_lr0_states()
self.compute_reads_relations()
self.compute_includes_lookback()
self.compute_lookaheads()
self.compute_lalr1_states()

+ 16
- 16
lark/parsers/lalr_parser.py View File

@@ -6,16 +6,15 @@ from ..exceptions import UnexpectedToken
from ..lexer import Token
from ..utils import Enumerator, Serialize

from .lalr_analysis import LALR_Analyzer, Shift, IntParseTable
from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable


###{standalone
class LALR_Parser(object):
def __init__(self, parser_conf, debug=False):
assert all(r.options is None or r.options.priority is None
for r in parser_conf.rules), "LALR doesn't yet support prioritization"
assert all(r.options.priority is None for r in parser_conf.rules), "LALR doesn't yet support prioritization"
analysis = LALR_Analyzer(parser_conf, debug=debug)
analysis.compute_lookahead()
analysis.compute_lalr()
callbacks = parser_conf.callbacks

self._parse_table = analysis.parse_table
@@ -39,19 +38,22 @@ class LALR_Parser(object):
class _Parser:
def __init__(self, parse_table, callbacks):
self.states = parse_table.states
self.start_state = parse_table.start_state
self.end_state = parse_table.end_state
self.start_states = parse_table.start_states
self.end_states = parse_table.end_states
self.callbacks = callbacks

def parse(self, seq, set_state=None):
def parse(self, seq, start, set_state=None):
token = None
stream = iter(seq)
states = self.states

state_stack = [self.start_state]
start_state = self.start_states[start]
end_state = self.end_states[start]

state_stack = [start_state]
value_stack = []

if set_state: set_state(self.start_state)
if set_state: set_state(start_state)

def get_action(token):
state = state_stack[-1]
@@ -81,7 +83,7 @@ class _Parser:
for token in stream:
while True:
action, arg = get_action(token)
assert arg != self.end_state
assert arg != end_state

if action is Shift:
state_stack.append(arg)
@@ -94,11 +96,9 @@ class _Parser:
token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1)
while True:
_action, arg = get_action(token)
if _action is Shift:
assert arg == self.end_state
val ,= value_stack
return val
else:
reduce(arg)
assert(_action is Reduce)
reduce(arg)
if state_stack[-1] == end_state:
return value_stack[-1]

###}

+ 4
- 3
lark/parsers/xearley.py View File

@@ -24,8 +24,8 @@ from .earley_forest import SymbolNode


class Parser(BaseParser):
def __init__(self, parser_conf, term_matcher, resolve_ambiguity=True, ignore = (), complete_lex = False):
BaseParser.__init__(self, parser_conf, term_matcher, resolve_ambiguity)
def __init__(self, parser_conf, term_matcher, resolve_ambiguity=True, ignore = (), complete_lex = False, debug=False):
BaseParser.__init__(self, parser_conf, term_matcher, resolve_ambiguity, debug)
self.ignore = [Terminal(t) for t in ignore]
self.complete_lex = complete_lex

@@ -146,4 +146,5 @@ class Parser(BaseParser):
self.predict_and_complete(i, to_scan, columns, transitives)

## Column is now the final column in the parse.
assert i == len(columns)-1
assert i == len(columns)-1
return to_scan

+ 47
- 12
lark/reconstruct.py View File

@@ -19,13 +19,15 @@ def is_iter_empty(i):
except StopIteration:
return True


class WriteTokensTransformer(Transformer_InPlace):
def __init__(self, tokens):
"Inserts discarded tokens into their correct place, according to the rules of grammar"

def __init__(self, tokens, term_subs):
self.tokens = tokens
self.term_subs = term_subs

def __default__(self, data, children, meta):
# if not isinstance(t, MatchTree):
# return t
if not getattr(meta, 'match_tree', False):
return Tree(data, children)

@@ -33,10 +35,15 @@ class WriteTokensTransformer(Transformer_InPlace):
to_write = []
for sym in meta.orig_expansion:
if is_discarded_terminal(sym):
t = self.tokens[sym.name]
if not isinstance(t.pattern, PatternStr):
raise NotImplementedError("Reconstructing regexps not supported yet: %s" % t)
to_write.append(t.pattern.value)
try:
v = self.term_subs[sym.name](sym)
except KeyError:
t = self.tokens[sym.name]
if not isinstance(t.pattern, PatternStr):
raise NotImplementedError("Reconstructing regexps not supported yet: %s" % t)

v = t.pattern.value
to_write.append(v)
else:
x = next(iter_args)
if isinstance(x, list):
@@ -66,19 +73,39 @@ class MakeMatchTree:
t.meta.orig_expansion = self.expansion
return t

def best_from_group(seq, group_key, cmp_key):
d = {}
for item in seq:
key = group_key(item)
if key in d:
v1 = cmp_key(item)
v2 = cmp_key(d[key])
if v2 > v1:
d[key] = item
else:
d[key] = item
return list(d.values())

class Reconstructor:
def __init__(self, parser):
def __init__(self, parser, term_subs={}):
# XXX TODO calling compile twice returns different results!
tokens, rules, _grammar_extra = parser.grammar.compile()
assert parser.options.maybe_placeholders == False
tokens, rules, _grammar_extra = parser.grammar.compile(parser.options.start)

self.write_tokens = WriteTokensTransformer({t.name:t for t in tokens})
self.write_tokens = WriteTokensTransformer({t.name:t for t in tokens}, term_subs)
self.rules = list(self._build_recons_rules(rules))
self.rules.reverse()

# Choose the best rule from each group of {rule => [rule.alias]}, since we only really need one derivation.
self.rules = best_from_group(self.rules, lambda r: r, lambda r: -len(r.expansion))

self.rules.sort(key=lambda r: len(r.expansion))
callbacks = {rule: rule.alias for rule in self.rules} # TODO pass callbacks through dict, instead of alias?
self.parser = earley.Parser(ParserConf(self.rules, callbacks, parser.options.start),
self._match, resolve_ambiguity=True)

def _build_recons_rules(self, rules):
expand1s = {r.origin for r in rules if r.options and r.options.expand1}
expand1s = {r.origin for r in rules if r.options.expand1}

aliases = defaultdict(list)
for r in rules:
@@ -126,4 +153,12 @@ class Reconstructor:
yield item

def reconstruct(self, tree):
return ''.join(self._reconstruct(tree))
x = self._reconstruct(tree)
y = []
prev_item = ''
for item in x:
if prev_item and item and prev_item[-1].isalnum() and item[0].isalnum():
y.append(' ')
y.append(item)
prev_item = item
return ''.join(y)

+ 2
- 2
lark/tools/nearley.py View File

@@ -18,7 +18,7 @@ nearley_grammar = r"""

expansion: expr+ js

?expr: item [":" /[+*?]/]
?expr: item (":" /[+*?]/)?

?item: rule|string|regexp|null
| "(" expansions ")"
@@ -167,7 +167,7 @@ def create_code_for_nearley_grammar(g, start, builtin_path, folder_path):
emit(" __default__ = lambda self, n, c, m: c if c else None")

emit()
emit('parser = Lark(grammar, start="n_%s")' % start)
emit('parser = Lark(grammar, start="n_%s", maybe_placeholders=False)' % start)
emit('def parse(text):')
emit(' return TransformNearley().transform(parser.parse(text))')



+ 39
- 0
lark/tools/serialize.py View File

@@ -0,0 +1,39 @@
import codecs
import sys
import json

from lark import Lark
from lark.grammar import RuleOptions, Rule
from lark.lexer import TerminalDef

import argparse

argparser = argparse.ArgumentParser(prog='python -m lark.tools.serialize') #description='''Lark Serialization Tool -- Stores Lark's internal state & LALR analysis as a convenient JSON file''')

argparser.add_argument('grammar_file', type=argparse.FileType('r'), help='A valid .lark file')
argparser.add_argument('-o', '--out', type=argparse.FileType('w'), default=sys.stdout, help='json file path to create (default=stdout)')
argparser.add_argument('-s', '--start', default='start', help='start symbol (default="start")', nargs='+')
argparser.add_argument('-l', '--lexer', default='standard', choices=['standard', 'contextual'], help='lexer type (default="standard")')


def serialize(infile, outfile, lexer, start):
lark_inst = Lark(infile, parser="lalr", lexer=lexer, start=start) # TODO contextual

data, memo = lark_inst.memo_serialize([TerminalDef, Rule])
outfile.write('{\n')
outfile.write(' "data": %s,\n' % json.dumps(data))
outfile.write(' "memo": %s\n' % json.dumps(memo))
outfile.write('}\n')


def main():
if len(sys.argv) == 1 or '-h' in sys.argv or '--help' in sys.argv:
print("Lark Serialization Tool - Stores Lark's internal state & LALR analysis as a JSON file")
print("")
argparser.print_help()
else:
args = argparser.parse_args()
serialize(args.grammar_file, args.out, args.lexer, args.start)

if __name__ == '__main__':
main()

+ 3
- 0
lark/tools/standalone.py View File

@@ -34,6 +34,9 @@
# See <http://www.gnu.org/licenses/>.
#
#

import os
from io import open
###}

import pprint


+ 31
- 28
lark/tree.py View File

@@ -56,30 +56,6 @@ class Tree(object):

def __hash__(self):
return hash((self.data, tuple(self.children)))
###}

def expand_kids_by_index(self, *indices):
"Expand (inline) children at the given indices"
for i in sorted(indices, reverse=True): # reverse so that changing tail won't affect indices
kid = self.children[i]
self.children[i:i+1] = kid.children

def find_pred(self, pred):
"Find all nodes where pred(tree) == True"
return filter(pred, self.iter_subtrees())

def find_data(self, data):
"Find all nodes where tree.data == data"
return self.find_pred(lambda t: t.data == data)

def scan_values(self, pred):
for c in self.children:
if isinstance(c, Tree):
for t in c.scan_values(pred):
yield t
else:
if pred(c):
yield c

def iter_subtrees(self):
# TODO: Re-write as a more efficient version
@@ -102,6 +78,31 @@ class Tree(object):
yield x
seen.add(id(x))

def find_pred(self, pred):
"Find all nodes where pred(tree) == True"
return filter(pred, self.iter_subtrees())

def find_data(self, data):
"Find all nodes where tree.data == data"
return self.find_pred(lambda t: t.data == data)

###}

def expand_kids_by_index(self, *indices):
"Expand (inline) children at the given indices"
for i in sorted(indices, reverse=True): # reverse so that changing tail won't affect indices
kid = self.children[i]
self.children[i:i+1] = kid.children

def scan_values(self, pred):
for c in self.children:
if isinstance(c, Tree):
for t in c.scan_values(pred):
yield t
else:
if pred(c):
yield c

def iter_subtrees_topdown(self):
stack = [self]
while stack:
@@ -141,17 +142,19 @@ class SlottedTree(Tree):
__slots__ = 'data', 'children', 'rule', '_meta'


def pydot__tree_to_png(tree, filename, rankdir="LR"):
def pydot__tree_to_png(tree, filename, rankdir="LR", **kwargs):
"""Creates a colorful image that represents the tree (data+children, without meta)

Possible values for `rankdir` are "TB", "LR", "BT", "RL", corresponding to
directed graphs drawn from top to bottom, from left to right, from bottom to
top, and from right to left, respectively. See:
https://www.graphviz.org/doc/info/attrs.html#k:rankdir
top, and from right to left, respectively.

`kwargs` can be any graph attribute (e. g. `dpi=200`). For a list of
possible attributes, see https://www.graphviz.org/doc/info/attrs.html.
"""

import pydot
graph = pydot.Dot(graph_type='digraph', rankdir=rankdir)
graph = pydot.Dot(graph_type='digraph', rankdir=rankdir, **kwargs)

i = [0]



+ 28
- 2
lark/utils.py View File

@@ -1,4 +1,5 @@
import sys
from ast import literal_eval
from collections import deque

class fzset(frozenset):
@@ -160,7 +161,7 @@ def smart_decorator(f, create_decorator):

elif isinstance(f, partial):
# wraps does not work for partials in 2.7: https://bugs.python.org/issue3445
return create_decorator(f.__func__, True)
return wraps(f.func)(create_decorator(lambda *args, **kw: f(*args[1:], **kw), True))

else:
return create_decorator(f.__func__.__call__, True)
@@ -172,7 +173,7 @@ import sre_parse
import sre_constants
def get_regexp_width(regexp):
try:
return sre_parse.parse(regexp).getwidth()
return [int(x) for x in sre_parse.parse(regexp).getwidth()]
except sre_constants.error:
raise ValueError(regexp)

@@ -239,3 +240,28 @@ class Enumerator(Serialize):
assert len(r) == len(self.enums)
return r


def eval_escaping(s):
w = ''
i = iter(s)
for n in i:
w += n
if n == '\\':
try:
n2 = next(i)
except StopIteration:
raise ValueError("Literal ended unexpectedly (bad escaping): `%r`" % s)
if n2 == '\\':
w += '\\\\'
elif n2 not in 'uxnftr':
w += '\\'
w += n2
w = w.replace('\\"', '"').replace("'", "\\'")

to_eval = "u'''%s'''" % w
try:
s = literal_eval(to_eval)
except SyntaxError as e:
raise ValueError(s, e)

return s

+ 113
- 43
lark/visitors.py View File

@@ -3,6 +3,7 @@ from functools import wraps
from .utils import smart_decorator
from .tree import Tree
from .exceptions import VisitError, GrammarError
from .lexer import Token

###{standalone
from inspect import getmembers, getmro
@@ -12,7 +13,31 @@ class Discard(Exception):

# Transformers

class Transformer:
class _Decoratable:
@classmethod
def _apply_decorator(cls, decorator, **kwargs):
mro = getmro(cls)
assert mro[0] is cls
libmembers = {name for _cls in mro[1:] for name, _ in getmembers(_cls)}
for name, value in getmembers(cls):

# Make sure the function isn't inherited (unless it's overwritten)
if name.startswith('_') or (name in libmembers and name not in cls.__dict__):
continue
if not callable(cls.__dict__[name]):
continue

# Skip if v_args already applied (at the function level)
if hasattr(cls.__dict__[name], 'vargs_applied'):
continue

static = isinstance(cls.__dict__[name], (staticmethod, classmethod))
setattr(cls, name, decorator(value, static=static, **kwargs))
return cls



class Transformer(_Decoratable):
"""Visits the tree recursively, starting with the leaves and finally the root (bottom-up)

Calls its methods (provided by user via inheritance) according to tree.data
@@ -21,6 +46,10 @@ class Transformer:
Can be used to implement map or reduce.
"""

__visit_tokens__ = True # For backwards compatibility
def __init__(self, visit_tokens=True):
self.__visit_tokens__ = visit_tokens

def _call_userfunc(self, tree, new_children=None):
# Assumes tree is already transformed
children = new_children if new_children is not None else tree.children
@@ -30,25 +59,39 @@ class Transformer:
return self.__default__(tree.data, children, tree.meta)
else:
try:
if getattr(f, 'meta', False):
return f(children, tree.meta)
elif getattr(f, 'inline', False):
return f(*children)
elif getattr(f, 'whole_tree', False):
if new_children is not None:
raise NotImplementedError("Doesn't work with the base Transformer class")
return f(tree)
wrapper = getattr(f, 'visit_wrapper', None)
if wrapper is not None:
return f.visit_wrapper(f, tree.data, children, tree.meta)
else:
return f(children)
except (GrammarError, Discard):
raise
except Exception as e:
raise VisitError(tree, e)
raise VisitError(tree.data, tree, e)

def _call_userfunc_token(self, token):
try:
f = getattr(self, token.type)
except AttributeError:
return self.__default_token__(token)
else:
try:
return f(token)
except (GrammarError, Discard):
raise
except Exception as e:
raise VisitError(token.type, token, e)


def _transform_children(self, children):
for c in children:
try:
yield self._transform_tree(c) if isinstance(c, Tree) else c
if isinstance(c, Tree):
yield self._transform_tree(c)
elif self.__visit_tokens__ and isinstance(c, Token):
yield self._call_userfunc_token(c)
else:
yield c
except Discard:
pass

@@ -66,26 +109,10 @@ class Transformer:
"Default operation on tree (for override)"
return Tree(data, children, meta)

@classmethod
def _apply_decorator(cls, decorator, **kwargs):
mro = getmro(cls)
assert mro[0] is cls
libmembers = {name for _cls in mro[1:] for name, _ in getmembers(_cls)}
for name, value in getmembers(cls):

# Make sure the function isn't inherited (unless it's overwritten)
if name.startswith('_') or (name in libmembers and name not in cls.__dict__):
continue
if not callable(cls.__dict__[name]):
continue

# Skip if v_args already applied (at the function level)
if hasattr(cls.__dict__[name], 'vargs_applied'):
continue
def __default_token__(self, token):
"Default operation on token (for override)"
return token

static = isinstance(cls.__dict__[name], (staticmethod, classmethod))
setattr(cls, name, decorator(value, static=static, **kwargs))
return cls


class InlineTransformer(Transformer): # XXX Deprecated
@@ -157,6 +184,11 @@ class Visitor(VisitorBase):
self._call_userfunc(subtree)
return tree

def visit_topdown(self,tree):
for subtree in tree.iter_subtrees_topdown():
self._call_userfunc(subtree)
return tree

class Visitor_Recursive(VisitorBase):
"""Bottom-up visitor, recursive

@@ -169,8 +201,16 @@ class Visitor_Recursive(VisitorBase):
if isinstance(child, Tree):
self.visit(child)

f = getattr(self, tree.data, self.__default__)
f(tree)
self._call_userfunc(tree)
return tree

def visit_topdown(self,tree):
self._call_userfunc(tree)

for child in tree.children:
if isinstance(child, Tree):
self.visit_topdown(child)

return tree


@@ -184,7 +224,7 @@ def visit_children_decor(func):
return inner


class Interpreter:
class Interpreter(_Decoratable):
"""Top-down visitor, recursive

Visits the tree, starting with the root and finally the leaves (top-down)
@@ -193,8 +233,14 @@ class Interpreter:
Unlike Transformer and Visitor, the Interpreter doesn't automatically visit its sub-branches.
The user has to explicitly call visit_children, or use the @visit_children_decor
"""

def visit(self, tree):
return getattr(self, tree.data)(tree)
f = getattr(self, tree.data)
wrapper = getattr(f, 'visit_wrapper', None)
if wrapper is not None:
return f.visit_wrapper(f, tree.data, tree.children, tree.meta)
else:
return f(tree)

def visit_children(self, tree):
return [self.visit(child) if isinstance(child, Tree) else child
@@ -240,8 +286,7 @@ def inline_args(obj): # XXX Deprecated



def _visitor_args_func_dec(func, inline=False, meta=False, whole_tree=False, static=False):
assert [whole_tree, meta, inline].count(True) <= 1
def _visitor_args_func_dec(func, visit_wrapper=None, static=False):
def create_decorator(_f, with_self):
if with_self:
def f(self, *args, **kwargs):
@@ -256,17 +301,42 @@ def _visitor_args_func_dec(func, inline=False, meta=False, whole_tree=False, sta
else:
f = smart_decorator(func, create_decorator)
f.vargs_applied = True
f.inline = inline
f.meta = meta
f.whole_tree = whole_tree
f.visit_wrapper = visit_wrapper
return f

def v_args(inline=False, meta=False, tree=False):

def _vargs_inline(f, data, children, meta):
return f(*children)
def _vargs_meta_inline(f, data, children, meta):
return f(meta, *children)
def _vargs_meta(f, data, children, meta):
return f(children, meta) # TODO swap these for consistency? Backwards incompatible!
def _vargs_tree(f, data, children, meta):
return f(Tree(data, children, meta))

def v_args(inline=False, meta=False, tree=False, wrapper=None):
"A convenience decorator factory, for modifying the behavior of user-supplied visitor methods"
if [tree, meta, inline].count(True) > 1:
raise ValueError("Visitor functions can either accept tree, or meta, or be inlined. These cannot be combined.")
if tree and (meta or inline):
raise ValueError("Visitor functions cannot combine 'tree' with 'meta' or 'inline'.")

func = None
if meta:
if inline:
func = _vargs_meta_inline
else:
func = _vargs_meta
elif inline:
func = _vargs_inline
elif tree:
func = _vargs_tree

if wrapper is not None:
if func is not None:
raise ValueError("Cannot use 'wrapper' along with 'tree', 'meta' or 'inline'.")
func = wrapper

def _visitor_args_dec(obj):
return _apply_decorator(obj, _visitor_args_func_dec, inline=inline, meta=meta, whole_tree=tree)
return _apply_decorator(obj, _visitor_args_func_dec, visit_wrapper=func)
return _visitor_args_dec




+ 1
- 0
mkdocs.yml View File

@@ -9,5 +9,6 @@ pages:
- How To Develop (Guide): how_to_develop.md
- Grammar Reference: grammar.md
- Tree Construction Reference: tree_construction.md
- Visitors and Transformers: visitors.md
- Classes Reference: classes.md
- Recipes: recipes.md

+ 10
- 0
readthedocs.yml View File

@@ -0,0 +1,10 @@
version: 2

mkdocs:
configuration: mkdocs.yml
fail_on_warning: false

formats: all

python:
version: 3.5

+ 2
- 1
tests/__main__.py View File

@@ -10,7 +10,7 @@ from .test_reconstructor import TestReconstructor
try:
from .test_nearley.test_nearley import TestNearley
except ImportError:
pass
logging.warning("Warning: Skipping tests for Nearley grammar imports (js2py required)")

# from .test_selectors import TestSelectors
# from .test_grammars import TestPythonG, TestConfigG
@@ -21,6 +21,7 @@ from .test_parser import (
TestCykStandard,
TestLalrContextual,
TestEarleyDynamic,
TestLalrCustom,

# TestFullEarleyStandard,
TestFullEarleyDynamic,


+ 1
- 0
tests/grammars/test_unicode.lark View File

@@ -0,0 +1 @@
UNICODE : /[a-zØ-öø-ÿ]/

+ 4
- 1
tests/test_nearley/test_nearley.py View File

@@ -15,9 +15,12 @@ NEARLEY_PATH = os.path.join(TEST_PATH, 'nearley')
BUILTIN_PATH = os.path.join(NEARLEY_PATH, 'builtin')

if not os.path.exists(NEARLEY_PATH):
print("Skipping Nearley tests!")
logging.warn("Nearley not installed. Skipping Nearley tests!")
raise ImportError("Skipping Nearley tests!")

import js2py # Ensures that js2py exists, to avoid failing tests


class TestNearley(unittest.TestCase):
def test_css(self):
fn = os.path.join(NEARLEY_PATH, 'examples/csscolor.ne')


+ 239
- 6
tests/test_parser.py View File

@@ -5,6 +5,7 @@ import unittest
import logging
import os
import sys
from copy import deepcopy
try:
from cStringIO import StringIO as cStringIO
except ImportError:
@@ -20,9 +21,9 @@ logging.basicConfig(level=logging.INFO)
from lark.lark import Lark
from lark.exceptions import GrammarError, ParseError, UnexpectedToken, UnexpectedInput, UnexpectedCharacters
from lark.tree import Tree
from lark.visitors import Transformer
from lark.visitors import Transformer, Transformer_InPlace, v_args
from lark.grammar import Rule
from lark.lexer import TerminalDef
from lark.lexer import TerminalDef, Lexer, TraditionalLexer

__path__ = os.path.dirname(__file__)
def _read(n, *args):
@@ -62,6 +63,14 @@ class TestParsers(unittest.TestCase):
r = g.parse('a')
self.assertEqual( r.children[0].meta.line, 1 )

g = Lark("""start: x
x: a
a: "a"
""", propagate_positions=True)

r = g.parse('a')
self.assertEqual( r.children[0].meta.line, 1 )

def test_expand1(self):

g = Lark("""start: a
@@ -94,6 +103,98 @@ class TestParsers(unittest.TestCase):
r = g.parse('xx')
self.assertEqual( r.children[0].data, "c" )

def test_comment_in_rule_definition(self):
g = Lark("""start: a
a: "a"
// A comment
// Another comment
| "b"
// Still more

c: "unrelated"
""")
r = g.parse('b')
self.assertEqual( r.children[0].data, "a" )

def test_visit_tokens(self):
class T(Transformer):
def a(self, children):
return children[0] + "!"
def A(self, tok):
return tok.update(value=tok.upper())

# Test regular
g = """start: a
a : A
A: "x"
"""
p = Lark(g, parser='lalr')
r = T(False).transform(p.parse("x"))
self.assertEqual( r.children, ["x!"] )
r = T().transform(p.parse("x"))
self.assertEqual( r.children, ["X!"] )

# Test internal transformer
p = Lark(g, parser='lalr', transformer=T())
r = p.parse("x")
self.assertEqual( r.children, ["X!"] )

def test_vargs_meta(self):

@v_args(meta=True)
class T1(Transformer):
def a(self, children, meta):
assert not children
return meta.line

def start(self, children, meta):
return children

@v_args(meta=True, inline=True)
class T2(Transformer):
def a(self, meta):
return meta.line

def start(self, meta, *res):
return list(res)

for T in (T1, T2):
for internal in [False, True]:
try:
g = Lark(r"""start: a+
a : "x" _NL?
_NL: /\n/+
""", parser='lalr', transformer=T() if internal else None, propagate_positions=True)
except NotImplementedError:
assert internal
continue

res = g.parse("xx\nx\nxxx\n\n\nxx")
assert not internal
res = T().transform(res)

self.assertEqual(res, [1, 1, 2, 3, 3, 3, 6, 6])

def test_vargs_tree(self):
tree = Lark('''
start: a a a
!a: "A"
''').parse('AAA')
tree_copy = deepcopy(tree)

@v_args(tree=True)
class T(Transformer):
def a(self, tree):
return 1
def start(self, tree):
return tree.children

res = T().transform(tree)
self.assertEqual(res, [1, 1, 1])
self.assertEqual(tree, tree_copy)



def test_embedded_transformer(self):
class T(Transformer):
def a(self, children):
@@ -150,6 +251,51 @@ class TestParsers(unittest.TestCase):
r = g.parse("xx")
self.assertEqual( r.children, ["<c>"] )

def test_embedded_transformer_inplace(self):
@v_args(tree=True)
class T1(Transformer_InPlace):
def a(self, tree):
assert isinstance(tree, Tree), tree
tree.children.append("tested")
return tree

def b(self, tree):
return Tree(tree.data, tree.children + ['tested2'])

@v_args(tree=True)
class T2(Transformer):
def a(self, tree):
assert isinstance(tree, Tree), tree
tree.children.append("tested")
return tree

def b(self, tree):
return Tree(tree.data, tree.children + ['tested2'])

class T3(Transformer):
@v_args(tree=True)
def a(self, tree):
assert isinstance(tree, Tree)
tree.children.append("tested")
return tree

@v_args(tree=True)
def b(self, tree):
return Tree(tree.data, tree.children + ['tested2'])

for t in [T1(), T2(), T3()]:
for internal in [False, True]:
g = Lark("""start: a b
a : "x"
b : "y"
""", parser='lalr', transformer=t if internal else None)
r = g.parse("xy")
if not internal:
r = t.transform(r)

a, b = r.children
self.assertEqual(a.children, ["tested"])
self.assertEqual(b.children, ["tested2"])

def test_alias(self):
Lark("""start: ["a"] "b" ["c"] "e" ["f"] ["g"] ["h"] "x" -> d """)
@@ -386,12 +532,22 @@ def _make_full_earley_test(LEXER):
_TestFullEarley.__name__ = _NAME
globals()[_NAME] = _TestFullEarley

class CustomLexer(Lexer):
"""
Purpose of this custom lexer is to test the integration,
so it uses the traditionalparser as implementation without custom lexing behaviour.
"""
def __init__(self, lexer_conf):
self.lexer = TraditionalLexer(lexer_conf.tokens, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks)
def lex(self, *args, **kwargs):
return self.lexer.lex(*args, **kwargs)

def _make_parser_test(LEXER, PARSER):
lexer_class_or_name = CustomLexer if LEXER == 'custom' else LEXER
def _Lark(grammar, **kwargs):
return Lark(grammar, lexer=LEXER, parser=PARSER, propagate_positions=True, **kwargs)
return Lark(grammar, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs)
def _Lark_open(gfilename, **kwargs):
return Lark.open(gfilename, lexer=LEXER, parser=PARSER, propagate_positions=True, **kwargs)
return Lark.open(gfilename, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs)
class _TestParser(unittest.TestCase):
def test_basic1(self):
g = _Lark("""start: a+ b a* "b" a*
@@ -890,7 +1046,7 @@ def _make_parser_test(LEXER, PARSER):

@unittest.skipIf(PARSER == 'cyk', "No empty rules")
def test_twice_empty(self):
g = """!start: [["A"]]
g = """!start: ("A"?)?
"""
l = _Lark(g)
tree = l.parse('A')
@@ -984,6 +1140,32 @@ def _make_parser_test(LEXER, PARSER):
self.assertEqual(res.children, ['ab'])


grammar = """
start: A B | AB
A: "a"
B.-20: "b"
AB.-10: "ab"
"""
l = _Lark(grammar)
res = l.parse("ab")
self.assertEqual(res.children, ['a', 'b'])


grammar = """
start: A B | AB
A.-99999999999999999999999: "a"
B: "b"
AB: "ab"
"""
l = _Lark(grammar)
res = l.parse("ab")

self.assertEqual(res.children, ['ab'])






def test_import(self):
grammar = """
@@ -1021,6 +1203,12 @@ def _make_parser_test(LEXER, PARSER):
self.assertEqual(x.children, ['12', 'lions'])


def test_relative_import_unicode(self):
l = _Lark_open('test_relative_import_unicode.lark', rel_to=__file__)
x = l.parse(u'Ø')
self.assertEqual(x.children, [u'Ø'])


def test_relative_import_rename(self):
l = _Lark_open('test_relative_import_rename.lark', rel_to=__file__)
x = l.parse('12 lions')
@@ -1448,7 +1636,20 @@ def _make_parser_test(LEXER, PARSER):

parser.parse(r'"That" "And a \"b"')

@unittest.skipIf(PARSER!='lalr', "Serialize currently only works for LALR parsers (though it should be easy to extend)")

def test_meddling_unused(self):
"Unless 'unused' is removed, LALR analysis will fail on reduce-reduce collision"

grammar = """
start: EKS* x
x: EKS
unused: x*
EKS: "x"
"""
parser = _Lark(grammar)


@unittest.skipIf(PARSER!='lalr' or LEXER=='custom', "Serialize currently only works for LALR parsers without custom lexers (though it should be easy to extend)")
def test_serialize(self):
grammar = """
start: _ANY b "C"
@@ -1465,6 +1666,37 @@ def _make_parser_test(LEXER, PARSER):
parser3 = Lark.deserialize(d, namespace, m)
self.assertEqual(parser3.parse('ABC'), Tree('start', [Tree('b', [])]) )

def test_multi_start(self):
parser = _Lark('''
a: "x" "a"?
b: "x" "b"?
''', start=['a', 'b'])

self.assertEqual(parser.parse('xa', 'a'), Tree('a', []))
self.assertEqual(parser.parse('xb', 'b'), Tree('b', []))

def test_lexer_detect_newline_tokens(self):
# Detect newlines in regular tokens
g = _Lark(r"""start: "go" tail*
!tail : SA "@" | SB "@" | SC "@" | SD "@"
SA : "a" /\n/
SB : /b./s
SC : "c" /[^a-z]/
SD : "d" /\s/
""")
a,b,c,d = [x.children[1] for x in g.parse('goa\n@b\n@c\n@d\n@').children]
self.assertEqual(a.line, 2)
self.assertEqual(b.line, 3)
self.assertEqual(c.line, 4)
self.assertEqual(d.line, 5)

# Detect newlines in ignored tokens
for re in ['/\\n/', '/[^a-z]/', '/\\s/']:
g = _Lark('''!start: "a" "a"
%ignore {}'''.format(re))
a, b = g.parse('a\na').children
self.assertEqual(a.line, 1)
self.assertEqual(b.line, 2)


_NAME = "Test" + PARSER.capitalize() + LEXER.capitalize()
@@ -1479,6 +1711,7 @@ _TO_TEST = [
('dynamic_complete', 'earley'),
('standard', 'lalr'),
('contextual', 'lalr'),
('custom', 'lalr'),
# (None, 'earley'),
]



+ 2
- 2
tests/test_reconstructor.py View File

@@ -16,7 +16,7 @@ def _remove_ws(s):
class TestReconstructor(TestCase):

def assert_reconstruct(self, grammar, code):
parser = Lark(grammar, parser='lalr')
parser = Lark(grammar, parser='lalr', maybe_placeholders=False)
tree = parser.parse(code)
new = Reconstructor(parser).reconstruct(tree)
self.assertEqual(_remove_ws(code), _remove_ws(new))
@@ -105,7 +105,7 @@ class TestReconstructor(TestCase):
%ignore WS
"""

json_parser = Lark(json_grammar, parser='lalr')
json_parser = Lark(json_grammar, parser='lalr', maybe_placeholders=False)
tree = json_parser.parse(test_json)

new_json = Reconstructor(json_parser).reconstruct(tree)


+ 3
- 0
tests/test_relative_import_unicode.lark View File

@@ -0,0 +1,3 @@
start: UNICODE

%import .grammars.test_unicode.UNICODE

+ 8
- 15
tests/test_tools.py View File

@@ -1,11 +1,9 @@
from __future__ import absolute_import

import sys
import unittest
from unittest import TestCase
from unittest import TestCase, main

from lark.tree import Tree

from lark.tools import standalone

try:
@@ -49,6 +47,8 @@ class TestStandalone(TestCase):
l = _Lark()
x = l.parse('12 elephants')
self.assertEqual(x.children, ['12', 'elephants'])
x = l.parse('16 candles')
self.assertEqual(x.children, ['16', 'candles'])

def test_contextual(self):
grammar = """
@@ -92,26 +92,19 @@ class TestStandalone(TestCase):
_NEWLINE: /\n/
"""

# from lark import Lark
# l = Lark(grammar, parser='lalr', lexer='contextual', postlex=MyIndenter())
# x = l.parse('(\n)\n')
# print('@@', x)


context = self._create_standalone(grammar)
_Lark = context['Lark_StandAlone']

# l = _Lark(postlex=MyIndenter())
# x = l.parse('()\n')
# print(x)
l = _Lark(postlex=MyIndenter())
x = l.parse('()\n')
self.assertEqual(x, Tree('start', []))
l = _Lark(postlex=MyIndenter())
x = l.parse('(\n)\n')
print(x)

self.assertEqual(x, Tree('start', []))



if __name__ == '__main__':
unittest.main()
main()



+ 55
- 1
tests/test_trees.py View File

@@ -4,9 +4,10 @@ import unittest
from unittest import TestCase
import copy
import pickle
import functools

from lark.tree import Tree
from lark.visitors import Transformer, Interpreter, visit_children_decor, v_args, Discard
from lark.visitors import Visitor, Visitor_Recursive, Transformer, Interpreter, visit_children_decor, v_args, Discard


class TestTrees(TestCase):
@@ -33,6 +34,43 @@ class TestTrees(TestCase):
nodes = list(self.tree1.iter_subtrees_topdown())
self.assertEqual(nodes, expected)

def test_visitor(self):
class Visitor1(Visitor):
def __init__(self):
self.nodes=[]

def __default__(self,tree):
self.nodes.append(tree)
class Visitor1_Recursive(Visitor_Recursive):
def __init__(self):
self.nodes=[]

def __default__(self,tree):
self.nodes.append(tree)

visitor1=Visitor1()
visitor1_recursive=Visitor1_Recursive()

expected_top_down = [Tree('a', [Tree('b', 'x'), Tree('c', 'y'), Tree('d', 'z')]),
Tree('b', 'x'), Tree('c', 'y'), Tree('d', 'z')]
expected_botton_up= [Tree('b', 'x'), Tree('c', 'y'), Tree('d', 'z'),
Tree('a', [Tree('b', 'x'), Tree('c', 'y'), Tree('d', 'z')])]

visitor1.visit(self.tree1)
self.assertEqual(visitor1.nodes,expected_botton_up)

visitor1_recursive.visit(self.tree1)
self.assertEqual(visitor1_recursive.nodes,expected_botton_up)

visitor1.nodes=[]
visitor1_recursive.nodes=[]

visitor1.visit_topdown(self.tree1)
self.assertEqual(visitor1.nodes,expected_top_down)

visitor1_recursive.visit_topdown(self.tree1)
self.assertEqual(visitor1_recursive.nodes,expected_top_down)

def test_interp(self):
t = Tree('a', [Tree('b', []), Tree('c', []), 'd'])

@@ -146,6 +184,22 @@ class TestTrees(TestCase):
res = T().transform(t)
self.assertEqual(res, 2.9)

def test_partial(self):

tree = Tree("start", [Tree("a", ["test1"]), Tree("b", ["test2"])])

def test(prefix, s, postfix):
return prefix + s.upper() + postfix

@v_args(inline=True)
class T(Transformer):
a = functools.partial(test, "@", postfix="!")
b = functools.partial(lambda s: s + "!")

res = T().transform(tree)
assert res.children == ["@TEST1!", "test2!"]


def test_discard(self):
class MyTransformer(Transformer):
def a(self, args):


Loading…
Cancel
Save