Browse Source

Merge branch 'evalable_repr' into master

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.10.0
TG-Techie 4 years ago
committed by GitHub
parent
commit
9cebf44091
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
100 changed files with 7844 additions and 1428 deletions
  1. +1
    -1
      .github/workflows/tests.yml
  2. +2
    -0
      .gitignore
  3. +14
    -32
      README.md
  4. +20
    -0
      docs/Makefile
  5. +0
    -0
      docs/_static/comparison_memory.png
  6. +0
    -0
      docs/_static/comparison_runtime.png
  7. +0
    -0
      docs/_static/lark_cheatsheet.pdf
  8. +0
    -185
      docs/classes.md
  9. +70
    -0
      docs/classes.rst
  10. +184
    -0
      docs/conf.py
  11. +9
    -3
      docs/features.md
  12. +41
    -12
      docs/grammar.md
  13. +7
    -6
      docs/how_to_use.md
  14. +101
    -0
      docs/ide/app.html
  15. +105
    -0
      docs/ide/app.js
  16. +83
    -0
      docs/ide/app/app.py
  17. +3152
    -0
      docs/ide/app/core.py
  18. +150
    -0
      docs/ide/app/examples.py
  19. +475
    -0
      docs/ide/app/ext.py
  20. +9
    -0
      docs/ide/app/files.json
  21. +6
    -0
      docs/ide/app/html5.py
  22. +186
    -0
      docs/ide/app/ignite.py
  23. +101
    -0
      docs/ide/app/utils.py
  24. BIN
      docs/ide/is-loading.gif
  25. BIN
      docs/ide/lark-logo.png
  26. +0
    -53
      docs/index.md
  27. +113
    -0
      docs/index.rst
  28. +1
    -2
      docs/json_tutorial.md
  29. +36
    -0
      docs/make.bat
  30. +47
    -0
      docs/nearley.md
  31. +5
    -5
      docs/parsers.md
  32. +2
    -2
      docs/philosophy.md
  33. +1
    -1
      docs/recipes.md
  34. +2
    -0
      docs/requirements.txt
  35. +3
    -3
      docs/tree_construction.md
  36. +0
    -125
      docs/visitors.md
  37. +102
    -0
      docs/visitors.rst
  38. +0
    -34
      examples/README.md
  39. +21
    -0
      examples/README.rst
  40. +2
    -0
      examples/advanced/README.rst
  41. +64
    -0
      examples/advanced/_json_parser.py
  42. +44
    -0
      examples/advanced/conf_earley.py
  43. +40
    -0
      examples/advanced/conf_lalr.py
  44. +9
    -8
      examples/advanced/custom_lexer.py
  45. +37
    -0
      examples/advanced/error_puppet.py
  46. +8
    -5
      examples/advanced/error_reporting_lalr.py
  47. +0
    -0
      examples/advanced/python2.lark
  48. +0
    -0
      examples/advanced/python3.lark
  49. +12
    -8
      examples/advanced/python_bytecode.py
  50. +15
    -15
      examples/advanced/python_parser.py
  51. +11
    -7
      examples/advanced/qscintilla_json.py
  52. +11
    -7
      examples/advanced/reconstruct_json.py
  53. +0
    -0
      examples/advanced/template_lark.lark
  54. +29
    -0
      examples/advanced/templates.py
  55. +7
    -3
      examples/calc.py
  56. +0
    -42
      examples/conf_earley.py
  57. +0
    -38
      examples/conf_lalr.py
  58. +12
    -3
      examples/fruitflies.py
  59. +12
    -9
      examples/indented_tree.py
  60. +6
    -6
      examples/json_parser.py
  61. +19
    -9
      examples/lark_grammar.py
  62. +636
    -354
      examples/standalone/json_parser.py
  63. +6
    -1
      examples/turtle_dsl.py
  64. +20
    -10
      lark-stubs/exceptions.pyi
  65. +7
    -1
      lark-stubs/lark.pyi
  66. +4
    -1
      lark-stubs/lexer.pyi
  67. +0
    -0
      lark-stubs/parsers/__init__.pyi
  68. +22
    -0
      lark-stubs/parsers/lalr_puppet.pyi
  69. +1
    -1
      lark-stubs/reconstruct.pyi
  70. +2
    -1
      lark/__init__.py
  71. +6
    -6
      lark/common.py
  72. +95
    -26
      lark/exceptions.py
  73. +165
    -87
      lark/lark.py
  74. +73
    -33
      lark/lexer.py
  75. +76
    -36
      lark/load_grammar.py
  76. +81
    -0
      lark/parse_tree_builder.py
  77. +32
    -15
      lark/parser_frontends.py
  78. +4
    -4
      lark/parsers/earley.py
  79. +81
    -4
      lark/parsers/earley_forest.py
  80. +1
    -1
      lark/parsers/grammar_analysis.py
  81. +4
    -5
      lark/parsers/lalr_analysis.py
  82. +16
    -13
      lark/parsers/lalr_parser.py
  83. +128
    -0
      lark/parsers/lalr_puppet.py
  84. +33
    -93
      lark/reconstruct.py
  85. +19
    -13
      lark/tools/nearley.py
  86. +40
    -2
      lark/tools/standalone.py
  87. +40
    -5
      lark/tree.py
  88. +177
    -0
      lark/tree_matcher.py
  89. +81
    -49
      lark/utils.py
  90. +75
    -14
      lark/visitors.py
  91. +0
    -14
      mkdocs.yml
  92. +7
    -5
      readthedocs.yml
  93. +10
    -2
      setup.py
  94. +1
    -0
      test-requirements.txt
  95. +6
    -2
      tests/__main__.py
  96. +16
    -0
      tests/test_cache.py
  97. +31
    -0
      tests/test_grammar.py
  98. +65
    -0
      tests/test_logger.py
  99. +4
    -3
      tests/test_nearley/test_nearley.py
  100. +392
    -3
      tests/test_parser.py

+ 1
- 1
.github/workflows/tests.yml View File

@@ -22,7 +22,7 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r nearley-requirements.txt
pip install -r test-requirements.txt
- name: Run tests
run: |
python -m tests

+ 2
- 0
.gitignore View File

@@ -10,3 +10,5 @@ tags
.mypy_cache
/dist
/build
docs/_build
docs/examples

+ 14
- 32
README.md View File

@@ -1,18 +1,18 @@
# Lark - a modern parsing library for Python
# Lark - a parsing toolkit for Python

Lark is a parser built with a focus on ergonomics, performance and resilience.
Lark is a parsing toolkit for Python, built with a focus on ergonomics, performance and modularity.

Lark can parse all context-free languages. That means it is capable of parsing almost any programming language out there, and to some degree most natural languages too.
Lark can parse all context-free languages. To put it simply, it means that it is capable of parsing almost any programming language out there, and to some degree most natural languages too.

**Who is it for?**

- **Beginners**: Lark is very friendly for experimentation. It can parse any grammar you throw at it, no matter how complicated or ambiguous, and do so efficiently. It also constructs an annotated parse-tree for you, using only the grammar, and it gives you convienient and flexible tools to process that parse-tree.
- **Beginners**: Lark is very friendly for experimentation. It can parse any grammar you throw at it, no matter how complicated or ambiguous, and do so efficiently. It also constructs an annotated parse-tree for you, using only the grammar and an input, and it gives you convienient and flexible tools to process that parse-tree.

- **Experts**: Lark implements both Earley(SPPF) and LALR(1), and several different lexers, so you can trade-off power and speed, according to your requirements. It also provides a variety of sophisticated features and utilities.

**What can it do?**

- Parse all context-free grammars, and handle any ambiguity
- Parse all context-free grammars, and handle any ambiguity gracefully
- Build an annotated parse-tree automagically, no construction code required.
- Provide first-rate performance in terms of both Big-O complexity and measured run-time (considering that this is Python ;)
- Run on every Python interpreter (it's pure-python)
@@ -25,14 +25,15 @@ Most importantly, Lark will save you time and prevent you from getting parsing h
### Quick links

- [Documentation @readthedocs](https://lark-parser.readthedocs.io/)
- [Cheatsheet (PDF)](/docs/lark_cheatsheet.pdf)
- [Cheatsheet (PDF)](/docs/_static/lark_cheatsheet.pdf)
- [Online IDE (very basic)](https://lark-parser.github.io/lark/ide/app.html)
- [Tutorial](/docs/json_tutorial.md) for writing a JSON parser.
- Blog post: [How to write a DSL with Lark](http://blog.erezsh.com/how-to-write-a-dsl-in-python-with-lark/)
- [Gitter chat](https://gitter.im/lark-parser/Lobby)

### Install Lark

$ pip install lark-parser
$ pip install lark-parser --upgrade

Lark has no dependencies.

@@ -76,12 +77,11 @@ Notice punctuation doesn't appear in the resulting tree. It's automatically filt

### Fruit flies like bananas

Lark is great at handling ambiguity. Let's parse the phrase "fruit flies like bananas":
Lark is great at handling ambiguity. Here is the result of parsing the phrase "fruit flies like bananas":

![fruitflies.png](examples/fruitflies.png)

See more [examples here](https://github.com/lark-parser/lark/tree/master/examples)

[Read the code here](https://github.com/lark-parser/lark/tree/master/examples/fruitflies.py), and [more examples here](https://github.com/lark-parser/lark/tree/master/examples)


## List of main features
@@ -99,7 +99,7 @@ See more [examples here](https://github.com/lark-parser/lark/tree/master/example
- **Python 2 & 3** compatible
- Automatic line & column tracking
- Standard library of terminals (strings, numbers, names, etc.)
- Import grammars from Nearley.js
- Import grammars from Nearley.js ([read more](/docs/nearley.md))
- Extensive test suite [![codecov](https://codecov.io/gh/erezsh/lark/branch/master/graph/badge.svg)](https://codecov.io/gh/erezsh/lark)
- MyPy support using type stubs
- And much more!
@@ -113,9 +113,9 @@ See the full list of [features here](https://lark-parser.readthedocs.io/en/lates

Lark is the fastest and lightest (lower is better)

![Run-time Comparison](docs/comparison_runtime.png)
![Run-time Comparison](docs/_static/comparison_runtime.png)

![Memory Usage Comparison](docs/comparison_memory.png)
![Memory Usage Comparison](docs/_static/comparison_memory.png)


Check out the [JSON tutorial](/docs/json_tutorial.md#conclusion) for more details on how the comparison was made.
@@ -155,28 +155,10 @@ Check out the [JSON tutorial](/docs/json_tutorial.md#conclusion) for more detail
- [miniwdl](https://github.com/chanzuckerberg/miniwdl) - A static analysis toolkit for the Workflow Description Language
- [pytreeview](https://gitlab.com/parmenti/pytreeview) - a lightweight tree-based grammar explorer
- [harmalysis](https://github.com/napulen/harmalysis) - A language for harmonic analysis and music theory
- [gersemi](https://github.com/BlankSpruce/gersemi) - A CMake code formatter

Using Lark? Send me a message and I'll add your project!

### How to use Nearley grammars in Lark

Lark comes with a tool to convert grammars from [Nearley](https://github.com/Hardmath123/nearley), a popular Earley library for Javascript. It uses [Js2Py](https://github.com/PiotrDabkowski/Js2Py) to convert and run the Javascript postprocessing code segments.

Here's an example:
```bash
git clone https://github.com/Hardmath123/nearley
python -m lark.tools.nearley nearley/examples/calculator/arithmetic.ne main nearley > ncalc.py
```

You can use the output as a regular python module:

```python
>>> import ncalc
>>> ncalc.parse('sin(pi/4) ^ e')
0.38981434460254655
```


## License

Lark uses the [MIT license](LICENSE).


+ 20
- 0
docs/Makefile View File

@@ -0,0 +1,20 @@
# Minimal makefile for Sphinx documentation
#

# You can set these variables from the command line.
SPHINXOPTS =
SPHINXBUILD = sphinx-build
SPHINXPROJ = Lark
SOURCEDIR = .
BUILDDIR = _build

# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

.PHONY: help Makefile

# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

docs/comparison_memory.png → docs/_static/comparison_memory.png View File


docs/comparison_runtime.png → docs/_static/comparison_runtime.png View File


docs/lark_cheatsheet.pdf → docs/_static/lark_cheatsheet.pdf View File


+ 0
- 185
docs/classes.md View File

@@ -1,185 +0,0 @@
# Classes Reference

This page details the important classes in Lark.

----

## lark.Lark

The Lark class is the main interface for the library. It's mostly a thin wrapper for the many different parsers, and for the tree constructor.

#### \_\_init\_\_(self, grammar_string, **options)

Creates an instance of Lark with the given grammar

#### open(cls, grammar_filename, rel_to=None, **options)

Creates an instance of Lark with the grammar given by its filename

If rel_to is provided, the function will find the grammar filename in relation to it.

Example:

```python
>>> Lark.open("grammar_file.lark", rel_to=__file__, parser="lalr")
Lark(...)
```

#### parse(self, text)

Return a complete parse tree for the text (of type Tree)

If a transformer is supplied to `__init__`, returns whatever is the result of the transformation.


#### save(self, f) / load(cls, f)

Useful for caching and multiprocessing.

`save` saves the instance into the given file object

`load` loads an instance from the given file object

####


### Lark Options
#### General options

**start** - The start symbol. Either a string, or a list of strings for multiple possible starts (Default: "start")

**debug** - Display debug information, such as warnings (default: False)

**transformer** - Applies the transformer to every parse tree (equivlent to applying it after the parse, but faster)

**propagate_positions** - Propagates (line, column, end_line, end_column) attributes into all tree branches.

**maybe_placeholders** -
- When True, the `[]` operator returns `None` when not matched.
- When `False`, `[]` behaves like the `?` operator, and returns no value at all.
- (default=`False`. Recommended to set to `True`)

**g_regex_flags** - Flags that are applied to all terminals (both regex and strings)

**keep_all_tokens** - Prevent the tree builder from automagically removing "punctuation" tokens (default: False)

**cache** - Cache the results of the Lark grammar analysis, for x2 to x3 faster loading. LALR only for now.
- When `False`, does nothing (default)
- When `True`, caches to a temporary file in the local directory
- When given a string, caches to the path pointed by the string

#### Algorithm

**parser** - Decides which parser engine to use, "earley" or "lalr". (Default: "earley")
(there is also a "cyk" option for legacy)

**lexer** - Decides whether or not to use a lexer stage

- "auto" (default): Choose for me based on the parser
- "standard": Use a standard lexer
- "contextual": Stronger lexer (only works with parser="lalr")
- "dynamic": Flexible and powerful (only with parser="earley")
- "dynamic_complete": Same as dynamic, but tries *every* variation of tokenizing possible. (only with parser="earley")

**ambiguity** - Decides how to handle ambiguity in the parse. Only relevant if parser="earley"
- "resolve": The parser will automatically choose the simplest derivation (it chooses consistently: greedy for tokens, non-greedy for rules)
- "explicit": The parser will return all derivations wrapped in "_ambig" tree nodes (i.e. a forest).

#### Domain Specific

- **postlex** - Lexer post-processing (Default: None) Only works with the standard and contextual lexers.
- **priority** - How priorities should be evaluated - auto, none, normal, invert (Default: auto)
- **lexer_callbacks** - Dictionary of callbacks for the lexer. May alter tokens during lexing. Use with caution.
- **edit_terminals** - A callback

----

## Tree

The main tree class

* `data` - The name of the rule or alias
* `children` - List of matched sub-rules and terminals
* `meta` - Line & Column numbers (if `propagate_positions` is enabled)
* meta attributes: `line`, `column`, `start_pos`, `end_line`, `end_column`, `end_pos`

#### \_\_init\_\_(self, data, children)

Creates a new tree, and stores "data" and "children" in attributes of the same name.

#### pretty(self, indent_str=' ')

Returns an indented string representation of the tree. Great for debugging.

#### find_pred(self, pred)

Returns all nodes of the tree that evaluate pred(node) as true.

#### find_data(self, data)

Returns all nodes of the tree whose data equals the given data.

#### iter_subtrees(self)

Depth-first iteration.

Iterates over all the subtrees, never returning to the same node twice (Lark's parse-tree is actually a DAG).

#### iter_subtrees_topdown(self)

Breadth-first iteration.

Iterates over all the subtrees, return nodes in order like pretty() does.

#### \_\_eq\_\_, \_\_hash\_\_

Trees can be hashed and compared.

----

## Token

When using a lexer, the resulting tokens in the trees will be of the Token class, which inherits from Python's string. So, normal string comparisons and operations will work as expected. Tokens also have other useful attributes:

* `type` - Name of the token (as specified in grammar).
* `pos_in_stream` - the index of the token in the text
* `line` - The line of the token in the text (starting with 1)
* `column` - The column of the token in the text (starting with 1)
* `end_line` - The line where the token ends
* `end_column` - The next column after the end of the token. For example, if the token is a single character with a `column` value of 4, `end_column` will be 5.
* `end_pos` - the index where the token ends (basically pos_in_stream + len(token))

## Transformer
## Visitor
## Interpreter

See the [visitors page](visitors.md)


## UnexpectedInput

## UnexpectedToken

## UnexpectedException

- `UnexpectedInput`
- `UnexpectedToken` - The parser recieved an unexpected token
- `UnexpectedCharacters` - The lexer encountered an unexpected string

After catching one of these exceptions, you may call the following helper methods to create a nicer error message:

#### get_context(text, span)

Returns a pretty string pinpointing the error in the text, with `span` amount of context characters around it.

(The parser doesn't hold a copy of the text it has to parse, so you have to provide it again)

#### match_examples(parse_fn, examples)

Allows you to detect what's wrong in the input text by matching against example errors.

Accepts the parse function (usually `lark_instance.parse`) and a dictionary of `{'example_string': value}`.

The function will iterate the dictionary until it finds a matching error, and return the corresponding value.

For an example usage, see: [examples/error_reporting_lalr.py](https://github.com/lark-parser/lark/blob/master/examples/error_reporting_lalr.py)

+ 70
- 0
docs/classes.rst View File

@@ -0,0 +1,70 @@
API Reference
=============

Lark
----

.. autoclass:: lark.Lark
:members: open, parse, save, load


Using Unicode character classes with ``regex``
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Python's builtin ``re`` module has a few persistent known bugs and also won't parse
advanced regex features such as character classes.
With ``pip install lark-parser[regex]``, the ``regex`` module will be
installed alongside lark and can act as a drop-in replacement to ``re``.

Any instance of Lark instantiated with ``regex=True`` will use the ``regex`` module instead of ``re``.

For example, we can use character classes to match PEP-3131 compliant Python identifiers:

::

from lark import Lark
>>> g = Lark(r"""
?start: NAME
NAME: ID_START ID_CONTINUE*
ID_START: /[\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}_]+/
ID_CONTINUE: ID_START | /[\p{Mn}\p{Mc}\p{Nd}\p{Pc}·]+/
""", regex=True)

>>> g.parse('வணக்கம்')
'வணக்கம்'


Tree
----

.. autoclass:: lark.Tree
:members: pretty, find_pred, find_data, iter_subtrees,
iter_subtrees_topdown

Token
-----

.. autoclass:: lark.Token

Transformer, Visitor & Interpreter
----------------------------------

See :doc:`visitors`.

UnexpectedInput
---------------

.. autoclass:: lark.exceptions.UnexpectedInput
:members: get_context, match_examples

.. autoclass:: lark.exceptions.UnexpectedToken

.. autoclass:: lark.exceptions.UnexpectedCharacters

.. _parserpuppet:

ParserPuppet
------------

.. autoclass:: lark.parsers.lalr_puppet.ParserPuppet
:members: choices, feed_token, copy, pretty, resume_parse

+ 184
- 0
docs/conf.py View File

@@ -0,0 +1,184 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# Lark documentation build configuration file, created by
# sphinx-quickstart on Sun Aug 16 13:09:41 2020.
#
# This file is execfile()d with the current directory set to its
# containing dir.
#
# Note that not all possible configuration values are present in this
# autogenerated file.
#
# All configuration values have a default; values that are commented out
# serve to show the default.

# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
import os
import sys
sys.path.insert(0, os.path.abspath('..'))
autodoc_member_order = 'bysource'


# -- General configuration ------------------------------------------------

# If your documentation needs a minimal Sphinx version, state it here.
#
# needs_sphinx = '1.0'

# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
'sphinx.ext.autodoc',
'sphinx.ext.napoleon',
'sphinx.ext.coverage',
'recommonmark',
'sphinx_gallery.gen_gallery'
]

# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']

# The suffix(es) of source filenames.
# You can specify multiple suffix as a list of string:
#
# source_suffix = ['.rst', '.md']
source_suffix = {
'.rst': 'restructuredtext',
'.md': 'markdown'
}


# The master toctree document.
master_doc = 'index'

# General information about the project.
project = 'Lark'
copyright = '2020, Erez Shinan'
author = 'Erez Shinan'

# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
# built documents.
#
# The short X.Y version.
version = ''
# The full version, including alpha/beta/rc tags.
release = ''

# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
#
# This is also used if you do content translation via gettext catalogs.
# Usually you set "language" from the command line for these cases.
language = None

# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This patterns also effect to html_static_path and html_extra_path
exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']

# The name of the Pygments (syntax highlighting) style to use.
pygments_style = 'sphinx'

# If true, `todo` and `todoList` produce output, else they produce nothing.
todo_include_todos = False


# -- Options for HTML output ----------------------------------------------

# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
html_theme = 'sphinx_rtd_theme'

# Theme options are theme-specific and customize the look and feel of a theme
# further. For a list of options available for each theme, see the
# documentation.
#
# html_theme_options = {}

# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']

# Custom sidebar templates, must be a dictionary that maps document names
# to template names.
#
# This is required for the alabaster theme
# refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars
html_sidebars = {
'**': [
'relations.html', # needs 'show_related': True theme option to display
'searchbox.html',
]
}


# -- Options for HTMLHelp output ------------------------------------------

# Output file base name for HTML help builder.
htmlhelp_basename = 'Larkdoc'


# -- Options for LaTeX output ---------------------------------------------

latex_elements = {
# The paper size ('letterpaper' or 'a4paper').
#
# 'papersize': 'letterpaper',

# The font size ('10pt', '11pt' or '12pt').
#
# 'pointsize': '10pt',

# Additional stuff for the LaTeX preamble.
#
# 'preamble': '',

# Latex figure (float) alignment
#
# 'figure_align': 'htbp',
}

# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title,
# author, documentclass [howto, manual, or own class]).
latex_documents = [
(master_doc, 'Lark.tex', 'Lark Documentation',
'Erez Shinan', 'manual'),
]


# -- Options for manual page output ---------------------------------------

# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [
(master_doc, 'lark', 'Lark Documentation',
[author], 1)
]


# -- Options for Texinfo output -------------------------------------------

# Grouping the document tree into Texinfo files. List of tuples
# (source start file, target name, title, author,
# dir menu entry, description, category)
texinfo_documents = [
(master_doc, 'Lark', 'Lark Documentation',
author, 'Lark', 'One line description of project.',
'Miscellaneous'),
]

# -- Sphinx gallery config -------------------------------------------

sphinx_gallery_conf = {
'examples_dirs': ['../examples'],
'gallery_dirs': ['examples'],
}

+ 9
- 3
docs/features.md View File

@@ -1,4 +1,6 @@
# Main Features
# Features

## Main Features
- Earley parser, capable of parsing any context-free grammar
- Implements SPPF, for efficient parsing and storing of ambiguous grammars.
- LALR(1) parser, limited in power of expression, but very efficient in space and performance (O(n)).
@@ -6,6 +8,7 @@
- EBNF-inspired grammar, with extra features (See: [Grammar Reference](grammar.md))
- Builds a parse-tree (AST) automagically based on the grammar
- Stand-alone parser generator - create a small independent parser to embed in your project.
- Flexible error handling by using a "puppet parser" mechanism (LALR only)
- Automatic line & column tracking (for both tokens and matched rules)
- Automatic terminal collision resolution
- Standard library of terminals (strings, numbers, names, etc.)
@@ -17,11 +20,14 @@

[Read more about the parsers](parsers.md)

# Extra features
## Extra features

- Import rules and tokens from other Lark grammars, for code reuse and modularity.
- Import grammars from Nearley.js
- Support for external regex module ([see here](classes.md#using-unicode-character-classes-with-regex))
- Import grammars from Nearley.js ([read more](nearley.md))
- CYK parser
- Visualize your parse trees as dot or png files ([see_example](https://github.com/lark-parser/lark/blob/master/examples/fruitflies.py))


### Experimental features
- Automatic reconstruction of input from parse-tree (see examples)


+ 41
- 12
docs/grammar.md View File

@@ -1,13 +1,5 @@
# Grammar Reference

Table of contents:

1. [Definitions](#defs)
1. [Terminals](#terms)
1. [Rules](#rules)
1. [Directives](#dirs)

<a name="defs"></a>
## Definitions

A **grammar** is a list of rules and terminals, that together define a language.
@@ -20,7 +12,7 @@ Each rule is a list of terminals and rules, whose location and nesting define th

A **parsing algorithm** is an algorithm that takes a grammar definition and a sequence of symbols (members of the alphabet), and matches the entirety of the sequence by searching for a structure that is allowed by the grammar.

## General Syntax and notes
### General Syntax and notes

Grammars in Lark are based on [EBNF](https://en.wikipedia.org/wiki/Extended_Backus–Naur_form) syntax, with several enhancements.

@@ -58,7 +50,6 @@ Lark begins the parse with the rule 'start', unless specified otherwise in the o
Names of rules are always in lowercase, while names of terminals are always in uppercase. This distinction has practical effects, for the shape of the generated parse-tree, and the automatic construction of the lexer (aka tokenizer, or scanner).


<a name="terms"></a>
## Terminals

Terminals are used to match text into symbols. They can be defined as a combination of literals and other terminals.
@@ -83,12 +74,50 @@ Terminals also support grammar operators, such as `|`, `+`, `*` and `?`.

Terminals are a linear construct, and therefore may not contain themselves (recursion isn't allowed).

### Templates

Templates are expanded when preprocessing the grammar.

Definition syntax:

```ebnf
my_template{param1, param2, ...}: <EBNF EXPRESSION>
```

Use syntax:

```ebnf
some_rule: my_template{arg1, arg2, ...}
```

Example:
```ebnf
_separated{x, sep}: x (sep x)* // Define a sequence of 'x sep x sep x ...'

num_list: "[" _separated{NUMBER, ","} "]" // Will match "[1, 2, 3]" etc.
```

### Priority

Terminals can be assigned priority only when using a lexer (future versions may support Earley's dynamic lexing).

Priority can be either positive or negative. If not specified for a terminal, it defaults to 1.

Highest priority terminals are always matched first.

### Regexp Flags

You can use flags on regexps and strings. For example:

```perl
SELECT: "select"i //# Will ignore case, and match SELECT or Select, etc.
MULTILINE_TEXT: /.+/s
```

Supported flags are one of: `imslu`. See Python's regex documentation for more details on each one.

Regexps/strings of different flags can only be concatenated in Python 3.6+

#### Notes for when using a lexer:

When using a lexer (standard or contextual), it is the grammar-author's responsibility to make sure the literals don't collide, or that if they do, they are matched in the desired order. Literals are matched according to the following precedence:
@@ -154,7 +183,6 @@ _ambig
```


<a name="rules"></a>
## Rules

**Syntax:**
@@ -176,7 +204,7 @@ Each item is one of:
* `TERMINAL`
* `"string literal"` or `/regexp literal/`
* `(item item ..)` - Group items
* `[item item ..]` - Maybe. Same as `(item item ..)?`, but generates `None` if there is no match
* `[item item ..]` - Maybe. Same as `(item item ..)?`, but when `maybe_placeholders=True`, generates `None` if there is no match.
* `item?` - Zero or one instances of item ("maybe")
* `item*` - Zero or more instances of item
* `item+` - One or more instances of item
@@ -256,3 +284,4 @@ Note that `%ignore` directives cannot be imported. Imported rules will abide by
### %declare

Declare a terminal without defining it. Useful for plugins.


+ 7
- 6
docs/how_to_use.md View File

@@ -22,20 +22,21 @@ Of course, some specific use-cases may deviate from this process. Feel free to s

Browse the [Examples](https://github.com/lark-parser/lark/tree/master/examples) to find a template that suits your purposes.

Read the tutorials to get a better understanding of how everything works. (links in the [main page](/))
Read the tutorials to get a better understanding of how everything works. (links in the [main page](/index))

Use the [Cheatsheet (PDF)](lark_cheatsheet.pdf) for quick reference.
Use the [Cheatsheet (PDF)](/_static/lark_cheatsheet.pdf) for quick reference.

Use the reference pages for more in-depth explanations. (links in the [main page](/)]
Use the reference pages for more in-depth explanations. (links in the [main page](/index)]

## LALR usage

By default Lark silently resolves Shift/Reduce conflicts as Shift. To enable warnings pass `debug=True`. To get the messages printed you have to configure `logging` framework beforehand. For example:
By default Lark silently resolves Shift/Reduce conflicts as Shift. To enable warnings pass `debug=True`. To get the messages printed you have to configure the `logger` beforehand. For example:

```python
from lark import Lark
import logging
logging.basicConfig(level=logging.DEBUG)
from lark import Lark, logger

logger.setLevel(logging.DEBUG)

collision_grammar = '''
start: as as


+ 101
- 0
docs/ide/app.html View File

@@ -0,0 +1,101 @@
<!doctype html>
<html>
<head>
<meta charset="UTF-8">

<!-- flip comment below to use local pyodide -->
<script src="https://pyodide-cdn2.iodide.io/v0.15.0/full/pyodide.js"></script>
<!-- <script src="./pyodide/pyodide.js"></script> -->

<link rel="stylesheet" href="https://unpkg.com/purecss@1.0.1/build/base-min.css">
<link href="https://fonts.googleapis.com/css2?family=Inconsolata:wght@500&display=swap" rel="stylesheet">

<script src="app.js"></script>
<style>
.is-loading:after {
background-image: url(is-loading.gif);
background-position: center 35%;
background-repeat: no-repeat;
background-color: hsla(0, 0%, 100%, .6);
position: absolute;
z-index: 700;
content: " ";
width: 100%;
height: 100%;
display: block;
left: 0;
right: 0;
top: 0;
bottom: 0
}

h1 {
text-align: center;
}

textarea, select, body > div > ul {
/* display: block;
margin: 15px auto;
width: 90%;
font-weight: bold;
color: #052569; */
font-family: 'Inconsolata', monospace;
}

textarea {
margin: 10px;
width: 90%;
padding: 10px;
font-size: 1.4em;
}

#grammar {
min-height: 300px;
}
#input {
min-height: 100px;
}

ul ul {
border-left: 1px dotted silver;
margin-left: -16px;
}

li {
list-style: circle;
margin-left: 10px;
}

select {
padding: 5px;
}

#inputs {
text-align: center;
}

#result {
display: flex;
justify-content: center;
}

#result > ul {
margin: 10px;
width: 90%;
padding: 10px;
font-size: 1.2em;
}

menu {
display: flex;
}

main {
margin: auto;
}

</style>
</head>
<body class="is-loading">
</body>
</html>

+ 105
- 0
docs/ide/app.js View File

@@ -0,0 +1,105 @@
class app {

constructor(modules, invocation){
languagePluginLoader.then(() => {
// If you don't require for pre-loaded Python packages, remove this promise below.
window.pyodide.runPythonAsync("import setuptools, micropip").then(()=>{
window.pyodide.runPythonAsync("micropip.install('lark-parser')").then(()=>{
this.fetchSources(modules).then(() => {
window.pyodide.runPythonAsync("import " + Object.keys(modules).join("\nimport ") + "\n" + invocation + "\n").then(() => this.initializingComplete());
});
});
});
});
}

loadSources(module, baseURL, files) {
let promises = [];

for (let f in files) {
promises.push(
new Promise((resolve, reject) => {
let file = files[f];
let url = (baseURL ? baseURL + "/" : "") + file;

fetch(url, {}).then((response) => {
if (response.status === 200)
return response.text().then((code) => {
let path = ("/lib/python3.7/site-packages/" + module + "/" + file).split("/");
let lookup = "";

for (let i in path) {
if (!path[i]) {
continue;
}

lookup += (lookup ? "/" : "") + path[i];

if (parseInt(i) === path.length - 1) {
window.pyodide._module.FS.writeFile(lookup, code);
console.debug(`fetched ${lookup}`);
} else {
try {
window.pyodide._module.FS.lookupPath(lookup);
} catch {
window.pyodide._module.FS.mkdir(lookup);
console.debug(`created ${lookup}`);
}
}
}

resolve();
});
else
reject();
});
})
);
}

return Promise.all(promises);
}

fetchSources(modules) {
let promises = [];

for( let module of Object.keys(modules) )
{
promises.push(
new Promise((resolve, reject) => {
fetch(`${modules[module]}/files.json`, {}).then((response) => {
if (response.status === 200) {
response.text().then((list) => {
let files = JSON.parse(list);

this.loadSources(module, modules[module], files).then(() => {
resolve();
})
})
} else {
reject();
}
})
}));
}

return Promise.all(promises).then(() => {
for( let module of Object.keys(modules) ) {
window.pyodide.loadedPackages[module] = "default channel";
}

window.pyodide.runPython(
'import importlib as _importlib\n' +
'_importlib.invalidate_caches()\n'
);
});
}

initializingComplete() {
document.body.classList.remove("is-loading")
}
}

(function () {
window.top.app = new app({"app": "app"}, "import app.app; app.app.start()");
})();

+ 83
- 0
docs/ide/app/app.py View File

@@ -0,0 +1,83 @@
from . import html5
from .examples import examples

from lark import Lark
from lark.tree import Tree


class App(html5.Div):
def __init__(self):
super().__init__("""
<h1>
<img src="lark-logo.png"> IDE
</h1>

<main>
<menu>
<select [name]="examples">
<option disabled selected>Examples</option>
</select>
<select [name]="parser">
<option value="earley" selected>Earley (default)</option>
<option value="lalr">LALR</option>
<option value="cyk">CYK</option>
</select>
</menu>
<div id="inputs">
<div>
<div>Grammar:</div>
<textarea [name]="grammar" id="grammar" placeholder="Lark Grammar..."></textarea>
</div>
<div>
<div>Input:</div>
<textarea [name]="input" id="input" placeholder="Parser input..."></textarea>
</div>
</div>
<div id="result">
<ul [name]="ast" />
</div>
</main>
""")
self.sinkEvent("onKeyUp", "onChange")

self.parser = "earley"

# Pre-load examples
for name, (grammar, input) in examples.items():
option = html5.Option(name)
option.grammar = grammar
option.input = input

self.examples.appendChild(option)

def onChange(self, e):
if html5.utils.doesEventHitWidgetOrChildren(e, self.examples):
example = self.examples.children(self.examples["selectedIndex"])
self.grammar["value"] = example.grammar.strip()
self.input["value"] = example.input.strip()
self.onKeyUp()

elif html5.utils.doesEventHitWidgetOrChildren(e, self.parser):
self.parser = self.parser.children(self.parser["selectedIndex"])["value"]
self.onKeyUp()

def onKeyUp(self, e=None):
l = Lark(self.grammar["value"], parser=self.parser)

try:
ast = l.parse(self.input["value"])
except Exception as e:
self.ast.appendChild(
html5.Li(str(e)), replace=True
)

print(ast)
traverse = lambda node: html5.Li([node.data, html5.Ul([traverse(c) for c in node.children])] if isinstance(node, Tree) else node)
self.ast.appendChild(traverse(ast), replace=True)


def start():
html5.Body().appendChild(
App()
)


+ 3152
- 0
docs/ide/app/core.py
File diff suppressed because it is too large
View File


+ 150
- 0
docs/ide/app/examples.py View File

@@ -0,0 +1,150 @@

# Examples formattet this way:
# "name": ("grammar", "demo-input")

examples = {

# --- hello.lark ---
"hello.lark": ("""
start: WORD "," WORD "!"

%import common.WORD // imports from terminal library
%ignore " " // Disregard spaces in text
""", "Hello, World!"),

# --- calc.lark ---
"calc.lark": ("""
?start: sum
| NAME "=" sum -> assign_var

?sum: product
| sum "+" product -> add
| sum "-" product -> sub

?product: atom
| product "*" atom -> mul
| product "/" atom -> div

?atom: NUMBER -> number
| "-" atom -> neg
| NAME -> var
| "(" sum ")"

%import common.CNAME -> NAME
%import common.NUMBER
%import common.WS_INLINE
%ignore WS_INLINE""",
"1 + 2 * 3 + 4"),

# --- json.lark ---
"json.lark": ("""
?start: value
?value: object
| array
| string
| SIGNED_NUMBER -> number
| "true" -> true
| "false" -> false
| "null" -> null
array : "[" [value ("," value)*] "]"
object : "{" [pair ("," pair)*] "}"
pair : string ":" value
string : ESCAPED_STRING
%import common.ESCAPED_STRING
%import common.SIGNED_NUMBER
%import common.WS
%ignore WS""",
"""
[
{
"_id": "5edb875cf3d764da55602437",
"index": 0,
"guid": "3dae2206-5d4d-41fe-b81d-dc8cdba7acaa",
"isActive": false,
"balance": "$2,872.54",
"picture": "http://placehold.it/32x32",
"age": 24,
"eyeColor": "blue",
"name": "Theresa Vargas",
"gender": "female",
"company": "GEEKOL",
"email": "theresavargas@geekol.com",
"phone": "+1 (930) 450-3445",
"address": "418 Herbert Street, Sexton, Florida, 1375",
"about": "Id minim deserunt laborum enim. Veniam commodo incididunt amet aute esse duis veniam occaecat nulla esse aute et deserunt eiusmod. Anim elit ullamco minim magna sint laboris. Est consequat quis deserunt excepteur in magna pariatur laborum quis eu. Ex quis tempor elit qui qui et culpa sunt sit esse mollit cupidatat. Fugiat cillum deserunt enim minim irure reprehenderit est. Voluptate nisi quis amet quis incididunt pariatur nostrud Lorem consectetur adipisicing voluptate.\\r\\n",
"registered": "2016-11-19T01:02:42 -01:00",
"latitude": -25.65267,
"longitude": 104.19531,
"tags": [
"eiusmod",
"reprehenderit",
"anim",
"sunt",
"esse",
"proident",
"esse"
],
"friends": [
{
"id": 0,
"name": "Roth Herrera"
},
{
"id": 1,
"name": "Callie Christian"
},
{
"id": 2,
"name": "Gracie Whitfield"
}
],
"greeting": "Hello, Theresa Vargas! You have 6 unread messages.",
"favoriteFruit": "banana"
},
{
"_id": "5edb875c845eb08161a83e64",
"index": 1,
"guid": "a8ada2c1-e2c7-40d3-96b4-52c93baff7f0",
"isActive": false,
"balance": "$2,717.04",
"picture": "http://placehold.it/32x32",
"age": 23,
"eyeColor": "green",
"name": "Lily Ross",
"gender": "female",
"company": "RODEOMAD",
"email": "lilyross@rodeomad.com",
"phone": "+1 (941) 465-3561",
"address": "525 Beekman Place, Blodgett, Marshall Islands, 3173",
"about": "Aliquip duis proident excepteur eiusmod in quis officia consequat culpa eu et ut. Occaecat reprehenderit tempor mollit do eu magna qui et magna exercitation aliqua. Incididunt exercitation dolor proident eiusmod minim occaecat. Sunt et minim mollit et veniam sint ex. Duis ullamco elit aute eu excepteur reprehenderit officia.\\r\\n",
"registered": "2019-11-02T04:06:42 -01:00",
"latitude": 17.031701,
"longitude": -42.657106,
"tags": [
"id",
"non",
"culpa",
"reprehenderit",
"esse",
"elit",
"sit"
],
"friends": [
{
"id": 0,
"name": "Ursula Maldonado"
},
{
"id": 1,
"name": "Traci Huff"
},
{
"id": 2,
"name": "Taylor Holt"
}
],
"greeting": "Hello, Lily Ross! You have 3 unread messages.",
"favoriteFruit": "strawberry"
}
]""")
}

+ 475
- 0
docs/ide/app/ext.py View File

@@ -0,0 +1,475 @@
# -*- coding: utf-8 -*-
from . import core as html5
from . import utils

class Button(html5.Button):

def __init__(self, txt=None, callback=None, className=None, *args, **kwargs):
super().__init__(*args, **kwargs)
self["class"] = "btn"

if className:
self.addClass(className)

self["type"] = "button"

if txt is not None:
self.setText(txt)

self.callback = callback
self.sinkEvent("onClick")

def setText(self, txt):
if txt is not None:
self.element.innerHTML = txt
self["title"] = txt
else:
self.element.innerHTML = ""
self["title"] = ""

def onClick(self, event):
event.stopPropagation()
event.preventDefault()
if self.callback is not None:
self.callback(self)


class Input(html5.Input):
def __init__(self, type="text", placeholder=None, callback=None, id=None, focusCallback=None, *args, **kwargs):
"""

:param type: Input type. Default: "text
:param placeholder: Placeholder text. Default: None
:param callback: Function to be called onChanged: callback(id, value)
:param id: Optional id of the input element. Will be passed to callback
:return:
"""
super().__init__(*args, **kwargs)
self["class"] = "input"
self["type"] = type
if placeholder is not None:
self["placeholder"] = placeholder

self.callback = callback
if id is not None:
self["id"] = id
self.sinkEvent("onChange")

self.focusCallback = focusCallback
if focusCallback:
self.sinkEvent("onFocus")

def onChange(self, event):
event.stopPropagation()
event.preventDefault()
if self.callback is not None:
self.callback(self, self["id"], self["value"])

def onFocus(self, event):
event.stopPropagation()
event.preventDefault()
if self.focusCallback is not None:
self.focusCallback(self, self["id"], self["value"])

def onDetach(self):
super().onDetach()
self.callback = None


class Popup(html5.Div):
def __init__(self, title=None, id=None, className=None, icon=None, enableShortcuts=True, closeable=True, *args, **kwargs):
super().__init__("""
<div class="box" [name]="popupBox">
<div class="box-head" [name]="popupHead">
<div class="item" [name]="popupHeadItem">
<div class="item-image">
<i class="i i--small" [name]="popupIcon"></i>
</div>
<div class="item-content">
<div class="item-headline" [name]="popupHeadline"></div>
</div>
</div>
</div>
<div class="box-body box--content" [name]="popupBody"></div>
<div class="box-foot box--content bar" [name]="popupFoot"></div>
</div>
""")

self.appendChild = self.popupBody.appendChild
self.fromHTML = lambda *args, **kwargs: self.popupBody.fromHTML(*args, **kwargs) if kwargs.get("bindTo") else self.popupBody.fromHTML(bindTo=self, *args, **kwargs)

self["class"] = "popup popup--center is-active"
if className:
self.addClass(className)

if closeable:
closeBtn = Button("&times;", self.close, className="item-action")
closeBtn.removeClass("btn")
self.popupHeadItem.appendChild(closeBtn)

if title:
self.popupHeadline.appendChild(title)

if icon:
self.popupIcon.appendChild(icon[0])
elif title:
self.popupIcon.appendChild(title[0])
else:
self.popupIcon.appendChild("Vi") #fixme!!! this _LIBRARY_ is not only used in the Vi...

# id can be used to pass information to callbacks
self.id = id

#FIXME: Implement a global overlay! One popupOverlay next to a list of popups.
self.popupOverlay = html5.Div()
self.popupOverlay["class"] = "popup-overlay is-active"

self.enableShortcuts = enableShortcuts
self.onDocumentKeyDownMethod = None

self.popupOverlay.appendChild(self)
html5.Body().appendChild(self.popupOverlay)

#FIXME: Close/Cancel every popup with click on popupCloseBtn without removing the global overlay.

def onAttach(self):
super(Popup, self).onAttach()

if self.enableShortcuts:
self.onDocumentKeyDownMethod = self.onDocumentKeyDown # safe reference to method
html5.document.addEventListener("keydown", self.onDocumentKeyDownMethod)

def onDetach(self):
super(Popup, self).onDetach()

if self.enableShortcuts:
html5.document.removeEventListener("keydown", self.onDocumentKeyDownMethod)

def onDocumentKeyDown(self, event):
if html5.isEscape(event):
self.close()

def close(self, *args, **kwargs):
html5.Body().removeChild(self.popupOverlay)
self.popupOverlay = None



class InputDialog(Popup):
def __init__(self, text, value="", successHandler=None, abortHandler=None,
successLbl="OK", abortLbl="Cancel", placeholder="", *args, **kwargs):

super().__init__(*args, **kwargs)
self.addClass("popup--inputdialog")

self.sinkEvent("onKeyDown", "onKeyUp")

self.successHandler = successHandler
self.abortHandler = abortHandler

self.fromHTML(
"""
<div class="input-group">
<label class="label">
{{text}}
</label>
<input class="input" [name]="inputElem" value="{{value}}" placeholder="{{placeholder}}" />
</div>
""",
vars={
"text": text,
"value": value,
"placeholder": placeholder
}
)

# Cancel
self.popupFoot.appendChild(Button(abortLbl, self.onCancel, className="btn--cancel btn--danger"))

# Okay
self.okayBtn = Button(successLbl, self.onOkay, className="btn--okay btn--primary")
if not value:
self.okayBtn.disable()

self.popupFoot.appendChild(self.okayBtn)

self.inputElem.focus()

def onKeyDown(self, event):
if html5.isReturn(event) and self.inputElem["value"]:
event.stopPropagation()
event.preventDefault()
self.onOkay()

def onKeyUp(self, event):
if self.inputElem["value"]:
self.okayBtn.enable()
else:
self.okayBtn.disable()

def onDocumentKeyDown(self, event):
if html5.isEscape(event):
event.stopPropagation()
event.preventDefault()
self.onCancel()

def onOkay(self, *args, **kwargs):
if self.successHandler:
self.successHandler(self, self.inputElem["value"])
self.close()

def onCancel(self, *args, **kwargs):
if self.abortHandler:
self.abortHandler(self, self.inputElem["value"])
self.close()


class Alert(Popup):
"""
Just displaying an alerting message box with OK-button.
"""

def __init__(self, msg, title=None, className=None, okCallback=None, okLabel="OK", icon="!", closeable=True, *args, **kwargs):
super().__init__(title, className=None, icon=icon, closeable=closeable, *args, **kwargs)
self.addClass("popup--alert")

if className:
self.addClass(className)

self.okCallback = okCallback

message = html5.Span()
message.addClass("alert-msg")
self.popupBody.appendChild(message)

if isinstance(msg, str):
msg = msg.replace("\n", "<br>")

message.appendChild(msg, bindTo=False)

self.sinkEvent("onKeyDown")

if closeable:
okBtn = Button(okLabel, callback=self.onOkBtnClick)
okBtn.addClass("btn--okay btn--primary")
self.popupFoot.appendChild(okBtn)

okBtn.focus()

def drop(self):
self.okCallback = None
self.close()

def onOkBtnClick(self, sender=None):
if self.okCallback:
self.okCallback(self)

self.drop()

def onKeyDown(self, event):
if html5.isReturn(event):
event.stopPropagation()
event.preventDefault()
self.onOkBtnClick()


class YesNoDialog(Popup):
def __init__(self, question, title=None, yesCallback=None, noCallback=None,
yesLabel="Yes", noLabel="No", icon="?",
closeable=False, *args, **kwargs):
super().__init__(title, closeable=closeable, icon=icon, *args, **kwargs)
self.addClass("popup--yesnodialog")

self.yesCallback = yesCallback
self.noCallback = noCallback

lbl = html5.Span()
lbl["class"].append("question")
self.popupBody.appendChild(lbl)

if isinstance(question, html5.Widget):
lbl.appendChild(question)
else:
utils.textToHtml(lbl, question)

if len(noLabel):
btnNo = Button(noLabel, className="btn--no", callback=self.onNoClicked)
#btnNo["class"].append("btn--no")
self.popupFoot.appendChild(btnNo)

btnYes = Button(yesLabel, callback=self.onYesClicked)
btnYes["class"].append("btn--yes")
self.popupFoot.appendChild(btnYes)

self.sinkEvent("onKeyDown")
btnYes.focus()

def onKeyDown(self, event):
if html5.isReturn(event):
event.stopPropagation()
event.preventDefault()
self.onYesClicked()

def onDocumentKeyDown(self, event):
if html5.isEscape(event):
event.stopPropagation()
event.preventDefault()
self.onNoClicked()

def drop(self):
self.yesCallback = None
self.noCallback = None
self.close()

def onYesClicked(self, *args, **kwargs):
if self.yesCallback:
self.yesCallback(self)

self.drop()

def onNoClicked(self, *args, **kwargs):
if self.noCallback:
self.noCallback(self)

self.drop()


class SelectDialog(Popup):

def __init__(self, prompt, items=None, title=None, okBtn="OK", cancelBtn="Cancel", forceSelect=False,
callback=None, *args, **kwargs):
super().__init__(title, *args, **kwargs)
self["class"].append("popup--selectdialog")

self.callback = callback
self.items = items
assert isinstance(self.items, list)

# Prompt
if prompt:
lbl = html5.Span()
lbl["class"].append("prompt")

if isinstance(prompt, html5.Widget):
lbl.appendChild(prompt)
else:
utils.textToHtml(lbl, prompt)

self.popupBody.appendChild(lbl)

# Items
if not forceSelect and len(items) <= 3:
for idx, item in enumerate(items):
if isinstance(item, dict):
title = item.get("title")
cssc = item.get("class")
elif isinstance(item, tuple):
title = item[1]
cssc = None
else:
title = item

btn = Button(title, callback=self.onAnyBtnClick)
btn.idx = idx

if cssc:
btn.addClass(cssc)

self.popupBody.appendChild(btn)
else:
self.select = html5.Select()
self.popupBody.appendChild(self.select)

for idx, item in enumerate(items):
if isinstance(item, dict):
title = item.get("title")
elif isinstance(item, tuple):
title = item[1]
else:
title = item

opt = html5.Option(title)
opt["value"] = str(idx)

self.select.appendChild(opt)

if okBtn:
self.popupFoot.appendChild(Button(okBtn, callback=self.onOkClick))

if cancelBtn:
self.popupFoot.appendChild(Button(cancelBtn, callback=self.onCancelClick))

def onAnyBtnClick(self, sender):
item = self.items[sender.idx]

if isinstance(item, dict) and item.get("callback") and callable(item["callback"]):
item["callback"](item)

if self.callback:
self.callback(item)

self.items = None
self.close()

def onCancelClick(self, sender=None):
self.close()

def onOkClick(self, sender=None):
assert self.select["selectedIndex"] >= 0
item = self.items[int(self.select.children(self.select["selectedIndex"])["value"])]

if isinstance(item, dict) and item.get("callback") and callable(item["callback"]):
item["callback"](item)

if self.callback:
self.callback(item)

self.items = None
self.select = None
self.close()


class TextareaDialog(Popup):
def __init__(self, text, value="", successHandler=None, abortHandler=None, successLbl="OK", abortLbl="Cancel",
*args, **kwargs):
super().__init__(*args, **kwargs)
self["class"].append("popup--textareadialog")

self.successHandler = successHandler
self.abortHandler = abortHandler

span = html5.Span()
span.element.innerHTML = text
self.popupBody.appendChild(span)

self.inputElem = html5.Textarea()
self.inputElem["value"] = value
self.popupBody.appendChild(self.inputElem)

okayBtn = Button(successLbl, self.onOkay)
okayBtn["class"].append("btn--okay")
self.popupFoot.appendChild(okayBtn)

cancelBtn = Button(abortLbl, self.onCancel)
cancelBtn["class"].append("btn--cancel")
self.popupFoot.appendChild(cancelBtn)

self.sinkEvent("onKeyDown")

self.inputElem.focus()

def onDocumentKeyDown(self, event):
if html5.isEscape(event):
event.stopPropagation()
event.preventDefault()
self.onCancel()

def onOkay(self, *args, **kwargs):
if self.successHandler:
self.successHandler(self, self.inputElem["value"])
self.close()

def onCancel(self, *args, **kwargs):
if self.abortHandler:
self.abortHandler(self, self.inputElem["value"])
self.close()

+ 9
- 0
docs/ide/app/files.json View File

@@ -0,0 +1,9 @@
[
"app.py",
"examples.py",
"html5.py",
"core.py",
"ext.py",
"ignite.py",
"utils.py"
]

+ 6
- 0
docs/ide/app/html5.py View File

@@ -0,0 +1,6 @@
#-*- coding: utf-8 -*-

from .core import *
from . import ext, utils, ignite



+ 186
- 0
docs/ide/app/ignite.py View File

@@ -0,0 +1,186 @@
# -*- coding: utf-8 -*-
from . import core as html5


@html5.tag
class Label(html5.Label):
_parserTagName = "ignite-label"

def __init__(self, *args, **kwargs):
super(Label, self).__init__(style="label ignt-label", *args, **kwargs)


@html5.tag
class Input(html5.Input):
_parserTagName = "ignite-input"

def __init__(self, *args, **kwargs):
super(Input, self).__init__(style="input ignt-input", *args, **kwargs)


@html5.tag
class Switch(html5.Div):
_parserTagName = "ignite-switch"

def __init__(self, *args, **kwargs):
super(Switch, self).__init__(style="switch ignt-switch", *args, **kwargs)

self.input = html5.Input(style="switch-input")
self.appendChild(self.input)
self.input["type"] = "checkbox"

switchLabel = html5.Label(forElem=self.input)
switchLabel.addClass("switch-label")
self.appendChild(switchLabel)

def _setChecked(self, value):
self.input["checked"] = bool(value)

def _getChecked(self):
return self.input["checked"]


@html5.tag
class Check(html5.Input):
_parserTagName = "ignite-check"

def __init__(self, *args, **kwargs):
super(Check, self).__init__(style="check ignt-check", *args, **kwargs)

checkInput = html5.Input()
checkInput.addClass("check-input")
checkInput["type"] = "checkbox"
self.appendChild(checkInput)

checkLabel = html5.Label(forElem=checkInput)
checkLabel.addClass("check-label")
self.appendChild(checkLabel)


@html5.tag
class Radio(html5.Div):
_parserTagName = "ignite-radio"

def __init__(self, *args, **kwargs):
super(Radio, self).__init__(style="radio ignt-radio", *args, **kwargs)

radioInput = html5.Input()
radioInput.addClass("radio-input")
radioInput["type"] = "radio"
self.appendChild(radioInput)

radioLabel = html5.Label(forElem=radioInput)
radioLabel.addClass("radio-label")
self.appendChild(radioLabel)


@html5.tag
class Select(html5.Select):
_parserTagName = "ignite-select"

def __init__(self, *args, **kwargs):
super(Select, self).__init__(style="select ignt-select", *args, **kwargs)

defaultOpt = html5.Option()
defaultOpt["selected"] = True
defaultOpt["disabled"] = True
defaultOpt.element.innerHTML = ""
self.appendChild(defaultOpt)


@html5.tag
class Textarea(html5.Textarea):
_parserTagName = "ignite-textarea"

def __init__(self, *args, **kwargs):
super(Textarea, self).__init__(style="textarea ignt-textarea", *args, **kwargs)


@html5.tag
class Progress(html5.Progress):
_parserTagName = "ignite-progress"

def __init__(self, *args, **kwargs):
super(Progress, self).__init__(style="progress ignt-progress", *args, **kwargs)


@html5.tag
class Item(html5.Div):
_parserTagName = "ignite-item"

def __init__(self, title=None, descr=None, className=None, *args, **kwargs):
super(Item, self).__init__(style="item ignt-item", *args, **kwargs)
if className:
self.addClass(className)

self.fromHTML("""
<div class="item-image ignt-item-image" [name]="itemImage">
</div>
<div class="item-content ignt-item-content" [name]="itemContent">
<div class="item-headline ignt-item-headline" [name]="itemHeadline">
</div>
</div>
""")

if title:
self.itemHeadline.appendChild(html5.TextNode(title))

if descr:
self.itemSubline = html5.Div()
self.addClass("item-subline ignt-item-subline")
self.itemSubline.appendChild(html5.TextNode(descr))
self.appendChild(self.itemSubline)


@html5.tag
class Table(html5.Table):
_parserTagName = "ignite-table"

def __init__(self, *args, **kwargs):
super(Table, self).__init__(*args, **kwargs)
self.head.addClass("ignt-table-head")
self.body.addClass("ignt-table-body")

def prepareRow(self, row):
assert row >= 0, "Cannot create rows with negative index"

for child in self.body._children:
row -= child["rowspan"]
if row < 0:
return

while row >= 0:
tableRow = html5.Tr()
tableRow.addClass("ignt-table-body-row")
self.body.appendChild(tableRow)
row -= 1

def prepareCol(self, row, col):
assert col >= 0, "Cannot create cols with negative index"
self.prepareRow(row)

for rowChild in self.body._children:
row -= rowChild["rowspan"]

if row < 0:
for colChild in rowChild._children:
col -= colChild["colspan"]
if col < 0:
return

while col >= 0:
tableCell = html5.Td()
tableCell.addClass("ignt-table-body-cell")
rowChild.appendChild(tableCell)
col -= 1

return
def fastGrid( self, rows, cols, createHidden=False ):
colsstr = "".join(['<td class="ignt-table-body-cell"></td>' for i in range(0, cols)])
tblstr = '<tbody [name]="body" class="ignt-table-body" >'

for r in range(0, rows):
tblstr += '<tr class="ignt-table-body-row %s">%s</tr>' %("is-hidden" if createHidden else "",colsstr)
tblstr +="</tbody>"

self.fromHTML(tblstr)

+ 101
- 0
docs/ide/app/utils.py View File

@@ -0,0 +1,101 @@
# -*- coding: utf-8 -*-
from . import core as html5

def unescape(val, maxLength = 0):
"""
Unquotes several HTML-quoted characters in a string.

:param val: The value to be unescaped.
:type val: str

:param maxLength: Cut-off after maxLength characters.
A value of 0 means "unlimited". (default)
:type maxLength: int

:returns: The unquoted string.
:rtype: str
"""
val = val \
.replace("&lt;", "<") \
.replace("&gt;", ">") \
.replace("&quot;", "\"") \
.replace("&#39;", "'")

if maxLength > 0:
return val[0:maxLength]

return val

def doesEventHitWidgetOrParents(event, widget):
"""
Test if event 'event' hits widget 'widget' (or *any* of its parents)
"""
while widget:
if event.target == widget.element:
return widget

widget = widget.parent()

return None

def doesEventHitWidgetOrChildren(event, widget):
"""
Test if event 'event' hits widget 'widget' (or *any* of its children)
"""
if event.target == widget.element:
return widget

for child in widget.children():
if doesEventHitWidgetOrChildren(event, child):
return child

return None

def textToHtml(node, text):
"""
Generates html nodes from text by splitting text into content and into
line breaks html5.Br.

:param node: The node where the nodes are appended to.
:param text: The text to be inserted.
"""

for (i, part) in enumerate(text.split("\n")):
if i > 0:
node.appendChild(html5.Br())

node.appendChild(html5.TextNode(part))

def parseInt(s, ret = 0):
"""
Parses a value as int
"""
if not isinstance(s, str):
return int(s)
elif s:
if s[0] in "+-":
ts = s[1:]
else:
ts = s

if ts and all([_ in "0123456789" for _ in ts]):
return int(s)

return ret

def parseFloat(s, ret = 0.0):
"""
Parses a value as float.
"""
if not isinstance(s, str):
return float(s)
elif s:
if s[0] in "+-":
ts = s[1:]
else:
ts = s

if ts and ts.count(".") <= 1 and all([_ in ".0123456789" for _ in ts]):
return float(s)

return ret

BIN
docs/ide/is-loading.gif View File

Before After
Width: 43  |  Height: 11  |  Size: 404 B

BIN
docs/ide/lark-logo.png View File

Before After
Width: 198  |  Height: 98  |  Size: 13 KiB

+ 0
- 53
docs/index.md View File

@@ -1,53 +0,0 @@
# Lark

A modern parsing library for Python

## Overview

Lark can parse any context-free grammar.

Lark provides:

- Advanced grammar language, based on EBNF
- Three parsing algorithms to choose from: Earley, LALR(1) and CYK
- Automatic tree construction, inferred from your grammar
- Fast unicode lexer with regexp support, and automatic line-counting

Lark's code is hosted on Github: [https://github.com/lark-parser/lark](https://github.com/lark-parser/lark)

### Install
```bash
$ pip install lark-parser
```

#### Syntax Highlighting

- [Sublime Text & TextMate](https://github.com/lark-parser/lark_syntax)
- [Visual Studio Code](https://github.com/lark-parser/vscode-lark) (Or install through the vscode plugin system)
- [Intellij & PyCharm](https://github.com/lark-parser/intellij-syntax-highlighting)

-----

## Documentation Index


* [Philosophy & Design Choices](philosophy.md)
* [Full List of Features](features.md)
* [Examples](https://github.com/lark-parser/lark/tree/master/examples)
* Tutorials
* [How to write a DSL](http://blog.erezsh.com/how-to-write-a-dsl-in-python-with-lark/) - Implements a toy LOGO-like language with an interpreter
* [How to write a JSON parser](json_tutorial.md) - Teaches you how to use Lark
* Unofficial
* [Program Synthesis is Possible](https://www.cs.cornell.edu/~asampson/blog/minisynth.html) - Creates a DSL for Z3
* Guides
* [How to use Lark](how_to_use.md)
* [How to develop Lark](how_to_develop.md)
* Reference
* [Grammar](grammar.md)
* [Tree Construction](tree_construction.md)
* [Visitors & Transformers](visitors.md)
* [Classes](classes.md)
* [Cheatsheet (PDF)](lark_cheatsheet.pdf)
* Discussion
* [Gitter](https://gitter.im/lark-parser/Lobby)
* [Forum (Google Groups)](https://groups.google.com/forum/#!forum/lark-parser)

+ 113
- 0
docs/index.rst View File

@@ -0,0 +1,113 @@
.. Lark documentation master file, created by
sphinx-quickstart on Sun Aug 16 13:09:41 2020.
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.

Welcome to Lark's documentation!
================================

.. toctree::
:maxdepth: 2
:caption: Overview
:hidden:

philosophy
features
parsers

.. toctree::
:maxdepth: 2
:caption: Tutorials & Guides
:hidden:

json_tutorial
how_to_use
how_to_develop
recipes
examples/index


.. toctree::
:maxdepth: 2
:caption: Reference
:hidden:

grammar
tree_construction
classes
visitors
nearley



Lark is a modern parsing library for Python. Lark can parse any context-free grammar.

Lark provides:

- Advanced grammar language, based on EBNF
- Three parsing algorithms to choose from: Earley, LALR(1) and CYK
- Automatic tree construction, inferred from your grammar
- Fast unicode lexer with regexp support, and automatic line-counting


Install Lark
--------------

.. code:: bash

$ pip install lark-parser

Syntax Highlighting
-------------------

- `Sublime Text & TextMate`_
- `Visual Studio Code`_ (Or install through the vscode plugin system)
- `Intellij & PyCharm`_

.. _Sublime Text & TextMate: https://github.com/lark-parser/lark_syntax
.. _Visual Studio Code: https://github.com/lark-parser/vscode-lark
.. _Intellij & PyCharm: https://github.com/lark-parser/intellij-syntax-highlighting

Resources
---------

- :doc:`philosophy`
- :doc:`features`
- `Examples`_
- `Online IDE`_
- Tutorials

- `How to write a DSL`_ - Implements a toy LOGO-like language with
an interpreter
- :doc:`json_tutorial` - Teaches you how to use Lark
- Unofficial

- `Program Synthesis is Possible`_ - Creates a DSL for Z3

- Guides

- :doc:`how_to_use`
- :doc:`how_to_develop`

- Reference

- :doc:`grammar`
- :doc:`tree_construction`
- :doc:`visitors`
- :doc:`classes`
- :doc:`nearley`
- `Cheatsheet (PDF)`_

- Discussion

- `Gitter`_
- `Forum (Google Groups)`_


.. _Examples: https://github.com/lark-parser/lark/tree/master/examples
.. _Online IDE: https://lark-parser.github.io/lark/ide/app.html
.. _How to write a DSL: http://blog.erezsh.com/how-to-write-a-dsl-in-python-with-lark/
.. _Program Synthesis is Possible: https://www.cs.cornell.edu/~asampson/blog/minisynth.html
.. _Cheatsheet (PDF): _static/lark_cheatsheet.pdf
.. _Gitter: https://gitter.im/lark-parser/Lobby
.. _Forum (Google Groups): https://groups.google.com/forum/#!forum/lark-parser

+ 1
- 2
docs/json_tutorial.md View File

@@ -1,7 +1,6 @@
# Lark Tutorial - JSON parser
# JSON parser - Tutorial

Lark is a parser - a program that accepts a grammar and text, and produces a structured tree that represents that text.

In this tutorial we will write a JSON parser in Lark, and explore Lark's various features in the process.

It has 5 parts.


+ 36
- 0
docs/make.bat View File

@@ -0,0 +1,36 @@
@ECHO OFF
pushd %~dp0
REM Command file for Sphinx documentation
if "%SPHINXBUILD%" == "" (
set SPHINXBUILD=sphinx-build
)
set SOURCEDIR=.
set BUILDDIR=_build
set SPHINXPROJ=Lark
if "%1" == "" goto help
%SPHINXBUILD% >NUL 2>NUL
if errorlevel 9009 (
echo.
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
echo.installed, then set the SPHINXBUILD environment variable to point
echo.to the full path of the 'sphinx-build' executable. Alternatively you
echo.may add the Sphinx directory to PATH.
echo.
echo.If you don't have Sphinx installed, grab it from
echo.http://sphinx-doc.org/
exit /b 1
)
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
goto end
:help
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
:end
popd

+ 47
- 0
docs/nearley.md View File

@@ -0,0 +1,47 @@
# Importing grammars from Nearley

Lark comes with a tool to convert grammars from [Nearley](https://github.com/Hardmath123/nearley), a popular Earley library for Javascript. It uses [Js2Py](https://github.com/PiotrDabkowski/Js2Py) to convert and run the Javascript postprocessing code segments.

## Requirements

1. Install Lark with the `nearley` component:
```bash
pip install lark-parser[nearley]
```

2. Acquire a copy of the nearley codebase. This can be done using:
```bash
git clone https://github.com/Hardmath123/nearley
```

## Usage

Here's an example of how to import nearley's calculator example into Lark:

```bash
git clone https://github.com/Hardmath123/nearley
python -m lark.tools.nearley nearley/examples/calculator/arithmetic.ne main nearley > ncalc.py
```

You can use the output as a regular python module:

```python
>>> import ncalc
>>> ncalc.parse('sin(pi/4) ^ e')
0.38981434460254655
```

The Nearley converter also supports an experimental converter for newer JavaScript (ES6+), using the `--es6` flag:

```bash
git clone https://github.com/Hardmath123/nearley
python -m lark.tools.nearley nearley/examples/calculator/arithmetic.ne main nearley --es6 > ncalc.py
```

## Notes

- Lark currently cannot import templates from Nearley

- Lark currently cannot export grammars to Nearley

These might get added in the future, if enough users ask for them.

+ 5
- 5
docs/parsers.md View File

@@ -1,7 +1,7 @@
# Parsers
Lark implements the following parsing algorithms: Earley, LALR(1), and CYK

# Earley
## Earley

An [Earley Parser](https://www.wikiwand.com/en/Earley_parser) is a chart parser capable of parsing any context-free grammar at O(n^3), and O(n^2) when the grammar is unambiguous. It can parse most LR grammars at O(n). Most programming languages are LR, and can be parsed at a linear time.

@@ -13,7 +13,7 @@ It's possible to bypass the dynamic lexing, and use the regular Earley parser wi

Lark implements the Shared Packed Parse Forest data-structure for the Earley parser, in order to reduce the space and computation required to handle ambiguous grammars.

You can read more about SPPF [here](http://www.bramvandersanden.com/post/2014/06/shared-packed-parse-forest/)
You can read more about SPPF [here](https://web.archive.org/web/20191229100607/www.bramvandersanden.com/post/2014/06/shared-packed-parse-forest)

As a result, Lark can efficiently parse and store every ambiguity in the grammar, when using Earley.

@@ -30,7 +30,7 @@ Lark provides the following options to combat ambiguity:

**TODO: Add documentation on dynamic_complete**

# LALR(1)
## LALR(1)

[LALR(1)](https://www.wikiwand.com/en/LALR_parser) is a very efficient, true-and-tested parsing algorithm. It's incredibly fast and requires very little memory. It can parse most programming languages (For example: Python and Java).

@@ -42,7 +42,7 @@ The contextual lexer communicates with the parser, and uses the parser's lookahe

This is an improvement to LALR(1) that is unique to Lark.

# CYK Parser
## CYK Parser

A [CYK parser](https://www.wikiwand.com/en/CYK_algorithm) can parse any context-free grammar at O(n^3*|G|).



+ 2
- 2
docs/philosophy.md View File

@@ -4,7 +4,7 @@ Parsers are innately complicated and confusing. They're difficult to understand,

Lark's mission is to make the process of writing them as simple and abstract as possible, by following these design principles:

### Design Principles
## Design Principles

1. Readability matters

@@ -23,7 +23,7 @@ In accordance with these principles, I arrived at the following design choices:

-----------

# Design Choices
## Design Choices

### 1. Separation of code and grammar



+ 1
- 1
docs/recipes.md View File

@@ -139,7 +139,7 @@ If your tree nodes aren't unique (if there is a shared Tree instance), the asser

```python
class Parent(Visitor):
def visit(self, tree):
def __default__(self, tree):
for subtree in tree.children:
if isinstance(subtree, Tree):
assert not hasattr(subtree, 'parent')


+ 2
- 0
docs/requirements.txt View File

@@ -0,0 +1,2 @@
# https://docs.readthedocs.io/en/stable/guides/specifying-dependencies.html#specifying-a-requirements-file
sphinx-gallery

+ 3
- 3
docs/tree_construction.md View File

@@ -1,4 +1,4 @@
# Automatic Tree Construction - Reference
# Tree Construction Reference


Lark builds a tree automatically based on the structure of the grammar, where each rule that is matched becomes a branch (node) in the tree, and its children are its matches, in the order of matching.
@@ -13,7 +13,7 @@ If `maybe_placeholders=False` (the default), then `[]` behaves like `()?`.

If `maybe_placeholders=True`, then using `[item]` will return the item if it matched, or the value `None`, if it didn't.

### Terminals
## Terminals

Terminals are always values in the tree, never branches.

@@ -74,7 +74,7 @@ Lark will parse "((hello world))" as:
The brackets do not appear in the tree by design. The words appear because they are matched by a named terminal.


# Shaping the tree
## Shaping the tree

Users can alter the automatic construction of the tree using a collection of grammar features.



+ 0
- 125
docs/visitors.md View File

@@ -1,125 +0,0 @@
## Transformers & Visitors

Transformers & Visitors provide a convenient interface to process the parse-trees that Lark returns.

They are used by inheriting from the correct class (visitor or transformer), and implementing methods corresponding to the rule you wish to process. Each method accepts the children as an argument. That can be modified using the `v_args` decorator, which allows to inline the arguments (akin to `*args`), or add the tree `meta` property as an argument.

See: <a href="https://github.com/lark-parser/lark/blob/master/lark/visitors.py">visitors.py</a>

### Visitors

Visitors visit each node of the tree, and run the appropriate method on it according to the node's data.

They work bottom-up, starting with the leaves and ending at the root of the tree.

**Example:**
```python
class IncreaseAllNumbers(Visitor):
def number(self, tree):
assert tree.data == "number"
tree.children[0] += 1

IncreaseAllNumbers().visit(parse_tree)
```

There are two classes that implement the visitor interface:

* Visitor - Visit every node (without recursion)

* Visitor_Recursive - Visit every node using recursion. Slightly faster.

### Transformers

Transformers visit each node of the tree, and run the appropriate method on it according to the node's data.

They work bottom-up (or: depth-first), starting with the leaves and ending at the root of the tree.

Transformers can be used to implement map & reduce patterns.

Because nodes are reduced from leaf to root, at any point the callbacks may assume the children have already been transformed (if applicable).

Transformers can be chained into a new transformer by using multiplication.

`Transformer` can do anything `Visitor` can do, but because it reconstructs the tree, it is slightly less efficient.


**Example:**
```python
from lark import Tree, Transformer

class EvalExpressions(Transformer):
def expr(self, args):
return eval(args[0])

t = Tree('a', [Tree('expr', ['1+2'])])
print(EvalExpressions().transform( t ))

# Prints: Tree(a, [3])
```

All these classes implement the transformer interface:

- Transformer - Recursively transforms the tree. This is the one you probably want.
- Transformer_InPlace - Non-recursive. Changes the tree in-place instead of returning new instances
- Transformer_InPlaceRecursive - Recursive. Changes the tree in-place instead of returning new instances

### visit_tokens

By default, transformers only visit rules. `visit_tokens=True` will tell Transformer to visit tokens as well. This is a slightly slower alternative to `lexer_callbacks`, but it's easier to maintain and works for all algorithms (even when there isn't a lexer).

**Example:**

```python
class T(Transformer):
INT = int
NUMBER = float
def NAME(self, name):
return lookup_dict.get(name, name)


T(visit_tokens=True).transform(tree)
```


### v_args

`v_args` is a decorator.

By default, callback methods of transformers/visitors accept one argument: a list of the node's children. `v_args` can modify this behavior.

When used on a transformer/visitor class definition, it applies to all the callback methods inside it.

`v_args` accepts one of three flags:

- `inline` - Children are provided as `*args` instead of a list argument (not recommended for very long lists).
- `meta` - Provides two arguments: `children` and `meta` (instead of just the first)
- `tree` - Provides the entire tree as the argument, instead of the children.

**Examples:**

```python
@v_args(inline=True)
class SolveArith(Transformer):
def add(self, left, right):
return left + right


class ReverseNotation(Transformer_InPlace):
@v_args(tree=True)
def tree_node(self, tree):
tree.children = tree.children[::-1]
```

### `__default__` and `__default_token__`
These are the functions that are called on if a function with a corresponding name has not been found.

- The `__default__` method has the signature `(data, children, meta)`, with `data` being the data attribute of the node. It defaults to reconstruct the Tree

- The `__default_token__` just takes the `Token` as an argument. It defaults to just return the argument.


### Discard

When raising the `Discard` exception in a transformer callback, that node is discarded and won't appear in the parent.



+ 102
- 0
docs/visitors.rst View File

@@ -0,0 +1,102 @@
Transformers & Visitors
=======================

Transformers & Visitors provide a convenient interface to process the
parse-trees that Lark returns.

They are used by inheriting from the correct class (visitor or transformer),
and implementing methods corresponding to the rule you wish to process. Each
method accepts the children as an argument. That can be modified using the
``v_args`` decorator, which allows to inline the arguments (akin to ``*args``),
or add the tree ``meta`` property as an argument.

See: `visitors.py`_

.. _visitors.py: https://github.com/lark-parser/lark/blob/master/lark/visitors.py

Visitor
-------

Visitors visit each node of the tree, and run the appropriate method on it according to the node's data.

They work bottom-up, starting with the leaves and ending at the root of the tree.

There are two classes that implement the visitor interface:

- ``Visitor``: Visit every node (without recursion)
- ``Visitor_Recursive``: Visit every node using recursion. Slightly faster.

Example:
::

class IncreaseAllNumbers(Visitor):
def number(self, tree):
assert tree.data == "number"
tree.children[0] += 1

IncreaseAllNumbers().visit(parse_tree)

.. autoclass:: lark.visitors.Visitor

.. autoclass:: lark.visitors.Visitor_Recursive

Interpreter
-----------

.. autoclass:: lark.visitors.Interpreter


Example:
::

class IncreaseSomeOfTheNumbers(Interpreter):
def number(self, tree):
tree.children[0] += 1

def skip(self, tree):
# skip this subtree. don't change any number node inside it.
pass

IncreaseSomeOfTheNumbers().visit(parse_tree)

Transformer
-----------

.. autoclass:: lark.visitors.Transformer
:members: __default__, __default_token__

Example:
::

from lark import Tree, Transformer

class EvalExpressions(Transformer):
def expr(self, args):
return eval(args[0])

t = Tree('a', [Tree('expr', ['1+2'])])
print(EvalExpressions().transform( t ))

# Prints: Tree(a, [3])

Example:
::

class T(Transformer):
INT = int
NUMBER = float
def NAME(self, name):
return lookup_dict.get(name, name)

T(visit_tokens=True).transform(tree)


v_args
------

.. autofunction:: lark.visitors.v_args

Discard
-------

.. autoclass:: lark.visitors.Discard

+ 0
- 34
examples/README.md View File

@@ -1,34 +0,0 @@
# Examples for Lark

#### How to run the examples

After cloning the repo, open the terminal into the root directory of the project, and run the following:

```bash
[lark]$ python -m examples.<name_of_example>
```

For example, the following will parse all the Python files in the standard library of your local installation:

```bash
[lark]$ python -m examples.python_parser
```

### Beginners

- [calc.py](calc.py) - A simple example of a REPL calculator
- [json\_parser.py](json_parser.py) - A simple JSON parser (comes with a tutorial, see docs)
- [indented\_tree.py](indented\_tree.py) - A demonstration of parsing indentation ("whitespace significant" language)
- [fruitflies.py](fruitflies.py) - A demonstration of ambiguity
- [turtle\_dsl.py](turtle_dsl.py) - Implements a LOGO-like toy language for Python's turtle, with interpreter.
- [lark\_grammar.py](lark_grammar.py) + [lark.lark](lark.lark) - A reference implementation of the Lark grammar (using LALR(1) + standard lexer)

### Advanced

- [error\_reporting\_lalr.py](error_reporting_lalr.py) - A demonstration of example-driven error reporting with the LALR parser
- [python\_parser.py](python_parser.py) - A fully-working Python 2 & 3 parser (but not production ready yet!)
- [python\_bytecode.py](python_bytecode.py) - A toy example showing how to compile Python directly to bytecode
- [conf\_lalr.py](conf_lalr.py) - Demonstrates the power of LALR's contextual lexer on a toy configuration language
- [conf\_earley.py](conf_earley.py) - Demonstrates the power of Earley's dynamic lexer on a toy configuration language
- [custom\_lexer.py](custom_lexer.py) - Demonstrates using a custom lexer to parse a non-textual stream of data
- [reconstruct\_json.py](reconstruct_json.py) - Demonstrates the experimental text-reconstruction feature

+ 21
- 0
examples/README.rst View File

@@ -0,0 +1,21 @@
Examples for Lark
=================

**How to run the examples**:

After cloning the repo, open the terminal into the root directory of the
project, and run the following:

.. code:: bash

[lark]$ python -m examples.<name_of_example>

For example, the following will parse all the Python files in the
standard library of your local installation:

.. code:: bash

[lark]$ python -m examples.python_parser

Beginner Examples
~~~~~~~~~~~~~~~~~

+ 2
- 0
examples/advanced/README.rst View File

@@ -0,0 +1,2 @@
Advanced Examples
~~~~~~~~~~~~~~~~~

+ 64
- 0
examples/advanced/_json_parser.py View File

@@ -0,0 +1,64 @@
"""
Simple JSON Parser
==================

The code is short and clear, and outperforms every other parser (that's written in Python).
For an explanation, check out the JSON parser tutorial at /docs/json_tutorial.md

(this is here for use by the other examples)
"""
import sys

from lark import Lark, Transformer, v_args

json_grammar = r"""
?start: value

?value: object
| array
| string
| SIGNED_NUMBER -> number
| "true" -> true
| "false" -> false
| "null" -> null

array : "[" [value ("," value)*] "]"
object : "{" [pair ("," pair)*] "}"
pair : string ":" value

string : ESCAPED_STRING

%import common.ESCAPED_STRING
%import common.SIGNED_NUMBER
%import common.WS

%ignore WS
"""


class TreeToJson(Transformer):
@v_args(inline=True)
def string(self, s):
return s[1:-1].replace('\\"', '"')

array = list
pair = tuple
object = dict
number = v_args(inline=True)(float)

null = lambda self, _: None
true = lambda self, _: True
false = lambda self, _: False


### Create the JSON parser with Lark, using the LALR algorithm
json_parser = Lark(json_grammar, parser='lalr',
# Using the standard lexer isn't required, and isn't usually recommended.
# But, it's good enough for JSON, and it's slightly faster.
lexer='standard',
# Disabling propagate_positions and placeholders slightly improves speed
propagate_positions=False,
maybe_placeholders=False,
# Using an internal transformer is faster and more memory efficient
transformer=TreeToJson())


+ 44
- 0
examples/advanced/conf_earley.py View File

@@ -0,0 +1,44 @@
"""
Earley’s dynamic lexer
======================

Demonstrates the power of Earley’s dynamic lexer on a toy configuration language

Using a lexer for configuration files is tricky, because values don't
have to be surrounded by delimiters. Using a standard lexer for this just won't work.

In this example we use a dynamic lexer and let the Earley parser resolve the ambiguity.

Another approach is to use the contextual lexer with LALR. It is less powerful than Earley,
but it can handle some ambiguity when lexing and it's much faster.
See examples/conf_lalr.py for an example of that approach.

"""
from lark import Lark

parser = Lark(r"""
start: _NL? section+
section: "[" NAME "]" _NL item+
item: NAME "=" VALUE? _NL
VALUE: /./+

%import common.CNAME -> NAME
%import common.NEWLINE -> _NL
%import common.WS_INLINE
%ignore WS_INLINE
""", parser="earley")

def test():
sample_conf = """
[bla]

a=Hello
this="that",4
empty=
"""

r = parser.parse(sample_conf)
print (r.pretty())

if __name__ == '__main__':
test()

+ 40
- 0
examples/advanced/conf_lalr.py View File

@@ -0,0 +1,40 @@
"""
LALR’s contextual lexer
=======================

Demonstrates the power of LALR’s contextual lexer on a toy configuration language.

The tokens NAME and VALUE match the same input. A standard lexer would arbitrarily
choose one over the other, which would lead to a (confusing) parse error.
However, due to the unambiguous structure of the grammar, Lark's LALR(1) algorithm knows
which one of them to expect at each point during the parse.
The lexer then only matches the tokens that the parser expects.
The result is a correct parse, something that is impossible with a regular lexer.

Another approach is to discard a lexer altogether and use the Earley algorithm.
It will handle more cases than the contextual lexer, but at the cost of performance.
See examples/conf_earley.py for an example of that approach.
"""
from lark import Lark

parser = Lark(r"""
start: _NL? section+
section: "[" NAME "]" _NL item+
item: NAME "=" VALUE? _NL
VALUE: /./+

%import common.CNAME -> NAME
%import common.NEWLINE -> _NL
%import common.WS_INLINE
%ignore WS_INLINE
""", parser="lalr")


sample_conf = """
[bla]
a=Hello
this="that",4
empty=
"""

print(parser.parse(sample_conf).pretty())

examples/custom_lexer.py → examples/advanced/custom_lexer.py View File

@@ -1,13 +1,14 @@
#
# This example demonstrates using Lark with a custom lexer.
#
# You can use a custom lexer to tokenize text when the lexers offered by Lark
# are too slow, or not flexible enough.
#
# You can also use it (as shown in this example) to tokenize streams of objects.
#
"""
Custom lexer
============

Demonstrates using a custom lexer to parse a non-textual stream of data

You can use a custom lexer to tokenize text when the lexers offered by Lark
are too slow, or not flexible enough.

You can also use it (as shown in this example) to tokenize streams of objects.
"""
from lark import Lark, Transformer, v_args
from lark.lexer import Lexer, Token


+ 37
- 0
examples/advanced/error_puppet.py View File

@@ -0,0 +1,37 @@
"""
Error handling with a puppet
==================================

This example demonstrates error handling using a parsing puppet in LALR

When the parser encounters an UnexpectedToken exception, it creates a
parsing puppet with the current parse-state, and lets you control how
to proceed step-by-step. When you've achieved the correct parse-state,
you can resume the run by returning True.
"""

from lark import Token

from _json_parser import json_parser

def ignore_errors(e):
if e.token.type == 'COMMA':
# Skip comma
return True
elif e.token.type == 'SIGNED_NUMBER':
# Try to feed a comma and retry the number
e.puppet.feed_token(Token('COMMA', ','))
e.puppet.feed_token(e.token)
return True

# Unhandled error. Will stop parse and raise exception
return False


def main():
s = "[0 1, 2,, 3,,, 4, 5 6 ]"
res = json_parser.parse(s, on_error=ignore_errors)
print(res) # prints [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0]

main()


examples/error_reporting_lalr.py → examples/advanced/error_reporting_lalr.py View File

@@ -1,10 +1,13 @@
#
# This demonstrates example-driven error reporting with the LALR parser
#
"""
Example-Driven Error Reporting
==============================

A demonstration of example-driven error reporting with the LALR parser

"""
from lark import Lark, UnexpectedInput

from .json_parser import json_grammar # Using the grammar from the json_parser example
from _json_parser import json_grammar # Using the grammar from the json_parser example

json_parser = Lark(json_grammar, parser='lalr')

@@ -52,7 +55,7 @@ def parse(json_text):
'[1,2,]',
'{"foo":1,}',
'{"foo":false,"bar":true,}']
})
}, use_accepts=True)
if not exc_class:
raise
raise exc_class(u.get_context(json_text), u.line, u.column)

examples/python2.lark → examples/advanced/python2.lark View File


examples/python3.lark → examples/advanced/python3.lark View File


examples/python_bytecode.py → examples/advanced/python_bytecode.py View File

@@ -1,12 +1,16 @@
#
# This is a toy example that compiles Python directly to bytecode, without generating an AST.
# It currently only works for very very simple Python code.
#
# It requires the 'bytecode' library. You can get it using
#
# $ pip install bytecode
#
"""
Compile Python to Bytecode
==========================
A toy example that compiles Python directly to bytecode, without generating an AST.
It currently only works for very very simple Python code.

It requires the 'bytecode' library. You can get it using
::

$ pip install bytecode

"""
from lark import Lark, Transformer, v_args
from lark.indenter import Indenter


examples/python_parser.py → examples/advanced/python_parser.py View File

@@ -1,7 +1,11 @@
#
# This example demonstrates usage of the included Python grammars
#
"""
Grammar-complete Python Parser
==============================

A fully-working Python 2 & 3 parser (but not production ready yet!)

This example demonstrates usage of the included Python grammars
"""
import sys
import os, os.path
from io import open
@@ -26,6 +30,13 @@ python_parser2 = Lark.open('python2.lark', parser='lalr', **kwargs)
python_parser3 = Lark.open('python3.lark',parser='lalr', **kwargs)
python_parser2_earley = Lark.open('python2.lark', parser='earley', lexer='standard', **kwargs)

try:
xrange
except NameError:
chosen_parser = python_parser3
else:
chosen_parser = python_parser2


def _read(fn, *args):
kwargs = {'encoding': 'iso-8859-1'}
@@ -42,24 +53,13 @@ def _get_lib_path():
return [x for x in sys.path if x.endswith('%s.%s' % sys.version_info[:2])][0]

def test_python_lib():

path = _get_lib_path()

start = time.time()
files = glob.glob(path+'/*.py')
for f in files:
print( f )
try:
# print list(python_parser.lex(_read(os.path.join(path, f)) + '\n'))
try:
xrange
except NameError:
python_parser3.parse(_read(os.path.join(path, f)) + '\n')
else:
python_parser2.parse(_read(os.path.join(path, f)) + '\n')
except:
print ('At %s' % f)
raise
chosen_parser.parse(_read(os.path.join(path, f)) + '\n')

end = time.time()
print( "test_python_lib (%d files), time: %s secs"%(len(files), end-start) )

examples/qscintilla_json.py → examples/advanced/qscintilla_json.py View File

@@ -1,10 +1,14 @@
#
# This example shows how to write a syntax-highlighted editor with Qt and Lark
#
# Requirements:
#
# PyQt5==5.10.1
# QScintilla==2.10.4
"""
Syntax Highlighting
===================

This example shows how to write a syntax-highlighted editor with Qt and Lark

Requirements:

PyQt5==5.10.1
QScintilla==2.10.4
"""

import sys
import textwrap

examples/reconstruct_json.py → examples/advanced/reconstruct_json.py View File

@@ -1,16 +1,20 @@
#
# This example demonstrates an experimental feature: Text reconstruction
# The Reconstructor takes a parse tree (already filtered from punctuation, of course),
# and reconstructs it into correct text, that can be parsed correctly.
# It can be useful for creating "hooks" to alter data before handing it to other parsers. You can also use it to generate samples from scratch.
#
"""
Reconstruct a JSON
==================

Demonstrates the experimental text-reconstruction feature

The Reconstructor takes a parse tree (already filtered from punctuation, of course),
and reconstructs it into correct text, that can be parsed correctly.
It can be useful for creating "hooks" to alter data before handing it to other parsers. You can also use it to generate samples from scratch.
"""

import json

from lark import Lark
from lark.reconstruct import Reconstructor

from .json_parser import json_grammar
from _json_parser import json_grammar

test_json = '''
{

examples/template_lark.lark → examples/advanced/template_lark.lark View File


+ 29
- 0
examples/advanced/templates.py View File

@@ -0,0 +1,29 @@
"""
Templates
=========

This example shows how to use Lark's templates to achieve cleaner grammars

"""
from lark import Lark

grammar = r"""
start: list | dict

list: "[" _seperated{atom, ","} "]"
dict: "{" _seperated{key_value, ","} "}"
key_value: atom ":" atom

_seperated{x, sep}: x (sep x)* // Define a sequence of 'x sep x sep x ...'

atom: NUMBER | ESCAPED_STRING

%import common (NUMBER, ESCAPED_STRING, WS)
%ignore WS
"""


parser = Lark(grammar)

print(parser.parse('[1, "a", 2]'))
print(parser.parse('{"a": 2, "b": 6}'))

+ 7
- 3
examples/calc.py View File

@@ -1,7 +1,11 @@
#
# This example shows how to write a basic calculator with variables.
#
"""
Basic calculator
================

A simple example of a REPL calculator

This example shows how to write a basic calculator with variables.
"""
from lark import Lark, Transformer, v_args




+ 0
- 42
examples/conf_earley.py View File

@@ -1,42 +0,0 @@
#
# This example demonstrates parsing using the dynamic-lexer earley frontend
#
# Using a lexer for configuration files is tricky, because values don't
# have to be surrounded by delimiters. Using a standard lexer for this just won't work.
#
# In this example we use a dynamic lexer and let the Earley parser resolve the ambiguity.
#
# Another approach is to use the contextual lexer with LALR. It is less powerful than Earley,
# but it can handle some ambiguity when lexing and it's much faster.
# See examples/conf_lalr.py for an example of that approach.
#


from lark import Lark

parser = Lark(r"""
start: _NL? section+
section: "[" NAME "]" _NL item+
item: NAME "=" VALUE? _NL
VALUE: /./+

%import common.CNAME -> NAME
%import common.NEWLINE -> _NL
%import common.WS_INLINE
%ignore WS_INLINE
""", parser="earley")

def test():
sample_conf = """
[bla]

a=Hello
this="that",4
empty=
"""

r = parser.parse(sample_conf)
print (r.pretty())

if __name__ == '__main__':
test()

+ 0
- 38
examples/conf_lalr.py View File

@@ -1,38 +0,0 @@
#
# This example demonstrates the power of the contextual lexer, by parsing a config file.
#
# The tokens NAME and VALUE match the same input. A standard lexer would arbitrarily
# choose one over the other, which would lead to a (confusing) parse error.
# However, due to the unambiguous structure of the grammar, Lark's LALR(1) algorithm knows
# which one of them to expect at each point during the parse.
# The lexer then only matches the tokens that the parser expects.
# The result is a correct parse, something that is impossible with a regular lexer.
#
# Another approach is to discard a lexer altogether and use the Earley algorithm.
# It will handle more cases than the contextual lexer, but at the cost of performance.
# See examples/conf_earley.py for an example of that approach.
#

from lark import Lark

parser = Lark(r"""
start: _NL? section+
section: "[" NAME "]" _NL item+
item: NAME "=" VALUE? _NL
VALUE: /./+

%import common.CNAME -> NAME
%import common.NEWLINE -> _NL
%import common.WS_INLINE
%ignore WS_INLINE
""", parser="lalr")


sample_conf = """
[bla]
a=Hello
this="that",4
empty=
"""

print(parser.parse(sample_conf).pretty())

+ 12
- 3
examples/fruitflies.py View File

@@ -1,7 +1,12 @@
#
# This example shows how to use get explicit ambiguity from Lark's Earley parser.
#
"""
Handling Ambiguity
==================

A demonstration of ambiguity

This example shows how to use get explicit ambiguity from Lark's Earley parser.

"""
import sys
from lark import Lark, tree

@@ -28,9 +33,13 @@ sentence = 'fruit flies like bananas'
def make_png(filename):
tree.pydot__tree_to_png( parser.parse(sentence), filename)

def make_dot(filename):
tree.pydot__tree_to_dot( parser.parse(sentence), filename)

if __name__ == '__main__':
print(parser.parse(sentence).pretty())
# make_png(sys.argv[1])
# make_dot(sys.argv[1])

# Output:
#


+ 12
- 9
examples/indented_tree.py View File

@@ -1,13 +1,16 @@
#
# This example demonstrates usage of the Indenter class.
#
# Since indentation is context-sensitive, a postlex stage is introduced to
# manufacture INDENT/DEDENT tokens.
#
# It is crucial for the indenter that the NL_type matches
# the spaces (and tabs) after the newline.
#
"""
Parsing Indentation
===================

A demonstration of parsing indentation (“whitespace significant” language)
and the usage of the Indenter class.

Since indentation is context-sensitive, a postlex stage is introduced to
manufacture INDENT/DEDENT tokens.

It is crucial for the indenter that the NL_type matches
the spaces (and tabs) after the newline.
"""
from lark import Lark
from lark.indenter import Indenter



+ 6
- 6
examples/json_parser.py View File

@@ -1,10 +1,10 @@
#
# This example shows how to write a basic JSON parser
#
# The code is short and clear, and outperforms every other parser (that's written in Python).
# For an explanation, check out the JSON parser tutorial at /docs/json_tutorial.md
#
"""
Simple JSON Parser
==================

The code is short and clear, and outperforms every other parser (that's written in Python).
For an explanation, check out the JSON parser tutorial at /docs/json_tutorial.md
"""
import sys

from lark import Lark, Transformer, v_args


+ 19
- 9
examples/lark_grammar.py View File

@@ -1,15 +1,25 @@
from lark import Lark
"""
Lark Grammar
============

parser = Lark(open('examples/lark.lark'), parser="lalr")
A reference implementation of the Lark grammar (using LALR(1))
"""
import lark
from pathlib import Path

parser = lark.Lark.open('lark.lark', rel_to=__file__, parser="lalr")

examples_path = Path(__file__).parent
lark_path = Path(lark.__file__).parent

grammar_files = [
'examples/python2.lark',
'examples/python3.lark',
'examples/lark.lark',
'examples/relative-imports/multiples.lark',
'examples/relative-imports/multiple2.lark',
'examples/relative-imports/multiple3.lark',
'lark/grammars/common.lark',
examples_path / 'lark.lark',
examples_path / 'advanced/python2.lark',
examples_path / 'advanced/python3.lark',
examples_path / 'relative-imports/multiples.lark',
examples_path / 'relative-imports/multiple2.lark',
examples_path / 'relative-imports/multiple3.lark',
lark_path / 'grammars/common.lark',
]

def test():


+ 636
- 354
examples/standalone/json_parser.py
File diff suppressed because it is too large
View File


+ 6
- 1
examples/turtle_dsl.py View File

@@ -1,4 +1,9 @@
# This example implements a LOGO-like toy language for Python's turtle, with interpreter.
"""
Turtle DSL
==========

Implements a LOGO-like toy language for Python’s turtle, with interpreter.
"""

try:
input = raw_input # For Python2 compatibility


+ 20
- 10
lark-stubs/exceptions.pyi View File

@@ -1,9 +1,9 @@
# -*- coding: utf-8 -*-

from typing import Dict, Iterable, Callable, Union
from typing import Dict, Iterable, Callable, Union, TypeVar, Tuple, Any, List, Set
from .tree import Tree
from .lexer import Token
from .parsers.lalr_puppet import ParserPuppet

class LarkError(Exception):
pass
@@ -21,27 +21,37 @@ class LexError(LarkError):
pass


T = TypeVar('T')


class UnexpectedInput(LarkError):
line: int
column: int
pos_in_stream: int
state: Any

def get_context(self, text: str, span: int = ...):
...

def match_examples(
self,
parse_fn: Callable[[str], Tree],
examples: Dict[str, Iterable[str]]
):
self,
parse_fn: Callable[[str], Tree],
examples: Union[Dict[T, Iterable[str]], Iterable[Tuple[T, Iterable[str]]]],
token_type_match_fallback: bool = False,
use_accepts: bool = False,
) -> T:
...


class UnexpectedToken(ParseError, UnexpectedInput):
pass

expected: Set[str]
considered_rules: Set[str]
puppet: ParserPuppet
accepts: Set[str]

class UnexpectedCharacters(LexError, UnexpectedInput):
line: int
column: int
allowed: Set[str]
considered_tokens: Set[Any]


class VisitError(LarkError):


+ 7
- 1
lark-stubs/lark.pyi View File

@@ -23,6 +23,7 @@ class LarkOptions:
transformer: Optional[Transformer]
postlex: Optional[PostLex]
ambiguity: str
regex: bool
debug: bool
keep_all_tokens: bool
propagate_positions: bool
@@ -30,10 +31,12 @@ class LarkOptions:
lexer_callbacks: Dict[str, Callable[[Token], Token]]
cache: Union[bool, str]
g_regex_flags: int
use_bytes: bool


class Lark:
source: str
grammar_source: str
options: LarkOptions
lexer: Lexer
terminals: List[TerminalDef]
@@ -48,12 +51,15 @@ class Lark:
transformer: Optional[Transformer] = None,
postlex: Optional[PostLex] = None,
ambiguity: Literal["explicit", "resolve"] = "resolve",
regex: bool = False,
debug: bool = False,
keep_all_tokens: bool = False,
propagate_positions: bool = False,
maybe_placeholders: bool = False,
lexer_callbacks: Optional[Dict[str, Callable[[Token], Token]]] = None,
g_regex_flags: int = ...
cache: Union[bool, str] = False,
g_regex_flags: int = ...,
use_bytes: bool = False,
):
...



+ 4
- 1
lark-stubs/lexer.pyi View File

@@ -1,5 +1,5 @@
# -*- coding: utf-8 -*-
from types import ModuleType
from typing import (
TypeVar, Type, Tuple, List, Dict, Iterator, Collection, Callable, Optional,
Pattern as REPattern,
@@ -107,10 +107,12 @@ class TraditionalLexer(Lexer):
user_callbacks: Dict[str, _Callback]
callback: Dict[str, _Callback]
mres: List[Tuple[REPattern, Dict[int, str]]]
re: ModuleType

def __init__(
self,
terminals: Collection[TerminalDef],
re_: ModuleType,
ignore: Collection[str] = ...,
user_callbacks: Dict[str, _Callback] = ...,
g_regex_flags: int = ...
@@ -135,6 +137,7 @@ class ContextualLexer(Lexer):
self,
terminals: Collection[TerminalDef],
states: Dict[str, Collection[str]],
re_: ModuleType,
ignore: Collection[str] = ...,
always_accept: Collection[str] = ...,
user_callbacks: Dict[str, _Callback] = ...,


+ 0
- 0
lark-stubs/parsers/__init__.pyi View File


+ 22
- 0
lark-stubs/parsers/lalr_puppet.pyi View File

@@ -0,0 +1,22 @@
from typing import Set, Dict, Any

from lark import Token, Tree


class ParserPuppet(object):
"""
Provides an interface to interactively step through the parser (LALR(1) only for now)

Accessible via `UnexpectedToken.puppet` (raised by the parser on token error)
"""
def feed_token(self, token: Token): ...

def copy(self) -> ParserPuppet: ...

def pretty(self) -> str: ...

def choices(self) -> Dict[str, Any]: ...

def accepts(self) -> Set[str]: ...

def resume_parse(self) -> Tree: ...

+ 1
- 1
lark-stubs/reconstruct.pyi View File

@@ -30,7 +30,7 @@ class MakeMatchTree:

class Reconstructor:

def __init__(self, parser: Lark):
def __init__(self, parser: Lark, term_subs: Dict[str, str] = ...):
...

def reconstruct(self, tree: Tree) -> str:


+ 2
- 1
lark/__init__.py View File

@@ -1,3 +1,4 @@
from .utils import logger
from .tree import Tree
from .visitors import Transformer, Visitor, v_args, Discard
from .visitors import InlineTransformer, inline_args # XXX Deprecated
@@ -6,4 +7,4 @@ from .exceptions import (ParseError, LexError, GrammarError, UnexpectedToken,
from .lexer import Token
from .lark import Lark

__version__ = "0.8.5"
__version__ = "0.9.0"

+ 6
- 6
lark/common.py View File

@@ -4,18 +4,18 @@ from .lexer import TerminalDef
###{standalone

class LexerConf(Serialize):
__serialize_fields__ = 'tokens', 'ignore', 'g_regex_flags'
__serialize_fields__ = 'tokens', 'ignore', 'g_regex_flags', 'use_bytes'
__serialize_namespace__ = TerminalDef,

def __init__(self, tokens, ignore=(), postlex=None, callbacks=None, g_regex_flags=0):
self.tokens = tokens
def __init__(self, tokens, re_module, ignore=(), postlex=None, callbacks=None, g_regex_flags=0, skip_validation=False, use_bytes=False):
self.tokens = tokens # TODO should be terminals
self.ignore = ignore
self.postlex = postlex
self.callbacks = callbacks or {}
self.g_regex_flags = g_regex_flags
def _deserialize(self):
self.callbacks = {} # TODO
self.re_module = re_module
self.skip_validation = skip_validation
self.use_bytes = use_bytes

###}



+ 95
- 26
lark/exceptions.py View File

@@ -1,6 +1,8 @@
from .utils import STRING_TYPE
from .utils import STRING_TYPE, logger

###{standalone


class LarkError(Exception):
pass

@@ -22,54 +24,109 @@ class UnexpectedEOF(ParseError):


class UnexpectedInput(LarkError):
"""UnexpectedInput Error.

Used as a base class for the following exceptions:

- ``UnexpectedToken``: The parser recieved an unexpected token
- ``UnexpectedCharacters``: The lexer encountered an unexpected string

After catching one of these exceptions, you may call the following helper methods to create a nicer error message.
"""
pos_in_stream = None

def get_context(self, text, span=40):
"""Returns a pretty string pinpointing the error in the text,
with span amount of context characters around it.

Note:
The parser doesn't hold a copy of the text it has to parse,
so you have to provide it again
"""
pos = self.pos_in_stream
start = max(pos - span, 0)
end = pos + span
before = text[start:pos].rsplit('\n', 1)[-1]
after = text[pos:end].split('\n', 1)[0]
return before + after + '\n' + ' ' * len(before) + '^\n'

def match_examples(self, parse_fn, examples):
""" Given a parser instance and a dictionary mapping some label with
some malformed syntax examples, it'll return the label for the
example that bests matches the current error.
if not isinstance(text, bytes):
before = text[start:pos].rsplit('\n', 1)[-1]
after = text[pos:end].split('\n', 1)[0]
return before + after + '\n' + ' ' * len(before.expandtabs()) + '^\n'
else:
before = text[start:pos].rsplit(b'\n', 1)[-1]
after = text[pos:end].split(b'\n', 1)[0]
return (before + after + b'\n' + b' ' * len(before.expandtabs()) + b'^\n').decode("ascii", "backslashreplace")

def match_examples(self, parse_fn, examples, token_type_match_fallback=False, use_accepts=False):
"""Allows you to detect what's wrong in the input text by matching
against example errors.

Given a parser instance and a dictionary mapping some label with
some malformed syntax examples, it'll return the label for the
example that bests matches the current error. The function will
iterate the dictionary until it finds a matching error, and
return the corresponding value.

For an example usage, see `examples/error_reporting_lalr.py`

Parameters:
parse_fn: parse function (usually ``lark_instance.parse``)
examples: dictionary of ``{'example_string': value}``.
use_accepts: Recommended to call this with ``use_accepts=True``.
The default is ``False`` for backwards compatibility.
"""
assert self.state is not None, "Not supported for this exception"

candidate = None
for label, example in examples.items():
if isinstance(examples, dict):
examples = examples.items()

candidate = (None, False)
for i, (label, example) in enumerate(examples):
assert not isinstance(example, STRING_TYPE)

for malformed in example:
for j, malformed in enumerate(example):
try:
parse_fn(malformed)
except UnexpectedInput as ut:
if ut.state == self.state:
if use_accepts and ut.accepts != self.accepts:
logger.debug("Different accepts with same state[%d]: %s != %s at example [%s][%s]" %
(self.state, self.accepts, ut.accepts, i, j))
continue
try:
if ut.token == self.token: # Try exact match first
logger.debug("Exact Match at example [%s][%s]" % (i, j))
return label

if token_type_match_fallback:
# Fallback to token types match
if (ut.token.type == self.token.type) and not candidate[-1]:
logger.debug("Token Type Fallback at example [%s][%s]" % (i, j))
candidate = label, True

except AttributeError:
pass
if not candidate:
candidate = label
if not candidate[0]:
logger.debug("Same State match at example [%s][%s]" % (i, j))
candidate = label, False

return candidate
return candidate[0]


class UnexpectedCharacters(LexError, UnexpectedInput):
def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None):
message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos], line, column)

self.line = line
self.column = column
self.allowed = allowed
self.considered_tokens = considered_tokens
self.pos_in_stream = lex_pos
self.state = state

self.allowed = allowed
self.considered_tokens = considered_tokens

if isinstance(seq, bytes):
_s = seq[lex_pos:lex_pos+1].decode("ascii", "backslashreplace")
else:
_s = seq[lex_pos]

message = "No terminal defined for '%s' at line %d col %d" % (_s, line, column)
message += '\n\n' + self.get_context(seq)
if allowed:
message += '\nExpecting: %s\n' % allowed
@@ -79,23 +136,35 @@ class UnexpectedCharacters(LexError, UnexpectedInput):
super(UnexpectedCharacters, self).__init__(message)



class UnexpectedToken(ParseError, UnexpectedInput):
def __init__(self, token, expected, considered_rules=None, state=None):
self.token = token
self.expected = expected # XXX str shouldn't necessary
"""When the parser throws UnexpectedToken, it instanciates a puppet
with its internal state. Users can then interactively set the puppet to
the desired puppet state, and resume regular parsing.

see: :ref:`ParserPuppet`.
"""
def __init__(self, token, expected, considered_rules=None, state=None, puppet=None):
self.line = getattr(token, 'line', '?')
self.column = getattr(token, 'column', '?')
self.considered_rules = considered_rules
self.state = state
self.pos_in_stream = getattr(token, 'pos_in_stream', None)
self.state = state

self.token = token
self.expected = expected # XXX deprecate? `accepts` is better
self.considered_rules = considered_rules
self.puppet = puppet

# TODO Only calculate `accepts()` when we need to display it to the user
# This will improve performance when doing automatic error handling
self.accepts = puppet and puppet.accepts()

message = ("Unexpected token %r at line %s, column %s.\n"
"Expected one of: \n\t* %s\n"
% (token, self.line, self.column, '\n\t* '.join(self.expected)))
% (token, self.line, self.column, '\n\t* '.join(self.accepts or self.expected)))

super(UnexpectedToken, self).__init__(message)


class VisitError(LarkError):
"""VisitError is raised when visitors are interrupted by an exception



+ 165
- 87
lark/lark.py View File

@@ -1,19 +1,25 @@
from __future__ import absolute_import

import sys, os, pickle, hashlib, logging
import sys, os, pickle, hashlib
from io import open


from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS
from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS, isascii, logger
from .load_grammar import load_grammar
from .tree import Tree
from .common import LexerConf, ParserConf

from .lexer import Lexer, TraditionalLexer, TerminalDef
from .lexer import Lexer, TraditionalLexer, TerminalDef, UnexpectedToken
from .parse_tree_builder import ParseTreeBuilder
from .parser_frontends import get_frontend
from .parser_frontends import get_frontend, _get_lexer_callbacks
from .grammar import Rule

import re
try:
import regex
except ImportError:
regex = None

###{standalone

class LarkOptions(Serialize):
@@ -21,61 +27,69 @@ class LarkOptions(Serialize):

"""
OPTIONS_DOC = """
# General

start - The start symbol. Either a string, or a list of strings for
multiple possible starts (Default: "start")
debug - Display debug information, such as warnings (default: False)
transformer - Applies the transformer to every parse tree (equivlent to
applying it after the parse, but faster)
propagate_positions - Propagates (line, column, end_line, end_column)
attributes into all tree branches.
maybe_placeholders - When True, the `[]` operator returns `None` when not matched.
When `False`, `[]` behaves like the `?` operator,
and returns no value at all.
(default=`False`. Recommended to set to `True`)
cache - Cache the results of the Lark grammar analysis, for x2 to x3 faster loading.
LALR only for now.
When `False`, does nothing (default)
When `True`, caches to a temporary file in the local directory
When given a string, caches to the path pointed by the string

g_regex_flags - Flags that are applied to all terminals
(both regex and strings)
keep_all_tokens - Prevent the tree builder from automagically
removing "punctuation" tokens (default: False)

# Algorithm

parser - Decides which parser engine to use
Accepts "earley" or "lalr". (Default: "earley")
(there is also a "cyk" option for legacy)

lexer - Decides whether or not to use a lexer stage
"auto" (default): Choose for me based on the parser
"standard": Use a standard lexer
"contextual": Stronger lexer (only works with parser="lalr")
"dynamic": Flexible and powerful (only with parser="earley")
"dynamic_complete": Same as dynamic, but tries *every* variation
of tokenizing possible.

ambiguity - Decides how to handle ambiguity in the parse.
Only relevant if parser="earley"
"resolve": The parser will automatically choose the simplest
derivation (it chooses consistently: greedy for
tokens, non-greedy for rules)
"explicit": The parser will return all derivations wrapped
in "_ambig" tree nodes (i.e. a forest).

# Domain Specific

postlex - Lexer post-processing (Default: None) Only works with the
standard and contextual lexers.
priority - How priorities should be evaluated - auto, none, normal,
invert (Default: auto)
lexer_callbacks - Dictionary of callbacks for the lexer. May alter
tokens during lexing. Use with caution.
edit_terminals - A callback
**=== General Options ===**

start
The start symbol. Either a string, or a list of strings for multiple possible starts (Default: "start")
debug
Display debug information, such as warnings (default: False)
transformer
Applies the transformer to every parse tree (equivlent to applying it after the parse, but faster)
propagate_positions
Propagates (line, column, end_line, end_column) attributes into all tree branches.
maybe_placeholders
When True, the ``[]`` operator returns ``None`` when not matched.

When ``False``, ``[]`` behaves like the ``?`` operator, and returns no value at all.
(default= ``False``. Recommended to set to ``True``)
regex
When True, uses the ``regex`` module instead of the stdlib ``re``.
cache
Cache the results of the Lark grammar analysis, for x2 to x3 faster loading. LALR only for now.

- When ``False``, does nothing (default)
- When ``True``, caches to a temporary file in the local directory
- When given a string, caches to the path pointed by the string

g_regex_flags
Flags that are applied to all terminals (both regex and strings)
keep_all_tokens
Prevent the tree builder from automagically removing "punctuation" tokens (default: False)

**=== Algorithm Options ===**

parser
Decides which parser engine to use. Accepts "earley" or "lalr". (Default: "earley").
(there is also a "cyk" option for legacy)
lexer
Decides whether or not to use a lexer stage

- "auto" (default): Choose for me based on the parser
- "standard": Use a standard lexer
- "contextual": Stronger lexer (only works with parser="lalr")
- "dynamic": Flexible and powerful (only with parser="earley")
- "dynamic_complete": Same as dynamic, but tries *every* variation of tokenizing possible.
ambiguity
Decides how to handle ambiguity in the parse. Only relevant if parser="earley"

- "resolve" - The parser will automatically choose the simplest derivation
(it chooses consistently: greedy for tokens, non-greedy for rules)
- "explicit": The parser will return all derivations wrapped in "_ambig" tree nodes (i.e. a forest).

**=== Misc. / Domain Specific Options ===**

postlex
Lexer post-processing (Default: None) Only works with the standard and contextual lexers.
priority
How priorities should be evaluated - auto, none, normal, invert (Default: auto)
lexer_callbacks
Dictionary of callbacks for the lexer. May alter tokens during lexing. Use with caution.
use_bytes
Accept an input of type ``bytes`` instead of ``str`` (Python 3 only).
edit_terminals
A callback for editing the terminals before parse.

**=== End Options ===**
"""
if __doc__:
__doc__ += OPTIONS_DOC
@@ -92,11 +106,13 @@ class LarkOptions(Serialize):
'start': 'start',
'priority': 'auto',
'ambiguity': 'auto',
'regex': False,
'propagate_positions': False,
'lexer_callbacks': {},
'maybe_placeholders': False,
'edit_terminals': None,
'g_regex_flags': 0,
'use_bytes': False,
}

def __init__(self, options_dict):
@@ -106,7 +122,7 @@ class LarkOptions(Serialize):
for name, default in self._defaults.items():
if name in o:
value = o.pop(name)
if isinstance(default, bool) and name != 'cache':
if isinstance(default, bool) and name not in ('cache', 'use_bytes'):
value = bool(value)
else:
value = default
@@ -146,14 +162,31 @@ class LarkOptions(Serialize):


class Lark(Serialize):
def __init__(self, grammar, **options):
"""
grammar : a string or file-object containing the grammar spec (using Lark's ebnf syntax)
options : a dictionary controlling various aspects of Lark.
"""
"""Main interface for the library.

It's mostly a thin wrapper for the many different parsers, and for the tree constructor.

Parameters:
grammar: a string or file-object containing the grammar spec (using Lark's ebnf syntax)
options: a dictionary controlling various aspects of Lark.

Example:
>>> Lark(r'''start: "foo" ''')
Lark(...)
"""
def __init__(self, grammar, **options):
self.options = LarkOptions(options)

# Set regex or re module
use_regex = self.options.regex
if use_regex:
if regex:
re_module = regex
else:
raise ImportError('`regex` module must be installed if calling `Lark(regex=True)`.')
else:
re_module = re

# Some, but not all file-like objects have a 'name' attribute
try:
self.source = grammar.name
@@ -169,6 +202,13 @@ class Lark(Serialize):
grammar = read()

assert isinstance(grammar, STRING_TYPE)
self.grammar_source = grammar
if self.options.use_bytes:
if not isascii(grammar):
raise ValueError("Grammar must be ascii only, when use_bytes=True")
if sys.version_info[0] == 2 and self.options.use_bytes != 'force':
raise NotImplementedError("`use_bytes=True` may have issues on python2."
"Use `use_bytes='force'` to use it at your own risk.")

cache_fn = None
if self.options.cache:
@@ -178,15 +218,16 @@ class Lark(Serialize):
cache_fn = self.options.cache
else:
if self.options.cache is not True:
raise ValueError("cache must be bool or str")
raise ValueError("cache argument must be bool or str")
unhashable = ('transformer', 'postlex', 'lexer_callbacks', 'edit_terminals')
from . import __version__
options_str = ''.join(k+str(v) for k, v in options.items() if k not in unhashable)
s = grammar + options_str
s = grammar + options_str + __version__
md5 = hashlib.md5(s.encode()).hexdigest()
cache_fn = '.lark_cache_%s.tmp' % md5

if FS.exists(cache_fn):
logging.debug('Loading grammar from cache: %s', cache_fn)
logger.debug('Loading grammar from cache: %s', cache_fn)
with FS.open(cache_fn, 'rb') as f:
self._load(f, self.options.transformer, self.options.postlex)
return
@@ -224,7 +265,7 @@ class Lark(Serialize):
assert self.options.ambiguity in ('resolve', 'explicit', 'auto', )

# Parse the grammar file and compose the grammars (TODO)
self.grammar = load_grammar(grammar, self.source)
self.grammar = load_grammar(grammar, self.source, re_module)

# Compile the EBNF grammar into BNF
self.terminals, self.rules, self.ignore_tokens = self.grammar.compile(self.options.start)
@@ -233,7 +274,7 @@ class Lark(Serialize):
for t in self.terminals:
self.options.edit_terminals(t)

self._terminals_dict = {t.name:t for t in self.terminals}
self._terminals_dict = {t.name: t for t in self.terminals}

# If the user asked to invert the priorities, negate them all here.
# This replaces the old 'resolve__antiscore_sum' option.
@@ -250,14 +291,12 @@ class Lark(Serialize):
rule.options.priority = None

# TODO Deprecate lexer_callbacks?
lexer_callbacks = dict(self.options.lexer_callbacks)
if self.options.transformer:
t = self.options.transformer
for term in self.terminals:
if hasattr(t, term.name):
lexer_callbacks[term.name] = getattr(t, term.name)
lexer_callbacks = (_get_lexer_callbacks(self.options.transformer, self.terminals)
if self.options.transformer
else {})
lexer_callbacks.update(self.options.lexer_callbacks)

self.lexer_conf = LexerConf(self.terminals, self.ignore_tokens, self.options.postlex, lexer_callbacks, self.options.g_regex_flags)
self.lexer_conf = LexerConf(self.terminals, re_module, self.ignore_tokens, self.options.postlex, lexer_callbacks, self.options.g_regex_flags, use_bytes=self.options.use_bytes)

if self.options.parser:
self.parser = self._build_parser()
@@ -265,17 +304,16 @@ class Lark(Serialize):
self.lexer = self._build_lexer()

if cache_fn:
logging.debug('Saving grammar to cache: %s', cache_fn)
logger.debug('Saving grammar to cache: %s', cache_fn)
with FS.open(cache_fn, 'wb') as f:
self.save(f)

if __init__.__doc__:
__init__.__doc__ += "\nOptions:\n" + LarkOptions.OPTIONS_DOC
__doc__ += "\n\n" + LarkOptions.OPTIONS_DOC

__serialize_fields__ = 'parser', 'rules', 'options'

def _build_lexer(self):
return TraditionalLexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks, g_regex_flags=self.lexer_conf.g_regex_flags)
return TraditionalLexer(self.lexer_conf)

def _prepare_callbacks(self):
self.parser_class = get_frontend(self.options.parser, self.options.lexer)
@@ -288,11 +326,19 @@ class Lark(Serialize):
return self.parser_class(self.lexer_conf, parser_conf, options=self.options)

def save(self, f):
"""Saves the instance into the given file object

Useful for caching and multiprocessing.
"""
data, m = self.memo_serialize([TerminalDef, Rule])
pickle.dump({'data': data, 'memo': m}, f)

@classmethod
def load(cls, f):
"""Loads an instance from the given file object

Useful for caching and multiprocessing.
"""
inst = cls.__new__(cls)
return inst._load(f)

@@ -312,10 +358,18 @@ class Lark(Serialize):
if postlex is not None:
options['postlex'] = postlex
self.options = LarkOptions.deserialize(options, memo)
re_module = regex if self.options.regex else re
self.rules = [Rule.deserialize(r, memo) for r in data['rules']]
self.source = '<deserialized>'
self._prepare_callbacks()
self.parser = self.parser_class.deserialize(data['parser'], memo, self._callbacks, self.options.postlex)
self.parser = self.parser_class.deserialize(
data['parser'],
memo,
self._callbacks,
self.options.postlex,
self.options.transformer,
re_module
)
return self

@classmethod
@@ -327,7 +381,7 @@ class Lark(Serialize):
def open(cls, grammar_filename, rel_to=None, **options):
"""Create an instance of Lark with the grammar given by its filename

If rel_to is provided, the function will find the grammar filename in relation to it.
If ``rel_to`` is provided, the function will find the grammar filename in relation to it.

Example:

@@ -358,13 +412,37 @@ class Lark(Serialize):
"Get information about a terminal"
return self._terminals_dict[name]

def parse(self, text, start=None):
def parse(self, text, start=None, on_error=None):
"""Parse the given text, according to the options provided.

The 'start' parameter is required if Lark was given multiple possible start symbols (using the start option).
Parameters:
text (str): Text to be parsed.
start (str, optional): Required if Lark was given multiple possible start symbols (using the start option).
on_error (function, optional): if provided, will be called on UnexpectedToken error. Return true to resume parsing.
LALR only. See examples/error_puppet.py for an example of how to use on_error.

Returns:
If a transformer is supplied to ``__init__``, returns whatever is the
result of the transformation. Otherwise, returns a Tree instance.

Returns a tree, unless specified otherwise.
"""
return self.parser.parse(text, start=start)

try:
return self.parser.parse(text, start=start)
except UnexpectedToken as e:
if on_error is None:
raise

while True:
if not on_error(e):
raise e
try:
return e.puppet.resume_parse()
except UnexpectedToken as e2:
if e.token.type == e2.token.type == '$END' and e.puppet == e2.puppet:
# Prevent infinite loop
raise e2
e = e2


###}

+ 73
- 33
lark/lexer.py View File

@@ -6,6 +6,7 @@ from .utils import Str, classify, get_regexp_width, Py36, Serialize
from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken

###{standalone
from copy import copy

class Pattern(Serialize):

@@ -88,8 +89,25 @@ class TerminalDef(Serialize):
return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern)



class Token(Str):
"""A string with meta-information, that is produced by the lexer.

When parsing text, the resulting chunks of the input that haven't been discarded,
will end up in the tree as Token instances. The Token class inherits from Python's ``str``,
so normal string comparisons and operations will work as expected.

Attributes:
type: Name of the token (as specified in grammar)
value: Value of the token (redundant, as ``token.value == token`` will always be true)
pos_in_stream: The index of the token in the text
line: The line of the token in the text (starting with 1)
column: The column of the token in the text (starting with 1)
end_line: The line where the token ends
end_column: The next column after the end of the token. For example,
if the token is a single character with a column value of 4,
end_column will be 5.
end_pos: the index where the token ends (basically ``pos_in_stream + len(token)``)
"""
__slots__ = ('type', 'pos_in_stream', 'value', 'line', 'column', 'end_line', 'end_column', 'end_pos')

def __new__(cls, type_, value, pos_in_stream=None, line=None, column=None, end_line=None, end_column=None, end_pos=None):
@@ -139,8 +157,8 @@ class Token(Str):


class LineCounter:
def __init__(self):
self.newline_char = '\n'
def __init__(self, newline_char):
self.newline_char = newline_char
self.char_pos = 0
self.line = 1
self.column = 1
@@ -169,7 +187,7 @@ class _Lex:
def lex(self, stream, newline_types, ignore_types):
newline_types = frozenset(newline_types)
ignore_types = frozenset(ignore_types)
line_ctr = LineCounter()
line_ctr = LineCounter('\n' if not self.lexer.use_bytes else b'\n')
last_token = None

while line_ctr.char_pos < len(stream):
@@ -230,7 +248,7 @@ class CallChain:



def _create_unless(terminals, g_regex_flags):
def _create_unless(terminals, g_regex_flags, re_, use_bytes):
tokens_by_type = classify(terminals, lambda t: type(t.pattern))
assert len(tokens_by_type) <= 2, tokens_by_type.keys()
embedded_strs = set()
@@ -241,37 +259,40 @@ def _create_unless(terminals, g_regex_flags):
if strtok.priority > retok.priority:
continue
s = strtok.pattern.value
m = re.match(retok.pattern.to_regexp(), s, g_regex_flags)
m = re_.match(retok.pattern.to_regexp(), s, g_regex_flags)
if m and m.group(0) == s:
unless.append(strtok)
if strtok.pattern.flags <= retok.pattern.flags:
embedded_strs.add(strtok)
if unless:
callback[retok.name] = UnlessCallback(build_mres(unless, g_regex_flags, match_whole=True))
callback[retok.name] = UnlessCallback(build_mres(unless, g_regex_flags, re_, match_whole=True, use_bytes=use_bytes))

terminals = [t for t in terminals if t not in embedded_strs]
return terminals, callback


def _build_mres(terminals, max_size, g_regex_flags, match_whole):
def _build_mres(terminals, max_size, g_regex_flags, match_whole, re_, use_bytes):
# Python sets an unreasonable group limit (currently 100) in its re module
# Worse, the only way to know we reached it is by catching an AssertionError!
# This function recursively tries less and less groups until it's successful.
postfix = '$' if match_whole else ''
mres = []
while terminals:
pattern = u'|'.join(u'(?P<%s>%s)' % (t.name, t.pattern.to_regexp() + postfix) for t in terminals[:max_size])
if use_bytes:
pattern = pattern.encode('latin-1')
try:
mre = re.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()+postfix) for t in terminals[:max_size]), g_regex_flags)
mre = re_.compile(pattern, g_regex_flags)
except AssertionError: # Yes, this is what Python provides us.. :/
return _build_mres(terminals, max_size//2, g_regex_flags, match_whole)
return _build_mres(terminals, max_size//2, g_regex_flags, match_whole, re_, use_bytes)

# terms_from_name = {t.name: t for t in terminals[:max_size]}
mres.append((mre, {i:n for n,i in mre.groupindex.items()} ))
terminals = terminals[max_size:]
return mres

def build_mres(terminals, g_regex_flags, match_whole=False):
return _build_mres(terminals, len(terminals), g_regex_flags, match_whole)
def build_mres(terminals, g_regex_flags, re_, use_bytes, match_whole=False):
return _build_mres(terminals, len(terminals), g_regex_flags, match_whole, re_, use_bytes)

def _regexp_has_newline(r):
r"""Expressions that may indicate newlines in a regexp:
@@ -294,34 +315,40 @@ class Lexer(object):

class TraditionalLexer(Lexer):

def __init__(self, terminals, ignore=(), user_callbacks={}, g_regex_flags=0):
def __init__(self, conf):
terminals = list(conf.tokens)
assert all(isinstance(t, TerminalDef) for t in terminals), terminals

terminals = list(terminals)
self.re = conf.re_module

# Sanitization
for t in terminals:
try:
re.compile(t.pattern.to_regexp(), g_regex_flags)
except re.error:
raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern))
if not conf.skip_validation:
# Sanitization
for t in terminals:
try:
self.re.compile(t.pattern.to_regexp(), conf.g_regex_flags)
except self.re.error:
raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern))

if t.pattern.min_width == 0:
raise LexError("Lexer does not allow zero-width terminals. (%s: %s)" % (t.name, t.pattern))
if t.pattern.min_width == 0:
raise LexError("Lexer does not allow zero-width terminals. (%s: %s)" % (t.name, t.pattern))

assert set(ignore) <= {t.name for t in terminals}
assert set(conf.ignore) <= {t.name for t in terminals}

# Init
self.newline_types = [t.name for t in terminals if _regexp_has_newline(t.pattern.to_regexp())]
self.ignore_types = list(ignore)
self.ignore_types = list(conf.ignore)

terminals.sort(key=lambda x:(-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name))
self.terminals = terminals
self.user_callbacks = user_callbacks
self.build(g_regex_flags)
self.user_callbacks = conf.callbacks
self.g_regex_flags = conf.g_regex_flags
self.use_bytes = conf.use_bytes

def build(self, g_regex_flags=0):
terminals, self.callback = _create_unless(self.terminals, g_regex_flags)
self._mres = None
# self.build(g_regex_flags)

def _build(self):
terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, re_=self.re, use_bytes=self.use_bytes)
assert all(self.callback.values())

for type_, f in self.user_callbacks.items():
@@ -331,7 +358,13 @@ class TraditionalLexer(Lexer):
else:
self.callback[type_] = f

self.mres = build_mres(terminals, g_regex_flags)
self._mres = build_mres(terminals, self.g_regex_flags, self.re, self.use_bytes)

@property
def mres(self):
if self._mres is None:
self._build()
return self._mres

def match(self, stream, pos):
for mre, type_from_index in self.mres:
@@ -347,12 +380,16 @@ class TraditionalLexer(Lexer):

class ContextualLexer(Lexer):

def __init__(self, terminals, states, ignore=(), always_accept=(), user_callbacks={}, g_regex_flags=0):
def __init__(self, conf, states, always_accept=()):
terminals = list(conf.tokens)
tokens_by_name = {}
for t in terminals:
assert t.name not in tokens_by_name, t
tokens_by_name[t.name] = t

trad_conf = copy(conf)
trad_conf.tokens = terminals

lexer_by_tokens = {}
self.lexers = {}
for state, accepts in states.items():
@@ -360,14 +397,17 @@ class ContextualLexer(Lexer):
try:
lexer = lexer_by_tokens[key]
except KeyError:
accepts = set(accepts) | set(ignore) | set(always_accept)
accepts = set(accepts) | set(conf.ignore) | set(always_accept)
state_tokens = [tokens_by_name[n] for n in accepts if n and n in tokens_by_name]
lexer = TraditionalLexer(state_tokens, ignore=ignore, user_callbacks=user_callbacks, g_regex_flags=g_regex_flags)
lexer_conf = copy(trad_conf)
lexer_conf.tokens = state_tokens
lexer = TraditionalLexer(lexer_conf)
lexer_by_tokens[key] = lexer

self.lexers[state] = lexer

self.root_lexer = TraditionalLexer(terminals, ignore=ignore, user_callbacks=user_callbacks, g_regex_flags=g_regex_flags)
assert trad_conf.tokens is terminals
self.root_lexer = TraditionalLexer(trad_conf)

def lex(self, stream, get_parser_state):
parser_state = get_parser_state()


+ 76
- 36
lark/load_grammar.py View File

@@ -5,7 +5,7 @@ import sys
from copy import copy, deepcopy
from io import open

from .utils import bfs, eval_escaping
from .utils import bfs, eval_escaping, Py36, logger, classify_bool
from .lexer import Token, TerminalDef, PatternStr, PatternRE

from .parse_tree_builder import ParseTreeBuilder
@@ -85,7 +85,7 @@ TERMINALS = {
'RULE': '!?[_?]?[a-z][_a-z0-9]*',
'TERMINAL': '_?[A-Z][_A-Z0-9]*',
'STRING': r'"(\\"|\\\\|[^"\n])*?"i?',
'REGEXP': r'/(?!/)(\\/|\\\\|[^/\n])*?/[%s]*' % _RE_FLAGS,
'REGEXP': r'/(?!/)(\\/|\\\\|[^/])*?/[%s]*' % _RE_FLAGS,
'_NL': r'(\r?\n)+\s*',
'WS': r'[ \t]+',
'COMMENT': r'\s*//[^\n]*',
@@ -307,6 +307,7 @@ class PrepareAnonTerminals(Transformer_InPlace):
self.term_set = {td.name for td in self.terminals}
self.term_reverse = {td.pattern: td for td in terminals}
self.i = 0
self.rule_options = None


@inline_args
@@ -335,7 +336,7 @@ class PrepareAnonTerminals(Transformer_InPlace):
term_name = None

elif isinstance(p, PatternRE):
if p in self.term_reverse: # Kind of a wierd placement.name
if p in self.term_reverse: # Kind of a weird placement.name
term_name = self.term_reverse[p].name
else:
assert False, p
@@ -351,7 +352,10 @@ class PrepareAnonTerminals(Transformer_InPlace):
self.term_reverse[p] = termdef
self.terminals.append(termdef)

return Terminal(term_name, filter_out=isinstance(p, PatternStr))
filter_out = False if self.rule_options and self.rule_options.keep_all_tokens else isinstance(p, PatternStr)

return Terminal(term_name, filter_out=filter_out)


class _ReplaceSymbols(Transformer_InPlace):
" Helper for ApplyTemplates "
@@ -405,6 +409,13 @@ def _literal_to_pattern(literal):
flags = v[flag_start:]
assert all(f in _RE_FLAGS for f in flags), flags

if literal.type == 'STRING' and '\n' in v:
raise GrammarError('You cannot put newlines in string literals')

if literal.type == 'REGEXP' and '\n' in v and 'x' not in flags:
raise GrammarError('You can only use newlines in regular expressions '
'with the `x` (verbose) flag')

v = v[:flag_start]
assert v[0] == v[-1] and v[0] in '"/'
x = v[1:-1]
@@ -413,9 +424,11 @@ def _literal_to_pattern(literal):

if literal.type == 'STRING':
s = s.replace('\\\\', '\\')

return { 'STRING': PatternStr,
'REGEXP': PatternRE }[literal.type](s, flags)
return PatternStr(s, flags)
elif literal.type == 'REGEXP':
return PatternRE(s, flags)
else:
assert False, 'Invariant failed: literal.type not in ["STRING", "REGEXP"]'


@inline_args
@@ -432,6 +445,20 @@ class PrepareLiterals(Transformer_InPlace):
return ST('pattern', [PatternRE(regexp)])


def _make_joined_pattern(regexp, flags_set):
# In Python 3.6, a new syntax for flags was introduced, that allows us to restrict the scope
# of flags to a specific regexp group. We are already using it in `lexer.Pattern._get_flags`
# However, for prior Python versions, we still need to use global flags, so we have to make sure
# that there are no flag collisions when we merge several terminals.
flags = ()
if not Py36:
if len(flags_set) > 1:
raise GrammarError("Lark doesn't support joining terminals with conflicting flags in python <3.6!")
elif len(flags_set) == 1:
flags ,= flags_set

return PatternRE(regexp, flags)

class TerminalTreeToPattern(Transformer):
def pattern(self, ps):
p ,= ps
@@ -441,16 +468,16 @@ class TerminalTreeToPattern(Transformer):
assert items
if len(items) == 1:
return items[0]
if len({i.flags for i in items}) > 1:
raise GrammarError("Lark doesn't support joining terminals with conflicting flags!")
return PatternRE(''.join(i.to_regexp() for i in items), items[0].flags if items else ())
pattern = ''.join(i.to_regexp() for i in items)
return _make_joined_pattern(pattern, {i.flags for i in items})

def expansions(self, exps):
if len(exps) == 1:
return exps[0]
if len({i.flags for i in exps}) > 1:
raise GrammarError("Lark doesn't support joining terminals with conflicting flags!")
return PatternRE('(?:%s)' % ('|'.join(i.to_regexp() for i in exps)), exps[0].flags)
pattern = '(?:%s)' % ('|'.join(i.to_regexp() for i in exps))
return _make_joined_pattern(pattern, {i.flags for i in exps})

def expr(self, args):
inner, op = args[:2]
@@ -527,7 +554,8 @@ class Grammar:
# =================

# 1. Pre-process terminals
transformer = PrepareLiterals() * PrepareSymbols() * PrepareAnonTerminals(terminals) # Adds to terminals
anon_tokens_transf = PrepareAnonTerminals(terminals)
transformer = PrepareLiterals() * PrepareSymbols() * anon_tokens_transf # Adds to terminals

# 2. Inline Templates

@@ -542,8 +570,10 @@ class Grammar:
i += 1
if len(params) != 0: # Dont transform templates
continue
ebnf_to_bnf.rule_options = RuleOptions(keep_all_tokens=True) if options.keep_all_tokens else None
rule_options = RuleOptions(keep_all_tokens=True) if options and options.keep_all_tokens else None
ebnf_to_bnf.rule_options = rule_options
ebnf_to_bnf.prefix = name
anon_tokens_transf.rule_options = rule_options
tree = transformer.transform(rule_tree)
res = ebnf_to_bnf.transform(tree)
rules.append((name, res, options))
@@ -601,7 +631,9 @@ class Grammar:
if isinstance(s, NonTerminal)
and s != r.origin}
used_rules |= {NonTerminal(s) for s in start}
compiled_rules = [r for r in compiled_rules if r.origin in used_rules]
compiled_rules, unused = classify_bool(compiled_rules, lambda r: r.origin in used_rules)
for r in unused:
logger.debug("Unused rule: %s", r)
if len(compiled_rules) == c:
break

@@ -609,14 +641,16 @@ class Grammar:
used_terms = {t.name for r in compiled_rules
for t in r.expansion
if isinstance(t, Terminal)}
terminals = [t for t in terminals if t.name in used_terms or t.name in self.ignore]
terminals, unused = classify_bool(terminals, lambda t: t.name in used_terms or t.name in self.ignore)
if unused:
logger.debug("Unused terminals: %s", [t.name for t in unused])

return terminals, compiled_rules, self.ignore



_imported_grammars = {}
def import_grammar(grammar_path, base_paths=[]):
def import_grammar(grammar_path, re_, base_paths=[]):
if grammar_path not in _imported_grammars:
import_paths = base_paths + IMPORT_PATHS
for import_path in import_paths:
@@ -624,7 +658,7 @@ def import_grammar(grammar_path, base_paths=[]):
joined_path = os.path.join(import_path, grammar_path)
with open(joined_path, encoding='utf8') as f:
text = f.read()
grammar = load_grammar(text, joined_path)
grammar = load_grammar(text, joined_path, re_)
_imported_grammars[grammar_path] = grammar
break
else:
@@ -755,18 +789,33 @@ def _find_used_symbols(tree):
for t in x.scan_values(lambda t: t.type in ('RULE', 'TERMINAL'))}

class GrammarLoader:
def __init__(self):
ERRORS = [
('Unclosed parenthesis', ['a: (\n']),
('Umatched closing parenthesis', ['a: )\n', 'a: [)\n', 'a: (]\n']),
('Expecting rule or terminal definition (missing colon)', ['a\n', 'A\n', 'a->\n', 'A->\n', 'a A\n']),
('Illegal name for rules or terminals', ['Aa:\n']),
('Alias expects lowercase name', ['a: -> "a"\n']),
('Unexpected colon', ['a::\n', 'a: b:\n', 'a: B:\n', 'a: "a":\n']),
('Misplaced operator', ['a: b??', 'a: b(?)', 'a:+\n', 'a:?\n', 'a:*\n', 'a:|*\n']),
('Expecting option ("|") or a new rule or terminal definition', ['a:a\n()\n']),
('Terminal names cannot contain dots', ['A.B\n']),
('%import expects a name', ['%import "a"\n']),
('%ignore expects a value', ['%ignore %import\n']),
]

def __init__(self, re_module):
terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()]

rules = [options_from_rule(name, None, x) for name, x in RULES.items()]
rules = [Rule(NonTerminal(r), symbols_from_strcase(x.split()), i, None, o) for r, _p, xs, o in rules for i, x in enumerate(xs)]
callback = ParseTreeBuilder(rules, ST).create_callback()
lexer_conf = LexerConf(terminals, ['WS', 'COMMENT'])
lexer_conf = LexerConf(terminals, re_module, ['WS', 'COMMENT'])

parser_conf = ParserConf(rules, callback, ['start'])
self.parser = LALR_TraditionalLexer(lexer_conf, parser_conf)

self.canonize_tree = CanonizeTree()
self.re_module = re_module

def load_grammar(self, grammar_text, grammar_name='<?>'):
"Parse grammar_text, verify, and create Grammar object. Display nice messages on error."
@@ -779,19 +828,9 @@ class GrammarLoader:
(e.line, e.column, grammar_name, context))
except UnexpectedToken as e:
context = e.get_context(grammar_text)
error = e.match_examples(self.parser.parse, {
'Unclosed parenthesis': ['a: (\n'],
'Umatched closing parenthesis': ['a: )\n', 'a: [)\n', 'a: (]\n'],
'Expecting rule or terminal definition (missing colon)': ['a\n', 'a->\n', 'A->\n', 'a A\n'],
'Alias expects lowercase name': ['a: -> "a"\n'],
'Unexpected colon': ['a::\n', 'a: b:\n', 'a: B:\n', 'a: "a":\n'],
'Misplaced operator': ['a: b??', 'a: b(?)', 'a:+\n', 'a:?\n', 'a:*\n', 'a:|*\n'],
'Expecting option ("|") or a new rule or terminal definition': ['a:a\n()\n'],
'%import expects a name': ['%import "a"\n'],
'%ignore expects a value': ['%ignore %import\n'],
})
error = e.match_examples(self.parser.parse, self.ERRORS, use_accepts=True)
if error:
raise GrammarError("%s at line %s column %s\n\n%s" % (error, e.line, e.column, context))
raise GrammarError("%s, at line %s column %s\n\n%s" % (error, e.line, e.column, context))
elif 'STRING' in e.expected:
raise GrammarError("Expecting a value at line %s column %s\n\n%s" % (e.line, e.column, context))
raise
@@ -819,7 +858,7 @@ class GrammarLoader:
if len(stmt.children) > 1:
path_node, arg1 = stmt.children
else:
path_node, = stmt.children
path_node ,= stmt.children
arg1 = None

if isinstance(arg1, Tree): # Multi import
@@ -862,7 +901,7 @@ class GrammarLoader:
# import grammars
for dotted_path, (base_paths, aliases) in imports.items():
grammar_path = os.path.join(*dotted_path) + EXT
g = import_grammar(grammar_path, base_paths=base_paths)
g = import_grammar(grammar_path, self.re_module, base_paths=base_paths)
new_td, new_rd = import_from_grammar_into_namespace(g, '__'.join(dotted_path), aliases)

term_defs += new_td
@@ -942,4 +981,5 @@ class GrammarLoader:



load_grammar = GrammarLoader().load_grammar
def load_grammar(grammar, source, re_):
return GrammarLoader(re_).load_grammar(grammar, source)

+ 81
- 0
lark/parse_tree_builder.py View File

@@ -195,6 +195,86 @@ def maybe_create_ambiguous_expander(tree_class, expansion, keep_all_tokens):
if to_expand:
return partial(AmbiguousExpander, to_expand, tree_class)

class AmbiguousIntermediateExpander:
"""
Propagate ambiguous intermediate nodes and their derivations up to the
current rule.

In general, converts

rule
_iambig
_inter
someChildren1
...
_inter
someChildren2
...
someChildren3
...

to

_ambig
rule
someChildren1
...
someChildren3
...
rule
someChildren2
...
someChildren3
...
rule
childrenFromNestedIambigs
...
someChildren3
...
...

propagating up any nested '_iambig' nodes along the way.
"""

def __init__(self, tree_class, node_builder):
self.node_builder = node_builder
self.tree_class = tree_class

def __call__(self, children):
def _is_iambig_tree(child):
return hasattr(child, 'data') and child.data == '_iambig'

def _collapse_iambig(children):
"""
Recursively flatten the derivations of the parent of an '_iambig'
node. Returns a list of '_inter' nodes guaranteed not
to contain any nested '_iambig' nodes, or None if children does
not contain an '_iambig' node.
"""

# Due to the structure of the SPPF,
# an '_iambig' node can only appear as the first child
if children and _is_iambig_tree(children[0]):
iambig_node = children[0]
result = []
for grandchild in iambig_node.children:
collapsed = _collapse_iambig(grandchild.children)
if collapsed:
for child in collapsed:
child.children += children[1:]
result += collapsed
else:
new_tree = self.tree_class('_inter', grandchild.children + children[1:])
result.append(new_tree)
return result

collapsed = _collapse_iambig(children)
if collapsed:
processed_nodes = [self.node_builder(c.children) for c in collapsed]
return self.tree_class('_ambig', processed_nodes)

return self.node_builder(children)

def ptb_inline_args(func):
@wraps(func)
def f(children):
@@ -239,6 +319,7 @@ class ParseTreeBuilder:
maybe_create_child_filter(rule.expansion, keep_all_tokens, self.ambiguous, options.empty_indices if self.maybe_placeholders else None),
self.propagate_positions and PropagatePositions,
self.ambiguous and maybe_create_ambiguous_expander(self.tree_class, rule.expansion, keep_all_tokens),
self.ambiguous and partial(AmbiguousIntermediateExpander, self.tree_class)
]))

yield rule, wrapper_chain


+ 32
- 15
lark/parser_frontends.py View File

@@ -1,9 +1,6 @@
import re
from functools import partial

from .utils import get_regexp_width, Serialize
from .parsers.grammar_analysis import GrammarAnalyzer
from .lexer import TraditionalLexer, ContextualLexer, Lexer, Token
from .lexer import TraditionalLexer, ContextualLexer, Lexer, Token, TerminalDef
from .parsers import earley, xearley, cyk
from .parsers.lalr_parser import LALR_Parser
from .grammar import Rule
@@ -21,7 +18,14 @@ def get_frontend(parser, lexer):
elif lexer == 'contextual':
return LALR_ContextualLexer
elif issubclass(lexer, Lexer):
return partial(LALR_CustomLexer, lexer)
class LALR_CustomLexerWrapper(LALR_CustomLexer):
def __init__(self, lexer_conf, parser_conf, options=None):
super(LALR_CustomLexerWrapper, self).__init__(
lexer, lexer_conf, parser_conf, options=options)
def init_lexer(self):
self.lexer = lexer(self.lexer_conf)

return LALR_CustomLexerWrapper
else:
raise ValueError('Unknown lexer: %s' % lexer)
elif parser=='earley':
@@ -54,6 +58,15 @@ class _ParserFrontend(Serialize):
return self.parser.parse(input, start, *args)


def _get_lexer_callbacks(transformer, terminals):
result = {}
for terminal in terminals:
callback = getattr(transformer, terminal.name, None)
if callback is not None:
result[terminal.name] = callback
return result


class WithLexer(_ParserFrontend):
lexer = None
parser = None
@@ -69,11 +82,18 @@ class WithLexer(_ParserFrontend):
self.postlex = lexer_conf.postlex

@classmethod
def deserialize(cls, data, memo, callbacks, postlex):
def deserialize(cls, data, memo, callbacks, postlex, transformer, re_module):
inst = super(WithLexer, cls).deserialize(data, memo)

inst.postlex = postlex
inst.parser = LALR_Parser.deserialize(inst.parser, memo, callbacks)

terminals = [item for item in memo.values() if isinstance(item, TerminalDef)]
inst.lexer_conf.callbacks = _get_lexer_callbacks(transformer, terminals)
inst.lexer_conf.re_module = re_module
inst.lexer_conf.skip_validation=True
inst.init_lexer()

return inst

def _serialize(self, data, memo):
@@ -88,7 +108,7 @@ class WithLexer(_ParserFrontend):
return self._parse(token_stream, start)

def init_traditional_lexer(self):
self.lexer = TraditionalLexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks, g_regex_flags=self.lexer_conf.g_regex_flags)
self.lexer = TraditionalLexer(self.lexer_conf)

class LALR_WithLexer(WithLexer):
def __init__(self, lexer_conf, parser_conf, options=None):
@@ -98,7 +118,7 @@ class LALR_WithLexer(WithLexer):

self.init_lexer()

def init_lexer(self):
def init_lexer(self, **kw):
raise NotImplementedError()

class LALR_TraditionalLexer(LALR_WithLexer):
@@ -109,11 +129,7 @@ class LALR_ContextualLexer(LALR_WithLexer):
def init_lexer(self):
states = {idx:list(t.keys()) for idx, t in self.parser._parse_table.states.items()}
always_accept = self.postlex.always_accept if self.postlex else ()
self.lexer = ContextualLexer(self.lexer_conf.tokens, states,
ignore=self.lexer_conf.ignore,
always_accept=always_accept,
user_callbacks=self.lexer_conf.callbacks,
g_regex_flags=self.lexer_conf.g_regex_flags)
self.lexer = ContextualLexer(self.lexer_conf, states, always_accept=always_accept)


def parse(self, text, start=None):
@@ -187,8 +203,10 @@ class XEarley(_ParserFrontend):
else:
if width == 0:
raise ValueError("Dynamic Earley doesn't allow zero-width regexps", t)
if lexer_conf.use_bytes:
regexp = regexp.encode('utf-8')

self.regexps[t.name] = re.compile(regexp, lexer_conf.g_regex_flags)
self.regexps[t.name] = lexer_conf.re_module.compile(regexp, lexer_conf.g_regex_flags)

def parse(self, text, start):
return self._parse(text, start)
@@ -225,4 +243,3 @@ class CYK(WithLexer):

def _apply_callback(self, tree):
return self.callbacks[tree.rule](tree.children)


+ 4
- 4
lark/parsers/earley.py View File

@@ -10,15 +10,15 @@ is better documented here:
http://www.bramvandersanden.com/post/2014/06/shared-packed-parse-forest/
"""

import logging
from collections import deque

from ..visitors import Transformer_InPlace, v_args
from ..exceptions import UnexpectedEOF, UnexpectedToken
from ..utils import logger
from .grammar_analysis import GrammarAnalyzer
from ..grammar import NonTerminal
from .earley_common import Item, TransitiveItem
from .earley_forest import ForestToTreeVisitor, ForestSumVisitor, SymbolNode, ForestToAmbiguousTreeVisitor
from .earley_forest import ForestToTreeVisitor, ForestSumVisitor, SymbolNode, CompleteForestToAmbiguousTreeVisitor

class Parser:
def __init__(self, parser_conf, term_matcher, resolve_ambiguity=True, debug=False):
@@ -301,7 +301,7 @@ class Parser:
try:
debug_walker = ForestToPyDotVisitor()
except ImportError:
logging.warning("Cannot find dependency 'pydot', will not generate sppf debug image")
logger.warning("Cannot find dependency 'pydot', will not generate sppf debug image")
else:
debug_walker.visit(solutions[0], "sppf.png")

@@ -313,7 +313,7 @@ class Parser:
assert False, 'Earley should not generate multiple start symbol items!'

# Perform our SPPF -> AST conversion using the right ForestVisitor.
forest_tree_visitor_cls = ForestToTreeVisitor if self.resolve_ambiguity else ForestToAmbiguousTreeVisitor
forest_tree_visitor_cls = ForestToTreeVisitor if self.resolve_ambiguity else CompleteForestToAmbiguousTreeVisitor
forest_tree_visitor = forest_tree_visitor_cls(self.callbacks, self.forest_sum_visitor and self.forest_sum_visitor())

return forest_tree_visitor.visit(solutions[0])


+ 81
- 4
lark/parsers/earley_forest.py View File

@@ -13,6 +13,7 @@ from collections import deque
from operator import attrgetter
from importlib import import_module

from ..utils import logger
from ..tree import Tree
from ..exceptions import ParseError

@@ -328,10 +329,17 @@ class ForestToAmbiguousTreeVisitor(ForestToTreeVisitor):
self.output_stack[-1].children.append(node)

def visit_symbol_node_in(self, node):
if self.forest_sum_visitor and node.is_ambiguous and isinf(node.priority):
self.forest_sum_visitor.visit(node)
if not node.is_intermediate and node.is_ambiguous:
self.output_stack.append(Tree('_ambig', []))
if node.is_ambiguous:
if self.forest_sum_visitor and isinf(node.priority):
self.forest_sum_visitor.visit(node)
if node.is_intermediate:
# TODO Support ambiguous intermediate nodes!
logger.warning("Ambiguous intermediate node in the SPPF: %s. "
"Lark does not currently process these ambiguities; resolving with the first derivation.", node)
return next(iter(node.children))
else:
self.output_stack.append(Tree('_ambig', []))

return iter(node.children)

def visit_symbol_node_out(self, node):
@@ -355,6 +363,75 @@ class ForestToAmbiguousTreeVisitor(ForestToTreeVisitor):
else:
self.result = result

class CompleteForestToAmbiguousTreeVisitor(ForestToTreeVisitor):
"""
An augmented version of ForestToAmbiguousTreeVisitor that is designed to
handle ambiguous intermediate nodes as well as ambiguous symbol nodes.

On the way down:

- When an ambiguous intermediate node is encountered, an '_iambig' node
is inserted into the tree.
- Each possible derivation of an ambiguous intermediate node is represented
by an '_inter' node added as a child of the corresponding '_iambig' node.

On the way up, these nodes are propagated up the tree and collapsed
into a single '_ambig' node for the nearest symbol node ancestor.
This is achieved by the AmbiguousIntermediateExpander contained in
the callbacks.
"""

def _collapse_ambig(self, children):
new_children = []
for child in children:
if child.data == '_ambig':
new_children += child.children
else:
new_children.append(child)
return new_children

def visit_token_node(self, node):
self.output_stack[-1].children.append(node)

def visit_symbol_node_in(self, node):
if node.is_ambiguous:
if self.forest_sum_visitor and isinf(node.priority):
self.forest_sum_visitor.visit(node)
if node.is_intermediate:
self.output_stack.append(Tree('_iambig', []))
else:
self.output_stack.append(Tree('_ambig', []))
return iter(node.children)

def visit_symbol_node_out(self, node):
if node.is_ambiguous:
result = self.output_stack.pop()
if not node.is_intermediate:
result = Tree('_ambig', self._collapse_ambig(result.children))
if self.output_stack:
self.output_stack[-1].children.append(result)
else:
self.result = result

def visit_packed_node_in(self, node):
if not node.parent.is_intermediate:
self.output_stack.append(Tree('drv', []))
elif node.parent.is_ambiguous:
self.output_stack.append(Tree('_inter', []))
return iter([node.left, node.right])

def visit_packed_node_out(self, node):
if not node.parent.is_intermediate:
result = self.callbacks[node.rule](self.output_stack.pop().children)
elif node.parent.is_ambiguous:
result = self.output_stack.pop()
else:
return
if self.output_stack:
self.output_stack[-1].children.append(result)
else:
self.result = result

class ForestToPyDotVisitor(ForestVisitor):
"""
A Forest visitor which writes the SPPF to a PNG.


+ 1
- 1
lark/parsers/grammar_analysis.py View File

@@ -138,7 +138,7 @@ class GrammarAnalyzer(object):
for r in rules:
for sym in r.expansion:
if not (sym.is_term or sym in self.rules_by_origin):
raise GrammarError("Using an undefined rule: %s" % sym) # TODO test validation
raise GrammarError("Using an undefined rule: %s" % sym)

self.start_states = {start: self.expand_rule(root_rule.origin)
for start, root_rule in root_rules.items()}


+ 4
- 5
lark/parsers/lalr_analysis.py View File

@@ -6,10 +6,9 @@ For now, shift/reduce conflicts are automatically resolved as shifts.
# Author: Erez Shinan (2017)
# Email : erezshin@gmail.com

import logging
from collections import defaultdict, deque
from collections import defaultdict

from ..utils import classify, classify_bool, bfs, fzset, Serialize, Enumerator
from ..utils import classify, classify_bool, bfs, fzset, Enumerator, logger
from ..exceptions import GrammarError

from .grammar_analysis import GrammarAnalyzer, Terminal, LR0ItemSet
@@ -256,8 +255,8 @@ class LALR_Analyzer(GrammarAnalyzer):
raise GrammarError('Reduce/Reduce collision in %s between the following rules: %s' % (la, ''.join([ '\n\t\t- ' + str(r) for r in rules ])))
if la in actions:
if self.debug:
logging.warning('Shift/Reduce conflict for terminal %s: (resolving as shift)', la.name)
logging.warning(' * %s', list(rules)[0])
logger.warning('Shift/Reduce conflict for terminal %s: (resolving as shift)', la.name)
logger.warning(' * %s', list(rules)[0])
else:
actions[la] = (Reduce, list(rules)[0])
m[state] = { k.name: v for k, v in actions.items() }


+ 16
- 13
lark/parsers/lalr_parser.py View File

@@ -7,9 +7,10 @@ from ..lexer import Token
from ..utils import Enumerator, Serialize

from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable
from .lalr_puppet import ParserPuppet

###{standalone

class LALR_Parser(object):
def __init__(self, parser_conf, debug=False):
assert all(r.options.priority is None for r in parser_conf.rules), "LALR doesn't yet support prioritization"
@@ -37,22 +38,19 @@ class LALR_Parser(object):

class _Parser:
def __init__(self, parse_table, callbacks, debug=False):
self.states = parse_table.states
self.start_states = parse_table.start_states
self.end_states = parse_table.end_states
self.parse_table = parse_table
self.callbacks = callbacks
self.debug = debug

def parse(self, seq, start, set_state=None):
def parse(self, seq, start, set_state=None, value_stack=None, state_stack=None):
token = None
stream = iter(seq)
states = self.states
states = self.parse_table.states
start_state = self.parse_table.start_states[start]
end_state = self.parse_table.end_states[start]

start_state = self.start_states[start]
end_state = self.end_states[start]

state_stack = [start_state]
value_stack = []
state_stack = state_stack or [start_state]
value_stack = value_stack or []

if set_state: set_state(start_state)

@@ -61,8 +59,12 @@ class _Parser:
try:
return states[state][token.type]
except KeyError:
expected = [s for s in states[state].keys() if s.isupper()]
raise UnexpectedToken(token, expected, state=state)
expected = {s for s in states[state].keys() if s.isupper()}
try:
puppet = ParserPuppet(self, state_stack, value_stack, start, stream, set_state)
except NameError: # For standalone parser
puppet = None
raise UnexpectedToken(token, expected, state=state, puppet=puppet)

def reduce(rule):
size = len(rule.expansion)
@@ -114,3 +116,4 @@ class _Parser:
return value_stack[-1]

###}


+ 128
- 0
lark/parsers/lalr_puppet.py View File

@@ -0,0 +1,128 @@
# This module provide a LALR puppet, which is used to debugging and error handling

from copy import deepcopy

from .lalr_analysis import Shift, Reduce
from .. import Token


class ParserPuppet(object):
"""ParserPuppet gives you advanced control over error handling when parsing with LALR.

For a simpler, more streamlined interface, see the ``on_error`` argument to ``Lark.parse()``.
"""
def __init__(self, parser, state_stack, value_stack, start, stream, set_state):
self.parser = parser
self._state_stack = state_stack
self._value_stack = value_stack
self._start = start
self._stream = stream
self._set_state = set_state

self.result = None

def feed_token(self, token):
"""Feed the parser with a token, and advance it to the next state, as if it recieved it from the lexer.

Note that ``token`` has to be an instance of ``Token``.
"""
end_state = self.parser.parse_table.end_states[self._start]
state_stack = self._state_stack
value_stack = self._value_stack

state = state_stack[-1]
action, arg = self.parser.parse_table.states[state][token.type]
assert arg != end_state

while action is Reduce:
rule = arg
size = len(rule.expansion)
if size:
s = value_stack[-size:]
del state_stack[-size:]
del value_stack[-size:]
else:
s = []

value = self.parser.callbacks[rule](s)

_action, new_state = self.parser.parse_table.states[state_stack[-1]][rule.origin.name]
assert _action is Shift
state_stack.append(new_state)
value_stack.append(value)

if state_stack[-1] == end_state:
self.result = value_stack[-1]
return self.result

state = state_stack[-1]
action, arg = self.parser.parse_table.states[state][token.type]
assert arg != end_state

assert action is Shift
state_stack.append(arg)
value_stack.append(token)

def copy(self):
"""Create a new puppet with a separate state.

Calls to feed_token() won't affect the old puppet, and vice-versa.
"""
return type(self)(
self.parser,
list(self._state_stack),
deepcopy(self._value_stack),
self._start,
self._stream,
self._set_state,
)

def __eq__(self, other):
if not isinstance(other, ParserPuppet):
return False

return (
self._state_stack == other._state_stack and
self._value_stack == other._value_stack and
self._stream == other._stream and
self._start == other._start
)

def __hash__(self):
return hash((tuple(self._state_stack), self._start))

def pretty(self):
"""Print the output of ``choices()`` in a way that's easier to read."""
out = ["Puppet choices:"]
for k, v in self.choices().items():
out.append('\t- %s -> %s' % (k, v))
out.append('stack size: %s' % len(self._state_stack))
return '\n'.join(out)

def choices(self):
"""Returns a dictionary of token types, matched to their action in the parser.

Only returns token types that are accepted by the current state.

Updated by ``feed_token()``.
"""
return self.parser.parse_table.states[self._state_stack[-1]]

def accepts(self):
accepts = set()
for t in self.choices():
new_puppet = self.copy()
try:
new_puppet.feed_token(Token(t, ''))
except KeyError:
pass
else:
accepts.add(t)
return accepts

def resume_parse(self):
"""Resume parsing from the current puppet state."""
return self.parser.parse(
self._stream, self._start, self._set_state,
self._value_stack, self._state_stack
)

+ 33
- 93
lark/reconstruct.py View File

@@ -1,16 +1,13 @@
from collections import defaultdict
"""Reconstruct text from a tree, based on Lark grammar"""

import unicodedata

from .tree import Tree
from .visitors import Transformer_InPlace
from .common import ParserConf
from .lexer import Token, PatternStr
from .parsers import earley
from .grammar import Rule, Terminal, NonTerminal


from .grammar import Terminal, NonTerminal

def is_discarded_terminal(t):
return t.is_term and t.filter_out
from .tree_matcher import TreeMatcher, is_discarded_terminal

def is_iter_empty(i):
try:
@@ -59,105 +56,48 @@ class WriteTokensTransformer(Transformer_InPlace):
return to_write


class MatchTree(Tree):
pass

class MakeMatchTree:
def __init__(self, name, expansion):
self.name = name
self.expansion = expansion

def __call__(self, args):
t = MatchTree(self.name, args)
t.meta.match_tree = True
t.meta.orig_expansion = self.expansion
return t

def best_from_group(seq, group_key, cmp_key):
d = {}
for item in seq:
key = group_key(item)
if key in d:
v1 = cmp_key(item)
v2 = cmp_key(d[key])
if v2 > v1:
d[key] = item
else:
d[key] = item
return list(d.values())

class Reconstructor:
def __init__(self, parser, term_subs={}):
# XXX TODO calling compile twice returns different results!
assert parser.options.maybe_placeholders == False
tokens, rules, _grammar_extra = parser.grammar.compile(parser.options.start)

self.write_tokens = WriteTokensTransformer({t.name:t for t in tokens}, term_subs)
self.rules = list(self._build_recons_rules(rules))
self.rules.reverse()

# Choose the best rule from each group of {rule => [rule.alias]}, since we only really need one derivation.
self.rules = best_from_group(self.rules, lambda r: r, lambda r: -len(r.expansion))

self.rules.sort(key=lambda r: len(r.expansion))
callbacks = {rule: rule.alias for rule in self.rules} # TODO pass callbacks through dict, instead of alias?
self.parser = earley.Parser(ParserConf(self.rules, callbacks, parser.options.start),
self._match, resolve_ambiguity=True)

def _build_recons_rules(self, rules):
expand1s = {r.origin for r in rules if r.options.expand1}

aliases = defaultdict(list)
for r in rules:
if r.alias:
aliases[r.origin].append( r.alias )

rule_names = {r.origin for r in rules}
nonterminals = {sym for sym in rule_names
if sym.name.startswith('_') or sym in expand1s or sym in aliases }

for r in rules:
recons_exp = [sym if sym in nonterminals else Terminal(sym.name)
for sym in r.expansion if not is_discarded_terminal(sym)]

# Skip self-recursive constructs
if recons_exp == [r.origin]:
continue

sym = NonTerminal(r.alias) if r.alias else r.origin

yield Rule(sym, recons_exp, alias=MakeMatchTree(sym.name, r.expansion))

for origin, rule_aliases in aliases.items():
for alias in rule_aliases:
yield Rule(origin, [Terminal(alias)], alias=MakeMatchTree(origin.name, [NonTerminal(alias)]))
yield Rule(origin, [Terminal(origin.name)], alias=MakeMatchTree(origin.name, [origin]))

def _match(self, term, token):
if isinstance(token, Tree):
return Terminal(token.data) == term
elif isinstance(token, Token):
return term == Terminal(token.type)
assert False
def _isalnum(x):
# Categories defined here: https://www.python.org/dev/peps/pep-3131/
return unicodedata.category(x) in ['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl', 'Mn', 'Mc', 'Nd', 'Pc']

class Reconstructor(TreeMatcher):
"""
A Reconstructor that will, given a full parse Tree, generate source code.

Note:
The reconstructor cannot generate values from regexps. If you need to produce discarded
regexes, such as newlines, use `term_subs` and provide default values for them.

Paramters:
parser: a Lark instance
term_subs: a dictionary of [Terminal name as str] to [output text as str]
"""

def __init__(self, parser, term_subs=None):
TreeMatcher.__init__(self, parser)

self.write_tokens = WriteTokensTransformer({t.name:t for t in self.tokens}, term_subs or {})

def _reconstruct(self, tree):
# TODO: ambiguity?
unreduced_tree = self.parser.parse(tree.children, tree.data) # find a full derivation
assert unreduced_tree.data == tree.data
unreduced_tree = self.match_tree(tree, tree.data)

res = self.write_tokens.transform(unreduced_tree)
for item in res:
if isinstance(item, Tree):
# TODO use orig_expansion.rulename to support templates
for x in self._reconstruct(item):
yield x
else:
yield item

def reconstruct(self, tree):
def reconstruct(self, tree, postproc=None):
x = self._reconstruct(tree)
if postproc:
x = postproc(x)
y = []
prev_item = ''
for item in x:
if prev_item and item and prev_item[-1].isalnum() and item[0].isalnum():
if prev_item and item and _isalnum(prev_item[-1]) and _isalnum(item[0]):
y.append(' ')
y.append(item)
prev_item = item


+ 19
- 13
lark/tools/nearley.py View File

@@ -1,8 +1,9 @@
"Converts between Lark and Nearley grammars. Work in progress!"
"Converts Nearley grammars to Lark"

import os.path
import sys
import codecs
import argparse


from lark import Lark, InlineTransformer
@@ -137,7 +138,7 @@ def _nearley_to_lark(g, builtin_path, n2l, js_code, folder_path, includes):
return rule_defs


def create_code_for_nearley_grammar(g, start, builtin_path, folder_path):
def create_code_for_nearley_grammar(g, start, builtin_path, folder_path, es6=False):
import js2py

emit_code = []
@@ -160,7 +161,10 @@ def create_code_for_nearley_grammar(g, start, builtin_path, folder_path):
for alias, code in n2l.alias_js_code.items():
js_code.append('%s = (%s);' % (alias, code))

emit(js2py.translate_js('\n'.join(js_code)))
if es6:
emit(js2py.translate_js6('\n'.join(js_code)))
else:
emit(js2py.translate_js('\n'.join(js_code)))
emit('class TransformNearley(Transformer):')
for alias in n2l.alias_js_code:
emit(" %s = var.get('%s').to_python()" % (alias, alias))
@@ -173,18 +177,20 @@ def create_code_for_nearley_grammar(g, start, builtin_path, folder_path):

return ''.join(emit_code)

def main(fn, start, nearley_lib):
def main(fn, start, nearley_lib, es6=False):
with codecs.open(fn, encoding='utf8') as f:
grammar = f.read()
return create_code_for_nearley_grammar(grammar, start, os.path.join(nearley_lib, 'builtin'), os.path.abspath(os.path.dirname(fn)))
return create_code_for_nearley_grammar(grammar, start, os.path.join(nearley_lib, 'builtin'), os.path.abspath(os.path.dirname(fn)), es6=es6)

def get_arg_parser():
parser = argparse.ArgumentParser('Reads Nearley grammar (with js functions) outputs an equivalent lark parser.')
parser.add_argument('nearley_grammar', help='Path to the file containing the nearley grammar')
parser.add_argument('start_rule', help='Rule within the nearley grammar to make the base rule')
parser.add_argument('nearley_lib', help='Path to root directory of nearley codebase (used for including builtins)')
parser.add_argument('--es6', help='Enable experimental ES6 support', action='store_true')
return parser

if __name__ == '__main__':
if len(sys.argv) < 4:
print("Reads Nearley grammar (with js functions) outputs an equivalent lark parser.")
print("Usage: %s <nearley_grammar_path> <start_rule> <nearley_lib_path>" % sys.argv[0])
sys.exit(1)

fn, start, nearley_lib = sys.argv[1:]

print(main(fn, start, nearley_lib))
parser = get_arg_parser()
args = parser.parse_args()
print(main(fn=args.nearley_grammar, start=args.start_rule, nearley_lib=args.nearley_lib, es6=args.es6))

+ 40
- 2
lark/tools/standalone.py View File

@@ -30,10 +30,12 @@ from io import open

import codecs
import sys
import token, tokenize
import os
from pprint import pprint
from os import path
from collections import defaultdict
from functools import partial

import lark
from lark import Lark
@@ -84,14 +86,50 @@ def extract_sections(lines):
return {name:''.join(text) for name, text in sections.items()}


def strip_docstrings(line_gen):
""" Strip comments and docstrings from a file.
Based on code from: https://stackoverflow.com/questions/1769332/script-to-remove-python-comments-docstrings
"""
res = []

prev_toktype = token.INDENT
last_lineno = -1
last_col = 0

tokgen = tokenize.generate_tokens(line_gen)
for toktype, ttext, (slineno, scol), (elineno, ecol), ltext in tokgen:
if slineno > last_lineno:
last_col = 0
if scol > last_col:
res.append(" " * (scol - last_col))
if toktype == token.STRING and prev_toktype == token.INDENT:
# Docstring
res.append("#--")
elif toktype == tokenize.COMMENT:
# Comment
res.append("##\n")
else:
res.append(ttext)
prev_toktype = toktype
last_col = ecol
last_lineno = elineno

return ''.join(res)


def main(fobj, start):
lark_inst = Lark(fobj, parser="lalr", lexer="contextual", start=start)

print('# The file was automatically generated by Lark v%s' % lark.__version__)
print('__version__ = "%s"' % lark.__version__)
print()

for pyfile in EXTRACT_STANDALONE_FILES:
for i, pyfile in enumerate(EXTRACT_STANDALONE_FILES):
with open(os.path.join(_larkdir, pyfile)) as f:
print (extract_sections(f)['standalone'])
code = extract_sections(f)['standalone']
if i: # if not this file
code = strip_docstrings(partial(next, iter(code.splitlines(True))))
print(code)

data, m = lark_inst.memo_serialize([TerminalDef, Rule])
print( 'DATA = (' )


+ 40
- 5
lark/tree.py View File

@@ -4,15 +4,29 @@ except ImportError:
pass

from copy import deepcopy
from collections import OrderedDict


###{standalone
from collections import OrderedDict


class Meta:
def __init__(self):
self.empty = True


class Tree(object):
"""The main tree class.

Creates a new tree, and stores "data" and "children" in attributes of the same name.
Trees can be hashed and compared.

Parameters:
data: The name of the rule or alias
children: List of matched sub-rules and terminals
meta: Line & Column numbers (if ``propagate_positions`` is enabled).
meta attributes: line, column, start_pos, end_line, end_column, end_pos
"""
def __init__(self, data, children, meta=None):
self.data = data
self.children = children
@@ -44,6 +58,10 @@ class Tree(object):
return l

def pretty(self, indent_str=' '):
"""Returns an indented string representation of the tree.

Great for debugging.
"""
return ''.join(self._pretty(0, indent_str))

def __eq__(self, other):
@@ -59,6 +77,10 @@ class Tree(object):
return hash((self.data, tuple(self.children)))

def iter_subtrees(self):
"""Depth-first iteration.

Iterates over all the subtrees, never returning to the same node twice (Lark's parse-tree is actually a DAG).
"""
queue = [self]
subtrees = OrderedDict()
for subtree in queue:
@@ -70,11 +92,11 @@ class Tree(object):
return reversed(list(subtrees.values()))

def find_pred(self, pred):
"Find all nodes where pred(tree) == True"
"""Returns all nodes of the tree that evaluate pred(node) as true."""
return filter(pred, self.iter_subtrees())

def find_data(self, data):
"Find all nodes where tree.data == data"
"""Returns all nodes of the tree whose data equals the given data."""
return self.find_pred(lambda t: t.data == data)

###}
@@ -95,6 +117,10 @@ class Tree(object):
yield c

def iter_subtrees_topdown(self):
"""Breadth-first iteration.

Iterates over all the subtrees, return nodes in order like pretty() does.
"""
stack = [self]
while stack:
node = stack.pop()
@@ -105,7 +131,7 @@ class Tree(object):
stack.append(n)

def __deepcopy__(self, memo):
return type(self)(self.data, deepcopy(self.children, memo))
return type(self)(self.data, deepcopy(self.children, memo), meta=self._meta)

def copy(self):
return type(self)(self.data, self.children)
@@ -134,6 +160,15 @@ class SlottedTree(Tree):


def pydot__tree_to_png(tree, filename, rankdir="LR", **kwargs):
graph = pydot__tree_to_graph(tree, rankdir, **kwargs)
graph.write_png(filename)


def pydot__tree_to_dot(tree, filename, rankdir="LR", **kwargs):
graph = pydot__tree_to_graph(tree, rankdir, **kwargs)
graph.write(filename)

def pydot__tree_to_graph(tree, rankdir="LR", **kwargs):
"""Creates a colorful image that represents the tree (data+children, without meta)

Possible values for `rankdir` are "TB", "LR", "BT", "RL", corresponding to
@@ -171,4 +206,4 @@ def pydot__tree_to_png(tree, filename, rankdir="LR", **kwargs):
return node

_to_pydot(tree)
graph.write_png(filename)
return graph

+ 177
- 0
lark/tree_matcher.py View File

@@ -0,0 +1,177 @@
"""Tree matcher based on Lark grammar"""

import re
from collections import defaultdict

from . import Tree, Token
from .common import ParserConf
from .parsers import earley
from .grammar import Rule, Terminal, NonTerminal


def is_discarded_terminal(t):
return t.is_term and t.filter_out


class _MakeTreeMatch:
def __init__(self, name, expansion):
self.name = name
self.expansion = expansion

def __call__(self, args):
t = Tree(self.name, args)
t.meta.match_tree = True
t.meta.orig_expansion = self.expansion
return t


def _best_from_group(seq, group_key, cmp_key):
d = {}
for item in seq:
key = group_key(item)
if key in d:
v1 = cmp_key(item)
v2 = cmp_key(d[key])
if v2 > v1:
d[key] = item
else:
d[key] = item
return list(d.values())


def _best_rules_from_group(rules):
rules = _best_from_group(rules, lambda r: r, lambda r: -len(r.expansion))
rules.sort(key=lambda r: len(r.expansion))
return rules


def _match(term, token):
if isinstance(token, Tree):
name, _args = parse_rulename(term.name)
return token.data == name
elif isinstance(token, Token):
return term == Terminal(token.type)
assert False


def make_recons_rule(origin, expansion, old_expansion):
return Rule(origin, expansion, alias=_MakeTreeMatch(origin.name, old_expansion))


def make_recons_rule_to_term(origin, term):
return make_recons_rule(origin, [Terminal(term.name)], [term])


def parse_rulename(s):
"Parse rule names that may contain a template syntax (like rule{a, b, ...})"
name, args_str = re.match(r'(\w+)(?:{(.+)})?', s).groups()
args = args_str and [a.strip() for a in args_str.split(',')]
return name, args


class TreeMatcher:
"""Match the elements of a tree node, based on an ontology
provided by a Lark grammar.

Supports templates and inlined rules (`rule{a, b,..}` and `_rule`)

Initiialize with an instance of Lark.
"""

def __init__(self, parser):
# XXX TODO calling compile twice returns different results!
assert parser.options.maybe_placeholders == False
self.tokens, rules, _extra = parser.grammar.compile(parser.options.start)

self.rules_for_root = defaultdict(list)

self.rules = list(self._build_recons_rules(rules))
self.rules.reverse()

# Choose the best rule from each group of {rule => [rule.alias]}, since we only really need one derivation.
self.rules = _best_rules_from_group(self.rules)

self.parser = parser
self._parser_cache = {}

def _build_recons_rules(self, rules):
"Convert tree-parsing/construction rules to tree-matching rules"
expand1s = {r.origin for r in rules if r.options.expand1}

aliases = defaultdict(list)
for r in rules:
if r.alias:
aliases[r.origin].append(r.alias)

rule_names = {r.origin for r in rules}
nonterminals = {sym for sym in rule_names
if sym.name.startswith('_') or sym in expand1s or sym in aliases}

seen = set()
for r in rules:
recons_exp = [sym if sym in nonterminals else Terminal(sym.name)
for sym in r.expansion if not is_discarded_terminal(sym)]

# Skip self-recursive constructs
if recons_exp == [r.origin] and r.alias is None:
continue

sym = NonTerminal(r.alias) if r.alias else r.origin
rule = make_recons_rule(sym, recons_exp, r.expansion)

if sym in expand1s and len(recons_exp) != 1:
self.rules_for_root[sym.name].append(rule)

if sym.name not in seen:
yield make_recons_rule_to_term(sym, sym)
seen.add(sym.name)
else:
if sym.name.startswith('_') or sym in expand1s:
yield rule
else:
self.rules_for_root[sym.name].append(rule)

for origin, rule_aliases in aliases.items():
for alias in rule_aliases:
yield make_recons_rule_to_term(origin, NonTerminal(alias))
yield make_recons_rule_to_term(origin, origin)

def match_tree(self, tree, rulename):
"""Match the elements of `tree` to the symbols of rule `rulename`.

Parameters:
tree (Tree): the tree node to match
rulename (str): The expected full rule name (including template args)

Returns:
Tree: an unreduced tree that matches `rulename`

Raises:
UnexpectedToken: If no match was found.

Note:
It's the callers' responsibility match the tree recursively.
"""
if rulename:
# validate
name, _args = parse_rulename(rulename)
assert tree.data == name
else:
rulename = tree.data

# TODO: ambiguity?
try:
parser = self._parser_cache[rulename]
except KeyError:
rules = self.rules + _best_rules_from_group(self.rules_for_root[rulename])

# TODO pass callbacks through dict, instead of alias?
callbacks = {rule: rule.alias for rule in rules}
conf = ParserConf(rules, callbacks, [rulename])
parser = earley.Parser(conf, _match, resolve_ambiguity=True)
self._parser_cache[rulename] = parser

# find a full derivation
unreduced_tree = parser.parse(tree.children, rulename)
assert unreduced_tree.data == rulename
return unreduced_tree

+ 81
- 49
lark/utils.py View File

@@ -4,51 +4,15 @@ from functools import reduce
from ast import literal_eval
from collections import deque

class fzset(frozenset):
def __repr__(self):
return '{%s}' % ', '.join(map(repr, self))


def classify_bool(seq, pred):
true_elems = []
false_elems = []

for elem in seq:
if pred(elem):
true_elems.append(elem)
else:
false_elems.append(elem)

return true_elems, false_elems



def bfs(initial, expand):
open_q = deque(list(initial))
visited = set(open_q)
while open_q:
node = open_q.popleft()
yield node
for next_node in expand(node):
if next_node not in visited:
visited.add(next_node)
open_q.append(next_node)


###{standalone
import logging
logger = logging.getLogger("lark")
logger.addHandler(logging.StreamHandler())
# Set to highest level, since we have some warnings amongst the code
# By default, we should not output any log messages
logger.setLevel(logging.CRITICAL)


def _serialize(value, memo):
if isinstance(value, Serialize):
return value.serialize(memo)
elif isinstance(value, list):
return [_serialize(elem, memo) for elem in value]
elif isinstance(value, frozenset):
return list(value) # TODO reversible?
elif isinstance(value, dict):
return {key:_serialize(elem, memo) for key, elem in value.items()}
return value

###{standalone
def classify(seq, key=None, value=None):
d = {}
for item in seq:
@@ -165,16 +129,31 @@ def smart_decorator(f, create_decorator):
else:
return create_decorator(f.__func__.__call__, True)

try:
import regex
except ImportError:
regex = None

import sys, re
Py36 = (sys.version_info[:2] >= (3, 6))

import sre_parse
import sre_constants
def get_regexp_width(regexp):
categ_pattern = re.compile(r'\\p{[A-Za-z_]+}')
def get_regexp_width(expr):
if regex:
# Since `sre_parse` cannot deal with Unicode categories of the form `\p{Mn}`, we replace these with
# a simple letter, which makes no difference as we are only trying to get the possible lengths of the regex
# match here below.
regexp_final = re.sub(categ_pattern, 'A', expr)
else:
if re.search(categ_pattern, expr):
raise ImportError('`regex` module must be installed in order to use Unicode categories.', expr)
regexp_final = expr
try:
return [int(x) for x in sre_parse.parse(regexp).getwidth()]
return [int(x) for x in sre_parse.parse(regexp_final).getwidth()]
except sre_constants.error:
raise ValueError(regexp)
raise ValueError(expr)

###}

@@ -182,7 +161,7 @@ def get_regexp_width(regexp):
def dedup_list(l):
"""Given a list (l) will removing duplicates from the list,
preserving the original order of the list. Assumes that
the list entrie are hashable."""
the list entries are hashable."""
dedup = set()
return [ x for x in l if not (x in dedup or dedup.add(x))]

@@ -287,7 +266,60 @@ def combine_alternatives(lists):
return reduce(lambda a,b: [i+[j] for i in a for j in b], lists[1:], init)



class FS:
open = open
exists = os.path.exists
exists = os.path.exists


def isascii(s):
""" str.isascii only exists in python3.7+ """
try:
return s.isascii()
except AttributeError:
try:
s.encode('ascii')
return True
except (UnicodeDecodeError, UnicodeEncodeError):
return False


class fzset(frozenset):
def __repr__(self):
return '{%s}' % ', '.join(map(repr, self))


def classify_bool(seq, pred):
true_elems = []
false_elems = []

for elem in seq:
if pred(elem):
true_elems.append(elem)
else:
false_elems.append(elem)

return true_elems, false_elems


def bfs(initial, expand):
open_q = deque(list(initial))
visited = set(open_q)
while open_q:
node = open_q.popleft()
yield node
for next_node in expand(node):
if next_node not in visited:
visited.add(next_node)
open_q.append(next_node)


def _serialize(value, memo):
if isinstance(value, Serialize):
return value.serialize(memo)
elif isinstance(value, list):
return [_serialize(elem, memo) for elem in value]
elif isinstance(value, frozenset):
return list(value) # TODO reversible?
elif isinstance(value, dict):
return {key:_serialize(elem, memo) for key, elem in value.items()}
return value

+ 75
- 14
lark/visitors.py View File

@@ -9,11 +9,16 @@ from .lexer import Token
from inspect import getmembers, getmro

class Discard(Exception):
"""When raising the Discard exception in a transformer callback,
that node is discarded and won't appear in the parent.
"""
pass

# Transformers

class _Decoratable:
"Provides support for decorating methods with @v_args"

@classmethod
def _apply_decorator(cls, decorator, **kwargs):
mro = getmro(cls)
@@ -40,12 +45,31 @@ class _Decoratable:


class Transformer(_Decoratable):
"""Visits the tree recursively, starting with the leaves and finally the root (bottom-up)
"""Transformers visit each node of the tree, and run the appropriate method on it according to the node's data.

Calls its methods (provided by user via inheritance) according to tree.data
Calls its methods (provided by user via inheritance) according to ``tree.data``.
The returned value replaces the old one in the structure.

Can be used to implement map or reduce.
They work bottom-up (or depth-first), starting with the leaves and ending at the root of the tree.
Transformers can be used to implement map & reduce patterns. Because nodes are reduced from leaf to root,
at any point the callbacks may assume the children have already been transformed (if applicable).

``Transformer`` can do anything ``Visitor`` can do, but because it reconstructs the tree,
it is slightly less efficient. It can be used to implement map or reduce patterns.

All these classes implement the transformer interface:

- ``Transformer`` - Recursively transforms the tree. This is the one you probably want.
- ``Transformer_InPlace`` - Non-recursive. Changes the tree in-place instead of returning new instances
- ``Transformer_InPlaceRecursive`` - Recursive. Changes the tree in-place instead of returning new instances

Parameters:
visit_tokens: By default, transformers only visit rules.
visit_tokens=True will tell ``Transformer`` to visit tokens
as well. This is a slightly slower alternative to lexer_callbacks
but it's easier to maintain and works for all algorithms
(even when there isn't a lexer).

"""
__visit_tokens__ = True # For backwards compatibility

@@ -108,11 +132,19 @@ class Transformer(_Decoratable):
return TransformerChain(self, other)

def __default__(self, data, children, meta):
"Default operation on tree (for override)"
"""Default operation on tree (for override)

Function that is called on if a function with a corresponding name has not been found.
Defaults to reconstruct the Tree.
"""
return Tree(data, children, meta)

def __default_token__(self, token):
"Default operation on token (for override)"
"""Default operation on token (for override)

Function that is called on if a function with a corresponding name has not been found.
Defaults to just return the argument.
"""
return token


@@ -209,10 +241,10 @@ class VisitorBase:


class Visitor(VisitorBase):
"""Bottom-up visitor, non-recursive
"""Bottom-up visitor, non-recursive.

Visits the tree, starting with the leaves and finally the root (bottom-up)
Calls its methods (provided by user via inheritance) according to tree.data
Calls its methods (provided by user via inheritance) according to ``tree.data``
"""

def visit(self, tree):
@@ -225,11 +257,12 @@ class Visitor(VisitorBase):
self._call_userfunc(subtree)
return tree


class Visitor_Recursive(VisitorBase):
"""Bottom-up visitor, recursive
"""Bottom-up visitor, recursive.

Visits the tree, starting with the leaves and finally the root (bottom-up)
Calls its methods (provided by user via inheritance) according to tree.data
Calls its methods (provided by user via inheritance) according to ``tree.data``
"""

def visit(self, tree):
@@ -261,13 +294,15 @@ def visit_children_decor(func):


class Interpreter(_Decoratable):
"""Top-down visitor, recursive
"""Interpreter walks the tree starting at the root.

Visits the tree, starting with the root and finally the leaves (top-down)
Calls its methods (provided by user via inheritance) according to tree.data

Unlike Transformer and Visitor, the Interpreter doesn't automatically visit its sub-branches.
The user has to explicitly call visit_children, or use the @visit_children_decor
For each tree node, it calls its methods (provided by user via inheritance) according to ``tree.data``.

Unlike ``Transformer`` and ``Visitor``, the Interpreter doesn't automatically visit its sub-branches.
The user has to explicitly call ``visit``, ``visit_children``, or use the ``@visit_children_decor``.
This allows the user to implement branching and loops.
"""

def visit(self, tree):
@@ -350,8 +385,34 @@ def _vargs_meta(f, data, children, meta):
def _vargs_tree(f, data, children, meta):
return f(Tree(data, children, meta))


def v_args(inline=False, meta=False, tree=False, wrapper=None):
"A convenience decorator factory, for modifying the behavior of user-supplied visitor methods"
"""A convenience decorator factory for modifying the behavior of user-supplied visitor methods.

By default, callback methods of transformers/visitors accept one argument - a list of the node's children.

``v_args`` can modify this behavior. When used on a transformer/visitor class definition,
it applies to all the callback methods inside it.

Parameters:
inline: Children are provided as ``*args`` instead of a list argument (not recommended for very long lists).
meta: Provides two arguments: ``children`` and ``meta`` (instead of just the first)
tree: Provides the entire tree as the argument, instead of the children.

Example:
::

@v_args(inline=True)
class SolveArith(Transformer):
def add(self, left, right):
return left + right


class ReverseNotation(Transformer_InPlace):
@v_args(tree=True)
def tree_node(self, tree):
tree.children = tree.children[::-1]
"""
if tree and (meta or inline):
raise ValueError("Visitor functions cannot combine 'tree' with 'meta' or 'inline'.")



+ 0
- 14
mkdocs.yml View File

@@ -1,14 +0,0 @@
site_name: Lark
theme: readthedocs
pages:
- Main Page: index.md
- Philosophy: philosophy.md
- Features: features.md
- Parsers: parsers.md
- How To Use (Guide): how_to_use.md
- How To Develop (Guide): how_to_develop.md
- Grammar Reference: grammar.md
- Tree Construction Reference: tree_construction.md
- Visitors and Transformers: visitors.md
- Classes Reference: classes.md
- Recipes: recipes.md

+ 7
- 5
readthedocs.yml View File

@@ -1,10 +1,12 @@
version: 2

mkdocs:
configuration: mkdocs.yml
fail_on_warning: false

formats: all

python:
version: 3.5
version: 3.7
install:
- requirements: docs/requirements.txt

# Build documentation in the docs/ directory with Sphinx
sphinx:
configuration: docs/conf.py

+ 10
- 2
setup.py View File

@@ -1,4 +1,7 @@
import re
try:
import regex as re
except ImportError:
import re
from setuptools import find_packages, setup

__version__ ,= re.findall('__version__ = "(.*)"', open('lark/__init__.py').read())
@@ -6,11 +9,16 @@ __version__ ,= re.findall('__version__ = "(.*)"', open('lark/__init__.py').read(
setup(
name = "lark-parser",
version = __version__,
packages = ['lark', 'lark.parsers', 'lark.tools', 'lark.grammars', 'lark-stubs'],
packages = ['lark', 'lark.parsers', 'lark.tools', 'lark.grammars', 'lark.__pyinstaller', 'lark-stubs'],

requires = [],
install_requires = [],

extras_require = {
"regex": ["regex"],
"nearley": ["js2py"]
},

package_data = {'': ['*.md', '*.lark'], 'lark-stubs': ['*.pyi']},

test_suite = 'tests.__main__',


nearley-requirements.txt → test-requirements.txt View File

@@ -1 +1,2 @@
Js2Py==0.68
regex

+ 6
- 2
tests/__main__.py View File

@@ -2,20 +2,24 @@ from __future__ import absolute_import, print_function

import unittest
import logging
from lark import logger

from .test_trees import TestTrees
from .test_tools import TestStandalone
from .test_cache import TestCache
from .test_grammar import TestGrammar
from .test_reconstructor import TestReconstructor

try:
from .test_nearley.test_nearley import TestNearley
except ImportError:
logging.warning("Warning: Skipping tests for Nearley grammar imports (js2py required)")
logger.warning("Warning: Skipping tests for Nearley grammar imports (js2py required)")

# from .test_selectors import TestSelectors
# from .test_grammars import TestPythonG, TestConfigG

from .test_logger import Testlogger

from .test_parser import (
TestLalrStandard,
TestEarleyStandard,
@@ -31,7 +35,7 @@ from .test_parser import (
TestParsers,
)

logging.basicConfig(level=logging.INFO)
logger.setLevel(logging.INFO)

if __name__ == '__main__':
unittest.main()

+ 16
- 0
tests/test_cache.py View File

@@ -4,6 +4,7 @@ import sys
from unittest import TestCase, main

from lark import Lark, Tree
from lark.lexer import Lexer, Token
import lark.lark as lark_module

try:
@@ -38,6 +39,15 @@ class MockFS:
return name in self.files


class CustomLexer(Lexer):
def __init__(self, lexer_conf):
pass

def lex(self, data):
for obj in data:
yield Token('A', obj)


class TestCache(TestCase):
def setUp(self):
pass
@@ -70,6 +80,12 @@ class TestCache(TestCase):
parser = Lark(g, parser='lalr', cache=True)
assert parser.parse('a') == Tree('start', [])

# Test with custom lexer
mock_fs.files = {}
parser = Lark(g, parser='lalr', lexer=CustomLexer, cache=True)
parser = Lark(g, parser='lalr', lexer=CustomLexer, cache=True)
assert len(mock_fs.files) == 1
assert parser.parse('a') == Tree('start', [])
finally:
lark_module.FS = fs



+ 31
- 0
tests/test_grammar.py View File

@@ -0,0 +1,31 @@
from __future__ import absolute_import

import sys
from unittest import TestCase, main

from lark import Lark
from lark.load_grammar import GrammarLoader, GrammarError


class TestGrammar(TestCase):
def setUp(self):
pass

def test_errors(self):
for msg, examples in GrammarLoader.ERRORS:
for example in examples:
try:
p = Lark(example)
except GrammarError as e:
assert msg in str(e)
else:
assert False, "example did not raise an error"




if __name__ == '__main__':
main()




+ 65
- 0
tests/test_logger.py View File

@@ -0,0 +1,65 @@
import logging
from contextlib import contextmanager
from lark import Lark, logger
from unittest import TestCase, main

try:
from StringIO import StringIO
except ImportError:
from io import StringIO

@contextmanager
def capture_log():
stream = StringIO()
orig_handler = logger.handlers[0]
del logger.handlers[:]
logger.addHandler(logging.StreamHandler(stream))
yield stream
del logger.handlers[:]
logger.addHandler(orig_handler)

class Testlogger(TestCase):

def test_debug(self):
logger.setLevel(logging.DEBUG)
collision_grammar = '''
start: as as
as: a*
a: "a"
'''
with capture_log() as log:
Lark(collision_grammar, parser='lalr', debug=True)

log = log.getvalue()
# since there are conflicts about A
# symbol A should appear in the log message for hint
self.assertIn("A", log)

def test_non_debug(self):
logger.setLevel(logging.DEBUG)
collision_grammar = '''
start: as as
as: a*
a: "a"
'''
with capture_log() as log:
Lark(collision_grammar, parser='lalr', debug=False)
log = log.getvalue()
# no log messge
self.assertEqual(len(log), 0)

def test_loglevel_higher(self):
logger.setLevel(logging.ERROR)
collision_grammar = '''
start: as as
as: a*
a: "a"
'''
with capture_log() as log:
Lark(collision_grammar, parser='lalr', debug=True)
log = log.getvalue()
# no log messge
self.assertEqual(len(log), 0)

if __name__ == '__main__':
main()

+ 4
- 3
tests/test_nearley/test_nearley.py View File

@@ -6,16 +6,17 @@ import logging
import os
import codecs

logging.basicConfig(level=logging.INFO)

from lark import logger
from lark.tools.nearley import create_code_for_nearley_grammar, main as nearley_tool_main

logger.setLevel(logging.INFO)

TEST_PATH = os.path.abspath(os.path.dirname(__file__))
NEARLEY_PATH = os.path.join(TEST_PATH, 'nearley')
BUILTIN_PATH = os.path.join(NEARLEY_PATH, 'builtin')

if not os.path.exists(NEARLEY_PATH):
logging.warn("Nearley not installed. Skipping Nearley tests!")
logger.warn("Nearley not installed. Skipping Nearley tests!")
raise ImportError("Skipping Nearley tests!")

import js2py # Ensures that js2py exists, to avoid failing tests


+ 392
- 3
tests/test_parser.py View File

@@ -6,7 +6,12 @@ import unittest
import logging
import os
import sys
from copy import deepcopy
from copy import copy, deepcopy

from lark.utils import Py36, isascii

from lark import Token

try:
from cStringIO import StringIO as cStringIO
except ImportError:
@@ -18,8 +23,13 @@ from io import (
open,
)

logging.basicConfig(level=logging.INFO)

try:
import regex
except ImportError:
regex = None

from lark import logger
from lark.lark import Lark
from lark.exceptions import GrammarError, ParseError, UnexpectedToken, UnexpectedInput, UnexpectedCharacters
from lark.tree import Tree
@@ -27,6 +37,7 @@ from lark.visitors import Transformer, Transformer_InPlace, v_args
from lark.grammar import Rule
from lark.lexer import TerminalDef, Lexer, TraditionalLexer

logger.setLevel(logging.INFO)


__path__ = os.path.dirname(__file__)
@@ -449,6 +460,221 @@ def _make_full_earley_test(LEXER):
])
self.assertEqual(res, expected)

def test_ambiguous_intermediate_node(self):
grammar = """
start: ab bc d?
!ab: "A" "B"?
!bc: "B"? "C"
!d: "D"
"""

l = Lark(grammar, parser='earley', ambiguity='explicit', lexer=LEXER)
ambig_tree = l.parse("ABCD")
expected = {
Tree('start', [Tree('ab', ['A']), Tree('bc', ['B', 'C']), Tree('d', ['D'])]),
Tree('start', [Tree('ab', ['A', 'B']), Tree('bc', ['C']), Tree('d', ['D'])])
}
self.assertEqual(ambig_tree.data, '_ambig')
self.assertEqual(set(ambig_tree.children), expected)

def test_ambiguous_symbol_and_intermediate_nodes(self):
grammar = """
start: ab bc cd
!ab: "A" "B"?
!bc: "B"? "C"?
!cd: "C"? "D"
"""

l = Lark(grammar, parser='earley', ambiguity='explicit', lexer=LEXER)
ambig_tree = l.parse("ABCD")
expected = {
Tree('start', [
Tree('ab', ['A', 'B']),
Tree('bc', ['C']),
Tree('cd', ['D'])
]),
Tree('start', [
Tree('ab', ['A', 'B']),
Tree('bc', []),
Tree('cd', ['C', 'D'])
]),
Tree('start', [
Tree('ab', ['A']),
Tree('bc', ['B', 'C']),
Tree('cd', ['D'])
]),
Tree('start', [
Tree('ab', ['A']),
Tree('bc', ['B']),
Tree('cd', ['C', 'D'])
]),
}
self.assertEqual(ambig_tree.data, '_ambig')
self.assertEqual(set(ambig_tree.children), expected)

def test_nested_ambiguous_intermediate_nodes(self):
grammar = """
start: ab bc cd e?
!ab: "A" "B"?
!bc: "B"? "C"?
!cd: "C"? "D"
!e: "E"
"""

l = Lark(grammar, parser='earley', ambiguity='explicit', lexer=LEXER)
ambig_tree = l.parse("ABCDE")
expected = {
Tree('start', [
Tree('ab', ['A', 'B']),
Tree('bc', ['C']),
Tree('cd', ['D']),
Tree('e', ['E'])
]),
Tree('start', [
Tree('ab', ['A']),
Tree('bc', ['B', 'C']),
Tree('cd', ['D']),
Tree('e', ['E'])
]),
Tree('start', [
Tree('ab', ['A']),
Tree('bc', ['B']),
Tree('cd', ['C', 'D']),
Tree('e', ['E'])
]),
Tree('start', [
Tree('ab', ['A', 'B']),
Tree('bc', []),
Tree('cd', ['C', 'D']),
Tree('e', ['E'])
]),
}
self.assertEqual(ambig_tree.data, '_ambig')
self.assertEqual(set(ambig_tree.children), expected)

def test_nested_ambiguous_intermediate_nodes2(self):
grammar = """
start: ab bc cd de f
!ab: "A" "B"?
!bc: "B"? "C"?
!cd: "C"? "D"?
!de: "D"? "E"
!f: "F"
"""

l = Lark(grammar, parser='earley', ambiguity='explicit', lexer=LEXER)
ambig_tree = l.parse("ABCDEF")
expected = {
Tree('start', [
Tree('ab', ['A', 'B']),
Tree('bc', ['C']),
Tree('cd', ['D']),
Tree('de', ['E']),
Tree('f', ['F']),
]),
Tree('start', [
Tree('ab', ['A']),
Tree('bc', ['B', 'C']),
Tree('cd', ['D']),
Tree('de', ['E']),
Tree('f', ['F']),
]),
Tree('start', [
Tree('ab', ['A']),
Tree('bc', ['B']),
Tree('cd', ['C', 'D']),
Tree('de', ['E']),
Tree('f', ['F']),
]),
Tree('start', [
Tree('ab', ['A']),
Tree('bc', ['B']),
Tree('cd', ['C']),
Tree('de', ['D', 'E']),
Tree('f', ['F']),
]),
Tree('start', [
Tree('ab', ['A', "B"]),
Tree('bc', []),
Tree('cd', ['C']),
Tree('de', ['D', 'E']),
Tree('f', ['F']),
]),
Tree('start', [
Tree('ab', ['A']),
Tree('bc', ['B', 'C']),
Tree('cd', []),
Tree('de', ['D', 'E']),
Tree('f', ['F']),
]),
Tree('start', [
Tree('ab', ['A', 'B']),
Tree('bc', []),
Tree('cd', ['C', 'D']),
Tree('de', ['E']),
Tree('f', ['F']),
]),
Tree('start', [
Tree('ab', ['A', 'B']),
Tree('bc', ['C']),
Tree('cd', []),
Tree('de', ['D', 'E']),
Tree('f', ['F']),
]),
}
self.assertEqual(ambig_tree.data, '_ambig')
self.assertEqual(set(ambig_tree.children), expected)

def test_ambiguous_intermediate_node_unnamed_token(self):
grammar = """
start: ab bc "D"
!ab: "A" "B"?
!bc: "B"? "C"
"""

l = Lark(grammar, parser='earley', ambiguity='explicit', lexer=LEXER)
ambig_tree = l.parse("ABCD")
expected = {
Tree('start', [Tree('ab', ['A']), Tree('bc', ['B', 'C'])]),
Tree('start', [Tree('ab', ['A', 'B']), Tree('bc', ['C'])])
}
self.assertEqual(ambig_tree.data, '_ambig')
self.assertEqual(set(ambig_tree.children), expected)

def test_ambiguous_intermediate_node_inlined_rule(self):
grammar = """
start: ab _bc d?
!ab: "A" "B"?
_bc: "B"? "C"
!d: "D"
"""

l = Lark(grammar, parser='earley', ambiguity='explicit', lexer=LEXER)
ambig_tree = l.parse("ABCD")
expected = {
Tree('start', [Tree('ab', ['A']), Tree('d', ['D'])]),
Tree('start', [Tree('ab', ['A', 'B']), Tree('d', ['D'])])
}
self.assertEqual(ambig_tree.data, '_ambig')
self.assertEqual(set(ambig_tree.children), expected)

def test_ambiguous_intermediate_node_conditionally_inlined_rule(self):
grammar = """
start: ab bc d?
!ab: "A" "B"?
!?bc: "B"? "C"
!d: "D"
"""

l = Lark(grammar, parser='earley', ambiguity='explicit', lexer=LEXER)
ambig_tree = l.parse("ABCD")
expected = {
Tree('start', [Tree('ab', ['A']), Tree('bc', ['B', 'C']), Tree('d', ['D'])]),
Tree('start', [Tree('ab', ['A', 'B']), 'C', Tree('d', ['D'])])
}
self.assertEqual(ambig_tree.data, '_ambig')
self.assertEqual(set(ambig_tree.children), expected)

def test_fruitflies_ambig(self):
grammar = """
start: noun verb noun -> simple
@@ -549,16 +775,88 @@ class CustomLexer(Lexer):
so it uses the traditionalparser as implementation without custom lexing behaviour.
"""
def __init__(self, lexer_conf):
self.lexer = TraditionalLexer(lexer_conf.tokens, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks, g_regex_flags=lexer_conf.g_regex_flags)
self.lexer = TraditionalLexer(copy(lexer_conf))
def lex(self, *args, **kwargs):
return self.lexer.lex(*args, **kwargs)

def _tree_structure_check(a, b):
"""
Checks that both Tree objects have the same structure, without checking their values.
"""
assert a.data == b.data and len(a.children) == len(b.children)
for ca,cb in zip(a.children, b.children):
assert type(ca) == type(cb)
if isinstance(ca, Tree):
_tree_structure_check(ca, cb)
elif isinstance(ca, Token):
assert ca.type == cb.type
else:
assert ca == cb

class DualBytesLark:
"""
A helper class that wraps both a normal parser, and a parser for bytes.
It automatically transforms `.parse` calls for both lexer, returning the value from the text lexer
It always checks that both produce the same output/error

NOTE: Not currently used, but left here for future debugging.
"""

def __init__(self, g, *args, **kwargs):
self.text_lexer = Lark(g, *args, use_bytes=False, **kwargs)
g = self.text_lexer.grammar_source.lower()
if '\\u' in g or not isascii(g):
# Bytes re can't deal with uniode escapes
self.bytes_lark = None
else:
# Everything here should work, so use `use_bytes='force'`
self.bytes_lark = Lark(self.text_lexer.grammar_source, *args, use_bytes='force', **kwargs)

def parse(self, text, start=None):
# TODO: Easy workaround, more complex checks would be beneficial
if not isascii(text) or self.bytes_lark is None:
return self.text_lexer.parse(text, start)
try:
rv = self.text_lexer.parse(text, start)
except Exception as e:
try:
self.bytes_lark.parse(text.encode(), start)
except Exception as be:
assert type(e) == type(be), "Parser with and without `use_bytes` raise different exceptions"
raise e
assert False, "Parser without `use_bytes` raises exception, with doesn't"
try:
bv = self.bytes_lark.parse(text.encode(), start)
except Exception as be:
assert False, "Parser without `use_bytes` doesn't raise an exception, with does"
_tree_structure_check(rv, bv)
return rv

@classmethod
def open(cls, grammar_filename, rel_to=None, **options):
if rel_to:
basepath = os.path.dirname(rel_to)
grammar_filename = os.path.join(basepath, grammar_filename)
with open(grammar_filename, encoding='utf8') as f:
return cls(f, **options)

def save(self,f):
self.text_lexer.save(f)
if self.bytes_lark is not None:
self.bytes_lark.save(f)

def load(self,f):
self.text_lexer = self.text_lexer.load(f)
if self.bytes_lark is not None:
self.bytes_lark.load(f)

def _make_parser_test(LEXER, PARSER):
lexer_class_or_name = CustomLexer if LEXER == 'custom' else LEXER
def _Lark(grammar, **kwargs):
return Lark(grammar, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs)
def _Lark_open(gfilename, **kwargs):
return Lark.open(gfilename, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs)

class _TestParser(unittest.TestCase):
def test_basic1(self):
g = _Lark("""start: a+ b a* "b" a*
@@ -639,6 +937,29 @@ def _make_parser_test(LEXER, PARSER):
""")
g.parse('\x01\x02\x03')

@unittest.skipIf(sys.version_info[0]==2 or sys.version_info[:2]==(3, 4),
"bytes parser isn't perfect in Python2, exceptions don't work correctly")
def test_bytes_utf8(self):
g = r"""
start: BOM? char+
BOM: "\xef\xbb\xbf"
char: CHAR1 | CHAR2 | CHAR3 | CHAR4
CONTINUATION_BYTE: "\x80" .. "\xbf"
CHAR1: "\x00" .. "\x7f"
CHAR2: "\xc0" .. "\xdf" CONTINUATION_BYTE
CHAR3: "\xe0" .. "\xef" CONTINUATION_BYTE CONTINUATION_BYTE
CHAR4: "\xf0" .. "\xf7" CONTINUATION_BYTE CONTINUATION_BYTE CONTINUATION_BYTE
"""
g = _Lark(g, use_bytes=True)
s = u"🔣 地? gurīn".encode('utf-8')
self.assertEqual(len(g.parse(s).children), 10)

for enc, j in [("sjis", u"地球の絵はグリーンでグッド? Chikyuu no e wa guriin de guddo"),
("sjis", u"売春婦"),
("euc-jp", u"乂鵬鵠")]:
s = j.encode(enc)
self.assertRaises(UnexpectedCharacters, g.parse, s)

@unittest.skipIf(PARSER == 'cyk', "Takes forever")
def test_stack_for_ebnf(self):
"""Verify that stack depth isn't an issue for EBNF grammars"""
@@ -1058,6 +1379,31 @@ def _make_parser_test(LEXER, PARSER):
self.assertEqual( g.parse('"hello"').children, ['"hello"'])
self.assertEqual( g.parse("'hello'").children, ["'hello'"])

@unittest.skipIf(not Py36, "Required re syntax only exists in python3.6+")
def test_join_regex_flags(self):
g = r"""
start: A
A: B C
B: /./s
C: /./
"""
g = _Lark(g)
self.assertEqual(g.parse(" ").children,[" "])
self.assertEqual(g.parse("\n ").children,["\n "])
self.assertRaises(UnexpectedCharacters, g.parse, "\n\n")

g = r"""
start: A
A: B | C
B: "b"i
C: "c"
"""
g = _Lark(g)
self.assertEqual(g.parse("b").children,["b"])
self.assertEqual(g.parse("B").children,["B"])
self.assertEqual(g.parse("c").children,["c"])
self.assertRaises(UnexpectedCharacters, g.parse, "C")


def test_lexer_token_limit(self):
"Python has a stupid limit of 100 groups in a regular expression. Test that we handle this limitation"
@@ -1132,6 +1478,32 @@ def _make_parser_test(LEXER, PARSER):
tree = l.parse('aA')
self.assertEqual(tree.children, ['a', 'A'])

def test_token_flags_verbose(self):
g = _Lark(r"""start: NL | ABC
ABC: / [a-z] /x
NL: /\n/
""")
x = g.parse('a')
self.assertEqual(x.children, ['a'])

def test_token_flags_verbose_multiline(self):
g = _Lark(r"""start: ABC
ABC: / a b c
d
e f
/x
""")
x = g.parse('abcdef')
self.assertEqual(x.children, ['abcdef'])

def test_token_multiline_only_works_with_x_flag(self):
g = r"""start: ABC
ABC: / a b c
d
e f
/i
"""
self.assertRaises( GrammarError, _Lark, g)

@unittest.skipIf(PARSER == 'cyk', "No empty rules")
def test_twice_empty(self):
@@ -1784,6 +2156,23 @@ def _make_parser_test(LEXER, PARSER):
self.assertEqual(a.line, 1)
self.assertEqual(b.line, 2)

@unittest.skipIf(not regex or sys.version_info[0] == 2, 'Unicode and Python 2 do not place nicely together.')
def test_unicode_class(self):
"Tests that character classes from the `regex` module work correctly."
g = _Lark(r"""?start: NAME
NAME: ID_START ID_CONTINUE*
ID_START: /[\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}_]+/
ID_CONTINUE: ID_START | /[\p{Mn}\p{Mc}\p{Nd}\p{Pc}]+/""", regex=True)

self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்')

@unittest.skipIf(not regex or sys.version_info[0] == 2, 'Unicode and Python 2 do not place nicely together.')
def test_unicode_word(self):
"Tests that a persistent bug in the `re` module works when `regex` is enabled."
g = _Lark(r"""?start: NAME
NAME: /[\w]+/
""", regex=True)
self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்')

_NAME = "Test" + PARSER.capitalize() + LEXER.capitalize()
_TestParser.__name__ = _NAME


Some files were not shown because too many files changed in this diff

Loading…
Cancel
Save