@@ -33,7 +33,7 @@ Most importantly, Lark will save you time and prevent you from getting parsing h | |||
### Install Lark | |||
$ pip install lark-parser | |||
$ pip install lark-parser --upgrade | |||
Lark has no dependencies. | |||
@@ -77,12 +77,11 @@ Notice punctuation doesn't appear in the resulting tree. It's automatically filt | |||
### Fruit flies like bananas | |||
Lark is great at handling ambiguity. Let's parse the phrase "fruit flies like bananas": | |||
Lark is great at handling ambiguity. Here is the result of parsing the phrase "fruit flies like bananas": | |||
 | |||
See more [examples here](https://github.com/lark-parser/lark/tree/master/examples) | |||
See the code and more [examples here](https://github.com/lark-parser/lark/tree/master/examples) | |||
## List of main features | |||
@@ -156,6 +155,7 @@ Check out the [JSON tutorial](/docs/json_tutorial.md#conclusion) for more detail | |||
- [miniwdl](https://github.com/chanzuckerberg/miniwdl) - A static analysis toolkit for the Workflow Description Language | |||
- [pytreeview](https://gitlab.com/parmenti/pytreeview) - a lightweight tree-based grammar explorer | |||
- [harmalysis](https://github.com/napulen/harmalysis) - A language for harmonic analysis and music theory | |||
- [gersemi](https://github.com/BlankSpruce/gersemi) - A CMake code formatter | |||
Using Lark? Send me a message and I'll add your project! | |||
@@ -23,6 +23,7 @@ | |||
## Extra features | |||
- Import rules and tokens from other Lark grammars, for code reuse and modularity. | |||
- Support for external regex module ([see here](classes.md#using-unicode-character-classes-with-regex)) | |||
- Import grammars from Nearley.js ([read more](nearley.md)) | |||
- CYK parser | |||
@@ -103,6 +103,8 @@ Terminals can be assigned priority only when using a lexer (future versions may | |||
Priority can be either positive or negative. If not specified for a terminal, it defaults to 1. | |||
Highest priority terminals are always matched first. | |||
### Regexp Flags | |||
You can use flags on regexps and strings. For example: | |||
@@ -30,12 +30,13 @@ Use the reference pages for more in-depth explanations. (links in the [main page | |||
## LALR usage | |||
By default Lark silently resolves Shift/Reduce conflicts as Shift. To enable warnings pass `debug=True`. To get the messages printed you have to configure `logging` framework beforehand. For example: | |||
By default Lark silently resolves Shift/Reduce conflicts as Shift. To enable warnings pass `debug=True`. To get the messages printed you have to configure the `logger` beforehand. For example: | |||
```python | |||
from lark import Lark | |||
import logging | |||
logging.basicConfig(level=logging.DEBUG) | |||
from lark import Lark, logger | |||
logger.setLevel(logging.DEBUG) | |||
collision_grammar = ''' | |||
start: as as | |||
@@ -8,6 +8,7 @@ Welcome to Lark's documentation! | |||
.. toctree:: | |||
:maxdepth: 2 | |||
:caption: Overview | |||
:hidden: | |||
philosophy | |||
@@ -22,7 +23,6 @@ Welcome to Lark's documentation! | |||
json_tutorial | |||
how_to_use | |||
how_to_develop | |||
nearley | |||
recipes | |||
@@ -35,6 +35,8 @@ Welcome to Lark's documentation! | |||
tree_construction | |||
visitors | |||
classes | |||
nearley | |||
Lark is a modern parsing library for Python. Lark can parse any context-free grammar. | |||
@@ -47,13 +49,15 @@ Lark provides: | |||
- Fast unicode lexer with regexp support, and automatic line-counting | |||
**Install Lark**: | |||
Install Lark | |||
-------------- | |||
.. code:: bash | |||
$ pip install lark-parser | |||
**Syntax Highlighting**: | |||
Syntax Highlighting | |||
------------------- | |||
- `Sublime Text & TextMate`_ | |||
- `Visual Studio Code`_ (Or install through the vscode plugin system) | |||
@@ -61,4 +65,48 @@ Lark provides: | |||
.. _Sublime Text & TextMate: https://github.com/lark-parser/lark_syntax | |||
.. _Visual Studio Code: https://github.com/lark-parser/vscode-lark | |||
.. _Intellij & PyCharm: https://github.com/lark-parser/intellij-syntax-highlighting | |||
.. _Intellij & PyCharm: https://github.com/lark-parser/intellij-syntax-highlighting | |||
Resources | |||
--------- | |||
- :doc:`philosophy` | |||
- :doc:`features` | |||
- `Examples`_ | |||
- `Online IDE`_ | |||
- Tutorials | |||
- `How to write a DSL`_ - Implements a toy LOGO-like language with | |||
an interpreter | |||
- :doc:`json_tutorial` - Teaches you how to use Lark | |||
- Unofficial | |||
- `Program Synthesis is Possible`_ - Creates a DSL for Z3 | |||
- Guides | |||
- :doc:`how_to_use` | |||
- :doc:`how_to_develop` | |||
- Reference | |||
- :doc:`grammar` | |||
- :doc:`tree_construction` | |||
- :doc:`visitors` | |||
- :doc:`classes` | |||
- :doc:`nearley` | |||
- `Cheatsheet (PDF)`_ | |||
- Discussion | |||
- `Gitter`_ | |||
- `Forum (Google Groups)`_ | |||
.. _Examples: https://github.com/lark-parser/lark/tree/master/examples | |||
.. _Online IDE: https://lark-parser.github.io/lark/ide/app.html | |||
.. _How to write a DSL: http://blog.erezsh.com/how-to-write-a-dsl-in-python-with-lark/ | |||
.. _Program Synthesis is Possible: https://www.cs.cornell.edu/~asampson/blog/minisynth.html | |||
.. _Cheatsheet (PDF): _static/lark_cheatsheet.pdf | |||
.. _Gitter: https://gitter.im/lark-parser/Lobby | |||
.. _Forum (Google Groups): https://groups.google.com/forum/#!forum/lark-parser |
@@ -52,7 +52,7 @@ def parse(json_text): | |||
'[1,2,]', | |||
'{"foo":1,}', | |||
'{"foo":false,"bar":true,}'] | |||
}) | |||
}, use_accepts=True) | |||
if not exc_class: | |||
raise | |||
raise exc_class(u.get_context(json_text), u.line, u.column) | |||
@@ -1,9 +1,9 @@ | |||
# -*- coding: utf-8 -*- | |||
from typing import Dict, Iterable, Callable, Union | |||
from typing import Dict, Iterable, Callable, Union, TypeVar, Tuple, Any, List, Set | |||
from .tree import Tree | |||
from .lexer import Token | |||
from .parsers.lalr_puppet import ParserPuppet | |||
class LarkError(Exception): | |||
pass | |||
@@ -21,27 +21,37 @@ class LexError(LarkError): | |||
pass | |||
T = TypeVar('T') | |||
class UnexpectedInput(LarkError): | |||
line: int | |||
column: int | |||
pos_in_stream: int | |||
state: Any | |||
def get_context(self, text: str, span: int = ...): | |||
... | |||
def match_examples( | |||
self, | |||
parse_fn: Callable[[str], Tree], | |||
examples: Dict[str, Iterable[str]] | |||
): | |||
self, | |||
parse_fn: Callable[[str], Tree], | |||
examples: Union[Dict[T, Iterable[str]], Iterable[Tuple[T, Iterable[str]]]], | |||
token_type_match_fallback: bool = False, | |||
use_accepts: bool = False, | |||
) -> T: | |||
... | |||
class UnexpectedToken(ParseError, UnexpectedInput): | |||
pass | |||
expected: Set[str] | |||
considered_rules: Set[str] | |||
puppet: ParserPuppet | |||
accepts: Set[str] | |||
class UnexpectedCharacters(LexError, UnexpectedInput): | |||
line: int | |||
column: int | |||
allowed: Set[str] | |||
considered_tokens: Set[Any] | |||
class VisitError(LarkError): | |||
@@ -0,0 +1,22 @@ | |||
from typing import Set, Dict, Any | |||
from lark import Token, Tree | |||
class ParserPuppet(object): | |||
""" | |||
Provides an interface to interactively step through the parser (LALR(1) only for now) | |||
Accessible via `UnexpectedToken.puppet` (raised by the parser on token error) | |||
""" | |||
def feed_token(self, token: Token): ... | |||
def copy(self) -> ParserPuppet: ... | |||
def pretty(self) -> str: ... | |||
def choices(self) -> Dict[str, Any]: ... | |||
def accepts(self) -> Set[str]: ... | |||
def resume_parse(self) -> Tree: ... |
@@ -1,3 +1,4 @@ | |||
from .utils import logger | |||
from .tree import Tree | |||
from .visitors import Transformer, Visitor, v_args, Discard | |||
from .visitors import InlineTransformer, inline_args # XXX Deprecated | |||
@@ -17,9 +17,6 @@ class LexerConf(Serialize): | |||
self.skip_validation = skip_validation | |||
self.use_bytes = use_bytes | |||
def _deserialize(self): | |||
self.callbacks = {} # TODO | |||
###} | |||
class ParserConf: | |||
@@ -1,6 +1,8 @@ | |||
from .utils import STRING_TYPE | |||
from .utils import STRING_TYPE, logger | |||
###{standalone | |||
class LarkError(Exception): | |||
pass | |||
@@ -37,34 +39,46 @@ class UnexpectedInput(LarkError): | |||
after = text[pos:end].split(b'\n', 1)[0] | |||
return (before + after + b'\n' + b' ' * len(before) + b'^\n').decode("ascii", "backslashreplace") | |||
def match_examples(self, parse_fn, examples, token_type_match_fallback=False): | |||
def match_examples(self, parse_fn, examples, token_type_match_fallback=False, use_accepts=False): | |||
""" Given a parser instance and a dictionary mapping some label with | |||
some malformed syntax examples, it'll return the label for the | |||
example that bests matches the current error. | |||
It's recommended to call this with `use_accepts=True`. The default is False for backwards compatibility. | |||
""" | |||
assert self.state is not None, "Not supported for this exception" | |||
if isinstance(examples, dict): | |||
examples = examples.items() | |||
candidate = (None, False) | |||
for label, example in examples.items(): | |||
for i, (label, example) in enumerate(examples): | |||
assert not isinstance(example, STRING_TYPE) | |||
for malformed in example: | |||
for j, malformed in enumerate(example): | |||
try: | |||
parse_fn(malformed) | |||
except UnexpectedInput as ut: | |||
if ut.state == self.state: | |||
if use_accepts and ut.accepts != self.accepts: | |||
logger.debug("Different accepts with same state[%d]: %s != %s at example [%s][%s]" % | |||
(self.state, self.accepts, ut.accepts, i, j)) | |||
continue | |||
try: | |||
if ut.token == self.token: # Try exact match first | |||
logger.debug("Exact Match at example [%s][%s]" % (i, j)) | |||
return label | |||
if token_type_match_fallback: | |||
# Fallback to token types match | |||
if (ut.token.type == self.token.type) and not candidate[-1]: | |||
logger.debug("Token Type Fallback at example [%s][%s]" % (i, j)) | |||
candidate = label, True | |||
except AttributeError: | |||
pass | |||
if not candidate[0]: | |||
logger.debug("Same State match at example [%s][%s]" % (i, j)) | |||
candidate = label, False | |||
return candidate[0] | |||
@@ -72,19 +86,20 @@ class UnexpectedInput(LarkError): | |||
class UnexpectedCharacters(LexError, UnexpectedInput): | |||
def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None): | |||
if isinstance(seq, bytes): | |||
message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos:lex_pos+1].decode("ascii", "backslashreplace"), line, column) | |||
else: | |||
message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos], line, column) | |||
self.line = line | |||
self.column = column | |||
self.allowed = allowed | |||
self.considered_tokens = considered_tokens | |||
self.pos_in_stream = lex_pos | |||
self.state = state | |||
self.allowed = allowed | |||
self.considered_tokens = considered_tokens | |||
if isinstance(seq, bytes): | |||
_s = seq[lex_pos:lex_pos+1].decode("ascii", "backslashreplace") | |||
else: | |||
_s = seq[lex_pos] | |||
message = "No terminal defined for '%s' at line %d col %d" % (_s, line, column) | |||
message += '\n\n' + self.get_context(seq) | |||
if allowed: | |||
message += '\nExpecting: %s\n' % allowed | |||
@@ -97,18 +112,23 @@ class UnexpectedCharacters(LexError, UnexpectedInput): | |||
class UnexpectedToken(ParseError, UnexpectedInput): | |||
def __init__(self, token, expected, considered_rules=None, state=None, puppet=None): | |||
self.token = token | |||
self.expected = expected # XXX str shouldn't necessary | |||
self.line = getattr(token, 'line', '?') | |||
self.column = getattr(token, 'column', '?') | |||
self.considered_rules = considered_rules | |||
self.state = state | |||
self.pos_in_stream = getattr(token, 'pos_in_stream', None) | |||
self.state = state | |||
self.token = token | |||
self.expected = expected # XXX deprecate? `accepts` is better | |||
self.considered_rules = considered_rules | |||
self.puppet = puppet | |||
# TODO Only calculate `accepts()` when we need to display it to the user | |||
# This will improve performance when doing automatic error handling | |||
self.accepts = puppet and puppet.accepts() | |||
message = ("Unexpected token %r at line %s, column %s.\n" | |||
"Expected one of: \n\t* %s\n" | |||
% (token, self.line, self.column, '\n\t* '.join(self.expected))) | |||
% (token, self.line, self.column, '\n\t* '.join(self.accepts or self.expected))) | |||
super(UnexpectedToken, self).__init__(message) | |||
@@ -1,17 +1,17 @@ | |||
from __future__ import absolute_import | |||
import sys, os, pickle, hashlib, logging | |||
import sys, os, pickle, hashlib | |||
from io import open | |||
from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS, isascii | |||
from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS, isascii, logger | |||
from .load_grammar import load_grammar | |||
from .tree import Tree | |||
from .common import LexerConf, ParserConf | |||
from .lexer import Lexer, TraditionalLexer, TerminalDef, UnexpectedToken | |||
from .parse_tree_builder import ParseTreeBuilder | |||
from .parser_frontends import get_frontend | |||
from .parser_frontends import get_frontend, _get_lexer_callbacks | |||
from .grammar import Rule | |||
import re | |||
@@ -214,7 +214,7 @@ class Lark(Serialize): | |||
cache_fn = '.lark_cache_%s.tmp' % md5 | |||
if FS.exists(cache_fn): | |||
logging.debug('Loading grammar from cache: %s', cache_fn) | |||
logger.debug('Loading grammar from cache: %s', cache_fn) | |||
with FS.open(cache_fn, 'rb') as f: | |||
self._load(f, self.options.transformer, self.options.postlex) | |||
return | |||
@@ -278,12 +278,10 @@ class Lark(Serialize): | |||
rule.options.priority = None | |||
# TODO Deprecate lexer_callbacks? | |||
lexer_callbacks = dict(self.options.lexer_callbacks) | |||
if self.options.transformer: | |||
t = self.options.transformer | |||
for term in self.terminals: | |||
if hasattr(t, term.name): | |||
lexer_callbacks[term.name] = getattr(t, term.name) | |||
lexer_callbacks = (_get_lexer_callbacks(self.options.transformer, self.terminals) | |||
if self.options.transformer | |||
else {}) | |||
lexer_callbacks.update(self.options.lexer_callbacks) | |||
self.lexer_conf = LexerConf(self.terminals, re_module, self.ignore_tokens, self.options.postlex, lexer_callbacks, self.options.g_regex_flags, use_bytes=self.options.use_bytes) | |||
@@ -293,7 +291,7 @@ class Lark(Serialize): | |||
self.lexer = self._build_lexer() | |||
if cache_fn: | |||
logging.debug('Saving grammar to cache: %s', cache_fn) | |||
logger.debug('Saving grammar to cache: %s', cache_fn) | |||
with FS.open(cache_fn, 'wb') as f: | |||
self.save(f) | |||
@@ -344,7 +342,14 @@ class Lark(Serialize): | |||
self.rules = [Rule.deserialize(r, memo) for r in data['rules']] | |||
self.source = '<deserialized>' | |||
self._prepare_callbacks() | |||
self.parser = self.parser_class.deserialize(data['parser'], memo, self._callbacks, self.options.postlex, re_module) | |||
self.parser = self.parser_class.deserialize( | |||
data['parser'], | |||
memo, | |||
self._callbacks, | |||
self.options.postlex, | |||
self.options.transformer, | |||
re_module | |||
) | |||
return self | |||
@classmethod | |||
@@ -85,7 +85,7 @@ TERMINALS = { | |||
'RULE': '!?[_?]?[a-z][_a-z0-9]*', | |||
'TERMINAL': '_?[A-Z][_A-Z0-9]*', | |||
'STRING': r'"(\\"|\\\\|[^"\n])*?"i?', | |||
'REGEXP': r'/(?!/)(\\/|\\\\|[^/\n])*?/[%s]*' % _RE_FLAGS, | |||
'REGEXP': r'/(?!/)(\\/|\\\\|[^/])*?/[%s]*' % _RE_FLAGS, | |||
'_NL': r'(\r?\n)+\s*', | |||
'WS': r'[ \t]+', | |||
'COMMENT': r'\s*//[^\n]*', | |||
@@ -336,7 +336,7 @@ class PrepareAnonTerminals(Transformer_InPlace): | |||
term_name = None | |||
elif isinstance(p, PatternRE): | |||
if p in self.term_reverse: # Kind of a wierd placement.name | |||
if p in self.term_reverse: # Kind of a weird placement.name | |||
term_name = self.term_reverse[p].name | |||
else: | |||
assert False, p | |||
@@ -409,6 +409,13 @@ def _literal_to_pattern(literal): | |||
flags = v[flag_start:] | |||
assert all(f in _RE_FLAGS for f in flags), flags | |||
if literal.type == 'STRING' and '\n' in v: | |||
raise GrammarError('You cannot put newlines in string literals') | |||
if literal.type == 'REGEXP' and '\n' in v and 'x' not in flags: | |||
raise GrammarError('You can only use newlines in regular expressions ' | |||
'with the `x` (verbose) flag') | |||
v = v[:flag_start] | |||
assert v[0] == v[-1] and v[0] in '"/' | |||
x = v[1:-1] | |||
@@ -417,9 +424,11 @@ def _literal_to_pattern(literal): | |||
if literal.type == 'STRING': | |||
s = s.replace('\\\\', '\\') | |||
return { 'STRING': PatternStr, | |||
'REGEXP': PatternRE }[literal.type](s, flags) | |||
return PatternStr(s, flags) | |||
elif literal.type == 'REGEXP': | |||
return PatternRE(s, flags) | |||
else: | |||
assert False, 'Invariant failed: literal.type not in ["STRING", "REGEXP"]' | |||
@inline_args | |||
@@ -841,7 +850,7 @@ class GrammarLoader: | |||
if len(stmt.children) > 1: | |||
path_node, arg1 = stmt.children | |||
else: | |||
path_node, = stmt.children | |||
path_node ,= stmt.children | |||
arg1 = None | |||
if isinstance(arg1, Tree): # Multi import | |||
@@ -1,6 +1,6 @@ | |||
from .utils import get_regexp_width, Serialize | |||
from .parsers.grammar_analysis import GrammarAnalyzer | |||
from .lexer import TraditionalLexer, ContextualLexer, Lexer, Token | |||
from .lexer import TraditionalLexer, ContextualLexer, Lexer, Token, TerminalDef | |||
from .parsers import earley, xearley, cyk | |||
from .parsers.lalr_parser import LALR_Parser | |||
from .grammar import Rule | |||
@@ -58,6 +58,15 @@ class _ParserFrontend(Serialize): | |||
return self.parser.parse(input, start, *args) | |||
def _get_lexer_callbacks(transformer, terminals): | |||
result = {} | |||
for terminal in terminals: | |||
callback = getattr(transformer, terminal.name, None) | |||
if callback is not None: | |||
result[terminal.name] = callback | |||
return result | |||
class WithLexer(_ParserFrontend): | |||
lexer = None | |||
parser = None | |||
@@ -73,13 +82,18 @@ class WithLexer(_ParserFrontend): | |||
self.postlex = lexer_conf.postlex | |||
@classmethod | |||
def deserialize(cls, data, memo, callbacks, postlex, re_module): | |||
def deserialize(cls, data, memo, callbacks, postlex, transformer, re_module): | |||
inst = super(WithLexer, cls).deserialize(data, memo) | |||
inst.postlex = postlex | |||
inst.parser = LALR_Parser.deserialize(inst.parser, memo, callbacks) | |||
terminals = [item for item in memo.values() if isinstance(item, TerminalDef)] | |||
inst.lexer_conf.callbacks = _get_lexer_callbacks(transformer, terminals) | |||
inst.lexer_conf.re_module = re_module | |||
inst.lexer_conf.skip_validation=True | |||
inst.init_lexer() | |||
return inst | |||
def _serialize(self, data, memo): | |||
@@ -229,4 +243,3 @@ class CYK(WithLexer): | |||
def _apply_callback(self, tree): | |||
return self.callbacks[tree.rule](tree.children) | |||
@@ -10,11 +10,11 @@ is better documented here: | |||
http://www.bramvandersanden.com/post/2014/06/shared-packed-parse-forest/ | |||
""" | |||
import logging | |||
from collections import deque | |||
from ..visitors import Transformer_InPlace, v_args | |||
from ..exceptions import UnexpectedEOF, UnexpectedToken | |||
from ..utils import logger | |||
from .grammar_analysis import GrammarAnalyzer | |||
from ..grammar import NonTerminal | |||
from .earley_common import Item, TransitiveItem | |||
@@ -301,7 +301,7 @@ class Parser: | |||
try: | |||
debug_walker = ForestToPyDotVisitor() | |||
except ImportError: | |||
logging.warning("Cannot find dependency 'pydot', will not generate sppf debug image") | |||
logger.warning("Cannot find dependency 'pydot', will not generate sppf debug image") | |||
else: | |||
debug_walker.visit(solutions[0], "sppf.png") | |||
@@ -13,6 +13,7 @@ from collections import deque | |||
from operator import attrgetter | |||
from importlib import import_module | |||
from ..utils import logger | |||
from ..tree import Tree | |||
from ..exceptions import ParseError | |||
@@ -328,10 +329,17 @@ class ForestToAmbiguousTreeVisitor(ForestToTreeVisitor): | |||
self.output_stack[-1].children.append(node) | |||
def visit_symbol_node_in(self, node): | |||
if self.forest_sum_visitor and node.is_ambiguous and isinf(node.priority): | |||
self.forest_sum_visitor.visit(node) | |||
if not node.is_intermediate and node.is_ambiguous: | |||
self.output_stack.append(Tree('_ambig', [])) | |||
if node.is_ambiguous: | |||
if self.forest_sum_visitor and isinf(node.priority): | |||
self.forest_sum_visitor.visit(node) | |||
if node.is_intermediate: | |||
# TODO Support ambiguous intermediate nodes! | |||
logger.warning("Ambiguous intermediate node in the SPPF: %s. " | |||
"Lark does not currently process these ambiguities; resolving with the first derivation.", node) | |||
return next(iter(node.children)) | |||
else: | |||
self.output_stack.append(Tree('_ambig', [])) | |||
return iter(node.children) | |||
def visit_symbol_node_out(self, node): | |||
@@ -6,10 +6,9 @@ For now, shift/reduce conflicts are automatically resolved as shifts. | |||
# Author: Erez Shinan (2017) | |||
# Email : erezshin@gmail.com | |||
import logging | |||
from collections import defaultdict, deque | |||
from collections import defaultdict | |||
from ..utils import classify, classify_bool, bfs, fzset, Serialize, Enumerator | |||
from ..utils import classify, classify_bool, bfs, fzset, Enumerator, logger | |||
from ..exceptions import GrammarError | |||
from .grammar_analysis import GrammarAnalyzer, Terminal, LR0ItemSet | |||
@@ -256,8 +255,8 @@ class LALR_Analyzer(GrammarAnalyzer): | |||
raise GrammarError('Reduce/Reduce collision in %s between the following rules: %s' % (la, ''.join([ '\n\t\t- ' + str(r) for r in rules ]))) | |||
if la in actions: | |||
if self.debug: | |||
logging.warning('Shift/Reduce conflict for terminal %s: (resolving as shift)', la.name) | |||
logging.warning(' * %s', list(rules)[0]) | |||
logger.warning('Shift/Reduce conflict for terminal %s: (resolving as shift)', la.name) | |||
logger.warning(' * %s', list(rules)[0]) | |||
else: | |||
actions[la] = (Reduce, list(rules)[0]) | |||
m[state] = { k.name: v for k, v in actions.items() } | |||
@@ -59,10 +59,10 @@ class _Parser: | |||
try: | |||
return states[state][token.type] | |||
except KeyError: | |||
expected = [s for s in states[state].keys() if s.isupper()] | |||
expected = {s for s in states[state].keys() if s.isupper()} | |||
try: | |||
puppet = ParserPuppet(self, state_stack, value_stack, start, stream, set_state) | |||
except NameError: | |||
except NameError: # For standalone parser | |||
puppet = None | |||
raise UnexpectedToken(token, expected, state=state, puppet=puppet) | |||
@@ -3,8 +3,10 @@ | |||
from copy import deepcopy | |||
from .lalr_analysis import Shift, Reduce | |||
from .. import Token | |||
class ParserPuppet: | |||
class ParserPuppet(object): | |||
def __init__(self, parser, state_stack, value_stack, start, stream, set_state): | |||
self.parser = parser | |||
self._state_stack = state_stack | |||
@@ -16,7 +18,7 @@ class ParserPuppet: | |||
self.result = None | |||
def feed_token(self, token): | |||
"""Advance the parser state, as if it just recieved `token` from the lexer | |||
"""Advance the parser state, as if it just received `token` from the lexer | |||
""" | |||
end_state = self.parser.parse_table.end_states[self._start] | |||
@@ -66,14 +68,27 @@ class ParserPuppet: | |||
self._set_state, | |||
) | |||
def pretty(): | |||
print("Puppet choices:") | |||
for k, v in self.choices.items(): | |||
print('\t-', k, '->', v) | |||
print('stack size:', len(self._state_stack)) | |||
def pretty(self): | |||
out = ["Puppet choices:"] | |||
for k, v in self.choices().items(): | |||
out.append('\t- %s -> %s' % (k, v)) | |||
out.append('stack size: %s' % len(self._state_stack)) | |||
return '\n'.join(out) | |||
def choices(self): | |||
return self.parser.parse_table.states[self._state_stack[-1]] | |||
def accepts(self): | |||
accepts = set() | |||
for t in self.choices(): | |||
new_puppet = self.copy() | |||
try: | |||
new_puppet.feed_token(Token(t, '')) | |||
except KeyError: | |||
pass | |||
else: | |||
accepts.add(t) | |||
return accepts | |||
def resume_parse(self): | |||
return self.parser.parse(self._stream, self._start, self._set_state, self._value_stack, self._state_stack) |
@@ -88,6 +88,8 @@ def main(fobj, start): | |||
lark_inst = Lark(fobj, parser="lalr", lexer="contextual", start=start) | |||
print('# The file was automatically generated by Lark v%s' % lark.__version__) | |||
print('__version__ = "%s"' % lark.__version__) | |||
print() | |||
for pyfile in EXTRACT_STANDALONE_FILES: | |||
with open(os.path.join(_larkdir, pyfile)) as f: | |||
@@ -4,10 +4,12 @@ except ImportError: | |||
pass | |||
from copy import deepcopy | |||
from collections import OrderedDict | |||
###{standalone | |||
from collections import OrderedDict | |||
class Meta: | |||
def __init__(self): | |||
self.empty = True | |||
@@ -4,51 +4,15 @@ from functools import reduce | |||
from ast import literal_eval | |||
from collections import deque | |||
class fzset(frozenset): | |||
def __repr__(self): | |||
return '{%s}' % ', '.join(map(repr, self)) | |||
def classify_bool(seq, pred): | |||
true_elems = [] | |||
false_elems = [] | |||
for elem in seq: | |||
if pred(elem): | |||
true_elems.append(elem) | |||
else: | |||
false_elems.append(elem) | |||
return true_elems, false_elems | |||
def bfs(initial, expand): | |||
open_q = deque(list(initial)) | |||
visited = set(open_q) | |||
while open_q: | |||
node = open_q.popleft() | |||
yield node | |||
for next_node in expand(node): | |||
if next_node not in visited: | |||
visited.add(next_node) | |||
open_q.append(next_node) | |||
###{standalone | |||
import logging | |||
logger = logging.getLogger("lark") | |||
logger.addHandler(logging.StreamHandler()) | |||
# Set to highest level, since we have some warnings amongst the code | |||
# By default, we should not output any log messages | |||
logger.setLevel(logging.CRITICAL) | |||
def _serialize(value, memo): | |||
if isinstance(value, Serialize): | |||
return value.serialize(memo) | |||
elif isinstance(value, list): | |||
return [_serialize(elem, memo) for elem in value] | |||
elif isinstance(value, frozenset): | |||
return list(value) # TODO reversible? | |||
elif isinstance(value, dict): | |||
return {key:_serialize(elem, memo) for key, elem in value.items()} | |||
return value | |||
###{standalone | |||
def classify(seq, key=None, value=None): | |||
d = {} | |||
for item in seq: | |||
@@ -302,13 +266,11 @@ def combine_alternatives(lists): | |||
return reduce(lambda a,b: [i+[j] for i in a for j in b], lists[1:], init) | |||
class FS: | |||
open = open | |||
exists = os.path.exists | |||
def isascii(s): | |||
""" str.isascii only exists in python3.7+ """ | |||
try: | |||
@@ -318,4 +280,46 @@ def isascii(s): | |||
s.encode('ascii') | |||
return True | |||
except (UnicodeDecodeError, UnicodeEncodeError): | |||
return False | |||
return False | |||
class fzset(frozenset): | |||
def __repr__(self): | |||
return '{%s}' % ', '.join(map(repr, self)) | |||
def classify_bool(seq, pred): | |||
true_elems = [] | |||
false_elems = [] | |||
for elem in seq: | |||
if pred(elem): | |||
true_elems.append(elem) | |||
else: | |||
false_elems.append(elem) | |||
return true_elems, false_elems | |||
def bfs(initial, expand): | |||
open_q = deque(list(initial)) | |||
visited = set(open_q) | |||
while open_q: | |||
node = open_q.popleft() | |||
yield node | |||
for next_node in expand(node): | |||
if next_node not in visited: | |||
visited.add(next_node) | |||
open_q.append(next_node) | |||
def _serialize(value, memo): | |||
if isinstance(value, Serialize): | |||
return value.serialize(memo) | |||
elif isinstance(value, list): | |||
return [_serialize(elem, memo) for elem in value] | |||
elif isinstance(value, frozenset): | |||
return list(value) # TODO reversible? | |||
elif isinstance(value, dict): | |||
return {key:_serialize(elem, memo) for key, elem in value.items()} | |||
return value |
@@ -14,6 +14,8 @@ class Discard(Exception): | |||
# Transformers | |||
class _Decoratable: | |||
"Provides support for decorating methods with @v_args" | |||
@classmethod | |||
def _apply_decorator(cls, decorator, **kwargs): | |||
mro = getmro(cls) | |||
@@ -13,3 +13,4 @@ pages: | |||
- Classes Reference: classes.md | |||
- Recipes: recipes.md | |||
- Import grammars from Nearley: nearley.md | |||
- Tutorial - JSON Parser: json_tutorial.md |
@@ -2,6 +2,7 @@ from __future__ import absolute_import, print_function | |||
import unittest | |||
import logging | |||
from lark import logger | |||
from .test_trees import TestTrees | |||
from .test_tools import TestStandalone | |||
@@ -11,11 +12,13 @@ from .test_reconstructor import TestReconstructor | |||
try: | |||
from .test_nearley.test_nearley import TestNearley | |||
except ImportError: | |||
logging.warning("Warning: Skipping tests for Nearley grammar imports (js2py required)") | |||
logger.warning("Warning: Skipping tests for Nearley grammar imports (js2py required)") | |||
# from .test_selectors import TestSelectors | |||
# from .test_grammars import TestPythonG, TestConfigG | |||
from .test_logger import Testlogger | |||
from .test_parser import ( | |||
TestLalrStandard, | |||
TestEarleyStandard, | |||
@@ -31,7 +34,7 @@ from .test_parser import ( | |||
TestParsers, | |||
) | |||
logging.basicConfig(level=logging.INFO) | |||
logger.setLevel(logging.INFO) | |||
if __name__ == '__main__': | |||
unittest.main() |
@@ -0,0 +1,65 @@ | |||
import logging | |||
from contextlib import contextmanager | |||
from lark import Lark, logger | |||
from unittest import TestCase, main | |||
try: | |||
from StringIO import StringIO | |||
except ImportError: | |||
from io import StringIO | |||
@contextmanager | |||
def capture_log(): | |||
stream = StringIO() | |||
orig_handler = logger.handlers[0] | |||
del logger.handlers[:] | |||
logger.addHandler(logging.StreamHandler(stream)) | |||
yield stream | |||
del logger.handlers[:] | |||
logger.addHandler(orig_handler) | |||
class Testlogger(TestCase): | |||
def test_debug(self): | |||
logger.setLevel(logging.DEBUG) | |||
collision_grammar = ''' | |||
start: as as | |||
as: a* | |||
a: "a" | |||
''' | |||
with capture_log() as log: | |||
Lark(collision_grammar, parser='lalr', debug=True) | |||
log = log.getvalue() | |||
# since there are conflicts about A | |||
# symbol A should appear in the log message for hint | |||
self.assertIn("A", log) | |||
def test_non_debug(self): | |||
logger.setLevel(logging.DEBUG) | |||
collision_grammar = ''' | |||
start: as as | |||
as: a* | |||
a: "a" | |||
''' | |||
with capture_log() as log: | |||
Lark(collision_grammar, parser='lalr', debug=False) | |||
log = log.getvalue() | |||
# no log messge | |||
self.assertEqual(len(log), 0) | |||
def test_loglevel_higher(self): | |||
logger.setLevel(logging.ERROR) | |||
collision_grammar = ''' | |||
start: as as | |||
as: a* | |||
a: "a" | |||
''' | |||
with capture_log() as log: | |||
Lark(collision_grammar, parser='lalr', debug=True) | |||
log = log.getvalue() | |||
# no log messge | |||
self.assertEqual(len(log), 0) | |||
if __name__ == '__main__': | |||
main() |
@@ -6,16 +6,17 @@ import logging | |||
import os | |||
import codecs | |||
logging.basicConfig(level=logging.INFO) | |||
from lark import logger | |||
from lark.tools.nearley import create_code_for_nearley_grammar, main as nearley_tool_main | |||
logger.setLevel(logging.INFO) | |||
TEST_PATH = os.path.abspath(os.path.dirname(__file__)) | |||
NEARLEY_PATH = os.path.join(TEST_PATH, 'nearley') | |||
BUILTIN_PATH = os.path.join(NEARLEY_PATH, 'builtin') | |||
if not os.path.exists(NEARLEY_PATH): | |||
logging.warn("Nearley not installed. Skipping Nearley tests!") | |||
logger.warn("Nearley not installed. Skipping Nearley tests!") | |||
raise ImportError("Skipping Nearley tests!") | |||
import js2py # Ensures that js2py exists, to avoid failing tests | |||
@@ -23,13 +23,13 @@ from io import ( | |||
open, | |||
) | |||
logging.basicConfig(level=logging.INFO) | |||
try: | |||
import regex | |||
except ImportError: | |||
regex = None | |||
from lark import logger | |||
from lark.lark import Lark | |||
from lark.exceptions import GrammarError, ParseError, UnexpectedToken, UnexpectedInput, UnexpectedCharacters | |||
from lark.tree import Tree | |||
@@ -37,6 +37,7 @@ from lark.visitors import Transformer, Transformer_InPlace, v_args | |||
from lark.grammar import Rule | |||
from lark.lexer import TerminalDef, Lexer, TraditionalLexer | |||
logger.setLevel(logging.INFO) | |||
__path__ = os.path.dirname(__file__) | |||
@@ -721,7 +722,8 @@ def _make_parser_test(LEXER, PARSER): | |||
""") | |||
g.parse('\x01\x02\x03') | |||
@unittest.skipIf(sys.version_info[:2]==(2, 7), "bytes parser isn't perfect in Python2.7, exceptions don't work correctly") | |||
@unittest.skipIf(sys.version_info[0]==2 or sys.version_info[:2]==(3, 4), | |||
"bytes parser isn't perfect in Python2, exceptions don't work correctly") | |||
def test_bytes_utf8(self): | |||
g = r""" | |||
start: BOM? char+ | |||
@@ -1261,6 +1263,32 @@ def _make_parser_test(LEXER, PARSER): | |||
tree = l.parse('aA') | |||
self.assertEqual(tree.children, ['a', 'A']) | |||
def test_token_flags_verbose(self): | |||
g = _Lark(r"""start: NL | ABC | |||
ABC: / [a-z] /x | |||
NL: /\n/ | |||
""") | |||
x = g.parse('a') | |||
self.assertEqual(x.children, ['a']) | |||
def test_token_flags_verbose_multiline(self): | |||
g = _Lark(r"""start: ABC | |||
ABC: / a b c | |||
d | |||
e f | |||
/x | |||
""") | |||
x = g.parse('abcdef') | |||
self.assertEqual(x.children, ['abcdef']) | |||
def test_token_multiline_only_works_with_x_flag(self): | |||
g = r"""start: ABC | |||
ABC: / a b c | |||
d | |||
e f | |||
/i | |||
""" | |||
self.assertRaises( GrammarError, _Lark, g) | |||
@unittest.skipIf(PARSER == 'cyk', "No empty rules") | |||
def test_twice_empty(self): | |||
@@ -106,6 +106,33 @@ class TestStandalone(TestCase): | |||
x = l.parse('(\n)\n') | |||
self.assertEqual(x, Tree('start', [])) | |||
def test_transformer(self): | |||
grammar = r""" | |||
start: some_rule "(" SOME_TERMINAL ")" | |||
some_rule: SOME_TERMINAL | |||
SOME_TERMINAL: /[A-Za-z_][A-Za-z0-9_]*/ | |||
""" | |||
context = self._create_standalone(grammar) | |||
_Lark = context["Lark_StandAlone"] | |||
_Token = context["Token"] | |||
_Tree = context["Tree"] | |||
class MyTransformer(context["Transformer"]): | |||
def SOME_TERMINAL(self, token): | |||
return _Token("SOME_TERMINAL", "token is transformed") | |||
def some_rule(self, children): | |||
return _Tree("rule_is_transformed", []) | |||
parser = _Lark(transformer=MyTransformer()) | |||
self.assertEqual( | |||
parser.parse("FOO(BAR)"), | |||
_Tree("start", [ | |||
_Tree("rule_is_transformed", []), | |||
_Token("SOME_TERMINAL", "token is transformed") | |||
]) | |||
) | |||
if __name__ == '__main__': | |||