| @@ -33,7 +33,7 @@ Most importantly, Lark will save you time and prevent you from getting parsing h | |||
| ### Install Lark | |||
| $ pip install lark-parser | |||
| $ pip install lark-parser --upgrade | |||
| Lark has no dependencies. | |||
| @@ -77,12 +77,11 @@ Notice punctuation doesn't appear in the resulting tree. It's automatically filt | |||
| ### Fruit flies like bananas | |||
| Lark is great at handling ambiguity. Let's parse the phrase "fruit flies like bananas": | |||
| Lark is great at handling ambiguity. Here is the result of parsing the phrase "fruit flies like bananas": | |||
|  | |||
| See more [examples here](https://github.com/lark-parser/lark/tree/master/examples) | |||
| See the code and more [examples here](https://github.com/lark-parser/lark/tree/master/examples) | |||
| ## List of main features | |||
| @@ -156,6 +155,7 @@ Check out the [JSON tutorial](/docs/json_tutorial.md#conclusion) for more detail | |||
| - [miniwdl](https://github.com/chanzuckerberg/miniwdl) - A static analysis toolkit for the Workflow Description Language | |||
| - [pytreeview](https://gitlab.com/parmenti/pytreeview) - a lightweight tree-based grammar explorer | |||
| - [harmalysis](https://github.com/napulen/harmalysis) - A language for harmonic analysis and music theory | |||
| - [gersemi](https://github.com/BlankSpruce/gersemi) - A CMake code formatter | |||
| Using Lark? Send me a message and I'll add your project! | |||
| @@ -23,6 +23,7 @@ | |||
| ## Extra features | |||
| - Import rules and tokens from other Lark grammars, for code reuse and modularity. | |||
| - Support for external regex module ([see here](classes.md#using-unicode-character-classes-with-regex)) | |||
| - Import grammars from Nearley.js ([read more](nearley.md)) | |||
| - CYK parser | |||
| @@ -103,6 +103,8 @@ Terminals can be assigned priority only when using a lexer (future versions may | |||
| Priority can be either positive or negative. If not specified for a terminal, it defaults to 1. | |||
| Highest priority terminals are always matched first. | |||
| ### Regexp Flags | |||
| You can use flags on regexps and strings. For example: | |||
| @@ -30,12 +30,13 @@ Use the reference pages for more in-depth explanations. (links in the [main page | |||
| ## LALR usage | |||
| By default Lark silently resolves Shift/Reduce conflicts as Shift. To enable warnings pass `debug=True`. To get the messages printed you have to configure `logging` framework beforehand. For example: | |||
| By default Lark silently resolves Shift/Reduce conflicts as Shift. To enable warnings pass `debug=True`. To get the messages printed you have to configure the `logger` beforehand. For example: | |||
| ```python | |||
| from lark import Lark | |||
| import logging | |||
| logging.basicConfig(level=logging.DEBUG) | |||
| from lark import Lark, logger | |||
| logger.setLevel(logging.DEBUG) | |||
| collision_grammar = ''' | |||
| start: as as | |||
| @@ -8,6 +8,7 @@ Welcome to Lark's documentation! | |||
| .. toctree:: | |||
| :maxdepth: 2 | |||
| :caption: Overview | |||
| :hidden: | |||
| philosophy | |||
| @@ -22,7 +23,6 @@ Welcome to Lark's documentation! | |||
| json_tutorial | |||
| how_to_use | |||
| how_to_develop | |||
| nearley | |||
| recipes | |||
| @@ -35,6 +35,8 @@ Welcome to Lark's documentation! | |||
| tree_construction | |||
| visitors | |||
| classes | |||
| nearley | |||
| Lark is a modern parsing library for Python. Lark can parse any context-free grammar. | |||
| @@ -47,13 +49,15 @@ Lark provides: | |||
| - Fast unicode lexer with regexp support, and automatic line-counting | |||
| **Install Lark**: | |||
| Install Lark | |||
| -------------- | |||
| .. code:: bash | |||
| $ pip install lark-parser | |||
| **Syntax Highlighting**: | |||
| Syntax Highlighting | |||
| ------------------- | |||
| - `Sublime Text & TextMate`_ | |||
| - `Visual Studio Code`_ (Or install through the vscode plugin system) | |||
| @@ -61,4 +65,48 @@ Lark provides: | |||
| .. _Sublime Text & TextMate: https://github.com/lark-parser/lark_syntax | |||
| .. _Visual Studio Code: https://github.com/lark-parser/vscode-lark | |||
| .. _Intellij & PyCharm: https://github.com/lark-parser/intellij-syntax-highlighting | |||
| .. _Intellij & PyCharm: https://github.com/lark-parser/intellij-syntax-highlighting | |||
| Resources | |||
| --------- | |||
| - :doc:`philosophy` | |||
| - :doc:`features` | |||
| - `Examples`_ | |||
| - `Online IDE`_ | |||
| - Tutorials | |||
| - `How to write a DSL`_ - Implements a toy LOGO-like language with | |||
| an interpreter | |||
| - :doc:`json_tutorial` - Teaches you how to use Lark | |||
| - Unofficial | |||
| - `Program Synthesis is Possible`_ - Creates a DSL for Z3 | |||
| - Guides | |||
| - :doc:`how_to_use` | |||
| - :doc:`how_to_develop` | |||
| - Reference | |||
| - :doc:`grammar` | |||
| - :doc:`tree_construction` | |||
| - :doc:`visitors` | |||
| - :doc:`classes` | |||
| - :doc:`nearley` | |||
| - `Cheatsheet (PDF)`_ | |||
| - Discussion | |||
| - `Gitter`_ | |||
| - `Forum (Google Groups)`_ | |||
| .. _Examples: https://github.com/lark-parser/lark/tree/master/examples | |||
| .. _Online IDE: https://lark-parser.github.io/lark/ide/app.html | |||
| .. _How to write a DSL: http://blog.erezsh.com/how-to-write-a-dsl-in-python-with-lark/ | |||
| .. _Program Synthesis is Possible: https://www.cs.cornell.edu/~asampson/blog/minisynth.html | |||
| .. _Cheatsheet (PDF): _static/lark_cheatsheet.pdf | |||
| .. _Gitter: https://gitter.im/lark-parser/Lobby | |||
| .. _Forum (Google Groups): https://groups.google.com/forum/#!forum/lark-parser | |||
| @@ -52,7 +52,7 @@ def parse(json_text): | |||
| '[1,2,]', | |||
| '{"foo":1,}', | |||
| '{"foo":false,"bar":true,}'] | |||
| }) | |||
| }, use_accepts=True) | |||
| if not exc_class: | |||
| raise | |||
| raise exc_class(u.get_context(json_text), u.line, u.column) | |||
| @@ -1,9 +1,9 @@ | |||
| # -*- coding: utf-8 -*- | |||
| from typing import Dict, Iterable, Callable, Union | |||
| from typing import Dict, Iterable, Callable, Union, TypeVar, Tuple, Any, List, Set | |||
| from .tree import Tree | |||
| from .lexer import Token | |||
| from .parsers.lalr_puppet import ParserPuppet | |||
| class LarkError(Exception): | |||
| pass | |||
| @@ -21,27 +21,37 @@ class LexError(LarkError): | |||
| pass | |||
| T = TypeVar('T') | |||
| class UnexpectedInput(LarkError): | |||
| line: int | |||
| column: int | |||
| pos_in_stream: int | |||
| state: Any | |||
| def get_context(self, text: str, span: int = ...): | |||
| ... | |||
| def match_examples( | |||
| self, | |||
| parse_fn: Callable[[str], Tree], | |||
| examples: Dict[str, Iterable[str]] | |||
| ): | |||
| self, | |||
| parse_fn: Callable[[str], Tree], | |||
| examples: Union[Dict[T, Iterable[str]], Iterable[Tuple[T, Iterable[str]]]], | |||
| token_type_match_fallback: bool = False, | |||
| use_accepts: bool = False, | |||
| ) -> T: | |||
| ... | |||
| class UnexpectedToken(ParseError, UnexpectedInput): | |||
| pass | |||
| expected: Set[str] | |||
| considered_rules: Set[str] | |||
| puppet: ParserPuppet | |||
| accepts: Set[str] | |||
| class UnexpectedCharacters(LexError, UnexpectedInput): | |||
| line: int | |||
| column: int | |||
| allowed: Set[str] | |||
| considered_tokens: Set[Any] | |||
| class VisitError(LarkError): | |||
| @@ -0,0 +1,22 @@ | |||
| from typing import Set, Dict, Any | |||
| from lark import Token, Tree | |||
| class ParserPuppet(object): | |||
| """ | |||
| Provides an interface to interactively step through the parser (LALR(1) only for now) | |||
| Accessible via `UnexpectedToken.puppet` (raised by the parser on token error) | |||
| """ | |||
| def feed_token(self, token: Token): ... | |||
| def copy(self) -> ParserPuppet: ... | |||
| def pretty(self) -> str: ... | |||
| def choices(self) -> Dict[str, Any]: ... | |||
| def accepts(self) -> Set[str]: ... | |||
| def resume_parse(self) -> Tree: ... | |||
| @@ -1,3 +1,4 @@ | |||
| from .utils import logger | |||
| from .tree import Tree | |||
| from .visitors import Transformer, Visitor, v_args, Discard | |||
| from .visitors import InlineTransformer, inline_args # XXX Deprecated | |||
| @@ -17,9 +17,6 @@ class LexerConf(Serialize): | |||
| self.skip_validation = skip_validation | |||
| self.use_bytes = use_bytes | |||
| def _deserialize(self): | |||
| self.callbacks = {} # TODO | |||
| ###} | |||
| class ParserConf: | |||
| @@ -1,6 +1,8 @@ | |||
| from .utils import STRING_TYPE | |||
| from .utils import STRING_TYPE, logger | |||
| ###{standalone | |||
| class LarkError(Exception): | |||
| pass | |||
| @@ -37,34 +39,46 @@ class UnexpectedInput(LarkError): | |||
| after = text[pos:end].split(b'\n', 1)[0] | |||
| return (before + after + b'\n' + b' ' * len(before) + b'^\n').decode("ascii", "backslashreplace") | |||
| def match_examples(self, parse_fn, examples, token_type_match_fallback=False): | |||
| def match_examples(self, parse_fn, examples, token_type_match_fallback=False, use_accepts=False): | |||
| """ Given a parser instance and a dictionary mapping some label with | |||
| some malformed syntax examples, it'll return the label for the | |||
| example that bests matches the current error. | |||
| It's recommended to call this with `use_accepts=True`. The default is False for backwards compatibility. | |||
| """ | |||
| assert self.state is not None, "Not supported for this exception" | |||
| if isinstance(examples, dict): | |||
| examples = examples.items() | |||
| candidate = (None, False) | |||
| for label, example in examples.items(): | |||
| for i, (label, example) in enumerate(examples): | |||
| assert not isinstance(example, STRING_TYPE) | |||
| for malformed in example: | |||
| for j, malformed in enumerate(example): | |||
| try: | |||
| parse_fn(malformed) | |||
| except UnexpectedInput as ut: | |||
| if ut.state == self.state: | |||
| if use_accepts and ut.accepts != self.accepts: | |||
| logger.debug("Different accepts with same state[%d]: %s != %s at example [%s][%s]" % | |||
| (self.state, self.accepts, ut.accepts, i, j)) | |||
| continue | |||
| try: | |||
| if ut.token == self.token: # Try exact match first | |||
| logger.debug("Exact Match at example [%s][%s]" % (i, j)) | |||
| return label | |||
| if token_type_match_fallback: | |||
| # Fallback to token types match | |||
| if (ut.token.type == self.token.type) and not candidate[-1]: | |||
| logger.debug("Token Type Fallback at example [%s][%s]" % (i, j)) | |||
| candidate = label, True | |||
| except AttributeError: | |||
| pass | |||
| if not candidate[0]: | |||
| logger.debug("Same State match at example [%s][%s]" % (i, j)) | |||
| candidate = label, False | |||
| return candidate[0] | |||
| @@ -72,19 +86,20 @@ class UnexpectedInput(LarkError): | |||
| class UnexpectedCharacters(LexError, UnexpectedInput): | |||
| def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None): | |||
| if isinstance(seq, bytes): | |||
| message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos:lex_pos+1].decode("ascii", "backslashreplace"), line, column) | |||
| else: | |||
| message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos], line, column) | |||
| self.line = line | |||
| self.column = column | |||
| self.allowed = allowed | |||
| self.considered_tokens = considered_tokens | |||
| self.pos_in_stream = lex_pos | |||
| self.state = state | |||
| self.allowed = allowed | |||
| self.considered_tokens = considered_tokens | |||
| if isinstance(seq, bytes): | |||
| _s = seq[lex_pos:lex_pos+1].decode("ascii", "backslashreplace") | |||
| else: | |||
| _s = seq[lex_pos] | |||
| message = "No terminal defined for '%s' at line %d col %d" % (_s, line, column) | |||
| message += '\n\n' + self.get_context(seq) | |||
| if allowed: | |||
| message += '\nExpecting: %s\n' % allowed | |||
| @@ -97,18 +112,23 @@ class UnexpectedCharacters(LexError, UnexpectedInput): | |||
| class UnexpectedToken(ParseError, UnexpectedInput): | |||
| def __init__(self, token, expected, considered_rules=None, state=None, puppet=None): | |||
| self.token = token | |||
| self.expected = expected # XXX str shouldn't necessary | |||
| self.line = getattr(token, 'line', '?') | |||
| self.column = getattr(token, 'column', '?') | |||
| self.considered_rules = considered_rules | |||
| self.state = state | |||
| self.pos_in_stream = getattr(token, 'pos_in_stream', None) | |||
| self.state = state | |||
| self.token = token | |||
| self.expected = expected # XXX deprecate? `accepts` is better | |||
| self.considered_rules = considered_rules | |||
| self.puppet = puppet | |||
| # TODO Only calculate `accepts()` when we need to display it to the user | |||
| # This will improve performance when doing automatic error handling | |||
| self.accepts = puppet and puppet.accepts() | |||
| message = ("Unexpected token %r at line %s, column %s.\n" | |||
| "Expected one of: \n\t* %s\n" | |||
| % (token, self.line, self.column, '\n\t* '.join(self.expected))) | |||
| % (token, self.line, self.column, '\n\t* '.join(self.accepts or self.expected))) | |||
| super(UnexpectedToken, self).__init__(message) | |||
| @@ -1,17 +1,17 @@ | |||
| from __future__ import absolute_import | |||
| import sys, os, pickle, hashlib, logging | |||
| import sys, os, pickle, hashlib | |||
| from io import open | |||
| from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS, isascii | |||
| from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS, isascii, logger | |||
| from .load_grammar import load_grammar | |||
| from .tree import Tree | |||
| from .common import LexerConf, ParserConf | |||
| from .lexer import Lexer, TraditionalLexer, TerminalDef, UnexpectedToken | |||
| from .parse_tree_builder import ParseTreeBuilder | |||
| from .parser_frontends import get_frontend | |||
| from .parser_frontends import get_frontend, _get_lexer_callbacks | |||
| from .grammar import Rule | |||
| import re | |||
| @@ -214,7 +214,7 @@ class Lark(Serialize): | |||
| cache_fn = '.lark_cache_%s.tmp' % md5 | |||
| if FS.exists(cache_fn): | |||
| logging.debug('Loading grammar from cache: %s', cache_fn) | |||
| logger.debug('Loading grammar from cache: %s', cache_fn) | |||
| with FS.open(cache_fn, 'rb') as f: | |||
| self._load(f, self.options.transformer, self.options.postlex) | |||
| return | |||
| @@ -278,12 +278,10 @@ class Lark(Serialize): | |||
| rule.options.priority = None | |||
| # TODO Deprecate lexer_callbacks? | |||
| lexer_callbacks = dict(self.options.lexer_callbacks) | |||
| if self.options.transformer: | |||
| t = self.options.transformer | |||
| for term in self.terminals: | |||
| if hasattr(t, term.name): | |||
| lexer_callbacks[term.name] = getattr(t, term.name) | |||
| lexer_callbacks = (_get_lexer_callbacks(self.options.transformer, self.terminals) | |||
| if self.options.transformer | |||
| else {}) | |||
| lexer_callbacks.update(self.options.lexer_callbacks) | |||
| self.lexer_conf = LexerConf(self.terminals, re_module, self.ignore_tokens, self.options.postlex, lexer_callbacks, self.options.g_regex_flags, use_bytes=self.options.use_bytes) | |||
| @@ -293,7 +291,7 @@ class Lark(Serialize): | |||
| self.lexer = self._build_lexer() | |||
| if cache_fn: | |||
| logging.debug('Saving grammar to cache: %s', cache_fn) | |||
| logger.debug('Saving grammar to cache: %s', cache_fn) | |||
| with FS.open(cache_fn, 'wb') as f: | |||
| self.save(f) | |||
| @@ -344,7 +342,14 @@ class Lark(Serialize): | |||
| self.rules = [Rule.deserialize(r, memo) for r in data['rules']] | |||
| self.source = '<deserialized>' | |||
| self._prepare_callbacks() | |||
| self.parser = self.parser_class.deserialize(data['parser'], memo, self._callbacks, self.options.postlex, re_module) | |||
| self.parser = self.parser_class.deserialize( | |||
| data['parser'], | |||
| memo, | |||
| self._callbacks, | |||
| self.options.postlex, | |||
| self.options.transformer, | |||
| re_module | |||
| ) | |||
| return self | |||
| @classmethod | |||
| @@ -85,7 +85,7 @@ TERMINALS = { | |||
| 'RULE': '!?[_?]?[a-z][_a-z0-9]*', | |||
| 'TERMINAL': '_?[A-Z][_A-Z0-9]*', | |||
| 'STRING': r'"(\\"|\\\\|[^"\n])*?"i?', | |||
| 'REGEXP': r'/(?!/)(\\/|\\\\|[^/\n])*?/[%s]*' % _RE_FLAGS, | |||
| 'REGEXP': r'/(?!/)(\\/|\\\\|[^/])*?/[%s]*' % _RE_FLAGS, | |||
| '_NL': r'(\r?\n)+\s*', | |||
| 'WS': r'[ \t]+', | |||
| 'COMMENT': r'\s*//[^\n]*', | |||
| @@ -336,7 +336,7 @@ class PrepareAnonTerminals(Transformer_InPlace): | |||
| term_name = None | |||
| elif isinstance(p, PatternRE): | |||
| if p in self.term_reverse: # Kind of a wierd placement.name | |||
| if p in self.term_reverse: # Kind of a weird placement.name | |||
| term_name = self.term_reverse[p].name | |||
| else: | |||
| assert False, p | |||
| @@ -409,6 +409,13 @@ def _literal_to_pattern(literal): | |||
| flags = v[flag_start:] | |||
| assert all(f in _RE_FLAGS for f in flags), flags | |||
| if literal.type == 'STRING' and '\n' in v: | |||
| raise GrammarError('You cannot put newlines in string literals') | |||
| if literal.type == 'REGEXP' and '\n' in v and 'x' not in flags: | |||
| raise GrammarError('You can only use newlines in regular expressions ' | |||
| 'with the `x` (verbose) flag') | |||
| v = v[:flag_start] | |||
| assert v[0] == v[-1] and v[0] in '"/' | |||
| x = v[1:-1] | |||
| @@ -417,9 +424,11 @@ def _literal_to_pattern(literal): | |||
| if literal.type == 'STRING': | |||
| s = s.replace('\\\\', '\\') | |||
| return { 'STRING': PatternStr, | |||
| 'REGEXP': PatternRE }[literal.type](s, flags) | |||
| return PatternStr(s, flags) | |||
| elif literal.type == 'REGEXP': | |||
| return PatternRE(s, flags) | |||
| else: | |||
| assert False, 'Invariant failed: literal.type not in ["STRING", "REGEXP"]' | |||
| @inline_args | |||
| @@ -841,7 +850,7 @@ class GrammarLoader: | |||
| if len(stmt.children) > 1: | |||
| path_node, arg1 = stmt.children | |||
| else: | |||
| path_node, = stmt.children | |||
| path_node ,= stmt.children | |||
| arg1 = None | |||
| if isinstance(arg1, Tree): # Multi import | |||
| @@ -1,6 +1,6 @@ | |||
| from .utils import get_regexp_width, Serialize | |||
| from .parsers.grammar_analysis import GrammarAnalyzer | |||
| from .lexer import TraditionalLexer, ContextualLexer, Lexer, Token | |||
| from .lexer import TraditionalLexer, ContextualLexer, Lexer, Token, TerminalDef | |||
| from .parsers import earley, xearley, cyk | |||
| from .parsers.lalr_parser import LALR_Parser | |||
| from .grammar import Rule | |||
| @@ -58,6 +58,15 @@ class _ParserFrontend(Serialize): | |||
| return self.parser.parse(input, start, *args) | |||
| def _get_lexer_callbacks(transformer, terminals): | |||
| result = {} | |||
| for terminal in terminals: | |||
| callback = getattr(transformer, terminal.name, None) | |||
| if callback is not None: | |||
| result[terminal.name] = callback | |||
| return result | |||
| class WithLexer(_ParserFrontend): | |||
| lexer = None | |||
| parser = None | |||
| @@ -73,13 +82,18 @@ class WithLexer(_ParserFrontend): | |||
| self.postlex = lexer_conf.postlex | |||
| @classmethod | |||
| def deserialize(cls, data, memo, callbacks, postlex, re_module): | |||
| def deserialize(cls, data, memo, callbacks, postlex, transformer, re_module): | |||
| inst = super(WithLexer, cls).deserialize(data, memo) | |||
| inst.postlex = postlex | |||
| inst.parser = LALR_Parser.deserialize(inst.parser, memo, callbacks) | |||
| terminals = [item for item in memo.values() if isinstance(item, TerminalDef)] | |||
| inst.lexer_conf.callbacks = _get_lexer_callbacks(transformer, terminals) | |||
| inst.lexer_conf.re_module = re_module | |||
| inst.lexer_conf.skip_validation=True | |||
| inst.init_lexer() | |||
| return inst | |||
| def _serialize(self, data, memo): | |||
| @@ -229,4 +243,3 @@ class CYK(WithLexer): | |||
| def _apply_callback(self, tree): | |||
| return self.callbacks[tree.rule](tree.children) | |||
| @@ -10,11 +10,11 @@ is better documented here: | |||
| http://www.bramvandersanden.com/post/2014/06/shared-packed-parse-forest/ | |||
| """ | |||
| import logging | |||
| from collections import deque | |||
| from ..visitors import Transformer_InPlace, v_args | |||
| from ..exceptions import UnexpectedEOF, UnexpectedToken | |||
| from ..utils import logger | |||
| from .grammar_analysis import GrammarAnalyzer | |||
| from ..grammar import NonTerminal | |||
| from .earley_common import Item, TransitiveItem | |||
| @@ -301,7 +301,7 @@ class Parser: | |||
| try: | |||
| debug_walker = ForestToPyDotVisitor() | |||
| except ImportError: | |||
| logging.warning("Cannot find dependency 'pydot', will not generate sppf debug image") | |||
| logger.warning("Cannot find dependency 'pydot', will not generate sppf debug image") | |||
| else: | |||
| debug_walker.visit(solutions[0], "sppf.png") | |||
| @@ -13,6 +13,7 @@ from collections import deque | |||
| from operator import attrgetter | |||
| from importlib import import_module | |||
| from ..utils import logger | |||
| from ..tree import Tree | |||
| from ..exceptions import ParseError | |||
| @@ -328,10 +329,17 @@ class ForestToAmbiguousTreeVisitor(ForestToTreeVisitor): | |||
| self.output_stack[-1].children.append(node) | |||
| def visit_symbol_node_in(self, node): | |||
| if self.forest_sum_visitor and node.is_ambiguous and isinf(node.priority): | |||
| self.forest_sum_visitor.visit(node) | |||
| if not node.is_intermediate and node.is_ambiguous: | |||
| self.output_stack.append(Tree('_ambig', [])) | |||
| if node.is_ambiguous: | |||
| if self.forest_sum_visitor and isinf(node.priority): | |||
| self.forest_sum_visitor.visit(node) | |||
| if node.is_intermediate: | |||
| # TODO Support ambiguous intermediate nodes! | |||
| logger.warning("Ambiguous intermediate node in the SPPF: %s. " | |||
| "Lark does not currently process these ambiguities; resolving with the first derivation.", node) | |||
| return next(iter(node.children)) | |||
| else: | |||
| self.output_stack.append(Tree('_ambig', [])) | |||
| return iter(node.children) | |||
| def visit_symbol_node_out(self, node): | |||
| @@ -6,10 +6,9 @@ For now, shift/reduce conflicts are automatically resolved as shifts. | |||
| # Author: Erez Shinan (2017) | |||
| # Email : erezshin@gmail.com | |||
| import logging | |||
| from collections import defaultdict, deque | |||
| from collections import defaultdict | |||
| from ..utils import classify, classify_bool, bfs, fzset, Serialize, Enumerator | |||
| from ..utils import classify, classify_bool, bfs, fzset, Enumerator, logger | |||
| from ..exceptions import GrammarError | |||
| from .grammar_analysis import GrammarAnalyzer, Terminal, LR0ItemSet | |||
| @@ -256,8 +255,8 @@ class LALR_Analyzer(GrammarAnalyzer): | |||
| raise GrammarError('Reduce/Reduce collision in %s between the following rules: %s' % (la, ''.join([ '\n\t\t- ' + str(r) for r in rules ]))) | |||
| if la in actions: | |||
| if self.debug: | |||
| logging.warning('Shift/Reduce conflict for terminal %s: (resolving as shift)', la.name) | |||
| logging.warning(' * %s', list(rules)[0]) | |||
| logger.warning('Shift/Reduce conflict for terminal %s: (resolving as shift)', la.name) | |||
| logger.warning(' * %s', list(rules)[0]) | |||
| else: | |||
| actions[la] = (Reduce, list(rules)[0]) | |||
| m[state] = { k.name: v for k, v in actions.items() } | |||
| @@ -59,10 +59,10 @@ class _Parser: | |||
| try: | |||
| return states[state][token.type] | |||
| except KeyError: | |||
| expected = [s for s in states[state].keys() if s.isupper()] | |||
| expected = {s for s in states[state].keys() if s.isupper()} | |||
| try: | |||
| puppet = ParserPuppet(self, state_stack, value_stack, start, stream, set_state) | |||
| except NameError: | |||
| except NameError: # For standalone parser | |||
| puppet = None | |||
| raise UnexpectedToken(token, expected, state=state, puppet=puppet) | |||
| @@ -3,8 +3,10 @@ | |||
| from copy import deepcopy | |||
| from .lalr_analysis import Shift, Reduce | |||
| from .. import Token | |||
| class ParserPuppet: | |||
| class ParserPuppet(object): | |||
| def __init__(self, parser, state_stack, value_stack, start, stream, set_state): | |||
| self.parser = parser | |||
| self._state_stack = state_stack | |||
| @@ -16,7 +18,7 @@ class ParserPuppet: | |||
| self.result = None | |||
| def feed_token(self, token): | |||
| """Advance the parser state, as if it just recieved `token` from the lexer | |||
| """Advance the parser state, as if it just received `token` from the lexer | |||
| """ | |||
| end_state = self.parser.parse_table.end_states[self._start] | |||
| @@ -66,14 +68,27 @@ class ParserPuppet: | |||
| self._set_state, | |||
| ) | |||
| def pretty(): | |||
| print("Puppet choices:") | |||
| for k, v in self.choices.items(): | |||
| print('\t-', k, '->', v) | |||
| print('stack size:', len(self._state_stack)) | |||
| def pretty(self): | |||
| out = ["Puppet choices:"] | |||
| for k, v in self.choices().items(): | |||
| out.append('\t- %s -> %s' % (k, v)) | |||
| out.append('stack size: %s' % len(self._state_stack)) | |||
| return '\n'.join(out) | |||
| def choices(self): | |||
| return self.parser.parse_table.states[self._state_stack[-1]] | |||
| def accepts(self): | |||
| accepts = set() | |||
| for t in self.choices(): | |||
| new_puppet = self.copy() | |||
| try: | |||
| new_puppet.feed_token(Token(t, '')) | |||
| except KeyError: | |||
| pass | |||
| else: | |||
| accepts.add(t) | |||
| return accepts | |||
| def resume_parse(self): | |||
| return self.parser.parse(self._stream, self._start, self._set_state, self._value_stack, self._state_stack) | |||
| @@ -88,6 +88,8 @@ def main(fobj, start): | |||
| lark_inst = Lark(fobj, parser="lalr", lexer="contextual", start=start) | |||
| print('# The file was automatically generated by Lark v%s' % lark.__version__) | |||
| print('__version__ = "%s"' % lark.__version__) | |||
| print() | |||
| for pyfile in EXTRACT_STANDALONE_FILES: | |||
| with open(os.path.join(_larkdir, pyfile)) as f: | |||
| @@ -4,10 +4,12 @@ except ImportError: | |||
| pass | |||
| from copy import deepcopy | |||
| from collections import OrderedDict | |||
| ###{standalone | |||
| from collections import OrderedDict | |||
| class Meta: | |||
| def __init__(self): | |||
| self.empty = True | |||
| @@ -4,51 +4,15 @@ from functools import reduce | |||
| from ast import literal_eval | |||
| from collections import deque | |||
| class fzset(frozenset): | |||
| def __repr__(self): | |||
| return '{%s}' % ', '.join(map(repr, self)) | |||
| def classify_bool(seq, pred): | |||
| true_elems = [] | |||
| false_elems = [] | |||
| for elem in seq: | |||
| if pred(elem): | |||
| true_elems.append(elem) | |||
| else: | |||
| false_elems.append(elem) | |||
| return true_elems, false_elems | |||
| def bfs(initial, expand): | |||
| open_q = deque(list(initial)) | |||
| visited = set(open_q) | |||
| while open_q: | |||
| node = open_q.popleft() | |||
| yield node | |||
| for next_node in expand(node): | |||
| if next_node not in visited: | |||
| visited.add(next_node) | |||
| open_q.append(next_node) | |||
| ###{standalone | |||
| import logging | |||
| logger = logging.getLogger("lark") | |||
| logger.addHandler(logging.StreamHandler()) | |||
| # Set to highest level, since we have some warnings amongst the code | |||
| # By default, we should not output any log messages | |||
| logger.setLevel(logging.CRITICAL) | |||
| def _serialize(value, memo): | |||
| if isinstance(value, Serialize): | |||
| return value.serialize(memo) | |||
| elif isinstance(value, list): | |||
| return [_serialize(elem, memo) for elem in value] | |||
| elif isinstance(value, frozenset): | |||
| return list(value) # TODO reversible? | |||
| elif isinstance(value, dict): | |||
| return {key:_serialize(elem, memo) for key, elem in value.items()} | |||
| return value | |||
| ###{standalone | |||
| def classify(seq, key=None, value=None): | |||
| d = {} | |||
| for item in seq: | |||
| @@ -302,13 +266,11 @@ def combine_alternatives(lists): | |||
| return reduce(lambda a,b: [i+[j] for i in a for j in b], lists[1:], init) | |||
| class FS: | |||
| open = open | |||
| exists = os.path.exists | |||
| def isascii(s): | |||
| """ str.isascii only exists in python3.7+ """ | |||
| try: | |||
| @@ -318,4 +280,46 @@ def isascii(s): | |||
| s.encode('ascii') | |||
| return True | |||
| except (UnicodeDecodeError, UnicodeEncodeError): | |||
| return False | |||
| return False | |||
| class fzset(frozenset): | |||
| def __repr__(self): | |||
| return '{%s}' % ', '.join(map(repr, self)) | |||
| def classify_bool(seq, pred): | |||
| true_elems = [] | |||
| false_elems = [] | |||
| for elem in seq: | |||
| if pred(elem): | |||
| true_elems.append(elem) | |||
| else: | |||
| false_elems.append(elem) | |||
| return true_elems, false_elems | |||
| def bfs(initial, expand): | |||
| open_q = deque(list(initial)) | |||
| visited = set(open_q) | |||
| while open_q: | |||
| node = open_q.popleft() | |||
| yield node | |||
| for next_node in expand(node): | |||
| if next_node not in visited: | |||
| visited.add(next_node) | |||
| open_q.append(next_node) | |||
| def _serialize(value, memo): | |||
| if isinstance(value, Serialize): | |||
| return value.serialize(memo) | |||
| elif isinstance(value, list): | |||
| return [_serialize(elem, memo) for elem in value] | |||
| elif isinstance(value, frozenset): | |||
| return list(value) # TODO reversible? | |||
| elif isinstance(value, dict): | |||
| return {key:_serialize(elem, memo) for key, elem in value.items()} | |||
| return value | |||
| @@ -14,6 +14,8 @@ class Discard(Exception): | |||
| # Transformers | |||
| class _Decoratable: | |||
| "Provides support for decorating methods with @v_args" | |||
| @classmethod | |||
| def _apply_decorator(cls, decorator, **kwargs): | |||
| mro = getmro(cls) | |||
| @@ -13,3 +13,4 @@ pages: | |||
| - Classes Reference: classes.md | |||
| - Recipes: recipes.md | |||
| - Import grammars from Nearley: nearley.md | |||
| - Tutorial - JSON Parser: json_tutorial.md | |||
| @@ -2,6 +2,7 @@ from __future__ import absolute_import, print_function | |||
| import unittest | |||
| import logging | |||
| from lark import logger | |||
| from .test_trees import TestTrees | |||
| from .test_tools import TestStandalone | |||
| @@ -11,11 +12,13 @@ from .test_reconstructor import TestReconstructor | |||
| try: | |||
| from .test_nearley.test_nearley import TestNearley | |||
| except ImportError: | |||
| logging.warning("Warning: Skipping tests for Nearley grammar imports (js2py required)") | |||
| logger.warning("Warning: Skipping tests for Nearley grammar imports (js2py required)") | |||
| # from .test_selectors import TestSelectors | |||
| # from .test_grammars import TestPythonG, TestConfigG | |||
| from .test_logger import Testlogger | |||
| from .test_parser import ( | |||
| TestLalrStandard, | |||
| TestEarleyStandard, | |||
| @@ -31,7 +34,7 @@ from .test_parser import ( | |||
| TestParsers, | |||
| ) | |||
| logging.basicConfig(level=logging.INFO) | |||
| logger.setLevel(logging.INFO) | |||
| if __name__ == '__main__': | |||
| unittest.main() | |||
| @@ -0,0 +1,65 @@ | |||
| import logging | |||
| from contextlib import contextmanager | |||
| from lark import Lark, logger | |||
| from unittest import TestCase, main | |||
| try: | |||
| from StringIO import StringIO | |||
| except ImportError: | |||
| from io import StringIO | |||
| @contextmanager | |||
| def capture_log(): | |||
| stream = StringIO() | |||
| orig_handler = logger.handlers[0] | |||
| del logger.handlers[:] | |||
| logger.addHandler(logging.StreamHandler(stream)) | |||
| yield stream | |||
| del logger.handlers[:] | |||
| logger.addHandler(orig_handler) | |||
| class Testlogger(TestCase): | |||
| def test_debug(self): | |||
| logger.setLevel(logging.DEBUG) | |||
| collision_grammar = ''' | |||
| start: as as | |||
| as: a* | |||
| a: "a" | |||
| ''' | |||
| with capture_log() as log: | |||
| Lark(collision_grammar, parser='lalr', debug=True) | |||
| log = log.getvalue() | |||
| # since there are conflicts about A | |||
| # symbol A should appear in the log message for hint | |||
| self.assertIn("A", log) | |||
| def test_non_debug(self): | |||
| logger.setLevel(logging.DEBUG) | |||
| collision_grammar = ''' | |||
| start: as as | |||
| as: a* | |||
| a: "a" | |||
| ''' | |||
| with capture_log() as log: | |||
| Lark(collision_grammar, parser='lalr', debug=False) | |||
| log = log.getvalue() | |||
| # no log messge | |||
| self.assertEqual(len(log), 0) | |||
| def test_loglevel_higher(self): | |||
| logger.setLevel(logging.ERROR) | |||
| collision_grammar = ''' | |||
| start: as as | |||
| as: a* | |||
| a: "a" | |||
| ''' | |||
| with capture_log() as log: | |||
| Lark(collision_grammar, parser='lalr', debug=True) | |||
| log = log.getvalue() | |||
| # no log messge | |||
| self.assertEqual(len(log), 0) | |||
| if __name__ == '__main__': | |||
| main() | |||
| @@ -6,16 +6,17 @@ import logging | |||
| import os | |||
| import codecs | |||
| logging.basicConfig(level=logging.INFO) | |||
| from lark import logger | |||
| from lark.tools.nearley import create_code_for_nearley_grammar, main as nearley_tool_main | |||
| logger.setLevel(logging.INFO) | |||
| TEST_PATH = os.path.abspath(os.path.dirname(__file__)) | |||
| NEARLEY_PATH = os.path.join(TEST_PATH, 'nearley') | |||
| BUILTIN_PATH = os.path.join(NEARLEY_PATH, 'builtin') | |||
| if not os.path.exists(NEARLEY_PATH): | |||
| logging.warn("Nearley not installed. Skipping Nearley tests!") | |||
| logger.warn("Nearley not installed. Skipping Nearley tests!") | |||
| raise ImportError("Skipping Nearley tests!") | |||
| import js2py # Ensures that js2py exists, to avoid failing tests | |||
| @@ -23,13 +23,13 @@ from io import ( | |||
| open, | |||
| ) | |||
| logging.basicConfig(level=logging.INFO) | |||
| try: | |||
| import regex | |||
| except ImportError: | |||
| regex = None | |||
| from lark import logger | |||
| from lark.lark import Lark | |||
| from lark.exceptions import GrammarError, ParseError, UnexpectedToken, UnexpectedInput, UnexpectedCharacters | |||
| from lark.tree import Tree | |||
| @@ -37,6 +37,7 @@ from lark.visitors import Transformer, Transformer_InPlace, v_args | |||
| from lark.grammar import Rule | |||
| from lark.lexer import TerminalDef, Lexer, TraditionalLexer | |||
| logger.setLevel(logging.INFO) | |||
| __path__ = os.path.dirname(__file__) | |||
| @@ -721,7 +722,8 @@ def _make_parser_test(LEXER, PARSER): | |||
| """) | |||
| g.parse('\x01\x02\x03') | |||
| @unittest.skipIf(sys.version_info[:2]==(2, 7), "bytes parser isn't perfect in Python2.7, exceptions don't work correctly") | |||
| @unittest.skipIf(sys.version_info[0]==2 or sys.version_info[:2]==(3, 4), | |||
| "bytes parser isn't perfect in Python2, exceptions don't work correctly") | |||
| def test_bytes_utf8(self): | |||
| g = r""" | |||
| start: BOM? char+ | |||
| @@ -1261,6 +1263,32 @@ def _make_parser_test(LEXER, PARSER): | |||
| tree = l.parse('aA') | |||
| self.assertEqual(tree.children, ['a', 'A']) | |||
| def test_token_flags_verbose(self): | |||
| g = _Lark(r"""start: NL | ABC | |||
| ABC: / [a-z] /x | |||
| NL: /\n/ | |||
| """) | |||
| x = g.parse('a') | |||
| self.assertEqual(x.children, ['a']) | |||
| def test_token_flags_verbose_multiline(self): | |||
| g = _Lark(r"""start: ABC | |||
| ABC: / a b c | |||
| d | |||
| e f | |||
| /x | |||
| """) | |||
| x = g.parse('abcdef') | |||
| self.assertEqual(x.children, ['abcdef']) | |||
| def test_token_multiline_only_works_with_x_flag(self): | |||
| g = r"""start: ABC | |||
| ABC: / a b c | |||
| d | |||
| e f | |||
| /i | |||
| """ | |||
| self.assertRaises( GrammarError, _Lark, g) | |||
| @unittest.skipIf(PARSER == 'cyk', "No empty rules") | |||
| def test_twice_empty(self): | |||
| @@ -106,6 +106,33 @@ class TestStandalone(TestCase): | |||
| x = l.parse('(\n)\n') | |||
| self.assertEqual(x, Tree('start', [])) | |||
| def test_transformer(self): | |||
| grammar = r""" | |||
| start: some_rule "(" SOME_TERMINAL ")" | |||
| some_rule: SOME_TERMINAL | |||
| SOME_TERMINAL: /[A-Za-z_][A-Za-z0-9_]*/ | |||
| """ | |||
| context = self._create_standalone(grammar) | |||
| _Lark = context["Lark_StandAlone"] | |||
| _Token = context["Token"] | |||
| _Tree = context["Tree"] | |||
| class MyTransformer(context["Transformer"]): | |||
| def SOME_TERMINAL(self, token): | |||
| return _Token("SOME_TERMINAL", "token is transformed") | |||
| def some_rule(self, children): | |||
| return _Tree("rule_is_transformed", []) | |||
| parser = _Lark(transformer=MyTransformer()) | |||
| self.assertEqual( | |||
| parser.parse("FOO(BAR)"), | |||
| _Tree("start", [ | |||
| _Tree("rule_is_transformed", []), | |||
| _Token("SOME_TERMINAL", "token is transformed") | |||
| ]) | |||
| ) | |||
| if __name__ == '__main__': | |||