@@ -12,6 +12,7 @@ Lark can: | |||||
- Build a parse-tree automagically, no construction code required | - Build a parse-tree automagically, no construction code required | ||||
- Outperform all other Python libraries when using LALR(1) (Yes, including PLY) | - Outperform all other Python libraries when using LALR(1) (Yes, including PLY) | ||||
- Run on every Python interpreter (it's pure-python) | - Run on every Python interpreter (it's pure-python) | ||||
- Generate a stand-alone parser (for LALR(1) grammars) | |||||
And many more features. Read ahead and find out. | And many more features. Read ahead and find out. | ||||
@@ -66,10 +67,11 @@ See more [examples in the wiki](https://github.com/erezsh/lark/wiki/Examples) | |||||
- Builds a parse-tree (AST) automagically, based on the structure of the grammar | - Builds a parse-tree (AST) automagically, based on the structure of the grammar | ||||
- **Earley** parser | - **Earley** parser | ||||
- Can parse *ALL* context-free grammars | |||||
- Full support for ambiguity in grammar | |||||
- Can parse all context-free grammars | |||||
- Full support for ambiguous grammars | |||||
- **LALR(1)** parser | - **LALR(1)** parser | ||||
- Competitive with PLY | |||||
- Fast and light, competitive with PLY | |||||
- Can generate a stand-alone parser | |||||
- **EBNF** grammar | - **EBNF** grammar | ||||
- **Unicode** fully supported | - **Unicode** fully supported | ||||
- **Python 2 & 3** compatible | - **Python 2 & 3** compatible | ||||
@@ -86,7 +88,7 @@ See the full list of [features in the wiki](https://github.com/erezsh/lark/wiki/ | |||||
#### Performance comparison | #### Performance comparison | ||||
Lower is better! | |||||
Lark is the fastest and lightest (lower is better) | |||||
![Run-time Comparison](docs/comparison_runtime.png) | ![Run-time Comparison](docs/comparison_runtime.png) | ||||
@@ -99,17 +101,17 @@ Check out the [JSON tutorial](/docs/json_tutorial.md#conclusion) for more detail | |||||
#### Feature comparison | #### Feature comparison | ||||
| Library | Algorithm | Grammar | Builds tree? | Supports ambiguity? | Can handle every CFG? | |||||
|:--------|:----------|:----|:--------|:------------|:------------ | |||||
| **Lark** | Earley/LALR(1) | EBNF | Yes! | Yes! | Yes! | | |||||
| [PLY](http://www.dabeaz.com/ply/) | LALR(1) | BNF | No | No | No | | |||||
| [PyParsing](http://pyparsing.wikispaces.com/) | PEG | Combinators | No | No | No\* | | |||||
| [Parsley](https://pypi.python.org/pypi/Parsley) | PEG | EBNF | No | No | No\* | | |||||
| [funcparserlib](https://github.com/vlasovskikh/funcparserlib) | Recursive-Descent | Combinators | No | No | No | | |||||
| [Parsimonious](https://github.com/erikrose/parsimonious) | PEG | EBNF | Yes | No | No\* | | |||||
| Library | Algorithm | Grammar | Builds tree? | Supports ambiguity? | Can handle every CFG? | Line/Column tracking | Generates Stand-alone | |||||
|:--------|:----------|:----|:--------|:------------|:------------|:----------|:---------- | |||||
| **Lark** | Earley/LALR(1) | EBNF | Yes! | Yes! | Yes! | Yes! | Yes! (LALR only) | | |||||
| [PLY](http://www.dabeaz.com/ply/) | LALR(1) | BNF | No | No | No | No | No | | |||||
| [PyParsing](http://pyparsing.wikispaces.com/) | PEG | Combinators | No | No | No\* | No | No | | |||||
| [Parsley](https://pypi.python.org/pypi/Parsley) | PEG | EBNF | No | No | No\* | No | No | | |||||
| [funcparserlib](https://github.com/vlasovskikh/funcparserlib) | Recursive-Descent | Combinators | No | No | No | No | No | | |||||
| [Parsimonious](https://github.com/erikrose/parsimonious) | PEG | EBNF | Yes | No | No\* | No | No | | |||||
(\* *According to Wikipedia, it remains unanswered whether PEGs can really parse all deterministic CFGs*) | |||||
(\* *PEGs cannot handle non-deterministic grammars. Also, according to Wikipedia, it remains unanswered whether PEGs can really parse all deterministic CFGs*) | |||||
### Projects using Lark | ### Projects using Lark | ||||
@@ -0,0 +1 @@ | |||||
python -m lark.tools.standalone json.g > json_parser.py |
@@ -0,0 +1,21 @@ | |||||
?start: value | |||||
?value: object | |||||
| array | |||||
| string | |||||
| SIGNED_NUMBER -> number | |||||
| "true" -> true | |||||
| "false" -> false | |||||
| "null" -> null | |||||
array : "[" [value ("," value)*] "]" | |||||
object : "{" [pair ("," pair)*] "}" | |||||
pair : string ":" value | |||||
string : ESCAPED_STRING | |||||
%import common.ESCAPED_STRING | |||||
%import common.SIGNED_NUMBER | |||||
%import common.WS | |||||
%ignore WS |
@@ -0,0 +1,794 @@ | |||||
# The file was automatically generated by Lark v0.5.2 | |||||
# | |||||
# | |||||
# Lark Stand-alone Generator Tool | |||||
# ---------------------------------- | |||||
# Generates a stand-alone LALR(1) parser with a standard lexer | |||||
# | |||||
# Git: https://github.com/erezsh/lark | |||||
# Author: Erez Shinan (erezshin@gmail.com) | |||||
# | |||||
# | |||||
# >>> LICENSE | |||||
# | |||||
# This tool and its generated code use a separate license from Lark. | |||||
# | |||||
# It is licensed under GPLv2 or above. | |||||
# | |||||
# If you wish to purchase a commercial license for this tool and its | |||||
# generated code, contact me via email. | |||||
# | |||||
# This program is free software: you can redistribute it and/or modify | |||||
# it under the terms of the GNU General Public License as published by | |||||
# the Free Software Foundation, either version 2 of the License, or | |||||
# (at your option) any later version. | |||||
# | |||||
# This program is distributed in the hope that it will be useful, | |||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |||||
# GNU General Public License for more details. | |||||
# | |||||
# See <http://www.gnu.org/licenses/>. | |||||
# | |||||
# | |||||
import types | |||||
import functools | |||||
from contextlib import contextmanager | |||||
Str = type(u'') | |||||
def inline_args(f): | |||||
# print '@@', f.__name__, type(f), isinstance(f, types.FunctionType), isinstance(f, types.TypeType), isinstance(f, types.BuiltinFunctionType) | |||||
if isinstance(f, types.FunctionType): | |||||
@functools.wraps(f) | |||||
def _f_func(self, args): | |||||
return f(self, *args) | |||||
return _f_func | |||||
elif isinstance(f, (type, types.BuiltinFunctionType)): | |||||
@functools.wraps(f) | |||||
def _f_builtin(_self, args): | |||||
return f(*args) | |||||
return _f_builtin | |||||
elif isinstance(f, types.MethodType): | |||||
@functools.wraps(f.__func__) | |||||
def _f(self, args): | |||||
return f.__func__(self, *args) | |||||
return _f | |||||
else: | |||||
@functools.wraps(f.__call__.__func__) | |||||
def _f(self, args): | |||||
return f.__call__.__func__(self, *args) | |||||
return _f | |||||
try: | |||||
from contextlib import suppress # Python 3 | |||||
except ImportError: | |||||
@contextmanager | |||||
def suppress(*excs): | |||||
'''Catch and dismiss the provided exception | |||||
>>> x = 'hello' | |||||
>>> with suppress(IndexError): | |||||
... x = x[10] | |||||
>>> x | |||||
'hello' | |||||
''' | |||||
try: | |||||
yield | |||||
except excs: | |||||
pass | |||||
def is_terminal(sym): | |||||
return sym.isupper() | |||||
class GrammarError(Exception): | |||||
pass | |||||
class ParseError(Exception): | |||||
pass | |||||
class UnexpectedToken(ParseError): | |||||
def __init__(self, token, expected, seq, index): | |||||
self.token = token | |||||
self.expected = expected | |||||
self.line = getattr(token, 'line', '?') | |||||
self.column = getattr(token, 'column', '?') | |||||
try: | |||||
context = ' '.join(['%r(%s)' % (t.value, t.type) for t in seq[index:index+5]]) | |||||
except AttributeError: | |||||
context = seq[index:index+5] | |||||
except TypeError: | |||||
context = "<no context>" | |||||
message = ("Unexpected token %r at line %s, column %s.\n" | |||||
"Expected: %s\n" | |||||
"Context: %s" % (token, self.line, self.column, expected, context)) | |||||
super(UnexpectedToken, self).__init__(message) | |||||
class Tree(object): | |||||
def __init__(self, data, children): | |||||
self.data = data | |||||
self.children = list(children) | |||||
def __repr__(self): | |||||
return 'Tree(%s, %s)' % (self.data, self.children) | |||||
def _pretty_label(self): | |||||
return self.data | |||||
def _pretty(self, level, indent_str): | |||||
if len(self.children) == 1 and not isinstance(self.children[0], Tree): | |||||
return [ indent_str*level, self._pretty_label(), '\t', '%s' % self.children[0], '\n'] | |||||
l = [ indent_str*level, self._pretty_label(), '\n' ] | |||||
for n in self.children: | |||||
if isinstance(n, Tree): | |||||
l += n._pretty(level+1, indent_str) | |||||
else: | |||||
l += [ indent_str*(level+1), '%s' % n, '\n' ] | |||||
return l | |||||
def pretty(self, indent_str=' '): | |||||
return ''.join(self._pretty(0, indent_str)) | |||||
class Transformer(object): | |||||
def _get_func(self, name): | |||||
return getattr(self, name) | |||||
def transform(self, tree): | |||||
items = [] | |||||
for c in tree.children: | |||||
try: | |||||
items.append(self.transform(c) if isinstance(c, Tree) else c) | |||||
except Discard: | |||||
pass | |||||
try: | |||||
f = self._get_func(tree.data) | |||||
except AttributeError: | |||||
return self.__default__(tree.data, items) | |||||
else: | |||||
return f(items) | |||||
def __default__(self, data, children): | |||||
return Tree(data, children) | |||||
def __mul__(self, other): | |||||
return TransformerChain(self, other) | |||||
class Discard(Exception): | |||||
pass | |||||
class TransformerChain(object): | |||||
def __init__(self, *transformers): | |||||
self.transformers = transformers | |||||
def transform(self, tree): | |||||
for t in self.transformers: | |||||
tree = t.transform(tree) | |||||
return tree | |||||
def __mul__(self, other): | |||||
return TransformerChain(*self.transformers + (other,)) | |||||
class InlineTransformer(Transformer): | |||||
def _get_func(self, name): # use super()._get_func | |||||
return inline_args(getattr(self, name)).__get__(self) | |||||
class Visitor(object): | |||||
def visit(self, tree): | |||||
for child in tree.children: | |||||
if isinstance(child, Tree): | |||||
self.visit(child) | |||||
f = getattr(self, tree.data, self.__default__) | |||||
f(tree) | |||||
return tree | |||||
def __default__(self, tree): | |||||
pass | |||||
class Visitor_NoRecurse(Visitor): | |||||
def visit(self, tree): | |||||
subtrees = list(tree.iter_subtrees()) | |||||
for subtree in (subtrees): | |||||
getattr(self, subtree.data, self.__default__)(subtree) | |||||
return tree | |||||
class Transformer_NoRecurse(Transformer): | |||||
def transform(self, tree): | |||||
subtrees = list(tree.iter_subtrees()) | |||||
def _t(t): | |||||
# Assumes t is already transformed | |||||
try: | |||||
f = self._get_func(t.data) | |||||
except AttributeError: | |||||
return self.__default__(t) | |||||
else: | |||||
return f(t) | |||||
for subtree in subtrees: | |||||
children = [] | |||||
for c in subtree.children: | |||||
try: | |||||
children.append(_t(c) if isinstance(c, Tree) else c) | |||||
except Discard: | |||||
pass | |||||
subtree.children = children | |||||
return _t(tree) | |||||
def __default__(self, t): | |||||
return t | |||||
class Indenter: | |||||
def __init__(self): | |||||
self.paren_level = 0 | |||||
self.indent_level = [0] | |||||
def handle_NL(self, token): | |||||
if self.paren_level > 0: | |||||
return | |||||
yield token | |||||
indent_str = token.rsplit('\n', 1)[1] # Tabs and spaces | |||||
indent = indent_str.count(' ') + indent_str.count('\t') * self.tab_len | |||||
if indent > self.indent_level[-1]: | |||||
self.indent_level.append(indent) | |||||
yield Token.new_borrow_pos(self.INDENT_type, indent_str, token) | |||||
else: | |||||
while indent < self.indent_level[-1]: | |||||
self.indent_level.pop() | |||||
yield Token.new_borrow_pos(self.DEDENT_type, indent_str, token) | |||||
assert indent == self.indent_level[-1], '%s != %s' % (indent, self.indent_level[-1]) | |||||
def process(self, stream): | |||||
for token in stream: | |||||
if token.type == self.NL_type: | |||||
for t in self.handle_NL(token): | |||||
yield t | |||||
else: | |||||
yield token | |||||
if token.type in self.OPEN_PAREN_types: | |||||
self.paren_level += 1 | |||||
elif token.type in self.CLOSE_PAREN_types: | |||||
self.paren_level -= 1 | |||||
assert self.paren_level >= 0 | |||||
while len(self.indent_level) > 1: | |||||
self.indent_level.pop() | |||||
yield Token(self.DEDENT_type, '') | |||||
assert self.indent_level == [0], self.indent_level | |||||
# XXX Hack for ContextualLexer. Maybe there's a more elegant solution? | |||||
@property | |||||
def always_accept(self): | |||||
return (self.NL_type,) | |||||
class LexError(Exception): | |||||
pass | |||||
class UnexpectedInput(LexError): | |||||
def __init__(self, seq, lex_pos, line, column, allowed=None): | |||||
context = seq[lex_pos:lex_pos+5] | |||||
message = "No token defined for: '%s' in %r at line %d col %d" % (seq[lex_pos], context, line, column) | |||||
super(UnexpectedInput, self).__init__(message) | |||||
self.line = line | |||||
self.column = column | |||||
self.context = context | |||||
self.allowed = allowed | |||||
class Token(Str): | |||||
def __new__(cls, type_, value, pos_in_stream=None, line=None, column=None): | |||||
inst = Str.__new__(cls, value) | |||||
inst.type = type_ | |||||
inst.pos_in_stream = pos_in_stream | |||||
inst.value = value | |||||
inst.line = line | |||||
inst.column = column | |||||
return inst | |||||
@classmethod | |||||
def new_borrow_pos(cls, type_, value, borrow_t): | |||||
return cls(type_, value, borrow_t.pos_in_stream, line=borrow_t.line, column=borrow_t.column) | |||||
def __repr__(self): | |||||
return 'Token(%s, %r)' % (self.type, self.value) | |||||
def __deepcopy__(self, memo): | |||||
return Token(self.type, self.value, self.pos_in_stream, self.line, self.column) | |||||
def __eq__(self, other): | |||||
if isinstance(other, Token) and self.type != other.type: | |||||
return False | |||||
return Str.__eq__(self, other) | |||||
__hash__ = Str.__hash__ | |||||
class LineCounter: | |||||
def __init__(self): | |||||
self.newline_char = '\n' | |||||
self.char_pos = 0 | |||||
self.line = 1 | |||||
self.column = 0 | |||||
self.line_start_pos = 0 | |||||
def feed(self, token, test_newline=True): | |||||
"""Consume a token and calculate the new line & column. | |||||
As an optional optimization, set test_newline=False is token doesn't contain a newline. | |||||
""" | |||||
if test_newline: | |||||
newlines = token.count(self.newline_char) | |||||
if newlines: | |||||
self.line += newlines | |||||
self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1 | |||||
self.char_pos += len(token) | |||||
self.column = self.char_pos - self.line_start_pos | |||||
class _Lex: | |||||
"Built to serve both Lexer and ContextualLexer" | |||||
def __init__(self, lexer): | |||||
self.lexer = lexer | |||||
def lex(self, stream, newline_types, ignore_types): | |||||
newline_types = list(newline_types) | |||||
newline_types = list(newline_types) | |||||
line_ctr = LineCounter() | |||||
while True: | |||||
lexer = self.lexer | |||||
for mre, type_from_index in lexer.mres: | |||||
m = mre.match(stream, line_ctr.char_pos) | |||||
if m: | |||||
value = m.group(0) | |||||
type_ = type_from_index[m.lastindex] | |||||
if type_ not in ignore_types: | |||||
t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) | |||||
if t.type in lexer.callback: | |||||
t = lexer.callback[t.type](t) | |||||
lexer = yield t | |||||
line_ctr.feed(value, type_ in newline_types) | |||||
break | |||||
else: | |||||
if line_ctr.char_pos < len(stream): | |||||
raise UnexpectedInput(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column) | |||||
break | |||||
class UnlessCallback: | |||||
def __init__(self, mres): | |||||
self.mres = mres | |||||
def __call__(self, t): | |||||
for mre, type_from_index in self.mres: | |||||
m = mre.match(t.value) | |||||
if m: | |||||
value = m.group(0) | |||||
t.type = type_from_index[m.lastindex] | |||||
break | |||||
return t | |||||
class NodeBuilder: | |||||
def __init__(self, tree_class, name): | |||||
self.tree_class = tree_class | |||||
self.name = name | |||||
def __call__(self, children): | |||||
return self.tree_class(self.name, children) | |||||
class Expand1: | |||||
def __init__(self, node_builder): | |||||
self.node_builder = node_builder | |||||
def __call__(self, children): | |||||
if len(children) == 1: | |||||
return children[0] | |||||
else: | |||||
return self.node_builder(children) | |||||
class Factory: | |||||
def __init__(self, cls, *args): | |||||
self.cls = cls | |||||
self.args = args | |||||
def __call__(self, node_builder): | |||||
return self.cls(node_builder, *self.args) | |||||
class TokenWrapper: | |||||
"Used for fixing the results of scanless parsing" | |||||
def __init__(self, node_builder, token_name): | |||||
self.node_builder = node_builder | |||||
self.token_name = token_name | |||||
def __call__(self, children): | |||||
return self.node_builder( [Token(self.token_name, ''.join(children))] ) | |||||
def identity(node_builder): | |||||
return node_builder | |||||
class ChildFilter: | |||||
def __init__(self, node_builder, to_include): | |||||
self.node_builder = node_builder | |||||
self.to_include = to_include | |||||
def __call__(self, children): | |||||
filtered = [] | |||||
for i, to_expand in self.to_include: | |||||
if to_expand: | |||||
filtered += children[i].children | |||||
else: | |||||
filtered.append(children[i]) | |||||
return self.node_builder(filtered) | |||||
def create_rule_handler(expansion, keep_all_tokens, filter_out): | |||||
# if not keep_all_tokens: | |||||
to_include = [(i, not is_terminal(sym) and sym.startswith('_')) | |||||
for i, sym in enumerate(expansion) | |||||
if keep_all_tokens | |||||
or not ((is_terminal(sym) and sym.startswith('_')) or sym in filter_out) | |||||
] | |||||
if len(to_include) < len(expansion) or any(to_expand for i, to_expand in to_include): | |||||
return Factory(ChildFilter, to_include) | |||||
# else, if no filtering required.. | |||||
return identity | |||||
class PropagatePositions: | |||||
def __init__(self, node_builder): | |||||
self.node_builder = node_builder | |||||
def __call__(self, children): | |||||
res = self.node_builder(children) | |||||
if children: | |||||
for a in children: | |||||
with suppress(AttributeError): | |||||
res.line = a.line | |||||
res.column = a.column | |||||
break | |||||
for a in reversed(children): | |||||
with suppress(AttributeError): | |||||
res.end_line = a.end_line | |||||
res.end_col = a.end_col | |||||
break | |||||
return res | |||||
class Callback(object): | |||||
pass | |||||
class ParseTreeBuilder: | |||||
def __init__(self, rules, tree_class, propagate_positions=False, keep_all_tokens=False): | |||||
self.tree_class = tree_class | |||||
self.propagate_positions = propagate_positions | |||||
self.always_keep_all_tokens = keep_all_tokens | |||||
self.rule_builders = list(self._init_builders(rules)) | |||||
self.user_aliases = {} | |||||
def _init_builders(self, rules): | |||||
filter_out = set() | |||||
for rule in rules: | |||||
if rule.options and rule.options.filter_out: | |||||
assert rule.origin.startswith('_') # Just to make sure | |||||
filter_out.add(rule.origin) | |||||
for rule in rules: | |||||
options = rule.options | |||||
keep_all_tokens = self.always_keep_all_tokens or (options.keep_all_tokens if options else False) | |||||
expand1 = options.expand1 if options else False | |||||
create_token = options.create_token if options else False | |||||
wrapper_chain = filter(None, [ | |||||
(expand1 and not rule.alias) and Expand1, | |||||
create_token and Factory(TokenWrapper, create_token), | |||||
create_rule_handler(rule.expansion, keep_all_tokens, filter_out), | |||||
self.propagate_positions and PropagatePositions, | |||||
]) | |||||
yield rule, wrapper_chain | |||||
def create_callback(self, transformer=None): | |||||
callback = Callback() | |||||
for rule, wrapper_chain in self.rule_builders: | |||||
internal_callback_name = '_callback_%s_%s' % (rule.origin, '_'.join(rule.expansion)) | |||||
user_callback_name = rule.alias or rule.origin | |||||
try: | |||||
f = transformer._get_func(user_callback_name) | |||||
except AttributeError: | |||||
f = NodeBuilder(self.tree_class, user_callback_name) | |||||
self.user_aliases[rule] = rule.alias | |||||
rule.alias = internal_callback_name | |||||
for w in wrapper_chain: | |||||
f = w(f) | |||||
if hasattr(callback, internal_callback_name): | |||||
raise GrammarError("Rule '%s' already exists" % (rule,)) | |||||
setattr(callback, internal_callback_name, f) | |||||
return callback | |||||
class _Parser: | |||||
def __init__(self, parse_table, callbacks): | |||||
self.states = parse_table.states | |||||
self.start_state = parse_table.start_state | |||||
self.end_state = parse_table.end_state | |||||
self.callbacks = callbacks | |||||
def parse(self, seq, set_state=None): | |||||
i = 0 | |||||
token = None | |||||
stream = iter(seq) | |||||
states = self.states | |||||
state_stack = [self.start_state] | |||||
value_stack = [] | |||||
if set_state: set_state(self.start_state) | |||||
def get_action(key): | |||||
state = state_stack[-1] | |||||
try: | |||||
return states[state][key] | |||||
except KeyError: | |||||
expected = states[state].keys() | |||||
raise UnexpectedToken(token, expected, seq, i) | |||||
def reduce(rule): | |||||
size = len(rule.expansion) | |||||
if size: | |||||
s = value_stack[-size:] | |||||
del state_stack[-size:] | |||||
del value_stack[-size:] | |||||
else: | |||||
s = [] | |||||
value = self.callbacks[rule](s) | |||||
_action, new_state = get_action(rule.origin) | |||||
assert _action is Shift | |||||
state_stack.append(new_state) | |||||
value_stack.append(value) | |||||
# Main LALR-parser loop | |||||
try: | |||||
token = next(stream) | |||||
i += 1 | |||||
while True: | |||||
action, arg = get_action(token.type) | |||||
assert arg != self.end_state | |||||
if action is Shift: | |||||
state_stack.append(arg) | |||||
value_stack.append(token) | |||||
if set_state: set_state(arg) | |||||
token = next(stream) | |||||
i += 1 | |||||
else: | |||||
reduce(arg) | |||||
except StopIteration: | |||||
pass | |||||
while True: | |||||
_action, arg = get_action('$END') | |||||
if _action is Shift: | |||||
assert arg == self.end_state | |||||
val ,= value_stack | |||||
return val | |||||
else: | |||||
reduce(arg) | |||||
class Rule(object): | |||||
""" | |||||
origin : a symbol | |||||
expansion : a list of symbols | |||||
""" | |||||
def __init__(self, origin, expansion, alias=None, options=None): | |||||
self.origin = origin | |||||
self.expansion = expansion | |||||
self.alias = alias | |||||
self.options = options | |||||
def __str__(self): | |||||
return '<%s : %s>' % (self.origin, ' '.join(map(str,self.expansion))) | |||||
def __repr__(self): | |||||
return 'Rule(%r, %r, %r, %r)' % (self.origin, self.expansion, self.alias, self.options) | |||||
class RuleOptions: | |||||
def __init__(self, keep_all_tokens=False, expand1=False, create_token=None, filter_out=False, priority=None): | |||||
self.keep_all_tokens = keep_all_tokens | |||||
self.expand1 = expand1 | |||||
self.create_token = create_token # used for scanless postprocessing | |||||
self.priority = priority | |||||
self.filter_out = filter_out # remove this rule from the tree | |||||
# used for "token"-rules in scanless | |||||
def __repr__(self): | |||||
return 'RuleOptions(%r, %r, %r, %r, %r)' % ( | |||||
self.keep_all_tokens, | |||||
self.expand1, | |||||
self.create_token, | |||||
self.priority, | |||||
self.filter_out | |||||
) | |||||
Shift = 0 | |||||
Reduce = 1 | |||||
import re | |||||
MRES = ( | |||||
[('(?P<SIGNED_NUMBER>(?:(?:\\+|\\-))?(?:(?:(?:[0-9])+(?:e|E)(?:(?:\\+|\\-))?(?:[0-9])+|(?:(?:[0-9])+\\.(?:(?:[0-9])+)?|\\.(?:[0-9])+)(?:(?:e|E)(?:(?:\\+|\\-))?(?:[0-9])+)?)|(?:[0-9])+))|(?P<ESCAPED_STRING>\\"(?:(?:\\\\\\"|[^"]))*\\")|(?P<WS>(?:[ \t\x0c' | |||||
'\r\n' | |||||
'])+)|(?P<__FALSE1>false)|(?P<__NULL2>null)|(?P<__TRUE0>true)|(?P<__COLON>\\:)|(?P<__COMMA>\\,)|(?P<__LBRACE>\\{)|(?P<__LSQB>\\[)|(?P<__RBRACE>\\})|(?P<__RSQB>\\])', | |||||
{1: 'SIGNED_NUMBER', | |||||
2: 'ESCAPED_STRING', | |||||
3: 'WS', | |||||
4: '__FALSE1', | |||||
5: '__NULL2', | |||||
6: '__TRUE0', | |||||
7: '__COLON', | |||||
8: '__COMMA', | |||||
9: '__LBRACE', | |||||
10: '__LSQB', | |||||
11: '__RBRACE', | |||||
12: '__RSQB'})] | |||||
) | |||||
LEXER_CALLBACK = ( | |||||
{} | |||||
) | |||||
NEWLINE_TYPES = ['WS'] | |||||
IGNORE_TYPES = ['WS'] | |||||
class LexerRegexps: pass | |||||
lexer_regexps = LexerRegexps() | |||||
lexer_regexps.mres = [(re.compile(p), d) for p, d in MRES] | |||||
lexer_regexps.callback = {n: UnlessCallback([(re.compile(p), d) for p, d in mres]) | |||||
for n, mres in LEXER_CALLBACK.items()} | |||||
lexer = _Lex(lexer_regexps) | |||||
def lex(stream): | |||||
return lexer.lex(stream, NEWLINE_TYPES, IGNORE_TYPES) | |||||
RULES = { | |||||
0: Rule('start', ['value'], None, RuleOptions(False, True, None, None, False)), | |||||
1: Rule('value', ['object'], None, RuleOptions(False, True, None, None, False)), | |||||
2: Rule('value', ['array'], None, RuleOptions(False, True, None, None, False)), | |||||
3: Rule('value', ['string'], None, RuleOptions(False, True, None, None, False)), | |||||
4: Rule('value', ['SIGNED_NUMBER'], 'number', RuleOptions(False, True, None, None, False)), | |||||
5: Rule('value', ['__TRUE0'], 'true', RuleOptions(False, True, None, None, False)), | |||||
6: Rule('value', ['__FALSE1'], 'false', RuleOptions(False, True, None, None, False)), | |||||
7: Rule('value', ['__NULL2'], 'null', RuleOptions(False, True, None, None, False)), | |||||
8: Rule('array', ['__LSQB', 'value', '__anon_star_0', '__RSQB'], None, RuleOptions(False, False, None, None, False)), | |||||
9: Rule('array', ['__LSQB', 'value', '__RSQB'], None, RuleOptions(False, False, None, None, False)), | |||||
10: Rule('array', ['__LSQB', '__RSQB'], None, RuleOptions(False, False, None, None, False)), | |||||
11: Rule('object', ['__LBRACE', 'pair', '__anon_star_1', '__RBRACE'], None, RuleOptions(False, False, None, None, False)), | |||||
12: Rule('object', ['__LBRACE', 'pair', '__RBRACE'], None, RuleOptions(False, False, None, None, False)), | |||||
13: Rule('object', ['__LBRACE', '__RBRACE'], None, RuleOptions(False, False, None, None, False)), | |||||
14: Rule('pair', ['string', '__COLON', 'value'], None, RuleOptions(False, False, None, None, False)), | |||||
15: Rule('string', ['ESCAPED_STRING'], None, RuleOptions(False, False, None, None, False)), | |||||
16: Rule('__anon_star_0', ['__COMMA', 'value'], None, None), | |||||
17: Rule('__anon_star_0', ['__anon_star_0', '__COMMA', 'value'], None, None), | |||||
18: Rule('__anon_star_1', ['__COMMA', 'pair'], None, None), | |||||
19: Rule('__anon_star_1', ['__anon_star_1', '__COMMA', 'pair'], None, None), | |||||
} | |||||
parse_tree_builder = ParseTreeBuilder(RULES.values(), Tree) | |||||
class ParseTable: pass | |||||
parse_table = ParseTable() | |||||
STATES = { | |||||
0: {0: (0, 1), 1: (0, 2), 2: (0, 3), 3: (0, 4), 4: (0, 5), 5: (0, 6), 6: (0, 7), 7: (0, 8), 8: (0, 9), 9: (0, 10), 10: (0, 11), 11: (0, 12)}, | |||||
1: {12: (1, 5), 13: (1, 5), 14: (1, 5), 15: (1, 5)}, | |||||
2: {9: (0, 10), 14: (0, 13), 16: (0, 14), 11: (0, 15)}, | |||||
3: {12: (1, 2), 13: (1, 2), 14: (1, 2), 15: (1, 2)}, | |||||
4: {12: (1, 1), 13: (1, 1), 14: (1, 1), 15: (1, 1)}, | |||||
5: {12: (0, 16)}, | |||||
6: {7: (0, 17), 0: (0, 1), 1: (0, 2), 2: (0, 3), 3: (0, 4), 5: (0, 6), 6: (0, 7), 8: (0, 9), 9: (0, 10), 15: (0, 18), 10: (0, 11), 11: (0, 12)}, | |||||
7: {12: (1, 4), 13: (1, 4), 14: (1, 4), 15: (1, 4)}, | |||||
8: {12: (1, 0)}, | |||||
9: {12: (1, 7), 13: (1, 7), 14: (1, 7), 15: (1, 7)}, | |||||
10: {12: (1, 15), 17: (1, 15), 13: (1, 15), 14: (1, 15), 15: (1, 15)}, | |||||
11: {12: (1, 6), 13: (1, 6), 14: (1, 6), 15: (1, 6)}, | |||||
12: {12: (1, 3), 13: (1, 3), 14: (1, 3), 15: (1, 3)}, | |||||
13: {13: (1, 13), 12: (1, 13), 14: (1, 13), 15: (1, 13)}, | |||||
14: {14: (0, 19), 13: (0, 20), 18: (0, 21)}, | |||||
15: {17: (0, 22)}, | |||||
16: {}, | |||||
17: {19: (0, 23), 15: (0, 24), 13: (0, 25)}, | |||||
18: {13: (1, 10), 12: (1, 10), 14: (1, 10), 15: (1, 10)}, | |||||
19: {13: (1, 12), 12: (1, 12), 14: (1, 12), 15: (1, 12)}, | |||||
20: {9: (0, 10), 11: (0, 15), 16: (0, 26)}, | |||||
21: {14: (0, 27), 13: (0, 28)}, | |||||
22: {5: (0, 6), 1: (0, 2), 0: (0, 1), 8: (0, 9), 2: (0, 3), 3: (0, 4), 9: (0, 10), 6: (0, 7), 10: (0, 11), 11: (0, 12), 7: (0, 29)}, | |||||
23: {15: (0, 30), 13: (0, 31)}, | |||||
24: {13: (1, 9), 12: (1, 9), 14: (1, 9), 15: (1, 9)}, | |||||
25: {5: (0, 6), 1: (0, 2), 0: (0, 1), 8: (0, 9), 2: (0, 3), 3: (0, 4), 7: (0, 32), 9: (0, 10), 6: (0, 7), 10: (0, 11), 11: (0, 12)}, | |||||
26: {13: (1, 18), 14: (1, 18)}, | |||||
27: {13: (1, 11), 12: (1, 11), 14: (1, 11), 15: (1, 11)}, | |||||
28: {16: (0, 33), 9: (0, 10), 11: (0, 15)}, | |||||
29: {13: (1, 14), 14: (1, 14)}, | |||||
30: {13: (1, 8), 12: (1, 8), 14: (1, 8), 15: (1, 8)}, | |||||
31: {5: (0, 6), 1: (0, 2), 0: (0, 1), 7: (0, 34), 8: (0, 9), 2: (0, 3), 3: (0, 4), 9: (0, 10), 6: (0, 7), 10: (0, 11), 11: (0, 12)}, | |||||
32: {15: (1, 16), 13: (1, 16)}, | |||||
33: {13: (1, 19), 14: (1, 19)}, | |||||
34: {15: (1, 17), 13: (1, 17)}, | |||||
} | |||||
TOKEN_TYPES = ( | |||||
{0: '__TRUE0', | |||||
1: '__LBRACE', | |||||
2: 'array', | |||||
3: 'object', | |||||
4: 'start', | |||||
5: '__LSQB', | |||||
6: 'SIGNED_NUMBER', | |||||
7: 'value', | |||||
8: '__NULL2', | |||||
9: 'ESCAPED_STRING', | |||||
10: '__FALSE1', | |||||
11: 'string', | |||||
12: '$END', | |||||
13: '__COMMA', | |||||
14: '__RBRACE', | |||||
15: '__RSQB', | |||||
16: 'pair', | |||||
17: '__COLON', | |||||
18: '__anon_star_1', | |||||
19: '__anon_star_0'} | |||||
) | |||||
parse_table.states = {s: {TOKEN_TYPES[t]: (a, RULES[x] if a is Reduce else x) for t, (a, x) in acts.items()} | |||||
for s, acts in STATES.items()} | |||||
parse_table.start_state = 0 | |||||
parse_table.end_state = 16 | |||||
class Lark_StandAlone: | |||||
def __init__(self, transformer=None, postlex=None): | |||||
callback = parse_tree_builder.create_callback(transformer=transformer) | |||||
callbacks = {rule: getattr(callback, rule.alias or rule.origin, None) for rule in RULES.values()} | |||||
self.parser = _Parser(parse_table, callbacks) | |||||
self.postlex = postlex | |||||
def parse(self, stream): | |||||
tokens = lex(stream) | |||||
if self.postlex: tokens = self.postlex.process(tokens) | |||||
return self.parser.parse(tokens) |
@@ -0,0 +1,25 @@ | |||||
import sys | |||||
from json_parser import Lark_StandAlone, Transformer, inline_args | |||||
class TreeToJson(Transformer): | |||||
@inline_args | |||||
def string(self, s): | |||||
return s[1:-1].replace('\\"', '"') | |||||
array = list | |||||
pair = tuple | |||||
object = dict | |||||
number = inline_args(float) | |||||
null = lambda self, _: None | |||||
true = lambda self, _: True | |||||
false = lambda self, _: False | |||||
parser = Lark_StandAlone(transformer=TreeToJson()) | |||||
if __name__ == '__main__': | |||||
with open(sys.argv[1]) as f: | |||||
print(parser.parse(f.read())) | |||||
@@ -4,4 +4,4 @@ from .lexer import UnexpectedInput, LexError | |||||
from .lark import Lark | from .lark import Lark | ||||
from .utils import inline_args | from .utils import inline_args | ||||
__version__ = "0.5.1" | |||||
__version__ = "0.5.2" |
@@ -1,16 +1,21 @@ | |||||
import re | import re | ||||
import sre_parse | |||||
import sys | import sys | ||||
from .utils import get_regexp_width | |||||
Py36 = (sys.version_info[:2] >= (3, 6)) | Py36 = (sys.version_info[:2] >= (3, 6)) | ||||
###{standalone | |||||
def is_terminal(sym): | |||||
return sym.isupper() | |||||
class GrammarError(Exception): | class GrammarError(Exception): | ||||
pass | pass | ||||
class ParseError(Exception): | class ParseError(Exception): | ||||
pass | pass | ||||
class UnexpectedToken(ParseError): | class UnexpectedToken(ParseError): | ||||
def __init__(self, token, expected, seq, index): | def __init__(self, token, expected, seq, index): | ||||
self.token = token | self.token = token | ||||
@@ -31,9 +36,8 @@ class UnexpectedToken(ParseError): | |||||
super(UnexpectedToken, self).__init__(message) | super(UnexpectedToken, self).__init__(message) | ||||
###} | |||||
def is_terminal(sym): | |||||
return isinstance(sym, Terminal) or sym.isupper() or sym == '$end' | |||||
class LexerConf: | class LexerConf: | ||||
@@ -44,7 +48,6 @@ class LexerConf: | |||||
class ParserConf: | class ParserConf: | ||||
def __init__(self, rules, callback, start): | def __init__(self, rules, callback, start): | ||||
assert all(len(r) == 4 for r in rules) | |||||
self.rules = rules | self.rules = rules | ||||
self.callback = callback | self.callback = callback | ||||
self.start = start | self.start = start | ||||
@@ -93,10 +96,10 @@ class PatternRE(Pattern): | |||||
@property | @property | ||||
def min_width(self): | def min_width(self): | ||||
return sre_parse.parse(self.to_regexp()).getwidth()[0] | |||||
return get_regexp_width(self.to_regexp())[0] | |||||
@property | @property | ||||
def max_width(self): | def max_width(self): | ||||
return sre_parse.parse(self.to_regexp()).getwidth()[1] | |||||
return get_regexp_width(self.to_regexp())[1] | |||||
class TokenDef(object): | class TokenDef(object): | ||||
def __init__(self, name, pattern, priority=1): | def __init__(self, name, pattern, priority=1): | ||||
@@ -108,27 +111,3 @@ class TokenDef(object): | |||||
def __repr__(self): | def __repr__(self): | ||||
return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern) | return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern) | ||||
class Terminal: | |||||
def __init__(self, data): | |||||
self.data = data | |||||
def __repr__(self): | |||||
return '%r' % self.data | |||||
def __eq__(self, other): | |||||
return isinstance(other, type(self)) and self.data == other.data | |||||
def __hash__(self): | |||||
return hash(self.data) | |||||
class Terminal_Regexp(Terminal): | |||||
def __init__(self, name, regexp): | |||||
Terminal.__init__(self, regexp) | |||||
self.name = name | |||||
self.match = re.compile(regexp).match | |||||
class Terminal_Token(Terminal): | |||||
def match(self, other): | |||||
return self.data == other.type | |||||
@@ -0,0 +1,37 @@ | |||||
class Rule(object): | |||||
""" | |||||
origin : a symbol | |||||
expansion : a list of symbols | |||||
""" | |||||
def __init__(self, origin, expansion, alias=None, options=None): | |||||
self.origin = origin | |||||
self.expansion = expansion | |||||
self.alias = alias | |||||
self.options = options | |||||
def __str__(self): | |||||
return '<%s : %s>' % (self.origin, ' '.join(map(str,self.expansion))) | |||||
def __repr__(self): | |||||
return 'Rule(%r, %r, %r, %r)' % (self.origin, self.expansion, self.alias, self.options) | |||||
class RuleOptions: | |||||
def __init__(self, keep_all_tokens=False, expand1=False, create_token=None, filter_out=False, priority=None): | |||||
self.keep_all_tokens = keep_all_tokens | |||||
self.expand1 = expand1 | |||||
self.create_token = create_token # used for scanless postprocessing | |||||
self.priority = priority | |||||
self.filter_out = filter_out # remove this rule from the tree | |||||
# used for "token"-rules in scanless | |||||
def __repr__(self): | |||||
return 'RuleOptions(%r, %r, %r, %r, %r)' % ( | |||||
self.keep_all_tokens, | |||||
self.expand1, | |||||
self.create_token, | |||||
self.priority, | |||||
self.filter_out | |||||
) |
@@ -12,6 +12,7 @@ DECIMAL: INT "." INT? | "." INT | |||||
// float = /-?\d+(\.\d+)?([eE][+-]?\d+)?/ | // float = /-?\d+(\.\d+)?([eE][+-]?\d+)?/ | ||||
_EXP: ("e"|"E") SIGNED_INT | _EXP: ("e"|"E") SIGNED_INT | ||||
FLOAT: INT _EXP | DECIMAL _EXP? | FLOAT: INT _EXP | DECIMAL _EXP? | ||||
SIGNED_FLOAT: ["+"|"-"] INT | |||||
NUMBER: FLOAT | INT | NUMBER: FLOAT | INT | ||||
SIGNED_NUMBER: ["+"|"-"] NUMBER | SIGNED_NUMBER: ["+"|"-"] NUMBER | ||||
@@ -2,6 +2,7 @@ | |||||
from .lexer import Token | from .lexer import Token | ||||
###{standalone | |||||
class Indenter: | class Indenter: | ||||
def __init__(self): | def __init__(self): | ||||
self.paren_level = 0 | self.paren_level = 0 | ||||
@@ -50,3 +51,5 @@ class Indenter: | |||||
@property | @property | ||||
def always_accept(self): | def always_accept(self): | ||||
return (self.NL_type,) | return (self.NL_type,) | ||||
###} |
@@ -169,13 +169,15 @@ class Lark: | |||||
def _build_parser(self): | def _build_parser(self): | ||||
self.parser_class = get_frontend(self.options.parser, self.options.lexer) | self.parser_class = get_frontend(self.options.parser, self.options.lexer) | ||||
self.parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class, self.options.propagate_positions, self.options.keep_all_tokens) | |||||
rules, callback = self.parse_tree_builder.apply(self.options.transformer) | |||||
self._parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class, self.options.propagate_positions, self.options.keep_all_tokens) | |||||
callback = self._parse_tree_builder.create_callback(self.options.transformer) | |||||
if self.profiler: | if self.profiler: | ||||
for f in dir(callback): | for f in dir(callback): | ||||
if not (f.startswith('__') and f.endswith('__')): | if not (f.startswith('__') and f.endswith('__')): | ||||
setattr(callback, f, self.profiler.make_wrapper('transformer', getattr(callback, f))) | setattr(callback, f, self.profiler.make_wrapper('transformer', getattr(callback, f))) | ||||
parser_conf = ParserConf(rules, callback, self.options.start) | |||||
parser_conf = ParserConf(self.rules, callback, self.options.start) | |||||
return self.parser_class(self.lexer_conf, parser_conf, options=self.options) | return self.parser_class(self.lexer_conf, parser_conf, options=self.options) | ||||
@@ -5,6 +5,7 @@ import re | |||||
from .utils import Str, classify | from .utils import Str, classify | ||||
from .common import is_terminal, PatternStr, PatternRE, TokenDef | from .common import is_terminal, PatternStr, PatternRE, TokenDef | ||||
###{standalone | |||||
class LexError(Exception): | class LexError(Exception): | ||||
pass | pass | ||||
@@ -48,27 +49,75 @@ class Token(Str): | |||||
__hash__ = Str.__hash__ | __hash__ = Str.__hash__ | ||||
class Regex: | |||||
def __init__(self, pattern, flags=()): | |||||
self.pattern = pattern | |||||
self.flags = flags | |||||
def _regexp_has_newline(r): | |||||
return '\n' in r or '\\n' in r or ('(?s)' in r and '.' in r) | |||||
class LineCounter: | |||||
def __init__(self): | |||||
self.newline_char = '\n' | |||||
self.char_pos = 0 | |||||
self.line = 1 | |||||
self.column = 0 | |||||
self.line_start_pos = 0 | |||||
def feed(self, token, test_newline=True): | |||||
"""Consume a token and calculate the new line & column. | |||||
As an optional optimization, set test_newline=False is token doesn't contain a newline. | |||||
""" | |||||
if test_newline: | |||||
newlines = token.count(self.newline_char) | |||||
if newlines: | |||||
self.line += newlines | |||||
self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1 | |||||
self.char_pos += len(token) | |||||
self.column = self.char_pos - self.line_start_pos | |||||
class _Lex: | |||||
"Built to serve both Lexer and ContextualLexer" | |||||
def __init__(self, lexer): | |||||
self.lexer = lexer | |||||
def lex(self, stream, newline_types, ignore_types): | |||||
newline_types = list(newline_types) | |||||
ignore_types = list(ignore_types) | |||||
line_ctr = LineCounter() | |||||
def _create_unless_callback(strs): | |||||
mres = build_mres(strs, match_whole=True) | |||||
def unless_callback(t): | |||||
# if t in strs: | |||||
# t.type = strs[t] | |||||
for mre, type_from_index in mres: | |||||
while True: | |||||
lexer = self.lexer | |||||
for mre, type_from_index in lexer.mres: | |||||
m = mre.match(stream, line_ctr.char_pos) | |||||
if m: | |||||
value = m.group(0) | |||||
type_ = type_from_index[m.lastindex] | |||||
if type_ not in ignore_types: | |||||
t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) | |||||
if t.type in lexer.callback: | |||||
t = lexer.callback[t.type](t) | |||||
yield t | |||||
line_ctr.feed(value, type_ in newline_types) | |||||
break | |||||
else: | |||||
if line_ctr.char_pos < len(stream): | |||||
raise UnexpectedInput(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column) | |||||
break | |||||
class UnlessCallback: | |||||
def __init__(self, mres): | |||||
self.mres = mres | |||||
def __call__(self, t): | |||||
for mre, type_from_index in self.mres: | |||||
m = mre.match(t.value) | m = mre.match(t.value) | ||||
if m: | if m: | ||||
value = m.group(0) | value = m.group(0) | ||||
t.type = type_from_index[m.lastindex] | t.type = type_from_index[m.lastindex] | ||||
break | break | ||||
return t | return t | ||||
return unless_callback | |||||
###} | |||||
def _create_unless(tokens): | def _create_unless(tokens): | ||||
tokens_by_type = classify(tokens, lambda t: type(t.pattern)) | tokens_by_type = classify(tokens, lambda t: type(t.pattern)) | ||||
@@ -85,7 +134,7 @@ def _create_unless(tokens): | |||||
if strtok.pattern.flags <= retok.pattern.flags: | if strtok.pattern.flags <= retok.pattern.flags: | ||||
embedded_strs.add(strtok) | embedded_strs.add(strtok) | ||||
if unless: | if unless: | ||||
callback[retok.name] = _create_unless_callback(unless) | |||||
callback[retok.name] = UnlessCallback(build_mres(unless, match_whole=True)) | |||||
tokens = [t for t in tokens if t not in embedded_strs] | tokens = [t for t in tokens if t not in embedded_strs] | ||||
return tokens, callback | return tokens, callback | ||||
@@ -110,13 +159,13 @@ def _build_mres(tokens, max_size, match_whole): | |||||
def build_mres(tokens, match_whole=False): | def build_mres(tokens, match_whole=False): | ||||
return _build_mres(tokens, len(tokens), match_whole) | return _build_mres(tokens, len(tokens), match_whole) | ||||
def _regexp_has_newline(r): | |||||
return '\n' in r or '\\n' in r or ('(?s)' in r and '.' in r) | |||||
class Lexer(object): | |||||
class Lexer: | |||||
def __init__(self, tokens, ignore=()): | def __init__(self, tokens, ignore=()): | ||||
assert all(isinstance(t, TokenDef) for t in tokens), tokens | assert all(isinstance(t, TokenDef) for t in tokens), tokens | ||||
self.ignore = ignore | |||||
self.newline_char = '\n' | |||||
tokens = list(tokens) | tokens = list(tokens) | ||||
# Sanitization | # Sanitization | ||||
@@ -129,14 +178,11 @@ class Lexer(object): | |||||
if t.pattern.min_width == 0: | if t.pattern.min_width == 0: | ||||
raise LexError("Lexer does not allow zero-width tokens. (%s: %s)" % (t.name, t.pattern)) | raise LexError("Lexer does not allow zero-width tokens. (%s: %s)" % (t.name, t.pattern)) | ||||
token_names = {t.name for t in tokens} | |||||
for t in ignore: | |||||
if t not in token_names: | |||||
raise LexError("Token '%s' was marked to ignore but it is not defined!" % t) | |||||
assert set(ignore) <= {t.name for t in tokens} | |||||
# Init | # Init | ||||
self.newline_types = [t.name for t in tokens if _regexp_has_newline(t.pattern.to_regexp())] | self.newline_types = [t.name for t in tokens if _regexp_has_newline(t.pattern.to_regexp())] | ||||
self.ignore_types = [t for t in ignore] | |||||
self.ignore_types = list(ignore) | |||||
tokens.sort(key=lambda x:(-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name)) | tokens.sort(key=lambda x:(-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name)) | ||||
@@ -147,46 +193,8 @@ class Lexer(object): | |||||
self.mres = build_mres(tokens) | self.mres = build_mres(tokens) | ||||
def lex(self, stream): | def lex(self, stream): | ||||
lex_pos = 0 | |||||
line = 1 | |||||
col_start_pos = 0 | |||||
newline_types = list(self.newline_types) | |||||
ignore_types = list(self.ignore_types) | |||||
while True: | |||||
for mre, type_from_index in self.mres: | |||||
m = mre.match(stream, lex_pos) | |||||
if m: | |||||
value = m.group(0) | |||||
type_ = type_from_index[m.lastindex] | |||||
to_yield = type_ not in ignore_types | |||||
if to_yield: | |||||
t = Token(type_, value, lex_pos, line, lex_pos - col_start_pos) | |||||
end_col = t.column + len(value) | |||||
if t.type in self.callback: | |||||
t = self.callback[t.type](t) | |||||
if type_ in newline_types: | |||||
newlines = value.count(self.newline_char) | |||||
if newlines: | |||||
line += newlines | |||||
last_newline_index = value.rindex(self.newline_char) + 1 | |||||
col_start_pos = lex_pos + last_newline_index | |||||
end_col = len(value) - last_newline_index | |||||
if to_yield: | |||||
t.end_line = line | |||||
t.end_col = end_col | |||||
yield t | |||||
lex_pos += len(value) | |||||
break | |||||
else: | |||||
if lex_pos < len(stream): | |||||
raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos) | |||||
break | |||||
return _Lex(self).lex(stream, self.newline_types, self.ignore_types) | |||||
class ContextualLexer: | class ContextualLexer: | ||||
@@ -204,7 +212,7 @@ class ContextualLexer: | |||||
lexer = lexer_by_tokens[key] | lexer = lexer_by_tokens[key] | ||||
except KeyError: | except KeyError: | ||||
accepts = set(accepts) | set(ignore) | set(always_accept) | accepts = set(accepts) | set(ignore) | set(always_accept) | ||||
state_tokens = [tokens_by_name[n] for n in accepts if is_terminal(n) and n!='$end'] | |||||
state_tokens = [tokens_by_name[n] for n in accepts if is_terminal(n) and n!='$END'] | |||||
lexer = Lexer(state_tokens, ignore=ignore) | lexer = Lexer(state_tokens, ignore=ignore) | ||||
lexer_by_tokens[key] = lexer | lexer_by_tokens[key] = lexer | ||||
@@ -218,33 +226,9 @@ class ContextualLexer: | |||||
self.parser_state = state | self.parser_state = state | ||||
def lex(self, stream): | def lex(self, stream): | ||||
lex_pos = 0 | |||||
line = 1 | |||||
col_start_pos = 0 | |||||
newline_types = list(self.root_lexer.newline_types) | |||||
ignore_types = list(self.root_lexer.ignore_types) | |||||
while True: | |||||
lexer = self.lexers[self.parser_state] | |||||
for mre, type_from_index in lexer.mres: | |||||
m = mre.match(stream, lex_pos) | |||||
if m: | |||||
value = m.group(0) | |||||
type_ = type_from_index[m.lastindex] | |||||
if type_ not in ignore_types: | |||||
t = Token(type_, value, lex_pos, line, lex_pos - col_start_pos) | |||||
if t.type in lexer.callback: | |||||
t = lexer.callback[t.type](t) | |||||
yield t | |||||
l = _Lex(self.lexers[self.parser_state]) | |||||
for x in l.lex(stream, self.root_lexer.newline_types, self.root_lexer.ignore_types): | |||||
yield x | |||||
l.lexer = self.lexers[self.parser_state] | |||||
if type_ in newline_types: | |||||
newlines = value.count(lexer.newline_char) | |||||
if newlines: | |||||
line += newlines | |||||
col_start_pos = lex_pos + value.rindex(lexer.newline_char) | |||||
lex_pos += len(value) | |||||
break | |||||
else: | |||||
if lex_pos < len(stream): | |||||
raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos, lexer.tokens) | |||||
break | |||||
@@ -12,6 +12,7 @@ from .parse_tree_builder import ParseTreeBuilder | |||||
from .parser_frontends import LALR | from .parser_frontends import LALR | ||||
from .parsers.lalr_parser import UnexpectedToken | from .parsers.lalr_parser import UnexpectedToken | ||||
from .common import is_terminal, GrammarError, LexerConf, ParserConf, PatternStr, PatternRE, TokenDef | from .common import is_terminal, GrammarError, LexerConf, ParserConf, PatternStr, PatternRE, TokenDef | ||||
from .grammar import RuleOptions, Rule | |||||
from .tree import Tree as T, Transformer, InlineTransformer, Visitor | from .tree import Tree as T, Transformer, InlineTransformer, Visitor | ||||
@@ -127,7 +128,7 @@ RULES = { | |||||
class EBNF_to_BNF(InlineTransformer): | class EBNF_to_BNF(InlineTransformer): | ||||
def __init__(self): | def __init__(self): | ||||
self.new_rules = {} | |||||
self.new_rules = [] | |||||
self.rules_by_expr = {} | self.rules_by_expr = {} | ||||
self.prefix = 'anon' | self.prefix = 'anon' | ||||
self.i = 0 | self.i = 0 | ||||
@@ -140,7 +141,8 @@ class EBNF_to_BNF(InlineTransformer): | |||||
new_name = '__%s_%s_%d' % (self.prefix, type_, self.i) | new_name = '__%s_%s_%d' % (self.prefix, type_, self.i) | ||||
self.i += 1 | self.i += 1 | ||||
t = Token('RULE', new_name, -1) | t = Token('RULE', new_name, -1) | ||||
self.new_rules[new_name] = T('expansions', [T('expansion', [expr]), T('expansion', [t, expr])]), self.rule_options | |||||
tree = T('expansions', [T('expansion', [expr]), T('expansion', [t, expr])]) | |||||
self.new_rules.append((new_name, tree, self.rule_options)) | |||||
self.rules_by_expr[expr] = t | self.rules_by_expr[expr] = t | ||||
return t | return t | ||||
@@ -174,7 +176,6 @@ class SimplifyRule_Visitor(Visitor): | |||||
break | break | ||||
tree.expand_kids_by_index(*to_expand) | tree.expand_kids_by_index(*to_expand) | ||||
def expansion(self, tree): | def expansion(self, tree): | ||||
# rules_list unpacking | # rules_list unpacking | ||||
# a : b (c|d) e | # a : b (c|d) e | ||||
@@ -194,7 +195,7 @@ class SimplifyRule_Visitor(Visitor): | |||||
tree.data = 'expansions' | tree.data = 'expansions' | ||||
tree.children = [self.visit(T('expansion', [option if i==j else other | tree.children = [self.visit(T('expansion', [option if i==j else other | ||||
for j, other in enumerate(tree.children)])) | for j, other in enumerate(tree.children)])) | ||||
for option in child.children] | |||||
for option in set(child.children)] | |||||
break | break | ||||
else: | else: | ||||
break | break | ||||
@@ -208,7 +209,10 @@ class SimplifyRule_Visitor(Visitor): | |||||
tree.data = 'expansions' | tree.data = 'expansions' | ||||
tree.children = aliases | tree.children = aliases | ||||
expansions = _flatten | |||||
def expansions(self, tree): | |||||
self._flatten(tree) | |||||
tree.children = list(set(tree.children)) | |||||
class RuleTreeToText(Transformer): | class RuleTreeToText(Transformer): | ||||
def expansions(self, x): | def expansions(self, x): | ||||
@@ -389,12 +393,6 @@ def _interleave(l, item): | |||||
def _choice_of_rules(rules): | def _choice_of_rules(rules): | ||||
return T('expansions', [T('expansion', [Token('RULE', name)]) for name in rules]) | return T('expansions', [T('expansion', [Token('RULE', name)]) for name in rules]) | ||||
def dict_update_safe(d1, d2): | |||||
for k, v in d2.items(): | |||||
assert k not in d1 | |||||
d1[k] = v | |||||
class Grammar: | class Grammar: | ||||
def __init__(self, rule_defs, token_defs, ignore): | def __init__(self, rule_defs, token_defs, ignore): | ||||
self.token_defs = token_defs | self.token_defs = token_defs | ||||
@@ -411,6 +409,7 @@ class Grammar: | |||||
terms_to_ignore = {name:'__'+name for name in self.ignore} | terms_to_ignore = {name:'__'+name for name in self.ignore} | ||||
if terms_to_ignore: | if terms_to_ignore: | ||||
assert set(terms_to_ignore) <= {name for name, _t in term_defs} | assert set(terms_to_ignore) <= {name for name, _t in term_defs} | ||||
term_defs = [(terms_to_ignore.get(name,name),t) for name,t in term_defs] | term_defs = [(terms_to_ignore.get(name,name),t) for name,t in term_defs] | ||||
expr = Token('RULE', '__ignore') | expr = Token('RULE', '__ignore') | ||||
for r, tree, _o in rule_defs: | for r, tree, _o in rule_defs: | ||||
@@ -466,57 +465,41 @@ class Grammar: | |||||
# ================= | # ================= | ||||
# Compile Rules | # Compile Rules | ||||
# ================= | # ================= | ||||
ebnf_to_bnf = EBNF_to_BNF() | |||||
simplify_rule = SimplifyRule_Visitor() | |||||
# 1. Pre-process terminals | |||||
transformer = PrepareLiterals() | transformer = PrepareLiterals() | ||||
if not lexer: | if not lexer: | ||||
transformer *= SplitLiterals() | transformer *= SplitLiterals() | ||||
transformer *= ExtractAnonTokens(tokens) # Adds to tokens | transformer *= ExtractAnonTokens(tokens) # Adds to tokens | ||||
rules = {} | |||||
# 2. Convert EBNF to BNF (and apply step 1) | |||||
ebnf_to_bnf = EBNF_to_BNF() | |||||
rules = [] | |||||
for name, rule_tree, options in rule_defs: | for name, rule_tree, options in rule_defs: | ||||
assert name not in rules, name | |||||
ebnf_to_bnf.rule_options = RuleOptions(keep_all_tokens=True) if options and options.keep_all_tokens else None | ebnf_to_bnf.rule_options = RuleOptions(keep_all_tokens=True) if options and options.keep_all_tokens else None | ||||
tree = transformer.transform(rule_tree) | tree = transformer.transform(rule_tree) | ||||
rules[name] = ebnf_to_bnf.transform(tree), options | |||||
rules.append((name, ebnf_to_bnf.transform(tree), options)) | |||||
rules += ebnf_to_bnf.new_rules | |||||
dict_update_safe(rules, ebnf_to_bnf.new_rules) | |||||
for tree, _o in rules.values(): | |||||
simplify_rule.visit(tree) | |||||
assert len(rules) == len({name for name, _t, _o in rules}), "Whoops, name collision" | |||||
# 3. Compile tree to Rule objects | |||||
rule_tree_to_text = RuleTreeToText() | rule_tree_to_text = RuleTreeToText() | ||||
rules = {origin: (rule_tree_to_text.transform(tree), options) for origin, (tree, options) in rules.items()} | |||||
return tokens, rules, self.ignore | |||||
simplify_rule = SimplifyRule_Visitor() | |||||
compiled_rules = [] | |||||
for name, tree, options in rules: | |||||
simplify_rule.visit(tree) | |||||
expansions = rule_tree_to_text.transform(tree) | |||||
for expansion, alias in expansions: | |||||
if alias and name.startswith('_'): | |||||
raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (name, alias)) | |||||
class RuleOptions: | |||||
def __init__(self, keep_all_tokens=False, expand1=False, create_token=None, filter_out=False, priority=None): | |||||
self.keep_all_tokens = keep_all_tokens | |||||
self.expand1 = expand1 | |||||
self.create_token = create_token # used for scanless postprocessing | |||||
self.priority = priority | |||||
self.filter_out = filter_out # remove this rule from the tree | |||||
# used for "token"-rules in scanless | |||||
@classmethod | |||||
def from_rule(cls, name, *x): | |||||
if len(x) > 1: | |||||
priority, expansions = x | |||||
priority = int(priority) | |||||
else: | |||||
expansions ,= x | |||||
priority = None | |||||
keep_all_tokens = name.startswith('!') | |||||
name = name.lstrip('!') | |||||
expand1 = name.startswith('?') | |||||
name = name.lstrip('?') | |||||
rule = Rule(name, expansion, alias, options) | |||||
compiled_rules.append(rule) | |||||
return name, expansions, cls(keep_all_tokens, expand1, priority=priority) | |||||
return tokens, compiled_rules, self.ignore | |||||
@@ -553,15 +536,30 @@ def resolve_token_references(token_defs): | |||||
if not changed: | if not changed: | ||||
break | break | ||||
def options_from_rule(name, *x): | |||||
if len(x) > 1: | |||||
priority, expansions = x | |||||
priority = int(priority) | |||||
else: | |||||
expansions ,= x | |||||
priority = None | |||||
keep_all_tokens = name.startswith('!') | |||||
name = name.lstrip('!') | |||||
expand1 = name.startswith('?') | |||||
name = name.lstrip('?') | |||||
return name, expansions, RuleOptions(keep_all_tokens, expand1, priority=priority) | |||||
class GrammarLoader: | class GrammarLoader: | ||||
def __init__(self): | def __init__(self): | ||||
tokens = [TokenDef(name, PatternRE(value)) for name, value in TOKENS.items()] | tokens = [TokenDef(name, PatternRE(value)) for name, value in TOKENS.items()] | ||||
rules = [RuleOptions.from_rule(name, x) for name, x in RULES.items()] | |||||
d = {r: ([(x.split(), None) for x in xs], o) for r, xs, o in rules} | |||||
rules, callback = ParseTreeBuilder(d, T).apply() | |||||
rules = [options_from_rule(name, x) for name, x in RULES.items()] | |||||
rules = [Rule(r, x.split(), None, o) for r, xs, o in rules for x in xs] | |||||
callback = ParseTreeBuilder(rules, T).create_callback() | |||||
lexer_conf = LexerConf(tokens, ['WS', 'COMMENT']) | lexer_conf = LexerConf(tokens, ['WS', 'COMMENT']) | ||||
parser_conf = ParserConf(rules, callback, 'start') | parser_conf = ParserConf(rules, callback, 'start') | ||||
self.parser = LALR(lexer_conf, parser_conf) | self.parser = LALR(lexer_conf, parser_conf) | ||||
@@ -636,7 +634,6 @@ class GrammarLoader: | |||||
ignore_names.append(name) | ignore_names.append(name) | ||||
token_defs.append((name, (t, 0))) | token_defs.append((name, (t, 0))) | ||||
# Verify correctness 2 | # Verify correctness 2 | ||||
token_names = set() | token_names = set() | ||||
for name, _ in token_defs: | for name, _ in token_defs: | ||||
@@ -644,10 +641,13 @@ class GrammarLoader: | |||||
raise GrammarError("Token '%s' defined more than once" % name) | raise GrammarError("Token '%s' defined more than once" % name) | ||||
token_names.add(name) | token_names.add(name) | ||||
if set(ignore_names) > token_names: | |||||
raise GrammarError("Tokens %s were marked to ignore but were not defined!" % (set(ignore_names) - token_names)) | |||||
# Resolve token references | # Resolve token references | ||||
resolve_token_references(token_defs) | resolve_token_references(token_defs) | ||||
rules = [RuleOptions.from_rule(*x) for x in rule_defs] | |||||
rules = [options_from_rule(*x) for x in rule_defs] | |||||
rule_names = set() | rule_names = set() | ||||
for name, _x, _o in rules: | for name, _x, _o in rules: | ||||
@@ -1,6 +1,9 @@ | |||||
from .common import is_terminal, GrammarError | from .common import is_terminal, GrammarError | ||||
from .utils import suppress | from .utils import suppress | ||||
from .lexer import Token | from .lexer import Token | ||||
from .grammar import Rule | |||||
###{standalone | |||||
class NodeBuilder: | class NodeBuilder: | ||||
def __init__(self, tree_class, name): | def __init__(self, tree_class, name): | ||||
@@ -27,7 +30,7 @@ class Factory: | |||||
def __call__(self, node_builder): | def __call__(self, node_builder): | ||||
return self.cls(node_builder, *self.args) | return self.cls(node_builder, *self.args) | ||||
class TokenWrapper: | class TokenWrapper: | ||||
"Used for fixing the results of scanless parsing" | "Used for fixing the results of scanless parsing" | ||||
@@ -106,51 +109,53 @@ class ParseTreeBuilder: | |||||
self.rule_builders = list(self._init_builders(rules)) | self.rule_builders = list(self._init_builders(rules)) | ||||
self.user_aliases = {} | |||||
def _init_builders(self, rules): | def _init_builders(self, rules): | ||||
filter_out = set() | filter_out = set() | ||||
for origin, (expansions, options) in rules.items(): | |||||
if options and options.filter_out: | |||||
assert origin.startswith('_') # Just to make sure | |||||
filter_out.add(origin) | |||||
for rule in rules: | |||||
if rule.options and rule.options.filter_out: | |||||
assert rule.origin.startswith('_') # Just to make sure | |||||
filter_out.add(rule.origin) | |||||
for origin, (expansions, options) in rules.items(): | |||||
for rule in rules: | |||||
options = rule.options | |||||
keep_all_tokens = self.always_keep_all_tokens or (options.keep_all_tokens if options else False) | keep_all_tokens = self.always_keep_all_tokens or (options.keep_all_tokens if options else False) | ||||
expand1 = options.expand1 if options else False | expand1 = options.expand1 if options else False | ||||
create_token = options.create_token if options else False | create_token = options.create_token if options else False | ||||
for expansion, alias in expansions: | |||||
if alias and origin.startswith('_'): | |||||
raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (origin, alias)) | |||||
wrapper_chain = filter(None, [ | |||||
(expand1 and not rule.alias) and Expand1, | |||||
create_token and Factory(TokenWrapper, create_token), | |||||
create_rule_handler(rule.expansion, keep_all_tokens, filter_out), | |||||
self.propagate_positions and PropagatePositions, | |||||
]) | |||||
wrapper_chain = filter(None, [ | |||||
(expand1 and not alias) and Expand1, | |||||
create_token and Factory(TokenWrapper, create_token), | |||||
create_rule_handler(expansion, keep_all_tokens, filter_out), | |||||
self.propagate_positions and PropagatePositions, | |||||
]) | |||||
yield rule, wrapper_chain | |||||
yield origin, expansion, options, alias or origin, wrapper_chain | |||||
def apply(self, transformer=None): | |||||
def create_callback(self, transformer=None): | |||||
callback = Callback() | callback = Callback() | ||||
new_rules = [] | |||||
for origin, expansion, options, alias, wrapper_chain in self.rule_builders: | |||||
callback_name = '_callback_%s_%s' % (origin, '_'.join(expansion)) | |||||
for rule, wrapper_chain in self.rule_builders: | |||||
internal_callback_name = '_callback_%s_%s' % (rule.origin, '_'.join(rule.expansion)) | |||||
user_callback_name = rule.alias or rule.origin | |||||
try: | try: | ||||
f = transformer._get_func(alias) | |||||
f = transformer._get_func(user_callback_name) | |||||
except AttributeError: | except AttributeError: | ||||
f = NodeBuilder(self.tree_class, alias) | |||||
f = NodeBuilder(self.tree_class, user_callback_name) | |||||
self.user_aliases[rule] = rule.alias | |||||
rule.alias = internal_callback_name | |||||
for w in wrapper_chain: | for w in wrapper_chain: | ||||
f = w(f) | f = w(f) | ||||
if hasattr(callback, callback_name): | |||||
raise GrammarError("Rule expansion '%s' already exists in rule %s" % (' '.join(expansion), origin)) | |||||
setattr(callback, callback_name, f) | |||||
if hasattr(callback, internal_callback_name): | |||||
raise GrammarError("Rule '%s' already exists" % (rule,)) | |||||
setattr(callback, internal_callback_name, f) | |||||
new_rules.append(( origin, expansion, callback_name, options )) | |||||
return callback | |||||
return new_rules, callback | |||||
###} |
@@ -1,5 +1,5 @@ | |||||
import re | import re | ||||
import sre_parse | |||||
from .utils import get_regexp_width | |||||
from parsers.grammar_analysis import GrammarAnalyzer | from parsers.grammar_analysis import GrammarAnalyzer | ||||
from .lexer import Lexer, ContextualLexer, Token | from .lexer import Lexer, ContextualLexer, Token | ||||
@@ -9,10 +9,16 @@ from .parsers import lalr_parser, earley, xearley, resolve_ambig, cyk | |||||
from .tree import Tree | from .tree import Tree | ||||
class WithLexer: | class WithLexer: | ||||
def __init__(self, lexer_conf): | |||||
def init_traditional_lexer(self, lexer_conf): | |||||
self.lexer_conf = lexer_conf | self.lexer_conf = lexer_conf | ||||
self.lexer = Lexer(lexer_conf.tokens, ignore=lexer_conf.ignore) | self.lexer = Lexer(lexer_conf.tokens, ignore=lexer_conf.ignore) | ||||
def init_contextual_lexer(self, lexer_conf, parser_conf): | |||||
self.lexer_conf = lexer_conf | |||||
d = {idx:t.keys() for idx, t in self.parser.analysis.parse_table.states.items()} | |||||
always_accept = lexer_conf.postlex.always_accept if lexer_conf.postlex else () | |||||
self.lexer = ContextualLexer(lexer_conf.tokens, d, ignore=lexer_conf.ignore, always_accept=always_accept) | |||||
def lex(self, text): | def lex(self, text): | ||||
stream = self.lexer.lex(text) | stream = self.lexer.lex(text) | ||||
if self.lexer_conf.postlex: | if self.lexer_conf.postlex: | ||||
@@ -23,32 +29,22 @@ class WithLexer: | |||||
class LALR(WithLexer): | class LALR(WithLexer): | ||||
def __init__(self, lexer_conf, parser_conf, options=None): | def __init__(self, lexer_conf, parser_conf, options=None): | ||||
WithLexer.__init__(self, lexer_conf) | |||||
self.parser_conf = parser_conf | |||||
self.parser = lalr_parser.Parser(parser_conf) | self.parser = lalr_parser.Parser(parser_conf) | ||||
self.init_traditional_lexer(lexer_conf) | |||||
def parse(self, text): | def parse(self, text): | ||||
tokens = self.lex(text) | |||||
return self.parser.parse(tokens) | |||||
token_stream = self.lex(text) | |||||
return self.parser.parse(token_stream) | |||||
class LALR_ContextualLexer: | |||||
class LALR_ContextualLexer(WithLexer): | |||||
def __init__(self, lexer_conf, parser_conf, options=None): | def __init__(self, lexer_conf, parser_conf, options=None): | ||||
self.lexer_conf = lexer_conf | |||||
self.parser_conf = parser_conf | |||||
self.parser = lalr_parser.Parser(parser_conf) | self.parser = lalr_parser.Parser(parser_conf) | ||||
d = {idx:t.keys() for idx, t in self.parser.analysis.states_idx.items()} | |||||
always_accept = lexer_conf.postlex.always_accept if lexer_conf.postlex else () | |||||
self.lexer = ContextualLexer(lexer_conf.tokens, d, ignore=lexer_conf.ignore, always_accept=always_accept) | |||||
self.init_contextual_lexer(lexer_conf, parser_conf) | |||||
def parse(self, text): | def parse(self, text): | ||||
tokens = self.lexer.lex(text) | |||||
if self.lexer_conf.postlex: | |||||
tokens = self.lexer_conf.postlex.process(tokens) | |||||
return self.parser.parse(tokens, self.lexer.set_parser_state) | |||||
token_stream = self.lex(text) | |||||
return self.parser.parse(token_stream, self.lexer.set_parser_state) | |||||
def get_ambiguity_resolver(options): | def get_ambiguity_resolver(options): | ||||
if not options or options.ambiguity == 'resolve': | if not options or options.ambiguity == 'resolve': | ||||
@@ -60,55 +56,47 @@ def get_ambiguity_resolver(options): | |||||
raise ValueError(options) | raise ValueError(options) | ||||
def tokenize_text(text): | def tokenize_text(text): | ||||
new_text = [] | |||||
line = 1 | line = 1 | ||||
col_start_pos = 0 | col_start_pos = 0 | ||||
for i, ch in enumerate(text): | for i, ch in enumerate(text): | ||||
if '\n' in ch: | if '\n' in ch: | ||||
line += ch.count('\n') | line += ch.count('\n') | ||||
col_start_pos = i + ch.rindex('\n') | col_start_pos = i + ch.rindex('\n') | ||||
new_text.append(Token('CHAR', ch, line=line, column=i - col_start_pos)) | |||||
return new_text | |||||
yield Token('CHAR', ch, line=line, column=i - col_start_pos) | |||||
class Earley_NoLex: | class Earley_NoLex: | ||||
def __init__(self, lexer_conf, parser_conf, options=None): | def __init__(self, lexer_conf, parser_conf, options=None): | ||||
self.token_by_name = {t.name:t for t in lexer_conf.tokens} | |||||
self._prepare_match(lexer_conf) | |||||
rules = [(n, list(self._prepare_expansion(x)), a, o) for n,x,a,o in parser_conf.rules] | |||||
self.parser = earley.Parser(rules, | |||||
parser_conf.start, | |||||
parser_conf.callback, | |||||
self.parser = earley.Parser(parser_conf, self.match, | |||||
resolve_ambiguity=get_ambiguity_resolver(options)) | resolve_ambiguity=get_ambiguity_resolver(options)) | ||||
def _prepare_expansion(self, expansion): | |||||
for sym in expansion: | |||||
if is_terminal(sym): | |||||
regexp = self.token_by_name[sym].pattern.to_regexp() | |||||
width = sre_parse.parse(regexp).getwidth() | |||||
if width != (1,1): | |||||
raise GrammarError('Scanless parsing (lexer=None) requires all tokens to have a width of 1 (terminal %s: %s is %s)' % (sym, regexp, width)) | |||||
yield Terminal_Regexp(sym, regexp) | |||||
else: | |||||
yield sym | |||||
def match(self, term, text, index=0): | |||||
return self.regexps[term].match(text, index) | |||||
def _prepare_match(self, lexer_conf): | |||||
self.regexps = {} | |||||
for t in lexer_conf.tokens: | |||||
regexp = t.pattern.to_regexp() | |||||
width = get_regexp_width(regexp) | |||||
if width != (1,1): | |||||
raise GrammarError('Scanless parsing (lexer=None) requires all tokens to have a width of 1 (terminal %s: %s is %s)' % (sym, regexp, width)) | |||||
self.regexps[t.name] = re.compile(regexp) | |||||
def parse(self, text): | def parse(self, text): | ||||
new_text = tokenize_text(text) | |||||
return self.parser.parse(new_text) | |||||
token_stream = tokenize_text(text) | |||||
return self.parser.parse(token_stream) | |||||
class Earley(WithLexer): | class Earley(WithLexer): | ||||
def __init__(self, lexer_conf, parser_conf, options=None): | def __init__(self, lexer_conf, parser_conf, options=None): | ||||
WithLexer.__init__(self, lexer_conf) | |||||
rules = [(n, self._prepare_expansion(x), a, o) for n,x,a,o in parser_conf.rules] | |||||
self.init_traditional_lexer(lexer_conf) | |||||
self.parser = earley.Parser(rules, | |||||
parser_conf.start, | |||||
parser_conf.callback, | |||||
self.parser = earley.Parser(parser_conf, self.match, | |||||
resolve_ambiguity=get_ambiguity_resolver(options)) | resolve_ambiguity=get_ambiguity_resolver(options)) | ||||
def _prepare_expansion(self, expansion): | |||||
return [Terminal_Token(sym) if is_terminal(sym) else sym for sym in expansion] | |||||
def match(self, term, token): | |||||
return term == token.type | |||||
def parse(self, text): | def parse(self, text): | ||||
tokens = self.lex(text) | tokens = self.lex(text) | ||||
@@ -119,27 +107,31 @@ class XEarley: | |||||
def __init__(self, lexer_conf, parser_conf, options=None): | def __init__(self, lexer_conf, parser_conf, options=None): | ||||
self.token_by_name = {t.name:t for t in lexer_conf.tokens} | self.token_by_name = {t.name:t for t in lexer_conf.tokens} | ||||
rules = [(n, list(self._prepare_expansion(x)), a, o) for n,x,a,o in parser_conf.rules] | |||||
self._prepare_match(lexer_conf) | |||||
ignore = [Terminal_Regexp(x, self.token_by_name[x].pattern.to_regexp()) for x in lexer_conf.ignore] | |||||
self.parser = xearley.Parser(rules, | |||||
parser_conf.start, | |||||
parser_conf.callback, | |||||
self.parser = xearley.Parser(parser_conf, | |||||
self.match, | |||||
resolve_ambiguity=get_ambiguity_resolver(options), | resolve_ambiguity=get_ambiguity_resolver(options), | ||||
ignore=ignore, | |||||
ignore=lexer_conf.ignore, | |||||
predict_all=options.earley__predict_all | predict_all=options.earley__predict_all | ||||
) | ) | ||||
def _prepare_expansion(self, expansion): | |||||
for sym in expansion: | |||||
if is_terminal(sym): | |||||
regexp = self.token_by_name[sym].pattern.to_regexp() | |||||
width = sre_parse.parse(regexp).getwidth() | |||||
assert width | |||||
yield Terminal_Regexp(sym, regexp) | |||||
def match(self, term, text, index=0): | |||||
return self.regexps[term].match(text, index) | |||||
def _prepare_match(self, lexer_conf): | |||||
self.regexps = {} | |||||
for t in lexer_conf.tokens: | |||||
regexp = t.pattern.to_regexp() | |||||
try: | |||||
width = get_regexp_width(regexp)[0] | |||||
except ValueError: | |||||
raise ValueError("Bad regexp in token %s: %s" % (t.name, regexp)) | |||||
else: | else: | ||||
yield sym | |||||
if width == 0: | |||||
raise ValueError("Dynamic Earley doesn't allow zero-width regexps") | |||||
self.regexps[t.name] = re.compile(regexp) | |||||
def parse(self, text): | def parse(self, text): | ||||
return self.parser.parse(text) | return self.parser.parse(text) | ||||
@@ -13,14 +13,11 @@ | |||||
# Author: Erez Shinan (2017) | # Author: Erez Shinan (2017) | ||||
# Email : erezshin@gmail.com | # Email : erezshin@gmail.com | ||||
from ..common import ParseError, UnexpectedToken, Terminal | |||||
from ..common import ParseError, UnexpectedToken, is_terminal | |||||
from ..tree import Tree, Visitor_NoRecurse, Transformer_NoRecurse | from ..tree import Tree, Visitor_NoRecurse, Transformer_NoRecurse | ||||
from .grammar_analysis import GrammarAnalyzer | from .grammar_analysis import GrammarAnalyzer | ||||
class EndToken: | |||||
type = '$end' | |||||
class Derivation(Tree): | class Derivation(Tree): | ||||
_hash = None | _hash = None | ||||
@@ -35,8 +32,6 @@ class Derivation(Tree): | |||||
self._hash = Tree.__hash__(self) | self._hash = Tree.__hash__(self) | ||||
return self._hash | return self._hash | ||||
END_TOKEN = EndToken() | |||||
class Item(object): | class Item(object): | ||||
"An Earley Item, the atom of the algorithm." | "An Earley Item, the atom of the algorithm." | ||||
@@ -59,11 +54,8 @@ class Item(object): | |||||
new_tree = Derivation(self.rule, self.tree.children + [tree]) | new_tree = Derivation(self.rule, self.tree.children + [tree]) | ||||
return self.__class__(self.rule, self.ptr+1, self.start, new_tree) | return self.__class__(self.rule, self.ptr+1, self.start, new_tree) | ||||
def similar(self, other): | |||||
return self.start is other.start and self.ptr == other.ptr and self.rule == other.rule | |||||
def __eq__(self, other): | def __eq__(self, other): | ||||
return self.similar(other) #and (self.tree == other.tree) | |||||
return self.start is other.start and self.ptr == other.ptr and self.rule == other.rule | |||||
def __hash__(self): | def __hash__(self): | ||||
return hash((self.rule, self.ptr, id(self.start))) # Always runs Derivation.__hash__ | return hash((self.rule, self.ptr, id(self.start))) # Always runs Derivation.__hash__ | ||||
@@ -134,7 +126,7 @@ class Column: | |||||
self.completed[item_key] = item | self.completed[item_key] = item | ||||
self.to_reduce.append(item) | self.to_reduce.append(item) | ||||
else: | else: | ||||
if isinstance(item.expect, Terminal): | |||||
if is_terminal(item.expect): | |||||
self.to_scan.append(item) | self.to_scan.append(item) | ||||
else: | else: | ||||
k = item_key if self.predict_all else item | k = item_key if self.predict_all else item | ||||
@@ -151,31 +143,30 @@ class Column: | |||||
__nonzero__ = __bool__ # Py2 backwards-compatibility | __nonzero__ = __bool__ # Py2 backwards-compatibility | ||||
class Parser: | class Parser: | ||||
def __init__(self, rules, start_symbol, callback, resolve_ambiguity=None): | |||||
self.analysis = GrammarAnalyzer(rules, start_symbol) | |||||
self.start_symbol = start_symbol | |||||
def __init__(self, parser_conf, term_matcher, resolve_ambiguity=None): | |||||
self.analysis = GrammarAnalyzer(parser_conf) | |||||
self.parser_conf = parser_conf | |||||
self.resolve_ambiguity = resolve_ambiguity | self.resolve_ambiguity = resolve_ambiguity | ||||
self.FIRST = self.analysis.FIRST | |||||
self.postprocess = {} | self.postprocess = {} | ||||
self.predictions = {} | self.predictions = {} | ||||
self.FIRST = {} | |||||
for rule in self.analysis.rules: | |||||
if rule.origin != '$root': # XXX kinda ugly | |||||
a = rule.alias | |||||
self.postprocess[rule] = a if callable(a) else (a and getattr(callback, a)) | |||||
self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)] | |||||
for rule in parser_conf.rules: | |||||
self.postprocess[rule] = getattr(parser_conf.callback, rule.alias) | |||||
self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)] | |||||
self.FIRST[rule.origin] = self.analysis.FIRST[rule.origin] | |||||
self.term_matcher = term_matcher | |||||
def parse(self, stream, start_symbol=None): | def parse(self, stream, start_symbol=None): | ||||
# Define parser functions | # Define parser functions | ||||
start_symbol = start_symbol or self.start_symbol | |||||
start_symbol = start_symbol or self.parser_conf.start | |||||
_Item = Item | _Item = Item | ||||
match = self.term_matcher | |||||
def predict(nonterm, column): | def predict(nonterm, column): | ||||
assert not isinstance(nonterm, Terminal), nonterm | |||||
assert not is_terminal(nonterm), nonterm | |||||
return [_Item(rule, 0, column, None) for rule in self.predictions[nonterm]] | return [_Item(rule, 0, column, None) for rule in self.predictions[nonterm]] | ||||
def complete(item): | def complete(item): | ||||
@@ -195,14 +186,13 @@ class Parser: | |||||
for item in to_reduce: | for item in to_reduce: | ||||
new_items = list(complete(item)) | new_items = list(complete(item)) | ||||
for new_item in new_items: | |||||
if new_item.similar(item): | |||||
raise ParseError('Infinite recursion detected! (rule %s)' % new_item.rule) | |||||
if item in new_items: | |||||
raise ParseError('Infinite recursion detected! (rule %s)' % item.rule) | |||||
column.add(new_items) | column.add(new_items) | ||||
def scan(i, token, column): | def scan(i, token, column): | ||||
next_set = Column(i, self.FIRST) | next_set = Column(i, self.FIRST) | ||||
next_set.add(item.advance(token) for item in column.to_scan if item.expect.match(token)) | |||||
next_set.add(item.advance(token) for item in column.to_scan if match(item.expect, token)) | |||||
if not next_set: | if not next_set: | ||||
expect = {i.expect for i in column.to_scan} | expect = {i.expect for i in column.to_scan} | ||||
@@ -249,24 +239,3 @@ class ApplyCallbacks(Transformer_NoRecurse): | |||||
return callback(children) | return callback(children) | ||||
else: | else: | ||||
return Tree(rule.origin, children) | return Tree(rule.origin, children) | ||||
# RULES = [ | |||||
# ('a', ['d']), | |||||
# ('d', ['b']), | |||||
# ('b', ['C']), | |||||
# ('b', ['b', 'C']), | |||||
# ('b', ['C', 'b']), | |||||
# ] | |||||
# p = Parser(RULES, 'a') | |||||
# for x in p.parse('CC'): | |||||
# print x.pretty() | |||||
#--------------- | |||||
# RULES = [ | |||||
# ('s', ['a', 'a']), | |||||
# ('a', ['b', 'b']), | |||||
# ('b', ['C'], lambda (x,): x), | |||||
# ('b', ['b', 'C']), | |||||
# ] | |||||
# p = Parser(RULES, 's', {}) | |||||
# print p.parse('CCCCC').pretty() |
@@ -1,20 +1,8 @@ | |||||
from ..utils import bfs, fzset | from ..utils import bfs, fzset | ||||
from ..common import GrammarError, is_terminal | from ..common import GrammarError, is_terminal | ||||
from ..grammar import Rule | |||||
class Rule(object): | |||||
""" | |||||
origin : a symbol | |||||
expansion : a list of symbols | |||||
""" | |||||
def __init__(self, origin, expansion, alias=None, options=None): | |||||
self.origin = origin | |||||
self.expansion = expansion | |||||
self.alias = alias | |||||
self.options = options | |||||
def __repr__(self): | |||||
return '<%s : %s>' % (self.origin, ' '.join(map(str,self.expansion))) | |||||
class RulePtr(object): | class RulePtr(object): | ||||
def __init__(self, rule, index): | def __init__(self, rule, index): | ||||
@@ -106,28 +94,30 @@ def calculate_sets(rules): | |||||
class GrammarAnalyzer(object): | class GrammarAnalyzer(object): | ||||
def __init__(self, rule_tuples, start_symbol, debug=False): | |||||
self.start_symbol = start_symbol | |||||
def __init__(self, parser_conf, debug=False): | |||||
rules = parser_conf.rules | |||||
assert len(rules) == len(set(rules)) | |||||
self.start_symbol = parser_conf.start | |||||
self.debug = debug | self.debug = debug | ||||
rule_tuples = list(rule_tuples) | |||||
rule_tuples.append(('$root', [start_symbol, '$end'])) | |||||
rule_tuples = [(t[0], t[1], None, None) if len(t)==2 else t for t in rule_tuples] | |||||
self.rules = set() | |||||
self.rules_by_origin = {o: [] for o, _x, _a, _opt in rule_tuples} | |||||
for origin, exp, alias, options in rule_tuples: | |||||
r = Rule( origin, exp, alias, options ) | |||||
self.rules.add(r) | |||||
self.rules_by_origin[origin].append(r) | |||||
for r in self.rules: | |||||
root_rule = Rule('$root', [self.start_symbol, '$END']) | |||||
self.rules_by_origin = {r.origin: [] for r in rules} | |||||
for r in rules: | |||||
self.rules_by_origin[r.origin].append(r) | |||||
self.rules_by_origin[root_rule.origin] = [root_rule] | |||||
for r in rules: | |||||
for sym in r.expansion: | for sym in r.expansion: | ||||
if not (is_terminal(sym) or sym in self.rules_by_origin): | if not (is_terminal(sym) or sym in self.rules_by_origin): | ||||
raise GrammarError("Using an undefined rule: %s" % sym) | raise GrammarError("Using an undefined rule: %s" % sym) | ||||
self.init_state = self.expand_rule('$root') | |||||
self.start_state = self.expand_rule('$root') | |||||
self.rules = rules | |||||
self.FIRST, self.FOLLOW, self.NULLABLE = calculate_sets(self.rules) | |||||
self.FIRST, self.FOLLOW, self.NULLABLE = calculate_sets(rules + [root_rule]) | |||||
def expand_rule(self, rule): | def expand_rule(self, rule): | ||||
"Returns all init_ptrs accessible by rule (recursive)" | "Returns all init_ptrs accessible by rule (recursive)" | ||||
@@ -14,7 +14,43 @@ from ..common import GrammarError, is_terminal | |||||
from .grammar_analysis import GrammarAnalyzer | from .grammar_analysis import GrammarAnalyzer | ||||
ACTION_SHIFT = 0 | |||||
class Action: | |||||
def __init__(self, name): | |||||
self.name = name | |||||
def __str__(self): | |||||
return self.name | |||||
def __repr__(self): | |||||
return str(self) | |||||
Shift = Action('Shift') | |||||
Reduce = Action('Reduce') | |||||
class ParseTable: | |||||
def __init__(self, states, start_state, end_state): | |||||
self.states = states | |||||
self.start_state = start_state | |||||
self.end_state = end_state | |||||
class IntParseTable(ParseTable): | |||||
@classmethod | |||||
def from_ParseTable(cls, parse_table): | |||||
enum = list(parse_table.states) | |||||
state_to_idx = {s:i for i,s in enumerate(enum)} | |||||
int_states = {} | |||||
for s, la in parse_table.states.items(): | |||||
la = {k:(v[0], state_to_idx[v[1]]) if v[0] is Shift else v | |||||
for k,v in la.items()} | |||||
int_states[ state_to_idx[s] ] = la | |||||
start_state = state_to_idx[parse_table.start_state] | |||||
end_state = state_to_idx[parse_table.end_state] | |||||
return cls(int_states, start_state, end_state) | |||||
class LALR_Analyzer(GrammarAnalyzer): | class LALR_Analyzer(GrammarAnalyzer): | ||||
@@ -27,7 +63,7 @@ class LALR_Analyzer(GrammarAnalyzer): | |||||
sat, unsat = classify_bool(state, lambda rp: rp.is_satisfied) | sat, unsat = classify_bool(state, lambda rp: rp.is_satisfied) | ||||
for rp in sat: | for rp in sat: | ||||
for term in self.FOLLOW.get(rp.rule.origin, ()): | for term in self.FOLLOW.get(rp.rule.origin, ()): | ||||
lookahead[term].append(('reduce', rp.rule)) | |||||
lookahead[term].append((Reduce, rp.rule)) | |||||
d = classify(unsat, lambda rp: rp.next) | d = classify(unsat, lambda rp: rp.next) | ||||
for sym, rps in d.items(): | for sym, rps in d.items(): | ||||
@@ -38,8 +74,8 @@ class LALR_Analyzer(GrammarAnalyzer): | |||||
rps |= self.expand_rule(rp.next) | rps |= self.expand_rule(rp.next) | ||||
new_state = fzset(rps) | new_state = fzset(rps) | ||||
lookahead[sym].append(('shift', new_state)) | |||||
if sym == '$end': | |||||
lookahead[sym].append((Shift, new_state)) | |||||
if sym == '$END': | |||||
self.end_states.append( new_state ) | self.end_states.append( new_state ) | ||||
yield fzset(rps) | yield fzset(rps) | ||||
@@ -50,7 +86,7 @@ class LALR_Analyzer(GrammarAnalyzer): | |||||
for x in v: | for x in v: | ||||
# XXX resolving shift/reduce into shift, like PLY | # XXX resolving shift/reduce into shift, like PLY | ||||
# Give a proper warning | # Give a proper warning | ||||
if x[0] == 'shift': | |||||
if x[0] is Shift: | |||||
lookahead[k] = [x] | lookahead[k] = [x] | ||||
for k, v in lookahead.items(): | for k, v in lookahead.items(): | ||||
@@ -59,22 +95,15 @@ class LALR_Analyzer(GrammarAnalyzer): | |||||
self.states[state] = {k:v[0] for k, v in lookahead.items()} | self.states[state] = {k:v[0] for k, v in lookahead.items()} | ||||
for _ in bfs([self.init_state], step): | |||||
for _ in bfs([self.start_state], step): | |||||
pass | pass | ||||
self.end_state ,= self.end_states | self.end_state ,= self.end_states | ||||
# -- | |||||
self.enum = list(self.states) | |||||
self.enum_rev = {s:i for i,s in enumerate(self.enum)} | |||||
self.states_idx = {} | |||||
for s, la in self.states.items(): | |||||
la = {k:(ACTION_SHIFT, self.enum_rev[v[1]]) if v[0]=='shift' | |||||
else (v[0], (v[1], len(v[1].expansion))) # Reduce | |||||
for k,v in la.items()} | |||||
self.states_idx[ self.enum_rev[s] ] = la | |||||
self._parse_table = ParseTable(self.states, self.start_state, self.end_state) | |||||
if self.debug: | |||||
self.parse_table = self._parse_table | |||||
else: | |||||
self.parse_table = IntParseTable.from_ParseTable(self._parse_table) | |||||
self.init_state_idx = self.enum_rev[self.init_state] | |||||
self.end_state_idx = self.enum_rev[self.end_state] |
@@ -3,30 +3,30 @@ | |||||
# Author: Erez Shinan (2017) | # Author: Erez Shinan (2017) | ||||
# Email : erezshin@gmail.com | # Email : erezshin@gmail.com | ||||
from ..common import ParseError, UnexpectedToken | |||||
from ..common import UnexpectedToken | |||||
from .lalr_analysis import LALR_Analyzer, ACTION_SHIFT | |||||
class FinalReduce: | |||||
def __init__(self, value): | |||||
self.value = value | |||||
from .lalr_analysis import LALR_Analyzer, Shift | |||||
class Parser: | class Parser: | ||||
def __init__(self, parser_conf): | def __init__(self, parser_conf): | ||||
assert all(o is None or o.priority is None for n,x,a,o in parser_conf.rules), "LALR doesn't yet support prioritization" | |||||
self.analysis = analysis = LALR_Analyzer(parser_conf.rules, parser_conf.start) | |||||
assert all(r.options is None or r.options.priority is None | |||||
for r in parser_conf.rules), "LALR doesn't yet support prioritization" | |||||
self.analysis = analysis = LALR_Analyzer(parser_conf) | |||||
analysis.compute_lookahead() | analysis.compute_lookahead() | ||||
callbacks = {rule: getattr(parser_conf.callback, rule.alias or rule.origin, None) | callbacks = {rule: getattr(parser_conf.callback, rule.alias or rule.origin, None) | ||||
for rule in analysis.rules} | for rule in analysis.rules} | ||||
self.parser = _Parser(analysis.states_idx, analysis.init_state_idx, analysis.end_state_idx, callbacks) | |||||
self.parser_conf = parser_conf | |||||
self.parser = _Parser(analysis.parse_table, callbacks) | |||||
self.parse = self.parser.parse | self.parse = self.parser.parse | ||||
###{standalone | |||||
class _Parser: | class _Parser: | ||||
def __init__(self, states, init_state, end_state, callbacks): | |||||
self.states = states | |||||
self.init_state = init_state | |||||
self.end_state = end_state | |||||
def __init__(self, parse_table, callbacks): | |||||
self.states = parse_table.states | |||||
self.start_state = parse_table.start_state | |||||
self.end_state = parse_table.end_state | |||||
self.callbacks = callbacks | self.callbacks = callbacks | ||||
def parse(self, seq, set_state=None): | def parse(self, seq, set_state=None): | ||||
@@ -35,10 +35,10 @@ class _Parser: | |||||
stream = iter(seq) | stream = iter(seq) | ||||
states = self.states | states = self.states | ||||
state_stack = [self.init_state] | |||||
state_stack = [self.start_state] | |||||
value_stack = [] | value_stack = [] | ||||
if set_state: set_state(self.init_state) | |||||
if set_state: set_state(self.start_state) | |||||
def get_action(key): | def get_action(key): | ||||
state = state_stack[-1] | state = state_stack[-1] | ||||
@@ -49,7 +49,8 @@ class _Parser: | |||||
raise UnexpectedToken(token, expected, seq, i) | raise UnexpectedToken(token, expected, seq, i) | ||||
def reduce(rule, size): | |||||
def reduce(rule): | |||||
size = len(rule.expansion) | |||||
if size: | if size: | ||||
s = value_stack[-size:] | s = value_stack[-size:] | ||||
del state_stack[-size:] | del state_stack[-size:] | ||||
@@ -60,7 +61,7 @@ class _Parser: | |||||
value = self.callbacks[rule](s) | value = self.callbacks[rule](s) | ||||
_action, new_state = get_action(rule.origin) | _action, new_state = get_action(rule.origin) | ||||
assert _action == ACTION_SHIFT | |||||
assert _action is Shift | |||||
state_stack.append(new_state) | state_stack.append(new_state) | ||||
value_stack.append(value) | value_stack.append(value) | ||||
@@ -72,22 +73,24 @@ class _Parser: | |||||
action, arg = get_action(token.type) | action, arg = get_action(token.type) | ||||
assert arg != self.end_state | assert arg != self.end_state | ||||
if action == ACTION_SHIFT: | |||||
if action is Shift: | |||||
state_stack.append(arg) | state_stack.append(arg) | ||||
value_stack.append(token) | value_stack.append(token) | ||||
if set_state: set_state(arg) | if set_state: set_state(arg) | ||||
token = next(stream) | token = next(stream) | ||||
i += 1 | i += 1 | ||||
else: | else: | ||||
reduce(*arg) | |||||
reduce(arg) | |||||
except StopIteration: | except StopIteration: | ||||
pass | pass | ||||
while True: | while True: | ||||
_action, arg = get_action('$end') | |||||
if _action == ACTION_SHIFT: | |||||
_action, arg = get_action('$END') | |||||
if _action is Shift: | |||||
assert arg == self.end_state | assert arg == self.end_state | ||||
val ,= value_stack | val ,= value_stack | ||||
return val | return val | ||||
else: | else: | ||||
reduce(*arg) | |||||
reduce(arg) | |||||
###} |
@@ -20,7 +20,7 @@ | |||||
from collections import defaultdict | from collections import defaultdict | ||||
from ..common import ParseError, UnexpectedToken, Terminal | |||||
from ..common import ParseError, UnexpectedToken, is_terminal | |||||
from ..lexer import Token, UnexpectedInput | from ..lexer import Token, UnexpectedInput | ||||
from ..tree import Tree | from ..tree import Tree | ||||
from .grammar_analysis import GrammarAnalyzer | from .grammar_analysis import GrammarAnalyzer | ||||
@@ -28,37 +28,34 @@ from .grammar_analysis import GrammarAnalyzer | |||||
from .earley import ApplyCallbacks, Item, Column | from .earley import ApplyCallbacks, Item, Column | ||||
class Parser: | class Parser: | ||||
def __init__(self, rules, start_symbol, callback, resolve_ambiguity=None, ignore=(), predict_all=False): | |||||
self.analysis = GrammarAnalyzer(rules, start_symbol) | |||||
self.start_symbol = start_symbol | |||||
def __init__(self, parser_conf, term_matcher, resolve_ambiguity=None, ignore=(), predict_all=False): | |||||
self.analysis = GrammarAnalyzer(parser_conf) | |||||
self.parser_conf = parser_conf | |||||
self.resolve_ambiguity = resolve_ambiguity | self.resolve_ambiguity = resolve_ambiguity | ||||
self.ignore = list(ignore) | self.ignore = list(ignore) | ||||
self.predict_all = predict_all | self.predict_all = predict_all | ||||
self.FIRST = self.analysis.FIRST | |||||
self.postprocess = {} | self.postprocess = {} | ||||
self.predictions = {} | self.predictions = {} | ||||
self.FIRST = {} | |||||
for rule in self.analysis.rules: | |||||
if rule.origin != '$root': # XXX kinda ugly | |||||
a = rule.alias | |||||
self.postprocess[rule] = a if callable(a) else (a and getattr(callback, a)) | |||||
self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)] | |||||
for rule in parser_conf.rules: | |||||
self.postprocess[rule] = getattr(parser_conf.callback, rule.alias) | |||||
self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)] | |||||
self.FIRST[rule.origin] = self.analysis.FIRST[rule.origin] | |||||
self.term_matcher = term_matcher | |||||
def parse(self, stream, start_symbol=None): | def parse(self, stream, start_symbol=None): | ||||
# Define parser functions | # Define parser functions | ||||
start_symbol = start_symbol or self.start_symbol | |||||
start_symbol = start_symbol or self.parser_conf.start | |||||
delayed_matches = defaultdict(list) | delayed_matches = defaultdict(list) | ||||
match = self.term_matcher | |||||
text_line = 1 | text_line = 1 | ||||
text_column = 0 | text_column = 0 | ||||
def predict(nonterm, column): | def predict(nonterm, column): | ||||
assert not isinstance(nonterm, Terminal), nonterm | |||||
assert not is_terminal(nonterm), nonterm | |||||
return [Item(rule, 0, column, None) for rule in self.predictions[nonterm]] | return [Item(rule, 0, column, None) for rule in self.predictions[nonterm]] | ||||
def complete(item): | def complete(item): | ||||
@@ -77,16 +74,15 @@ class Parser: | |||||
column.add( predict(nonterm, column) ) | column.add( predict(nonterm, column) ) | ||||
for item in to_reduce: | for item in to_reduce: | ||||
new_items = list(complete(item)) | new_items = list(complete(item)) | ||||
for new_item in new_items: | |||||
if new_item.similar(item): | |||||
raise ParseError('Infinite recursion detected! (rule %s)' % new_item.rule) | |||||
if item in new_items: | |||||
raise ParseError('Infinite recursion detected! (rule %s)' % item.rule) | |||||
column.add(new_items) | column.add(new_items) | ||||
def scan(i, token, column): | def scan(i, token, column): | ||||
to_scan = column.to_scan | to_scan = column.to_scan | ||||
for x in self.ignore: | for x in self.ignore: | ||||
m = x.match(stream, i) | |||||
m = match(x, stream, i) | |||||
if m: | if m: | ||||
delayed_matches[m.end()] += set(to_scan) | delayed_matches[m.end()] += set(to_scan) | ||||
delayed_matches[m.end()] += set(column.to_reduce) | delayed_matches[m.end()] += set(column.to_reduce) | ||||
@@ -99,16 +95,16 @@ class Parser: | |||||
# delayed_matches[m.end()] += to_scan | # delayed_matches[m.end()] += to_scan | ||||
for item in to_scan: | for item in to_scan: | ||||
m = item.expect.match(stream, i) | |||||
m = match(item.expect, stream, i) | |||||
if m: | if m: | ||||
t = Token(item.expect.name, m.group(0), i, text_line, text_column) | |||||
t = Token(item.expect, m.group(0), i, text_line, text_column) | |||||
delayed_matches[m.end()].append(item.advance(t)) | delayed_matches[m.end()].append(item.advance(t)) | ||||
s = m.group(0) | s = m.group(0) | ||||
for j in range(1, len(s)): | for j in range(1, len(s)): | ||||
m = item.expect.match(s[:-j]) | |||||
m = match(item.expect, s[:-j]) | |||||
if m: | if m: | ||||
t = Token(item.expect.name, m.group(0), i, text_line, text_column) | |||||
t = Token(item.expect, m.group(0), i, text_line, text_column) | |||||
delayed_matches[i+m.end()].append(item.advance(t)) | delayed_matches[i+m.end()].append(item.advance(t)) | ||||
next_set = Column(i+1, self.FIRST, predict_all=self.predict_all) | next_set = Column(i+1, self.FIRST, predict_all=self.predict_all) | ||||
@@ -131,7 +127,7 @@ class Parser: | |||||
if token == '\n': | if token == '\n': | ||||
text_line += 1 | text_line += 1 | ||||
text_column = 1 | |||||
text_column = 0 | |||||
else: | else: | ||||
text_column += 1 | text_column += 1 | ||||
@@ -143,7 +139,7 @@ class Parser: | |||||
if n.rule.origin==start_symbol and n.start is column0] | if n.rule.origin==start_symbol and n.start is column0] | ||||
if not solutions: | if not solutions: | ||||
expected_tokens = [t.expect.name for t in column.to_scan] | |||||
expected_tokens = [t.expect for t in column.to_scan] | |||||
raise ParseError('Unexpected end of input! Expecting a terminal of: %s' % expected_tokens) | raise ParseError('Unexpected end of input! Expecting a terminal of: %s' % expected_tokens) | ||||
elif len(solutions) == 1: | elif len(solutions) == 1: | ||||
@@ -0,0 +1,203 @@ | |||||
###{standalone | |||||
# | |||||
# | |||||
# Lark Stand-alone Generator Tool | |||||
# ---------------------------------- | |||||
# Generates a stand-alone LALR(1) parser with a standard lexer | |||||
# | |||||
# Git: https://github.com/erezsh/lark | |||||
# Author: Erez Shinan (erezshin@gmail.com) | |||||
# | |||||
# | |||||
# >>> LICENSE | |||||
# | |||||
# This tool and its generated code use a separate license from Lark. | |||||
# | |||||
# It is licensed under GPLv2 or above. | |||||
# | |||||
# If you wish to purchase a commercial license for this tool and its | |||||
# generated code, contact me via email. | |||||
# | |||||
# This program is free software: you can redistribute it and/or modify | |||||
# it under the terms of the GNU General Public License as published by | |||||
# the Free Software Foundation, either version 2 of the License, or | |||||
# (at your option) any later version. | |||||
# | |||||
# This program is distributed in the hope that it will be useful, | |||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |||||
# GNU General Public License for more details. | |||||
# | |||||
# See <http://www.gnu.org/licenses/>. | |||||
# | |||||
# | |||||
###} | |||||
import codecs | |||||
import sys | |||||
import os | |||||
from pprint import pprint | |||||
from os import path | |||||
from collections import defaultdict | |||||
import lark | |||||
from lark import Lark | |||||
from lark.parsers.lalr_analysis import Shift, Reduce | |||||
from ..grammar import Rule | |||||
__dir__ = path.dirname(__file__) | |||||
__larkdir__ = path.join(__dir__, path.pardir) | |||||
EXTRACT_STANDALONE_FILES = [ | |||||
'tools/standalone.py', | |||||
'utils.py', | |||||
'common.py', | |||||
'tree.py', | |||||
'indenter.py', | |||||
'lexer.py', | |||||
'parse_tree_builder.py', | |||||
'parsers/lalr_parser.py', | |||||
] | |||||
def extract_sections(lines): | |||||
section = None | |||||
text = [] | |||||
sections = defaultdict(list) | |||||
for l in lines: | |||||
if l.startswith('###'): | |||||
if l[3] == '{': | |||||
section = l[4:].strip() | |||||
elif l[3] == '}': | |||||
sections[section] += text | |||||
section = None | |||||
text = [] | |||||
else: | |||||
raise ValueError(l) | |||||
elif section: | |||||
text.append(l) | |||||
return {name:''.join(text) for name, text in sections.items()} | |||||
class LexerAtoms: | |||||
def __init__(self, lexer): | |||||
self.mres = [(p.pattern,d) for p,d in lexer.mres] | |||||
self.newline_types = lexer.newline_types | |||||
self.ignore_types = lexer.ignore_types | |||||
self.callback = {name:[(p.pattern,d) for p,d in c.mres] | |||||
for name, c in lexer.callback.items()} | |||||
def print_python(self): | |||||
print('import re') | |||||
print('MRES = (') | |||||
pprint(self.mres) | |||||
print(')') | |||||
print('LEXER_CALLBACK = (') | |||||
pprint(self.callback) | |||||
print(')') | |||||
print('NEWLINE_TYPES = %s' % self.newline_types) | |||||
print('IGNORE_TYPES = %s' % self.ignore_types) | |||||
print('class LexerRegexps: pass') | |||||
print('lexer_regexps = LexerRegexps()') | |||||
print('lexer_regexps.mres = [(re.compile(p), d) for p, d in MRES]') | |||||
print('lexer_regexps.callback = {n: UnlessCallback([(re.compile(p), d) for p, d in mres])') | |||||
print(' for n, mres in LEXER_CALLBACK.items()}') | |||||
print('lexer = _Lex(lexer_regexps)') | |||||
print('def lex(stream):') | |||||
print(' return lexer.lex(stream, NEWLINE_TYPES, IGNORE_TYPES)') | |||||
class GetRule: | |||||
def __init__(self, rule_id): | |||||
self.rule_id = rule_id | |||||
def __repr__(self): | |||||
return 'RULES[%d]' % self.rule_id | |||||
rule_ids = {} | |||||
token_types = {} | |||||
def _get_token_type(token_type): | |||||
if token_type not in token_types: | |||||
token_types[token_type] = len(token_types) | |||||
return token_types[token_type] | |||||
class ParserAtoms: | |||||
def __init__(self, parser): | |||||
self.parse_table = parser.analysis.parse_table | |||||
def print_python(self): | |||||
print('class ParseTable: pass') | |||||
print('parse_table = ParseTable()') | |||||
print('STATES = {') | |||||
for state, actions in self.parse_table.states.items(): | |||||
print(' %r: %r,' % (state, {_get_token_type(token): ((1, rule_ids[arg]) if action is Reduce else (0, arg)) | |||||
for token, (action, arg) in actions.items()})) | |||||
print('}') | |||||
print('TOKEN_TYPES = (') | |||||
pprint({v:k for k, v in token_types.items()}) | |||||
print(')') | |||||
print('parse_table.states = {s: {TOKEN_TYPES[t]: (a, RULES[x] if a is Reduce else x) for t, (a, x) in acts.items()}') | |||||
print(' for s, acts in STATES.items()}') | |||||
print('parse_table.start_state = %s' % self.parse_table.start_state) | |||||
print('parse_table.end_state = %s' % self.parse_table.end_state) | |||||
print('class Lark_StandAlone:') | |||||
print(' def __init__(self, transformer=None, postlex=None):') | |||||
print(' callback = parse_tree_builder.create_callback(transformer=transformer)') | |||||
print(' callbacks = {rule: getattr(callback, rule.alias or rule.origin, None) for rule in RULES.values()}') | |||||
print(' self.parser = _Parser(parse_table, callbacks)') | |||||
print(' self.postlex = postlex') | |||||
print(' def parse(self, stream):') | |||||
print(' tokens = lex(stream)') | |||||
print(' if self.postlex: tokens = self.postlex.process(tokens)') | |||||
print(' return self.parser.parse(tokens)') | |||||
class TreeBuilderAtoms: | |||||
def __init__(self, lark): | |||||
self.rules = lark.rules | |||||
self.ptb = lark._parse_tree_builder | |||||
def print_python(self): | |||||
print('RULES = {') | |||||
for i, r in enumerate(self.rules): | |||||
rule_ids[r] = i | |||||
print(' %d: Rule(%r, %r, %r, %r),' % (i, r.origin, r.expansion, self.ptb.user_aliases[r], r.options )) | |||||
print('}') | |||||
print('parse_tree_builder = ParseTreeBuilder(RULES.values(), Tree)') | |||||
def main(fn, start): | |||||
with codecs.open(fn, encoding='utf8') as f: | |||||
lark_inst = Lark(f, parser="lalr", start=start) | |||||
lexer_atoms = LexerAtoms(lark_inst.parser.lexer) | |||||
parser_atoms = ParserAtoms(lark_inst.parser.parser) | |||||
tree_builder_atoms = TreeBuilderAtoms(lark_inst) | |||||
print('# The file was automatically generated by Lark v%s' % lark.__version__) | |||||
for pyfile in EXTRACT_STANDALONE_FILES: | |||||
print (extract_sections(open(os.path.join(__larkdir__, pyfile)))['standalone']) | |||||
print(open(os.path.join(__larkdir__, 'grammar.py')).read()) | |||||
print('Shift = 0') | |||||
print('Reduce = 1') | |||||
lexer_atoms.print_python() | |||||
tree_builder_atoms.print_python() | |||||
parser_atoms.print_python() | |||||
if __name__ == '__main__': | |||||
if len(sys.argv) < 2: | |||||
print("Lark Stand-alone Generator Tool") | |||||
print("Usage: python -m lark.tools.standalone <grammar-file> [<start>]") | |||||
sys.exit(1) | |||||
if len(sys.argv) == 3: | |||||
fn, start = sys.argv[1:] | |||||
elif len(sys.argv) == 2: | |||||
fn, start = sys.argv[1], 'start' | |||||
else: | |||||
assert False, sys.argv | |||||
main(fn, start) |
@@ -7,6 +7,7 @@ from copy import deepcopy | |||||
from .utils import inline_args | from .utils import inline_args | ||||
###{standalone | |||||
class Tree(object): | class Tree(object): | ||||
def __init__(self, data, children, rule=None): | def __init__(self, data, children, rule=None): | ||||
self.data = data | self.data = data | ||||
@@ -34,6 +35,7 @@ class Tree(object): | |||||
def pretty(self, indent_str=' '): | def pretty(self, indent_str=' '): | ||||
return ''.join(self._pretty(0, indent_str)) | return ''.join(self._pretty(0, indent_str)) | ||||
###} | |||||
def expand_kids_by_index(self, *indices): | def expand_kids_by_index(self, *indices): | ||||
for i in sorted(indices, reverse=True): # reverse so that changing tail won't affect indices | for i in sorted(indices, reverse=True): # reverse so that changing tail won't affect indices | ||||
@@ -100,6 +102,7 @@ class Tree(object): | |||||
###{standalone | |||||
class Transformer(object): | class Transformer(object): | ||||
def _get_func(self, name): | def _get_func(self, name): | ||||
return getattr(self, name) | return getattr(self, name) | ||||
@@ -139,7 +142,7 @@ class TransformerChain(object): | |||||
def __mul__(self, other): | def __mul__(self, other): | ||||
return TransformerChain(*self.transformers + (other,)) | return TransformerChain(*self.transformers + (other,)) | ||||
class InlineTransformer(Transformer): | class InlineTransformer(Transformer): | ||||
@@ -196,6 +199,7 @@ class Transformer_NoRecurse(Transformer): | |||||
def __default__(self, t): | def __default__(self, t): | ||||
return t | return t | ||||
###} | |||||
def pydot__tree_to_png(tree, filename): | def pydot__tree_to_png(tree, filename): | ||||
@@ -1,7 +1,4 @@ | |||||
import functools | |||||
import types | |||||
from collections import deque | from collections import deque | ||||
from contextlib import contextmanager | |||||
class fzset(frozenset): | class fzset(frozenset): | ||||
def __repr__(self): | def __repr__(self): | ||||
@@ -49,8 +46,13 @@ try: | |||||
except NameError: # Python 3 | except NameError: # Python 3 | ||||
STRING_TYPE = str | STRING_TYPE = str | ||||
Str = type(u'') | |||||
###{standalone | |||||
import types | |||||
import functools | |||||
from contextlib import contextmanager | |||||
Str = type(u'') | |||||
def inline_args(f): | def inline_args(f): | ||||
# print '@@', f.__name__, type(f), isinstance(f, types.FunctionType), isinstance(f, types.TypeType), isinstance(f, types.BuiltinFunctionType) | # print '@@', f.__name__, type(f), isinstance(f, types.FunctionType), isinstance(f, types.TypeType), isinstance(f, types.BuiltinFunctionType) | ||||
@@ -76,19 +78,6 @@ def inline_args(f): | |||||
return _f | return _f | ||||
try: | |||||
compare = cmp | |||||
except NameError: | |||||
def compare(a, b): | |||||
if a == b: | |||||
return 0 | |||||
elif a > b: | |||||
return 1 | |||||
else: | |||||
return -1 | |||||
try: | try: | ||||
from contextlib import suppress # Python 3 | from contextlib import suppress # Python 3 | ||||
except ImportError: | except ImportError: | ||||
@@ -107,6 +96,26 @@ except ImportError: | |||||
except excs: | except excs: | ||||
pass | pass | ||||
###} | |||||
try: | |||||
compare = cmp | |||||
except NameError: | |||||
def compare(a, b): | |||||
if a == b: | |||||
return 0 | |||||
elif a > b: | |||||
return 1 | |||||
else: | |||||
return -1 | |||||
import sre_parse | |||||
import sre_constants | |||||
def get_regexp_width(regexp): | |||||
try: | |||||
return sre_parse.parse(regexp).getwidth() | |||||
except sre_constants.error: | |||||
raise ValueError(regexp) |
@@ -126,7 +126,7 @@ class TestParsers(unittest.TestCase): | |||||
r = T().transform(g.parse("x")) | r = T().transform(g.parse("x")) | ||||
self.assertEqual( r.children, ["<b>"] ) | self.assertEqual( r.children, ["<b>"] ) | ||||
g = Lark("""start: a | g = Lark("""start: a | ||||
?a : b | ?a : b | ||||
b : "x" | b : "x" | ||||
@@ -142,14 +142,14 @@ class TestParsers(unittest.TestCase): | |||||
r = T().transform(g.parse("xx")) | r = T().transform(g.parse("xx")) | ||||
self.assertEqual( r.children, ["<c>"] ) | self.assertEqual( r.children, ["<c>"] ) | ||||
g = Lark("""start: a | g = Lark("""start: a | ||||
?a : b b -> c | ?a : b b -> c | ||||
b : "x" | b : "x" | ||||
""", parser='lalr', transformer=T()) | """, parser='lalr', transformer=T()) | ||||
r = g.parse("xx") | r = g.parse("xx") | ||||
self.assertEqual( r.children, ["<c>"] ) | self.assertEqual( r.children, ["<c>"] ) | ||||
@@ -159,7 +159,7 @@ def _make_full_earley_test(LEXER): | |||||
# Fails an Earley implementation without special handling for empty rules, | # Fails an Earley implementation without special handling for empty rules, | ||||
# or re-processing of already completed rules. | # or re-processing of already completed rules. | ||||
g = Lark(r"""start: B | g = Lark(r"""start: B | ||||
B: ("ab"|/[^b]/)* | |||||
B: ("ab"|/[^b]/)+ | |||||
""", lexer=LEXER) | """, lexer=LEXER) | ||||
self.assertEqual( g.parse('abc').children[0], 'abc') | self.assertEqual( g.parse('abc').children[0], 'abc') | ||||
@@ -796,6 +796,49 @@ def _make_parser_test(LEXER, PARSER): | |||||
self.assertEqual(tree.children, ['a', 'A']) | self.assertEqual(tree.children, ['a', 'A']) | ||||
def test_twice_empty(self): | |||||
g = """!start: [["A"]] | |||||
""" | |||||
l = _Lark(g) | |||||
tree = l.parse('A') | |||||
self.assertEqual(tree.children, ['A']) | |||||
tree = l.parse('') | |||||
self.assertEqual(tree.children, []) | |||||
def test_undefined_ignore(self): | |||||
g = """!start: "A" | |||||
%ignore B | |||||
""" | |||||
self.assertRaises( GrammarError, _Lark, g) | |||||
@unittest.skipIf(LEXER==None, "TODO: Fix scanless parsing or get rid of it") # TODO | |||||
def test_line_and_column(self): | |||||
g = r"""!start: "A" bc "D" | |||||
!bc: "B\nC" | |||||
""" | |||||
l = _Lark(g) | |||||
a, bc, d = l.parse("AB\nCD").children | |||||
self.assertEqual(a.line, 1) | |||||
self.assertEqual(a.column, 0) | |||||
bc ,= bc.children | |||||
self.assertEqual(bc.line, 1) | |||||
self.assertEqual(bc.column, 1) | |||||
self.assertEqual(d.line, 2) | |||||
self.assertEqual(d.column, 1) | |||||
# self.assertEqual(a.end_line, 1) | |||||
# self.assertEqual(a.end_col, 1) | |||||
# self.assertEqual(bc.end_line, 2) | |||||
# self.assertEqual(bc.end_col, 1) | |||||
# self.assertEqual(d.end_line, 2) | |||||
# self.assertEqual(d.end_col, 2) | |||||
def test_reduce_cycle(self): | def test_reduce_cycle(self): | ||||
"""Tests an edge-condition in the LALR parser, in which a transition state looks exactly like the end state. | """Tests an edge-condition in the LALR parser, in which a transition state looks exactly like the end state. | ||||
It seems that the correct solution is to explicitely distinguish finalization in the reduce() function. | It seems that the correct solution is to explicitely distinguish finalization in the reduce() function. | ||||
@@ -969,7 +1012,7 @@ def _make_parser_test(LEXER, PARSER): | |||||
parser = _Lark(grammar) | parser = _Lark(grammar) | ||||
tree = parser.parse("int 1 ! This is a comment\n") | |||||
tree = parser.parse("int 1 ! This is a comment\n") | |||||
self.assertEqual(tree.children, ['1']) | self.assertEqual(tree.children, ['1']) | ||||
tree = parser.parse("int 1 ! This is a comment") # A trailing ignore token can be tricky! | tree = parser.parse("int 1 ! This is a comment") # A trailing ignore token can be tricky! | ||||
@@ -983,6 +1026,7 @@ def _make_parser_test(LEXER, PARSER): | |||||
self.assertEqual(tree.children, []) | self.assertEqual(tree.children, []) | ||||
@unittest.skipIf(LEXER==None, "Scanless doesn't support regular expressions") | @unittest.skipIf(LEXER==None, "Scanless doesn't support regular expressions") | ||||
def test_regex_escaping(self): | def test_regex_escaping(self): | ||||
g = _Lark("start: /[ab]/") | g = _Lark("start: /[ab]/") | ||||