@@ -12,6 +12,7 @@ Lark can: | |||
- Build a parse-tree automagically, no construction code required | |||
- Outperform all other Python libraries when using LALR(1) (Yes, including PLY) | |||
- Run on every Python interpreter (it's pure-python) | |||
- Generate a stand-alone parser (for LALR(1) grammars) | |||
And many more features. Read ahead and find out. | |||
@@ -66,10 +67,11 @@ See more [examples in the wiki](https://github.com/erezsh/lark/wiki/Examples) | |||
- Builds a parse-tree (AST) automagically, based on the structure of the grammar | |||
- **Earley** parser | |||
- Can parse *ALL* context-free grammars | |||
- Full support for ambiguity in grammar | |||
- Can parse all context-free grammars | |||
- Full support for ambiguous grammars | |||
- **LALR(1)** parser | |||
- Competitive with PLY | |||
- Fast and light, competitive with PLY | |||
- Can generate a stand-alone parser | |||
- **EBNF** grammar | |||
- **Unicode** fully supported | |||
- **Python 2 & 3** compatible | |||
@@ -86,7 +88,7 @@ See the full list of [features in the wiki](https://github.com/erezsh/lark/wiki/ | |||
#### Performance comparison | |||
Lower is better! | |||
Lark is the fastest and lightest (lower is better) | |||
 | |||
@@ -99,17 +101,17 @@ Check out the [JSON tutorial](/docs/json_tutorial.md#conclusion) for more detail | |||
#### Feature comparison | |||
| Library | Algorithm | Grammar | Builds tree? | Supports ambiguity? | Can handle every CFG? | |||
|:--------|:----------|:----|:--------|:------------|:------------ | |||
| **Lark** | Earley/LALR(1) | EBNF | Yes! | Yes! | Yes! | | |||
| [PLY](http://www.dabeaz.com/ply/) | LALR(1) | BNF | No | No | No | | |||
| [PyParsing](http://pyparsing.wikispaces.com/) | PEG | Combinators | No | No | No\* | | |||
| [Parsley](https://pypi.python.org/pypi/Parsley) | PEG | EBNF | No | No | No\* | | |||
| [funcparserlib](https://github.com/vlasovskikh/funcparserlib) | Recursive-Descent | Combinators | No | No | No | | |||
| [Parsimonious](https://github.com/erikrose/parsimonious) | PEG | EBNF | Yes | No | No\* | | |||
| Library | Algorithm | Grammar | Builds tree? | Supports ambiguity? | Can handle every CFG? | Line/Column tracking | Generates Stand-alone | |||
|:--------|:----------|:----|:--------|:------------|:------------|:----------|:---------- | |||
| **Lark** | Earley/LALR(1) | EBNF | Yes! | Yes! | Yes! | Yes! | Yes! (LALR only) | | |||
| [PLY](http://www.dabeaz.com/ply/) | LALR(1) | BNF | No | No | No | No | No | | |||
| [PyParsing](http://pyparsing.wikispaces.com/) | PEG | Combinators | No | No | No\* | No | No | | |||
| [Parsley](https://pypi.python.org/pypi/Parsley) | PEG | EBNF | No | No | No\* | No | No | | |||
| [funcparserlib](https://github.com/vlasovskikh/funcparserlib) | Recursive-Descent | Combinators | No | No | No | No | No | | |||
| [Parsimonious](https://github.com/erikrose/parsimonious) | PEG | EBNF | Yes | No | No\* | No | No | | |||
(\* *According to Wikipedia, it remains unanswered whether PEGs can really parse all deterministic CFGs*) | |||
(\* *PEGs cannot handle non-deterministic grammars. Also, according to Wikipedia, it remains unanswered whether PEGs can really parse all deterministic CFGs*) | |||
### Projects using Lark | |||
@@ -0,0 +1 @@ | |||
python -m lark.tools.standalone json.g > json_parser.py |
@@ -0,0 +1,21 @@ | |||
?start: value | |||
?value: object | |||
| array | |||
| string | |||
| SIGNED_NUMBER -> number | |||
| "true" -> true | |||
| "false" -> false | |||
| "null" -> null | |||
array : "[" [value ("," value)*] "]" | |||
object : "{" [pair ("," pair)*] "}" | |||
pair : string ":" value | |||
string : ESCAPED_STRING | |||
%import common.ESCAPED_STRING | |||
%import common.SIGNED_NUMBER | |||
%import common.WS | |||
%ignore WS |
@@ -0,0 +1,794 @@ | |||
# The file was automatically generated by Lark v0.5.2 | |||
# | |||
# | |||
# Lark Stand-alone Generator Tool | |||
# ---------------------------------- | |||
# Generates a stand-alone LALR(1) parser with a standard lexer | |||
# | |||
# Git: https://github.com/erezsh/lark | |||
# Author: Erez Shinan (erezshin@gmail.com) | |||
# | |||
# | |||
# >>> LICENSE | |||
# | |||
# This tool and its generated code use a separate license from Lark. | |||
# | |||
# It is licensed under GPLv2 or above. | |||
# | |||
# If you wish to purchase a commercial license for this tool and its | |||
# generated code, contact me via email. | |||
# | |||
# This program is free software: you can redistribute it and/or modify | |||
# it under the terms of the GNU General Public License as published by | |||
# the Free Software Foundation, either version 2 of the License, or | |||
# (at your option) any later version. | |||
# | |||
# This program is distributed in the hope that it will be useful, | |||
# but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |||
# GNU General Public License for more details. | |||
# | |||
# See <http://www.gnu.org/licenses/>. | |||
# | |||
# | |||
import types | |||
import functools | |||
from contextlib import contextmanager | |||
Str = type(u'') | |||
def inline_args(f): | |||
# print '@@', f.__name__, type(f), isinstance(f, types.FunctionType), isinstance(f, types.TypeType), isinstance(f, types.BuiltinFunctionType) | |||
if isinstance(f, types.FunctionType): | |||
@functools.wraps(f) | |||
def _f_func(self, args): | |||
return f(self, *args) | |||
return _f_func | |||
elif isinstance(f, (type, types.BuiltinFunctionType)): | |||
@functools.wraps(f) | |||
def _f_builtin(_self, args): | |||
return f(*args) | |||
return _f_builtin | |||
elif isinstance(f, types.MethodType): | |||
@functools.wraps(f.__func__) | |||
def _f(self, args): | |||
return f.__func__(self, *args) | |||
return _f | |||
else: | |||
@functools.wraps(f.__call__.__func__) | |||
def _f(self, args): | |||
return f.__call__.__func__(self, *args) | |||
return _f | |||
try: | |||
from contextlib import suppress # Python 3 | |||
except ImportError: | |||
@contextmanager | |||
def suppress(*excs): | |||
'''Catch and dismiss the provided exception | |||
>>> x = 'hello' | |||
>>> with suppress(IndexError): | |||
... x = x[10] | |||
>>> x | |||
'hello' | |||
''' | |||
try: | |||
yield | |||
except excs: | |||
pass | |||
def is_terminal(sym): | |||
return sym.isupper() | |||
class GrammarError(Exception): | |||
pass | |||
class ParseError(Exception): | |||
pass | |||
class UnexpectedToken(ParseError): | |||
def __init__(self, token, expected, seq, index): | |||
self.token = token | |||
self.expected = expected | |||
self.line = getattr(token, 'line', '?') | |||
self.column = getattr(token, 'column', '?') | |||
try: | |||
context = ' '.join(['%r(%s)' % (t.value, t.type) for t in seq[index:index+5]]) | |||
except AttributeError: | |||
context = seq[index:index+5] | |||
except TypeError: | |||
context = "<no context>" | |||
message = ("Unexpected token %r at line %s, column %s.\n" | |||
"Expected: %s\n" | |||
"Context: %s" % (token, self.line, self.column, expected, context)) | |||
super(UnexpectedToken, self).__init__(message) | |||
class Tree(object): | |||
def __init__(self, data, children): | |||
self.data = data | |||
self.children = list(children) | |||
def __repr__(self): | |||
return 'Tree(%s, %s)' % (self.data, self.children) | |||
def _pretty_label(self): | |||
return self.data | |||
def _pretty(self, level, indent_str): | |||
if len(self.children) == 1 and not isinstance(self.children[0], Tree): | |||
return [ indent_str*level, self._pretty_label(), '\t', '%s' % self.children[0], '\n'] | |||
l = [ indent_str*level, self._pretty_label(), '\n' ] | |||
for n in self.children: | |||
if isinstance(n, Tree): | |||
l += n._pretty(level+1, indent_str) | |||
else: | |||
l += [ indent_str*(level+1), '%s' % n, '\n' ] | |||
return l | |||
def pretty(self, indent_str=' '): | |||
return ''.join(self._pretty(0, indent_str)) | |||
class Transformer(object): | |||
def _get_func(self, name): | |||
return getattr(self, name) | |||
def transform(self, tree): | |||
items = [] | |||
for c in tree.children: | |||
try: | |||
items.append(self.transform(c) if isinstance(c, Tree) else c) | |||
except Discard: | |||
pass | |||
try: | |||
f = self._get_func(tree.data) | |||
except AttributeError: | |||
return self.__default__(tree.data, items) | |||
else: | |||
return f(items) | |||
def __default__(self, data, children): | |||
return Tree(data, children) | |||
def __mul__(self, other): | |||
return TransformerChain(self, other) | |||
class Discard(Exception): | |||
pass | |||
class TransformerChain(object): | |||
def __init__(self, *transformers): | |||
self.transformers = transformers | |||
def transform(self, tree): | |||
for t in self.transformers: | |||
tree = t.transform(tree) | |||
return tree | |||
def __mul__(self, other): | |||
return TransformerChain(*self.transformers + (other,)) | |||
class InlineTransformer(Transformer): | |||
def _get_func(self, name): # use super()._get_func | |||
return inline_args(getattr(self, name)).__get__(self) | |||
class Visitor(object): | |||
def visit(self, tree): | |||
for child in tree.children: | |||
if isinstance(child, Tree): | |||
self.visit(child) | |||
f = getattr(self, tree.data, self.__default__) | |||
f(tree) | |||
return tree | |||
def __default__(self, tree): | |||
pass | |||
class Visitor_NoRecurse(Visitor): | |||
def visit(self, tree): | |||
subtrees = list(tree.iter_subtrees()) | |||
for subtree in (subtrees): | |||
getattr(self, subtree.data, self.__default__)(subtree) | |||
return tree | |||
class Transformer_NoRecurse(Transformer): | |||
def transform(self, tree): | |||
subtrees = list(tree.iter_subtrees()) | |||
def _t(t): | |||
# Assumes t is already transformed | |||
try: | |||
f = self._get_func(t.data) | |||
except AttributeError: | |||
return self.__default__(t) | |||
else: | |||
return f(t) | |||
for subtree in subtrees: | |||
children = [] | |||
for c in subtree.children: | |||
try: | |||
children.append(_t(c) if isinstance(c, Tree) else c) | |||
except Discard: | |||
pass | |||
subtree.children = children | |||
return _t(tree) | |||
def __default__(self, t): | |||
return t | |||
class Indenter: | |||
def __init__(self): | |||
self.paren_level = 0 | |||
self.indent_level = [0] | |||
def handle_NL(self, token): | |||
if self.paren_level > 0: | |||
return | |||
yield token | |||
indent_str = token.rsplit('\n', 1)[1] # Tabs and spaces | |||
indent = indent_str.count(' ') + indent_str.count('\t') * self.tab_len | |||
if indent > self.indent_level[-1]: | |||
self.indent_level.append(indent) | |||
yield Token.new_borrow_pos(self.INDENT_type, indent_str, token) | |||
else: | |||
while indent < self.indent_level[-1]: | |||
self.indent_level.pop() | |||
yield Token.new_borrow_pos(self.DEDENT_type, indent_str, token) | |||
assert indent == self.indent_level[-1], '%s != %s' % (indent, self.indent_level[-1]) | |||
def process(self, stream): | |||
for token in stream: | |||
if token.type == self.NL_type: | |||
for t in self.handle_NL(token): | |||
yield t | |||
else: | |||
yield token | |||
if token.type in self.OPEN_PAREN_types: | |||
self.paren_level += 1 | |||
elif token.type in self.CLOSE_PAREN_types: | |||
self.paren_level -= 1 | |||
assert self.paren_level >= 0 | |||
while len(self.indent_level) > 1: | |||
self.indent_level.pop() | |||
yield Token(self.DEDENT_type, '') | |||
assert self.indent_level == [0], self.indent_level | |||
# XXX Hack for ContextualLexer. Maybe there's a more elegant solution? | |||
@property | |||
def always_accept(self): | |||
return (self.NL_type,) | |||
class LexError(Exception): | |||
pass | |||
class UnexpectedInput(LexError): | |||
def __init__(self, seq, lex_pos, line, column, allowed=None): | |||
context = seq[lex_pos:lex_pos+5] | |||
message = "No token defined for: '%s' in %r at line %d col %d" % (seq[lex_pos], context, line, column) | |||
super(UnexpectedInput, self).__init__(message) | |||
self.line = line | |||
self.column = column | |||
self.context = context | |||
self.allowed = allowed | |||
class Token(Str): | |||
def __new__(cls, type_, value, pos_in_stream=None, line=None, column=None): | |||
inst = Str.__new__(cls, value) | |||
inst.type = type_ | |||
inst.pos_in_stream = pos_in_stream | |||
inst.value = value | |||
inst.line = line | |||
inst.column = column | |||
return inst | |||
@classmethod | |||
def new_borrow_pos(cls, type_, value, borrow_t): | |||
return cls(type_, value, borrow_t.pos_in_stream, line=borrow_t.line, column=borrow_t.column) | |||
def __repr__(self): | |||
return 'Token(%s, %r)' % (self.type, self.value) | |||
def __deepcopy__(self, memo): | |||
return Token(self.type, self.value, self.pos_in_stream, self.line, self.column) | |||
def __eq__(self, other): | |||
if isinstance(other, Token) and self.type != other.type: | |||
return False | |||
return Str.__eq__(self, other) | |||
__hash__ = Str.__hash__ | |||
class LineCounter: | |||
def __init__(self): | |||
self.newline_char = '\n' | |||
self.char_pos = 0 | |||
self.line = 1 | |||
self.column = 0 | |||
self.line_start_pos = 0 | |||
def feed(self, token, test_newline=True): | |||
"""Consume a token and calculate the new line & column. | |||
As an optional optimization, set test_newline=False is token doesn't contain a newline. | |||
""" | |||
if test_newline: | |||
newlines = token.count(self.newline_char) | |||
if newlines: | |||
self.line += newlines | |||
self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1 | |||
self.char_pos += len(token) | |||
self.column = self.char_pos - self.line_start_pos | |||
class _Lex: | |||
"Built to serve both Lexer and ContextualLexer" | |||
def __init__(self, lexer): | |||
self.lexer = lexer | |||
def lex(self, stream, newline_types, ignore_types): | |||
newline_types = list(newline_types) | |||
newline_types = list(newline_types) | |||
line_ctr = LineCounter() | |||
while True: | |||
lexer = self.lexer | |||
for mre, type_from_index in lexer.mres: | |||
m = mre.match(stream, line_ctr.char_pos) | |||
if m: | |||
value = m.group(0) | |||
type_ = type_from_index[m.lastindex] | |||
if type_ not in ignore_types: | |||
t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) | |||
if t.type in lexer.callback: | |||
t = lexer.callback[t.type](t) | |||
lexer = yield t | |||
line_ctr.feed(value, type_ in newline_types) | |||
break | |||
else: | |||
if line_ctr.char_pos < len(stream): | |||
raise UnexpectedInput(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column) | |||
break | |||
class UnlessCallback: | |||
def __init__(self, mres): | |||
self.mres = mres | |||
def __call__(self, t): | |||
for mre, type_from_index in self.mres: | |||
m = mre.match(t.value) | |||
if m: | |||
value = m.group(0) | |||
t.type = type_from_index[m.lastindex] | |||
break | |||
return t | |||
class NodeBuilder: | |||
def __init__(self, tree_class, name): | |||
self.tree_class = tree_class | |||
self.name = name | |||
def __call__(self, children): | |||
return self.tree_class(self.name, children) | |||
class Expand1: | |||
def __init__(self, node_builder): | |||
self.node_builder = node_builder | |||
def __call__(self, children): | |||
if len(children) == 1: | |||
return children[0] | |||
else: | |||
return self.node_builder(children) | |||
class Factory: | |||
def __init__(self, cls, *args): | |||
self.cls = cls | |||
self.args = args | |||
def __call__(self, node_builder): | |||
return self.cls(node_builder, *self.args) | |||
class TokenWrapper: | |||
"Used for fixing the results of scanless parsing" | |||
def __init__(self, node_builder, token_name): | |||
self.node_builder = node_builder | |||
self.token_name = token_name | |||
def __call__(self, children): | |||
return self.node_builder( [Token(self.token_name, ''.join(children))] ) | |||
def identity(node_builder): | |||
return node_builder | |||
class ChildFilter: | |||
def __init__(self, node_builder, to_include): | |||
self.node_builder = node_builder | |||
self.to_include = to_include | |||
def __call__(self, children): | |||
filtered = [] | |||
for i, to_expand in self.to_include: | |||
if to_expand: | |||
filtered += children[i].children | |||
else: | |||
filtered.append(children[i]) | |||
return self.node_builder(filtered) | |||
def create_rule_handler(expansion, keep_all_tokens, filter_out): | |||
# if not keep_all_tokens: | |||
to_include = [(i, not is_terminal(sym) and sym.startswith('_')) | |||
for i, sym in enumerate(expansion) | |||
if keep_all_tokens | |||
or not ((is_terminal(sym) and sym.startswith('_')) or sym in filter_out) | |||
] | |||
if len(to_include) < len(expansion) or any(to_expand for i, to_expand in to_include): | |||
return Factory(ChildFilter, to_include) | |||
# else, if no filtering required.. | |||
return identity | |||
class PropagatePositions: | |||
def __init__(self, node_builder): | |||
self.node_builder = node_builder | |||
def __call__(self, children): | |||
res = self.node_builder(children) | |||
if children: | |||
for a in children: | |||
with suppress(AttributeError): | |||
res.line = a.line | |||
res.column = a.column | |||
break | |||
for a in reversed(children): | |||
with suppress(AttributeError): | |||
res.end_line = a.end_line | |||
res.end_col = a.end_col | |||
break | |||
return res | |||
class Callback(object): | |||
pass | |||
class ParseTreeBuilder: | |||
def __init__(self, rules, tree_class, propagate_positions=False, keep_all_tokens=False): | |||
self.tree_class = tree_class | |||
self.propagate_positions = propagate_positions | |||
self.always_keep_all_tokens = keep_all_tokens | |||
self.rule_builders = list(self._init_builders(rules)) | |||
self.user_aliases = {} | |||
def _init_builders(self, rules): | |||
filter_out = set() | |||
for rule in rules: | |||
if rule.options and rule.options.filter_out: | |||
assert rule.origin.startswith('_') # Just to make sure | |||
filter_out.add(rule.origin) | |||
for rule in rules: | |||
options = rule.options | |||
keep_all_tokens = self.always_keep_all_tokens or (options.keep_all_tokens if options else False) | |||
expand1 = options.expand1 if options else False | |||
create_token = options.create_token if options else False | |||
wrapper_chain = filter(None, [ | |||
(expand1 and not rule.alias) and Expand1, | |||
create_token and Factory(TokenWrapper, create_token), | |||
create_rule_handler(rule.expansion, keep_all_tokens, filter_out), | |||
self.propagate_positions and PropagatePositions, | |||
]) | |||
yield rule, wrapper_chain | |||
def create_callback(self, transformer=None): | |||
callback = Callback() | |||
for rule, wrapper_chain in self.rule_builders: | |||
internal_callback_name = '_callback_%s_%s' % (rule.origin, '_'.join(rule.expansion)) | |||
user_callback_name = rule.alias or rule.origin | |||
try: | |||
f = transformer._get_func(user_callback_name) | |||
except AttributeError: | |||
f = NodeBuilder(self.tree_class, user_callback_name) | |||
self.user_aliases[rule] = rule.alias | |||
rule.alias = internal_callback_name | |||
for w in wrapper_chain: | |||
f = w(f) | |||
if hasattr(callback, internal_callback_name): | |||
raise GrammarError("Rule '%s' already exists" % (rule,)) | |||
setattr(callback, internal_callback_name, f) | |||
return callback | |||
class _Parser: | |||
def __init__(self, parse_table, callbacks): | |||
self.states = parse_table.states | |||
self.start_state = parse_table.start_state | |||
self.end_state = parse_table.end_state | |||
self.callbacks = callbacks | |||
def parse(self, seq, set_state=None): | |||
i = 0 | |||
token = None | |||
stream = iter(seq) | |||
states = self.states | |||
state_stack = [self.start_state] | |||
value_stack = [] | |||
if set_state: set_state(self.start_state) | |||
def get_action(key): | |||
state = state_stack[-1] | |||
try: | |||
return states[state][key] | |||
except KeyError: | |||
expected = states[state].keys() | |||
raise UnexpectedToken(token, expected, seq, i) | |||
def reduce(rule): | |||
size = len(rule.expansion) | |||
if size: | |||
s = value_stack[-size:] | |||
del state_stack[-size:] | |||
del value_stack[-size:] | |||
else: | |||
s = [] | |||
value = self.callbacks[rule](s) | |||
_action, new_state = get_action(rule.origin) | |||
assert _action is Shift | |||
state_stack.append(new_state) | |||
value_stack.append(value) | |||
# Main LALR-parser loop | |||
try: | |||
token = next(stream) | |||
i += 1 | |||
while True: | |||
action, arg = get_action(token.type) | |||
assert arg != self.end_state | |||
if action is Shift: | |||
state_stack.append(arg) | |||
value_stack.append(token) | |||
if set_state: set_state(arg) | |||
token = next(stream) | |||
i += 1 | |||
else: | |||
reduce(arg) | |||
except StopIteration: | |||
pass | |||
while True: | |||
_action, arg = get_action('$END') | |||
if _action is Shift: | |||
assert arg == self.end_state | |||
val ,= value_stack | |||
return val | |||
else: | |||
reduce(arg) | |||
class Rule(object): | |||
""" | |||
origin : a symbol | |||
expansion : a list of symbols | |||
""" | |||
def __init__(self, origin, expansion, alias=None, options=None): | |||
self.origin = origin | |||
self.expansion = expansion | |||
self.alias = alias | |||
self.options = options | |||
def __str__(self): | |||
return '<%s : %s>' % (self.origin, ' '.join(map(str,self.expansion))) | |||
def __repr__(self): | |||
return 'Rule(%r, %r, %r, %r)' % (self.origin, self.expansion, self.alias, self.options) | |||
class RuleOptions: | |||
def __init__(self, keep_all_tokens=False, expand1=False, create_token=None, filter_out=False, priority=None): | |||
self.keep_all_tokens = keep_all_tokens | |||
self.expand1 = expand1 | |||
self.create_token = create_token # used for scanless postprocessing | |||
self.priority = priority | |||
self.filter_out = filter_out # remove this rule from the tree | |||
# used for "token"-rules in scanless | |||
def __repr__(self): | |||
return 'RuleOptions(%r, %r, %r, %r, %r)' % ( | |||
self.keep_all_tokens, | |||
self.expand1, | |||
self.create_token, | |||
self.priority, | |||
self.filter_out | |||
) | |||
Shift = 0 | |||
Reduce = 1 | |||
import re | |||
MRES = ( | |||
[('(?P<SIGNED_NUMBER>(?:(?:\\+|\\-))?(?:(?:(?:[0-9])+(?:e|E)(?:(?:\\+|\\-))?(?:[0-9])+|(?:(?:[0-9])+\\.(?:(?:[0-9])+)?|\\.(?:[0-9])+)(?:(?:e|E)(?:(?:\\+|\\-))?(?:[0-9])+)?)|(?:[0-9])+))|(?P<ESCAPED_STRING>\\"(?:(?:\\\\\\"|[^"]))*\\")|(?P<WS>(?:[ \t\x0c' | |||
'\r\n' | |||
'])+)|(?P<__FALSE1>false)|(?P<__NULL2>null)|(?P<__TRUE0>true)|(?P<__COLON>\\:)|(?P<__COMMA>\\,)|(?P<__LBRACE>\\{)|(?P<__LSQB>\\[)|(?P<__RBRACE>\\})|(?P<__RSQB>\\])', | |||
{1: 'SIGNED_NUMBER', | |||
2: 'ESCAPED_STRING', | |||
3: 'WS', | |||
4: '__FALSE1', | |||
5: '__NULL2', | |||
6: '__TRUE0', | |||
7: '__COLON', | |||
8: '__COMMA', | |||
9: '__LBRACE', | |||
10: '__LSQB', | |||
11: '__RBRACE', | |||
12: '__RSQB'})] | |||
) | |||
LEXER_CALLBACK = ( | |||
{} | |||
) | |||
NEWLINE_TYPES = ['WS'] | |||
IGNORE_TYPES = ['WS'] | |||
class LexerRegexps: pass | |||
lexer_regexps = LexerRegexps() | |||
lexer_regexps.mres = [(re.compile(p), d) for p, d in MRES] | |||
lexer_regexps.callback = {n: UnlessCallback([(re.compile(p), d) for p, d in mres]) | |||
for n, mres in LEXER_CALLBACK.items()} | |||
lexer = _Lex(lexer_regexps) | |||
def lex(stream): | |||
return lexer.lex(stream, NEWLINE_TYPES, IGNORE_TYPES) | |||
RULES = { | |||
0: Rule('start', ['value'], None, RuleOptions(False, True, None, None, False)), | |||
1: Rule('value', ['object'], None, RuleOptions(False, True, None, None, False)), | |||
2: Rule('value', ['array'], None, RuleOptions(False, True, None, None, False)), | |||
3: Rule('value', ['string'], None, RuleOptions(False, True, None, None, False)), | |||
4: Rule('value', ['SIGNED_NUMBER'], 'number', RuleOptions(False, True, None, None, False)), | |||
5: Rule('value', ['__TRUE0'], 'true', RuleOptions(False, True, None, None, False)), | |||
6: Rule('value', ['__FALSE1'], 'false', RuleOptions(False, True, None, None, False)), | |||
7: Rule('value', ['__NULL2'], 'null', RuleOptions(False, True, None, None, False)), | |||
8: Rule('array', ['__LSQB', 'value', '__anon_star_0', '__RSQB'], None, RuleOptions(False, False, None, None, False)), | |||
9: Rule('array', ['__LSQB', 'value', '__RSQB'], None, RuleOptions(False, False, None, None, False)), | |||
10: Rule('array', ['__LSQB', '__RSQB'], None, RuleOptions(False, False, None, None, False)), | |||
11: Rule('object', ['__LBRACE', 'pair', '__anon_star_1', '__RBRACE'], None, RuleOptions(False, False, None, None, False)), | |||
12: Rule('object', ['__LBRACE', 'pair', '__RBRACE'], None, RuleOptions(False, False, None, None, False)), | |||
13: Rule('object', ['__LBRACE', '__RBRACE'], None, RuleOptions(False, False, None, None, False)), | |||
14: Rule('pair', ['string', '__COLON', 'value'], None, RuleOptions(False, False, None, None, False)), | |||
15: Rule('string', ['ESCAPED_STRING'], None, RuleOptions(False, False, None, None, False)), | |||
16: Rule('__anon_star_0', ['__COMMA', 'value'], None, None), | |||
17: Rule('__anon_star_0', ['__anon_star_0', '__COMMA', 'value'], None, None), | |||
18: Rule('__anon_star_1', ['__COMMA', 'pair'], None, None), | |||
19: Rule('__anon_star_1', ['__anon_star_1', '__COMMA', 'pair'], None, None), | |||
} | |||
parse_tree_builder = ParseTreeBuilder(RULES.values(), Tree) | |||
class ParseTable: pass | |||
parse_table = ParseTable() | |||
STATES = { | |||
0: {0: (0, 1), 1: (0, 2), 2: (0, 3), 3: (0, 4), 4: (0, 5), 5: (0, 6), 6: (0, 7), 7: (0, 8), 8: (0, 9), 9: (0, 10), 10: (0, 11), 11: (0, 12)}, | |||
1: {12: (1, 5), 13: (1, 5), 14: (1, 5), 15: (1, 5)}, | |||
2: {9: (0, 10), 14: (0, 13), 16: (0, 14), 11: (0, 15)}, | |||
3: {12: (1, 2), 13: (1, 2), 14: (1, 2), 15: (1, 2)}, | |||
4: {12: (1, 1), 13: (1, 1), 14: (1, 1), 15: (1, 1)}, | |||
5: {12: (0, 16)}, | |||
6: {7: (0, 17), 0: (0, 1), 1: (0, 2), 2: (0, 3), 3: (0, 4), 5: (0, 6), 6: (0, 7), 8: (0, 9), 9: (0, 10), 15: (0, 18), 10: (0, 11), 11: (0, 12)}, | |||
7: {12: (1, 4), 13: (1, 4), 14: (1, 4), 15: (1, 4)}, | |||
8: {12: (1, 0)}, | |||
9: {12: (1, 7), 13: (1, 7), 14: (1, 7), 15: (1, 7)}, | |||
10: {12: (1, 15), 17: (1, 15), 13: (1, 15), 14: (1, 15), 15: (1, 15)}, | |||
11: {12: (1, 6), 13: (1, 6), 14: (1, 6), 15: (1, 6)}, | |||
12: {12: (1, 3), 13: (1, 3), 14: (1, 3), 15: (1, 3)}, | |||
13: {13: (1, 13), 12: (1, 13), 14: (1, 13), 15: (1, 13)}, | |||
14: {14: (0, 19), 13: (0, 20), 18: (0, 21)}, | |||
15: {17: (0, 22)}, | |||
16: {}, | |||
17: {19: (0, 23), 15: (0, 24), 13: (0, 25)}, | |||
18: {13: (1, 10), 12: (1, 10), 14: (1, 10), 15: (1, 10)}, | |||
19: {13: (1, 12), 12: (1, 12), 14: (1, 12), 15: (1, 12)}, | |||
20: {9: (0, 10), 11: (0, 15), 16: (0, 26)}, | |||
21: {14: (0, 27), 13: (0, 28)}, | |||
22: {5: (0, 6), 1: (0, 2), 0: (0, 1), 8: (0, 9), 2: (0, 3), 3: (0, 4), 9: (0, 10), 6: (0, 7), 10: (0, 11), 11: (0, 12), 7: (0, 29)}, | |||
23: {15: (0, 30), 13: (0, 31)}, | |||
24: {13: (1, 9), 12: (1, 9), 14: (1, 9), 15: (1, 9)}, | |||
25: {5: (0, 6), 1: (0, 2), 0: (0, 1), 8: (0, 9), 2: (0, 3), 3: (0, 4), 7: (0, 32), 9: (0, 10), 6: (0, 7), 10: (0, 11), 11: (0, 12)}, | |||
26: {13: (1, 18), 14: (1, 18)}, | |||
27: {13: (1, 11), 12: (1, 11), 14: (1, 11), 15: (1, 11)}, | |||
28: {16: (0, 33), 9: (0, 10), 11: (0, 15)}, | |||
29: {13: (1, 14), 14: (1, 14)}, | |||
30: {13: (1, 8), 12: (1, 8), 14: (1, 8), 15: (1, 8)}, | |||
31: {5: (0, 6), 1: (0, 2), 0: (0, 1), 7: (0, 34), 8: (0, 9), 2: (0, 3), 3: (0, 4), 9: (0, 10), 6: (0, 7), 10: (0, 11), 11: (0, 12)}, | |||
32: {15: (1, 16), 13: (1, 16)}, | |||
33: {13: (1, 19), 14: (1, 19)}, | |||
34: {15: (1, 17), 13: (1, 17)}, | |||
} | |||
TOKEN_TYPES = ( | |||
{0: '__TRUE0', | |||
1: '__LBRACE', | |||
2: 'array', | |||
3: 'object', | |||
4: 'start', | |||
5: '__LSQB', | |||
6: 'SIGNED_NUMBER', | |||
7: 'value', | |||
8: '__NULL2', | |||
9: 'ESCAPED_STRING', | |||
10: '__FALSE1', | |||
11: 'string', | |||
12: '$END', | |||
13: '__COMMA', | |||
14: '__RBRACE', | |||
15: '__RSQB', | |||
16: 'pair', | |||
17: '__COLON', | |||
18: '__anon_star_1', | |||
19: '__anon_star_0'} | |||
) | |||
parse_table.states = {s: {TOKEN_TYPES[t]: (a, RULES[x] if a is Reduce else x) for t, (a, x) in acts.items()} | |||
for s, acts in STATES.items()} | |||
parse_table.start_state = 0 | |||
parse_table.end_state = 16 | |||
class Lark_StandAlone: | |||
def __init__(self, transformer=None, postlex=None): | |||
callback = parse_tree_builder.create_callback(transformer=transformer) | |||
callbacks = {rule: getattr(callback, rule.alias or rule.origin, None) for rule in RULES.values()} | |||
self.parser = _Parser(parse_table, callbacks) | |||
self.postlex = postlex | |||
def parse(self, stream): | |||
tokens = lex(stream) | |||
if self.postlex: tokens = self.postlex.process(tokens) | |||
return self.parser.parse(tokens) |
@@ -0,0 +1,25 @@ | |||
import sys | |||
from json_parser import Lark_StandAlone, Transformer, inline_args | |||
class TreeToJson(Transformer): | |||
@inline_args | |||
def string(self, s): | |||
return s[1:-1].replace('\\"', '"') | |||
array = list | |||
pair = tuple | |||
object = dict | |||
number = inline_args(float) | |||
null = lambda self, _: None | |||
true = lambda self, _: True | |||
false = lambda self, _: False | |||
parser = Lark_StandAlone(transformer=TreeToJson()) | |||
if __name__ == '__main__': | |||
with open(sys.argv[1]) as f: | |||
print(parser.parse(f.read())) | |||
@@ -4,4 +4,4 @@ from .lexer import UnexpectedInput, LexError | |||
from .lark import Lark | |||
from .utils import inline_args | |||
__version__ = "0.5.1" | |||
__version__ = "0.5.2" |
@@ -1,16 +1,21 @@ | |||
import re | |||
import sre_parse | |||
import sys | |||
from .utils import get_regexp_width | |||
Py36 = (sys.version_info[:2] >= (3, 6)) | |||
###{standalone | |||
def is_terminal(sym): | |||
return sym.isupper() | |||
class GrammarError(Exception): | |||
pass | |||
class ParseError(Exception): | |||
pass | |||
class UnexpectedToken(ParseError): | |||
def __init__(self, token, expected, seq, index): | |||
self.token = token | |||
@@ -31,9 +36,8 @@ class UnexpectedToken(ParseError): | |||
super(UnexpectedToken, self).__init__(message) | |||
###} | |||
def is_terminal(sym): | |||
return isinstance(sym, Terminal) or sym.isupper() or sym == '$end' | |||
class LexerConf: | |||
@@ -44,7 +48,6 @@ class LexerConf: | |||
class ParserConf: | |||
def __init__(self, rules, callback, start): | |||
assert all(len(r) == 4 for r in rules) | |||
self.rules = rules | |||
self.callback = callback | |||
self.start = start | |||
@@ -93,10 +96,10 @@ class PatternRE(Pattern): | |||
@property | |||
def min_width(self): | |||
return sre_parse.parse(self.to_regexp()).getwidth()[0] | |||
return get_regexp_width(self.to_regexp())[0] | |||
@property | |||
def max_width(self): | |||
return sre_parse.parse(self.to_regexp()).getwidth()[1] | |||
return get_regexp_width(self.to_regexp())[1] | |||
class TokenDef(object): | |||
def __init__(self, name, pattern, priority=1): | |||
@@ -108,27 +111,3 @@ class TokenDef(object): | |||
def __repr__(self): | |||
return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern) | |||
class Terminal: | |||
def __init__(self, data): | |||
self.data = data | |||
def __repr__(self): | |||
return '%r' % self.data | |||
def __eq__(self, other): | |||
return isinstance(other, type(self)) and self.data == other.data | |||
def __hash__(self): | |||
return hash(self.data) | |||
class Terminal_Regexp(Terminal): | |||
def __init__(self, name, regexp): | |||
Terminal.__init__(self, regexp) | |||
self.name = name | |||
self.match = re.compile(regexp).match | |||
class Terminal_Token(Terminal): | |||
def match(self, other): | |||
return self.data == other.type | |||
@@ -0,0 +1,37 @@ | |||
class Rule(object): | |||
""" | |||
origin : a symbol | |||
expansion : a list of symbols | |||
""" | |||
def __init__(self, origin, expansion, alias=None, options=None): | |||
self.origin = origin | |||
self.expansion = expansion | |||
self.alias = alias | |||
self.options = options | |||
def __str__(self): | |||
return '<%s : %s>' % (self.origin, ' '.join(map(str,self.expansion))) | |||
def __repr__(self): | |||
return 'Rule(%r, %r, %r, %r)' % (self.origin, self.expansion, self.alias, self.options) | |||
class RuleOptions: | |||
def __init__(self, keep_all_tokens=False, expand1=False, create_token=None, filter_out=False, priority=None): | |||
self.keep_all_tokens = keep_all_tokens | |||
self.expand1 = expand1 | |||
self.create_token = create_token # used for scanless postprocessing | |||
self.priority = priority | |||
self.filter_out = filter_out # remove this rule from the tree | |||
# used for "token"-rules in scanless | |||
def __repr__(self): | |||
return 'RuleOptions(%r, %r, %r, %r, %r)' % ( | |||
self.keep_all_tokens, | |||
self.expand1, | |||
self.create_token, | |||
self.priority, | |||
self.filter_out | |||
) |
@@ -12,6 +12,7 @@ DECIMAL: INT "." INT? | "." INT | |||
// float = /-?\d+(\.\d+)?([eE][+-]?\d+)?/ | |||
_EXP: ("e"|"E") SIGNED_INT | |||
FLOAT: INT _EXP | DECIMAL _EXP? | |||
SIGNED_FLOAT: ["+"|"-"] INT | |||
NUMBER: FLOAT | INT | |||
SIGNED_NUMBER: ["+"|"-"] NUMBER | |||
@@ -2,6 +2,7 @@ | |||
from .lexer import Token | |||
###{standalone | |||
class Indenter: | |||
def __init__(self): | |||
self.paren_level = 0 | |||
@@ -50,3 +51,5 @@ class Indenter: | |||
@property | |||
def always_accept(self): | |||
return (self.NL_type,) | |||
###} |
@@ -169,13 +169,15 @@ class Lark: | |||
def _build_parser(self): | |||
self.parser_class = get_frontend(self.options.parser, self.options.lexer) | |||
self.parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class, self.options.propagate_positions, self.options.keep_all_tokens) | |||
rules, callback = self.parse_tree_builder.apply(self.options.transformer) | |||
self._parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class, self.options.propagate_positions, self.options.keep_all_tokens) | |||
callback = self._parse_tree_builder.create_callback(self.options.transformer) | |||
if self.profiler: | |||
for f in dir(callback): | |||
if not (f.startswith('__') and f.endswith('__')): | |||
setattr(callback, f, self.profiler.make_wrapper('transformer', getattr(callback, f))) | |||
parser_conf = ParserConf(rules, callback, self.options.start) | |||
parser_conf = ParserConf(self.rules, callback, self.options.start) | |||
return self.parser_class(self.lexer_conf, parser_conf, options=self.options) | |||
@@ -5,6 +5,7 @@ import re | |||
from .utils import Str, classify | |||
from .common import is_terminal, PatternStr, PatternRE, TokenDef | |||
###{standalone | |||
class LexError(Exception): | |||
pass | |||
@@ -48,27 +49,75 @@ class Token(Str): | |||
__hash__ = Str.__hash__ | |||
class Regex: | |||
def __init__(self, pattern, flags=()): | |||
self.pattern = pattern | |||
self.flags = flags | |||
def _regexp_has_newline(r): | |||
return '\n' in r or '\\n' in r or ('(?s)' in r and '.' in r) | |||
class LineCounter: | |||
def __init__(self): | |||
self.newline_char = '\n' | |||
self.char_pos = 0 | |||
self.line = 1 | |||
self.column = 0 | |||
self.line_start_pos = 0 | |||
def feed(self, token, test_newline=True): | |||
"""Consume a token and calculate the new line & column. | |||
As an optional optimization, set test_newline=False is token doesn't contain a newline. | |||
""" | |||
if test_newline: | |||
newlines = token.count(self.newline_char) | |||
if newlines: | |||
self.line += newlines | |||
self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1 | |||
self.char_pos += len(token) | |||
self.column = self.char_pos - self.line_start_pos | |||
class _Lex: | |||
"Built to serve both Lexer and ContextualLexer" | |||
def __init__(self, lexer): | |||
self.lexer = lexer | |||
def lex(self, stream, newline_types, ignore_types): | |||
newline_types = list(newline_types) | |||
ignore_types = list(ignore_types) | |||
line_ctr = LineCounter() | |||
def _create_unless_callback(strs): | |||
mres = build_mres(strs, match_whole=True) | |||
def unless_callback(t): | |||
# if t in strs: | |||
# t.type = strs[t] | |||
for mre, type_from_index in mres: | |||
while True: | |||
lexer = self.lexer | |||
for mre, type_from_index in lexer.mres: | |||
m = mre.match(stream, line_ctr.char_pos) | |||
if m: | |||
value = m.group(0) | |||
type_ = type_from_index[m.lastindex] | |||
if type_ not in ignore_types: | |||
t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) | |||
if t.type in lexer.callback: | |||
t = lexer.callback[t.type](t) | |||
yield t | |||
line_ctr.feed(value, type_ in newline_types) | |||
break | |||
else: | |||
if line_ctr.char_pos < len(stream): | |||
raise UnexpectedInput(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column) | |||
break | |||
class UnlessCallback: | |||
def __init__(self, mres): | |||
self.mres = mres | |||
def __call__(self, t): | |||
for mre, type_from_index in self.mres: | |||
m = mre.match(t.value) | |||
if m: | |||
value = m.group(0) | |||
t.type = type_from_index[m.lastindex] | |||
break | |||
return t | |||
return unless_callback | |||
###} | |||
def _create_unless(tokens): | |||
tokens_by_type = classify(tokens, lambda t: type(t.pattern)) | |||
@@ -85,7 +134,7 @@ def _create_unless(tokens): | |||
if strtok.pattern.flags <= retok.pattern.flags: | |||
embedded_strs.add(strtok) | |||
if unless: | |||
callback[retok.name] = _create_unless_callback(unless) | |||
callback[retok.name] = UnlessCallback(build_mres(unless, match_whole=True)) | |||
tokens = [t for t in tokens if t not in embedded_strs] | |||
return tokens, callback | |||
@@ -110,13 +159,13 @@ def _build_mres(tokens, max_size, match_whole): | |||
def build_mres(tokens, match_whole=False): | |||
return _build_mres(tokens, len(tokens), match_whole) | |||
def _regexp_has_newline(r): | |||
return '\n' in r or '\\n' in r or ('(?s)' in r and '.' in r) | |||
class Lexer(object): | |||
class Lexer: | |||
def __init__(self, tokens, ignore=()): | |||
assert all(isinstance(t, TokenDef) for t in tokens), tokens | |||
self.ignore = ignore | |||
self.newline_char = '\n' | |||
tokens = list(tokens) | |||
# Sanitization | |||
@@ -129,14 +178,11 @@ class Lexer(object): | |||
if t.pattern.min_width == 0: | |||
raise LexError("Lexer does not allow zero-width tokens. (%s: %s)" % (t.name, t.pattern)) | |||
token_names = {t.name for t in tokens} | |||
for t in ignore: | |||
if t not in token_names: | |||
raise LexError("Token '%s' was marked to ignore but it is not defined!" % t) | |||
assert set(ignore) <= {t.name for t in tokens} | |||
# Init | |||
self.newline_types = [t.name for t in tokens if _regexp_has_newline(t.pattern.to_regexp())] | |||
self.ignore_types = [t for t in ignore] | |||
self.ignore_types = list(ignore) | |||
tokens.sort(key=lambda x:(-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name)) | |||
@@ -147,46 +193,8 @@ class Lexer(object): | |||
self.mres = build_mres(tokens) | |||
def lex(self, stream): | |||
lex_pos = 0 | |||
line = 1 | |||
col_start_pos = 0 | |||
newline_types = list(self.newline_types) | |||
ignore_types = list(self.ignore_types) | |||
while True: | |||
for mre, type_from_index in self.mres: | |||
m = mre.match(stream, lex_pos) | |||
if m: | |||
value = m.group(0) | |||
type_ = type_from_index[m.lastindex] | |||
to_yield = type_ not in ignore_types | |||
if to_yield: | |||
t = Token(type_, value, lex_pos, line, lex_pos - col_start_pos) | |||
end_col = t.column + len(value) | |||
if t.type in self.callback: | |||
t = self.callback[t.type](t) | |||
if type_ in newline_types: | |||
newlines = value.count(self.newline_char) | |||
if newlines: | |||
line += newlines | |||
last_newline_index = value.rindex(self.newline_char) + 1 | |||
col_start_pos = lex_pos + last_newline_index | |||
end_col = len(value) - last_newline_index | |||
if to_yield: | |||
t.end_line = line | |||
t.end_col = end_col | |||
yield t | |||
lex_pos += len(value) | |||
break | |||
else: | |||
if lex_pos < len(stream): | |||
raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos) | |||
break | |||
return _Lex(self).lex(stream, self.newline_types, self.ignore_types) | |||
class ContextualLexer: | |||
@@ -204,7 +212,7 @@ class ContextualLexer: | |||
lexer = lexer_by_tokens[key] | |||
except KeyError: | |||
accepts = set(accepts) | set(ignore) | set(always_accept) | |||
state_tokens = [tokens_by_name[n] for n in accepts if is_terminal(n) and n!='$end'] | |||
state_tokens = [tokens_by_name[n] for n in accepts if is_terminal(n) and n!='$END'] | |||
lexer = Lexer(state_tokens, ignore=ignore) | |||
lexer_by_tokens[key] = lexer | |||
@@ -218,33 +226,9 @@ class ContextualLexer: | |||
self.parser_state = state | |||
def lex(self, stream): | |||
lex_pos = 0 | |||
line = 1 | |||
col_start_pos = 0 | |||
newline_types = list(self.root_lexer.newline_types) | |||
ignore_types = list(self.root_lexer.ignore_types) | |||
while True: | |||
lexer = self.lexers[self.parser_state] | |||
for mre, type_from_index in lexer.mres: | |||
m = mre.match(stream, lex_pos) | |||
if m: | |||
value = m.group(0) | |||
type_ = type_from_index[m.lastindex] | |||
if type_ not in ignore_types: | |||
t = Token(type_, value, lex_pos, line, lex_pos - col_start_pos) | |||
if t.type in lexer.callback: | |||
t = lexer.callback[t.type](t) | |||
yield t | |||
l = _Lex(self.lexers[self.parser_state]) | |||
for x in l.lex(stream, self.root_lexer.newline_types, self.root_lexer.ignore_types): | |||
yield x | |||
l.lexer = self.lexers[self.parser_state] | |||
if type_ in newline_types: | |||
newlines = value.count(lexer.newline_char) | |||
if newlines: | |||
line += newlines | |||
col_start_pos = lex_pos + value.rindex(lexer.newline_char) | |||
lex_pos += len(value) | |||
break | |||
else: | |||
if lex_pos < len(stream): | |||
raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos, lexer.tokens) | |||
break | |||
@@ -12,6 +12,7 @@ from .parse_tree_builder import ParseTreeBuilder | |||
from .parser_frontends import LALR | |||
from .parsers.lalr_parser import UnexpectedToken | |||
from .common import is_terminal, GrammarError, LexerConf, ParserConf, PatternStr, PatternRE, TokenDef | |||
from .grammar import RuleOptions, Rule | |||
from .tree import Tree as T, Transformer, InlineTransformer, Visitor | |||
@@ -127,7 +128,7 @@ RULES = { | |||
class EBNF_to_BNF(InlineTransformer): | |||
def __init__(self): | |||
self.new_rules = {} | |||
self.new_rules = [] | |||
self.rules_by_expr = {} | |||
self.prefix = 'anon' | |||
self.i = 0 | |||
@@ -140,7 +141,8 @@ class EBNF_to_BNF(InlineTransformer): | |||
new_name = '__%s_%s_%d' % (self.prefix, type_, self.i) | |||
self.i += 1 | |||
t = Token('RULE', new_name, -1) | |||
self.new_rules[new_name] = T('expansions', [T('expansion', [expr]), T('expansion', [t, expr])]), self.rule_options | |||
tree = T('expansions', [T('expansion', [expr]), T('expansion', [t, expr])]) | |||
self.new_rules.append((new_name, tree, self.rule_options)) | |||
self.rules_by_expr[expr] = t | |||
return t | |||
@@ -174,7 +176,6 @@ class SimplifyRule_Visitor(Visitor): | |||
break | |||
tree.expand_kids_by_index(*to_expand) | |||
def expansion(self, tree): | |||
# rules_list unpacking | |||
# a : b (c|d) e | |||
@@ -194,7 +195,7 @@ class SimplifyRule_Visitor(Visitor): | |||
tree.data = 'expansions' | |||
tree.children = [self.visit(T('expansion', [option if i==j else other | |||
for j, other in enumerate(tree.children)])) | |||
for option in child.children] | |||
for option in set(child.children)] | |||
break | |||
else: | |||
break | |||
@@ -208,7 +209,10 @@ class SimplifyRule_Visitor(Visitor): | |||
tree.data = 'expansions' | |||
tree.children = aliases | |||
expansions = _flatten | |||
def expansions(self, tree): | |||
self._flatten(tree) | |||
tree.children = list(set(tree.children)) | |||
class RuleTreeToText(Transformer): | |||
def expansions(self, x): | |||
@@ -389,12 +393,6 @@ def _interleave(l, item): | |||
def _choice_of_rules(rules): | |||
return T('expansions', [T('expansion', [Token('RULE', name)]) for name in rules]) | |||
def dict_update_safe(d1, d2): | |||
for k, v in d2.items(): | |||
assert k not in d1 | |||
d1[k] = v | |||
class Grammar: | |||
def __init__(self, rule_defs, token_defs, ignore): | |||
self.token_defs = token_defs | |||
@@ -411,6 +409,7 @@ class Grammar: | |||
terms_to_ignore = {name:'__'+name for name in self.ignore} | |||
if terms_to_ignore: | |||
assert set(terms_to_ignore) <= {name for name, _t in term_defs} | |||
term_defs = [(terms_to_ignore.get(name,name),t) for name,t in term_defs] | |||
expr = Token('RULE', '__ignore') | |||
for r, tree, _o in rule_defs: | |||
@@ -466,57 +465,41 @@ class Grammar: | |||
# ================= | |||
# Compile Rules | |||
# ================= | |||
ebnf_to_bnf = EBNF_to_BNF() | |||
simplify_rule = SimplifyRule_Visitor() | |||
# 1. Pre-process terminals | |||
transformer = PrepareLiterals() | |||
if not lexer: | |||
transformer *= SplitLiterals() | |||
transformer *= ExtractAnonTokens(tokens) # Adds to tokens | |||
rules = {} | |||
# 2. Convert EBNF to BNF (and apply step 1) | |||
ebnf_to_bnf = EBNF_to_BNF() | |||
rules = [] | |||
for name, rule_tree, options in rule_defs: | |||
assert name not in rules, name | |||
ebnf_to_bnf.rule_options = RuleOptions(keep_all_tokens=True) if options and options.keep_all_tokens else None | |||
tree = transformer.transform(rule_tree) | |||
rules[name] = ebnf_to_bnf.transform(tree), options | |||
rules.append((name, ebnf_to_bnf.transform(tree), options)) | |||
rules += ebnf_to_bnf.new_rules | |||
dict_update_safe(rules, ebnf_to_bnf.new_rules) | |||
for tree, _o in rules.values(): | |||
simplify_rule.visit(tree) | |||
assert len(rules) == len({name for name, _t, _o in rules}), "Whoops, name collision" | |||
# 3. Compile tree to Rule objects | |||
rule_tree_to_text = RuleTreeToText() | |||
rules = {origin: (rule_tree_to_text.transform(tree), options) for origin, (tree, options) in rules.items()} | |||
return tokens, rules, self.ignore | |||
simplify_rule = SimplifyRule_Visitor() | |||
compiled_rules = [] | |||
for name, tree, options in rules: | |||
simplify_rule.visit(tree) | |||
expansions = rule_tree_to_text.transform(tree) | |||
for expansion, alias in expansions: | |||
if alias and name.startswith('_'): | |||
raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (name, alias)) | |||
class RuleOptions: | |||
def __init__(self, keep_all_tokens=False, expand1=False, create_token=None, filter_out=False, priority=None): | |||
self.keep_all_tokens = keep_all_tokens | |||
self.expand1 = expand1 | |||
self.create_token = create_token # used for scanless postprocessing | |||
self.priority = priority | |||
self.filter_out = filter_out # remove this rule from the tree | |||
# used for "token"-rules in scanless | |||
@classmethod | |||
def from_rule(cls, name, *x): | |||
if len(x) > 1: | |||
priority, expansions = x | |||
priority = int(priority) | |||
else: | |||
expansions ,= x | |||
priority = None | |||
keep_all_tokens = name.startswith('!') | |||
name = name.lstrip('!') | |||
expand1 = name.startswith('?') | |||
name = name.lstrip('?') | |||
rule = Rule(name, expansion, alias, options) | |||
compiled_rules.append(rule) | |||
return name, expansions, cls(keep_all_tokens, expand1, priority=priority) | |||
return tokens, compiled_rules, self.ignore | |||
@@ -553,15 +536,30 @@ def resolve_token_references(token_defs): | |||
if not changed: | |||
break | |||
def options_from_rule(name, *x): | |||
if len(x) > 1: | |||
priority, expansions = x | |||
priority = int(priority) | |||
else: | |||
expansions ,= x | |||
priority = None | |||
keep_all_tokens = name.startswith('!') | |||
name = name.lstrip('!') | |||
expand1 = name.startswith('?') | |||
name = name.lstrip('?') | |||
return name, expansions, RuleOptions(keep_all_tokens, expand1, priority=priority) | |||
class GrammarLoader: | |||
def __init__(self): | |||
tokens = [TokenDef(name, PatternRE(value)) for name, value in TOKENS.items()] | |||
rules = [RuleOptions.from_rule(name, x) for name, x in RULES.items()] | |||
d = {r: ([(x.split(), None) for x in xs], o) for r, xs, o in rules} | |||
rules, callback = ParseTreeBuilder(d, T).apply() | |||
rules = [options_from_rule(name, x) for name, x in RULES.items()] | |||
rules = [Rule(r, x.split(), None, o) for r, xs, o in rules for x in xs] | |||
callback = ParseTreeBuilder(rules, T).create_callback() | |||
lexer_conf = LexerConf(tokens, ['WS', 'COMMENT']) | |||
parser_conf = ParserConf(rules, callback, 'start') | |||
self.parser = LALR(lexer_conf, parser_conf) | |||
@@ -636,7 +634,6 @@ class GrammarLoader: | |||
ignore_names.append(name) | |||
token_defs.append((name, (t, 0))) | |||
# Verify correctness 2 | |||
token_names = set() | |||
for name, _ in token_defs: | |||
@@ -644,10 +641,13 @@ class GrammarLoader: | |||
raise GrammarError("Token '%s' defined more than once" % name) | |||
token_names.add(name) | |||
if set(ignore_names) > token_names: | |||
raise GrammarError("Tokens %s were marked to ignore but were not defined!" % (set(ignore_names) - token_names)) | |||
# Resolve token references | |||
resolve_token_references(token_defs) | |||
rules = [RuleOptions.from_rule(*x) for x in rule_defs] | |||
rules = [options_from_rule(*x) for x in rule_defs] | |||
rule_names = set() | |||
for name, _x, _o in rules: | |||
@@ -1,6 +1,9 @@ | |||
from .common import is_terminal, GrammarError | |||
from .utils import suppress | |||
from .lexer import Token | |||
from .grammar import Rule | |||
###{standalone | |||
class NodeBuilder: | |||
def __init__(self, tree_class, name): | |||
@@ -27,7 +30,7 @@ class Factory: | |||
def __call__(self, node_builder): | |||
return self.cls(node_builder, *self.args) | |||
class TokenWrapper: | |||
"Used for fixing the results of scanless parsing" | |||
@@ -106,51 +109,53 @@ class ParseTreeBuilder: | |||
self.rule_builders = list(self._init_builders(rules)) | |||
self.user_aliases = {} | |||
def _init_builders(self, rules): | |||
filter_out = set() | |||
for origin, (expansions, options) in rules.items(): | |||
if options and options.filter_out: | |||
assert origin.startswith('_') # Just to make sure | |||
filter_out.add(origin) | |||
for rule in rules: | |||
if rule.options and rule.options.filter_out: | |||
assert rule.origin.startswith('_') # Just to make sure | |||
filter_out.add(rule.origin) | |||
for origin, (expansions, options) in rules.items(): | |||
for rule in rules: | |||
options = rule.options | |||
keep_all_tokens = self.always_keep_all_tokens or (options.keep_all_tokens if options else False) | |||
expand1 = options.expand1 if options else False | |||
create_token = options.create_token if options else False | |||
for expansion, alias in expansions: | |||
if alias and origin.startswith('_'): | |||
raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (origin, alias)) | |||
wrapper_chain = filter(None, [ | |||
(expand1 and not rule.alias) and Expand1, | |||
create_token and Factory(TokenWrapper, create_token), | |||
create_rule_handler(rule.expansion, keep_all_tokens, filter_out), | |||
self.propagate_positions and PropagatePositions, | |||
]) | |||
wrapper_chain = filter(None, [ | |||
(expand1 and not alias) and Expand1, | |||
create_token and Factory(TokenWrapper, create_token), | |||
create_rule_handler(expansion, keep_all_tokens, filter_out), | |||
self.propagate_positions and PropagatePositions, | |||
]) | |||
yield rule, wrapper_chain | |||
yield origin, expansion, options, alias or origin, wrapper_chain | |||
def apply(self, transformer=None): | |||
def create_callback(self, transformer=None): | |||
callback = Callback() | |||
new_rules = [] | |||
for origin, expansion, options, alias, wrapper_chain in self.rule_builders: | |||
callback_name = '_callback_%s_%s' % (origin, '_'.join(expansion)) | |||
for rule, wrapper_chain in self.rule_builders: | |||
internal_callback_name = '_callback_%s_%s' % (rule.origin, '_'.join(rule.expansion)) | |||
user_callback_name = rule.alias or rule.origin | |||
try: | |||
f = transformer._get_func(alias) | |||
f = transformer._get_func(user_callback_name) | |||
except AttributeError: | |||
f = NodeBuilder(self.tree_class, alias) | |||
f = NodeBuilder(self.tree_class, user_callback_name) | |||
self.user_aliases[rule] = rule.alias | |||
rule.alias = internal_callback_name | |||
for w in wrapper_chain: | |||
f = w(f) | |||
if hasattr(callback, callback_name): | |||
raise GrammarError("Rule expansion '%s' already exists in rule %s" % (' '.join(expansion), origin)) | |||
setattr(callback, callback_name, f) | |||
if hasattr(callback, internal_callback_name): | |||
raise GrammarError("Rule '%s' already exists" % (rule,)) | |||
setattr(callback, internal_callback_name, f) | |||
new_rules.append(( origin, expansion, callback_name, options )) | |||
return callback | |||
return new_rules, callback | |||
###} |
@@ -1,5 +1,5 @@ | |||
import re | |||
import sre_parse | |||
from .utils import get_regexp_width | |||
from parsers.grammar_analysis import GrammarAnalyzer | |||
from .lexer import Lexer, ContextualLexer, Token | |||
@@ -9,10 +9,16 @@ from .parsers import lalr_parser, earley, xearley, resolve_ambig, cyk | |||
from .tree import Tree | |||
class WithLexer: | |||
def __init__(self, lexer_conf): | |||
def init_traditional_lexer(self, lexer_conf): | |||
self.lexer_conf = lexer_conf | |||
self.lexer = Lexer(lexer_conf.tokens, ignore=lexer_conf.ignore) | |||
def init_contextual_lexer(self, lexer_conf, parser_conf): | |||
self.lexer_conf = lexer_conf | |||
d = {idx:t.keys() for idx, t in self.parser.analysis.parse_table.states.items()} | |||
always_accept = lexer_conf.postlex.always_accept if lexer_conf.postlex else () | |||
self.lexer = ContextualLexer(lexer_conf.tokens, d, ignore=lexer_conf.ignore, always_accept=always_accept) | |||
def lex(self, text): | |||
stream = self.lexer.lex(text) | |||
if self.lexer_conf.postlex: | |||
@@ -23,32 +29,22 @@ class WithLexer: | |||
class LALR(WithLexer): | |||
def __init__(self, lexer_conf, parser_conf, options=None): | |||
WithLexer.__init__(self, lexer_conf) | |||
self.parser_conf = parser_conf | |||
self.parser = lalr_parser.Parser(parser_conf) | |||
self.init_traditional_lexer(lexer_conf) | |||
def parse(self, text): | |||
tokens = self.lex(text) | |||
return self.parser.parse(tokens) | |||
token_stream = self.lex(text) | |||
return self.parser.parse(token_stream) | |||
class LALR_ContextualLexer: | |||
class LALR_ContextualLexer(WithLexer): | |||
def __init__(self, lexer_conf, parser_conf, options=None): | |||
self.lexer_conf = lexer_conf | |||
self.parser_conf = parser_conf | |||
self.parser = lalr_parser.Parser(parser_conf) | |||
d = {idx:t.keys() for idx, t in self.parser.analysis.states_idx.items()} | |||
always_accept = lexer_conf.postlex.always_accept if lexer_conf.postlex else () | |||
self.lexer = ContextualLexer(lexer_conf.tokens, d, ignore=lexer_conf.ignore, always_accept=always_accept) | |||
self.init_contextual_lexer(lexer_conf, parser_conf) | |||
def parse(self, text): | |||
tokens = self.lexer.lex(text) | |||
if self.lexer_conf.postlex: | |||
tokens = self.lexer_conf.postlex.process(tokens) | |||
return self.parser.parse(tokens, self.lexer.set_parser_state) | |||
token_stream = self.lex(text) | |||
return self.parser.parse(token_stream, self.lexer.set_parser_state) | |||
def get_ambiguity_resolver(options): | |||
if not options or options.ambiguity == 'resolve': | |||
@@ -60,55 +56,47 @@ def get_ambiguity_resolver(options): | |||
raise ValueError(options) | |||
def tokenize_text(text): | |||
new_text = [] | |||
line = 1 | |||
col_start_pos = 0 | |||
for i, ch in enumerate(text): | |||
if '\n' in ch: | |||
line += ch.count('\n') | |||
col_start_pos = i + ch.rindex('\n') | |||
new_text.append(Token('CHAR', ch, line=line, column=i - col_start_pos)) | |||
return new_text | |||
yield Token('CHAR', ch, line=line, column=i - col_start_pos) | |||
class Earley_NoLex: | |||
def __init__(self, lexer_conf, parser_conf, options=None): | |||
self.token_by_name = {t.name:t for t in lexer_conf.tokens} | |||
self._prepare_match(lexer_conf) | |||
rules = [(n, list(self._prepare_expansion(x)), a, o) for n,x,a,o in parser_conf.rules] | |||
self.parser = earley.Parser(rules, | |||
parser_conf.start, | |||
parser_conf.callback, | |||
self.parser = earley.Parser(parser_conf, self.match, | |||
resolve_ambiguity=get_ambiguity_resolver(options)) | |||
def _prepare_expansion(self, expansion): | |||
for sym in expansion: | |||
if is_terminal(sym): | |||
regexp = self.token_by_name[sym].pattern.to_regexp() | |||
width = sre_parse.parse(regexp).getwidth() | |||
if width != (1,1): | |||
raise GrammarError('Scanless parsing (lexer=None) requires all tokens to have a width of 1 (terminal %s: %s is %s)' % (sym, regexp, width)) | |||
yield Terminal_Regexp(sym, regexp) | |||
else: | |||
yield sym | |||
def match(self, term, text, index=0): | |||
return self.regexps[term].match(text, index) | |||
def _prepare_match(self, lexer_conf): | |||
self.regexps = {} | |||
for t in lexer_conf.tokens: | |||
regexp = t.pattern.to_regexp() | |||
width = get_regexp_width(regexp) | |||
if width != (1,1): | |||
raise GrammarError('Scanless parsing (lexer=None) requires all tokens to have a width of 1 (terminal %s: %s is %s)' % (sym, regexp, width)) | |||
self.regexps[t.name] = re.compile(regexp) | |||
def parse(self, text): | |||
new_text = tokenize_text(text) | |||
return self.parser.parse(new_text) | |||
token_stream = tokenize_text(text) | |||
return self.parser.parse(token_stream) | |||
class Earley(WithLexer): | |||
def __init__(self, lexer_conf, parser_conf, options=None): | |||
WithLexer.__init__(self, lexer_conf) | |||
rules = [(n, self._prepare_expansion(x), a, o) for n,x,a,o in parser_conf.rules] | |||
self.init_traditional_lexer(lexer_conf) | |||
self.parser = earley.Parser(rules, | |||
parser_conf.start, | |||
parser_conf.callback, | |||
self.parser = earley.Parser(parser_conf, self.match, | |||
resolve_ambiguity=get_ambiguity_resolver(options)) | |||
def _prepare_expansion(self, expansion): | |||
return [Terminal_Token(sym) if is_terminal(sym) else sym for sym in expansion] | |||
def match(self, term, token): | |||
return term == token.type | |||
def parse(self, text): | |||
tokens = self.lex(text) | |||
@@ -119,27 +107,31 @@ class XEarley: | |||
def __init__(self, lexer_conf, parser_conf, options=None): | |||
self.token_by_name = {t.name:t for t in lexer_conf.tokens} | |||
rules = [(n, list(self._prepare_expansion(x)), a, o) for n,x,a,o in parser_conf.rules] | |||
self._prepare_match(lexer_conf) | |||
ignore = [Terminal_Regexp(x, self.token_by_name[x].pattern.to_regexp()) for x in lexer_conf.ignore] | |||
self.parser = xearley.Parser(rules, | |||
parser_conf.start, | |||
parser_conf.callback, | |||
self.parser = xearley.Parser(parser_conf, | |||
self.match, | |||
resolve_ambiguity=get_ambiguity_resolver(options), | |||
ignore=ignore, | |||
ignore=lexer_conf.ignore, | |||
predict_all=options.earley__predict_all | |||
) | |||
def _prepare_expansion(self, expansion): | |||
for sym in expansion: | |||
if is_terminal(sym): | |||
regexp = self.token_by_name[sym].pattern.to_regexp() | |||
width = sre_parse.parse(regexp).getwidth() | |||
assert width | |||
yield Terminal_Regexp(sym, regexp) | |||
def match(self, term, text, index=0): | |||
return self.regexps[term].match(text, index) | |||
def _prepare_match(self, lexer_conf): | |||
self.regexps = {} | |||
for t in lexer_conf.tokens: | |||
regexp = t.pattern.to_regexp() | |||
try: | |||
width = get_regexp_width(regexp)[0] | |||
except ValueError: | |||
raise ValueError("Bad regexp in token %s: %s" % (t.name, regexp)) | |||
else: | |||
yield sym | |||
if width == 0: | |||
raise ValueError("Dynamic Earley doesn't allow zero-width regexps") | |||
self.regexps[t.name] = re.compile(regexp) | |||
def parse(self, text): | |||
return self.parser.parse(text) | |||
@@ -13,14 +13,11 @@ | |||
# Author: Erez Shinan (2017) | |||
# Email : erezshin@gmail.com | |||
from ..common import ParseError, UnexpectedToken, Terminal | |||
from ..common import ParseError, UnexpectedToken, is_terminal | |||
from ..tree import Tree, Visitor_NoRecurse, Transformer_NoRecurse | |||
from .grammar_analysis import GrammarAnalyzer | |||
class EndToken: | |||
type = '$end' | |||
class Derivation(Tree): | |||
_hash = None | |||
@@ -35,8 +32,6 @@ class Derivation(Tree): | |||
self._hash = Tree.__hash__(self) | |||
return self._hash | |||
END_TOKEN = EndToken() | |||
class Item(object): | |||
"An Earley Item, the atom of the algorithm." | |||
@@ -59,11 +54,8 @@ class Item(object): | |||
new_tree = Derivation(self.rule, self.tree.children + [tree]) | |||
return self.__class__(self.rule, self.ptr+1, self.start, new_tree) | |||
def similar(self, other): | |||
return self.start is other.start and self.ptr == other.ptr and self.rule == other.rule | |||
def __eq__(self, other): | |||
return self.similar(other) #and (self.tree == other.tree) | |||
return self.start is other.start and self.ptr == other.ptr and self.rule == other.rule | |||
def __hash__(self): | |||
return hash((self.rule, self.ptr, id(self.start))) # Always runs Derivation.__hash__ | |||
@@ -134,7 +126,7 @@ class Column: | |||
self.completed[item_key] = item | |||
self.to_reduce.append(item) | |||
else: | |||
if isinstance(item.expect, Terminal): | |||
if is_terminal(item.expect): | |||
self.to_scan.append(item) | |||
else: | |||
k = item_key if self.predict_all else item | |||
@@ -151,31 +143,30 @@ class Column: | |||
__nonzero__ = __bool__ # Py2 backwards-compatibility | |||
class Parser: | |||
def __init__(self, rules, start_symbol, callback, resolve_ambiguity=None): | |||
self.analysis = GrammarAnalyzer(rules, start_symbol) | |||
self.start_symbol = start_symbol | |||
def __init__(self, parser_conf, term_matcher, resolve_ambiguity=None): | |||
self.analysis = GrammarAnalyzer(parser_conf) | |||
self.parser_conf = parser_conf | |||
self.resolve_ambiguity = resolve_ambiguity | |||
self.FIRST = self.analysis.FIRST | |||
self.postprocess = {} | |||
self.predictions = {} | |||
self.FIRST = {} | |||
for rule in self.analysis.rules: | |||
if rule.origin != '$root': # XXX kinda ugly | |||
a = rule.alias | |||
self.postprocess[rule] = a if callable(a) else (a and getattr(callback, a)) | |||
self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)] | |||
for rule in parser_conf.rules: | |||
self.postprocess[rule] = getattr(parser_conf.callback, rule.alias) | |||
self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)] | |||
self.FIRST[rule.origin] = self.analysis.FIRST[rule.origin] | |||
self.term_matcher = term_matcher | |||
def parse(self, stream, start_symbol=None): | |||
# Define parser functions | |||
start_symbol = start_symbol or self.start_symbol | |||
start_symbol = start_symbol or self.parser_conf.start | |||
_Item = Item | |||
match = self.term_matcher | |||
def predict(nonterm, column): | |||
assert not isinstance(nonterm, Terminal), nonterm | |||
assert not is_terminal(nonterm), nonterm | |||
return [_Item(rule, 0, column, None) for rule in self.predictions[nonterm]] | |||
def complete(item): | |||
@@ -195,14 +186,13 @@ class Parser: | |||
for item in to_reduce: | |||
new_items = list(complete(item)) | |||
for new_item in new_items: | |||
if new_item.similar(item): | |||
raise ParseError('Infinite recursion detected! (rule %s)' % new_item.rule) | |||
if item in new_items: | |||
raise ParseError('Infinite recursion detected! (rule %s)' % item.rule) | |||
column.add(new_items) | |||
def scan(i, token, column): | |||
next_set = Column(i, self.FIRST) | |||
next_set.add(item.advance(token) for item in column.to_scan if item.expect.match(token)) | |||
next_set.add(item.advance(token) for item in column.to_scan if match(item.expect, token)) | |||
if not next_set: | |||
expect = {i.expect for i in column.to_scan} | |||
@@ -249,24 +239,3 @@ class ApplyCallbacks(Transformer_NoRecurse): | |||
return callback(children) | |||
else: | |||
return Tree(rule.origin, children) | |||
# RULES = [ | |||
# ('a', ['d']), | |||
# ('d', ['b']), | |||
# ('b', ['C']), | |||
# ('b', ['b', 'C']), | |||
# ('b', ['C', 'b']), | |||
# ] | |||
# p = Parser(RULES, 'a') | |||
# for x in p.parse('CC'): | |||
# print x.pretty() | |||
#--------------- | |||
# RULES = [ | |||
# ('s', ['a', 'a']), | |||
# ('a', ['b', 'b']), | |||
# ('b', ['C'], lambda (x,): x), | |||
# ('b', ['b', 'C']), | |||
# ] | |||
# p = Parser(RULES, 's', {}) | |||
# print p.parse('CCCCC').pretty() |
@@ -1,20 +1,8 @@ | |||
from ..utils import bfs, fzset | |||
from ..common import GrammarError, is_terminal | |||
from ..grammar import Rule | |||
class Rule(object): | |||
""" | |||
origin : a symbol | |||
expansion : a list of symbols | |||
""" | |||
def __init__(self, origin, expansion, alias=None, options=None): | |||
self.origin = origin | |||
self.expansion = expansion | |||
self.alias = alias | |||
self.options = options | |||
def __repr__(self): | |||
return '<%s : %s>' % (self.origin, ' '.join(map(str,self.expansion))) | |||
class RulePtr(object): | |||
def __init__(self, rule, index): | |||
@@ -106,28 +94,30 @@ def calculate_sets(rules): | |||
class GrammarAnalyzer(object): | |||
def __init__(self, rule_tuples, start_symbol, debug=False): | |||
self.start_symbol = start_symbol | |||
def __init__(self, parser_conf, debug=False): | |||
rules = parser_conf.rules | |||
assert len(rules) == len(set(rules)) | |||
self.start_symbol = parser_conf.start | |||
self.debug = debug | |||
rule_tuples = list(rule_tuples) | |||
rule_tuples.append(('$root', [start_symbol, '$end'])) | |||
rule_tuples = [(t[0], t[1], None, None) if len(t)==2 else t for t in rule_tuples] | |||
self.rules = set() | |||
self.rules_by_origin = {o: [] for o, _x, _a, _opt in rule_tuples} | |||
for origin, exp, alias, options in rule_tuples: | |||
r = Rule( origin, exp, alias, options ) | |||
self.rules.add(r) | |||
self.rules_by_origin[origin].append(r) | |||
for r in self.rules: | |||
root_rule = Rule('$root', [self.start_symbol, '$END']) | |||
self.rules_by_origin = {r.origin: [] for r in rules} | |||
for r in rules: | |||
self.rules_by_origin[r.origin].append(r) | |||
self.rules_by_origin[root_rule.origin] = [root_rule] | |||
for r in rules: | |||
for sym in r.expansion: | |||
if not (is_terminal(sym) or sym in self.rules_by_origin): | |||
raise GrammarError("Using an undefined rule: %s" % sym) | |||
self.init_state = self.expand_rule('$root') | |||
self.start_state = self.expand_rule('$root') | |||
self.rules = rules | |||
self.FIRST, self.FOLLOW, self.NULLABLE = calculate_sets(self.rules) | |||
self.FIRST, self.FOLLOW, self.NULLABLE = calculate_sets(rules + [root_rule]) | |||
def expand_rule(self, rule): | |||
"Returns all init_ptrs accessible by rule (recursive)" | |||
@@ -14,7 +14,43 @@ from ..common import GrammarError, is_terminal | |||
from .grammar_analysis import GrammarAnalyzer | |||
ACTION_SHIFT = 0 | |||
class Action: | |||
def __init__(self, name): | |||
self.name = name | |||
def __str__(self): | |||
return self.name | |||
def __repr__(self): | |||
return str(self) | |||
Shift = Action('Shift') | |||
Reduce = Action('Reduce') | |||
class ParseTable: | |||
def __init__(self, states, start_state, end_state): | |||
self.states = states | |||
self.start_state = start_state | |||
self.end_state = end_state | |||
class IntParseTable(ParseTable): | |||
@classmethod | |||
def from_ParseTable(cls, parse_table): | |||
enum = list(parse_table.states) | |||
state_to_idx = {s:i for i,s in enumerate(enum)} | |||
int_states = {} | |||
for s, la in parse_table.states.items(): | |||
la = {k:(v[0], state_to_idx[v[1]]) if v[0] is Shift else v | |||
for k,v in la.items()} | |||
int_states[ state_to_idx[s] ] = la | |||
start_state = state_to_idx[parse_table.start_state] | |||
end_state = state_to_idx[parse_table.end_state] | |||
return cls(int_states, start_state, end_state) | |||
class LALR_Analyzer(GrammarAnalyzer): | |||
@@ -27,7 +63,7 @@ class LALR_Analyzer(GrammarAnalyzer): | |||
sat, unsat = classify_bool(state, lambda rp: rp.is_satisfied) | |||
for rp in sat: | |||
for term in self.FOLLOW.get(rp.rule.origin, ()): | |||
lookahead[term].append(('reduce', rp.rule)) | |||
lookahead[term].append((Reduce, rp.rule)) | |||
d = classify(unsat, lambda rp: rp.next) | |||
for sym, rps in d.items(): | |||
@@ -38,8 +74,8 @@ class LALR_Analyzer(GrammarAnalyzer): | |||
rps |= self.expand_rule(rp.next) | |||
new_state = fzset(rps) | |||
lookahead[sym].append(('shift', new_state)) | |||
if sym == '$end': | |||
lookahead[sym].append((Shift, new_state)) | |||
if sym == '$END': | |||
self.end_states.append( new_state ) | |||
yield fzset(rps) | |||
@@ -50,7 +86,7 @@ class LALR_Analyzer(GrammarAnalyzer): | |||
for x in v: | |||
# XXX resolving shift/reduce into shift, like PLY | |||
# Give a proper warning | |||
if x[0] == 'shift': | |||
if x[0] is Shift: | |||
lookahead[k] = [x] | |||
for k, v in lookahead.items(): | |||
@@ -59,22 +95,15 @@ class LALR_Analyzer(GrammarAnalyzer): | |||
self.states[state] = {k:v[0] for k, v in lookahead.items()} | |||
for _ in bfs([self.init_state], step): | |||
for _ in bfs([self.start_state], step): | |||
pass | |||
self.end_state ,= self.end_states | |||
# -- | |||
self.enum = list(self.states) | |||
self.enum_rev = {s:i for i,s in enumerate(self.enum)} | |||
self.states_idx = {} | |||
for s, la in self.states.items(): | |||
la = {k:(ACTION_SHIFT, self.enum_rev[v[1]]) if v[0]=='shift' | |||
else (v[0], (v[1], len(v[1].expansion))) # Reduce | |||
for k,v in la.items()} | |||
self.states_idx[ self.enum_rev[s] ] = la | |||
self._parse_table = ParseTable(self.states, self.start_state, self.end_state) | |||
if self.debug: | |||
self.parse_table = self._parse_table | |||
else: | |||
self.parse_table = IntParseTable.from_ParseTable(self._parse_table) | |||
self.init_state_idx = self.enum_rev[self.init_state] | |||
self.end_state_idx = self.enum_rev[self.end_state] |
@@ -3,30 +3,30 @@ | |||
# Author: Erez Shinan (2017) | |||
# Email : erezshin@gmail.com | |||
from ..common import ParseError, UnexpectedToken | |||
from ..common import UnexpectedToken | |||
from .lalr_analysis import LALR_Analyzer, ACTION_SHIFT | |||
class FinalReduce: | |||
def __init__(self, value): | |||
self.value = value | |||
from .lalr_analysis import LALR_Analyzer, Shift | |||
class Parser: | |||
def __init__(self, parser_conf): | |||
assert all(o is None or o.priority is None for n,x,a,o in parser_conf.rules), "LALR doesn't yet support prioritization" | |||
self.analysis = analysis = LALR_Analyzer(parser_conf.rules, parser_conf.start) | |||
assert all(r.options is None or r.options.priority is None | |||
for r in parser_conf.rules), "LALR doesn't yet support prioritization" | |||
self.analysis = analysis = LALR_Analyzer(parser_conf) | |||
analysis.compute_lookahead() | |||
callbacks = {rule: getattr(parser_conf.callback, rule.alias or rule.origin, None) | |||
for rule in analysis.rules} | |||
self.parser = _Parser(analysis.states_idx, analysis.init_state_idx, analysis.end_state_idx, callbacks) | |||
self.parser_conf = parser_conf | |||
self.parser = _Parser(analysis.parse_table, callbacks) | |||
self.parse = self.parser.parse | |||
###{standalone | |||
class _Parser: | |||
def __init__(self, states, init_state, end_state, callbacks): | |||
self.states = states | |||
self.init_state = init_state | |||
self.end_state = end_state | |||
def __init__(self, parse_table, callbacks): | |||
self.states = parse_table.states | |||
self.start_state = parse_table.start_state | |||
self.end_state = parse_table.end_state | |||
self.callbacks = callbacks | |||
def parse(self, seq, set_state=None): | |||
@@ -35,10 +35,10 @@ class _Parser: | |||
stream = iter(seq) | |||
states = self.states | |||
state_stack = [self.init_state] | |||
state_stack = [self.start_state] | |||
value_stack = [] | |||
if set_state: set_state(self.init_state) | |||
if set_state: set_state(self.start_state) | |||
def get_action(key): | |||
state = state_stack[-1] | |||
@@ -49,7 +49,8 @@ class _Parser: | |||
raise UnexpectedToken(token, expected, seq, i) | |||
def reduce(rule, size): | |||
def reduce(rule): | |||
size = len(rule.expansion) | |||
if size: | |||
s = value_stack[-size:] | |||
del state_stack[-size:] | |||
@@ -60,7 +61,7 @@ class _Parser: | |||
value = self.callbacks[rule](s) | |||
_action, new_state = get_action(rule.origin) | |||
assert _action == ACTION_SHIFT | |||
assert _action is Shift | |||
state_stack.append(new_state) | |||
value_stack.append(value) | |||
@@ -72,22 +73,24 @@ class _Parser: | |||
action, arg = get_action(token.type) | |||
assert arg != self.end_state | |||
if action == ACTION_SHIFT: | |||
if action is Shift: | |||
state_stack.append(arg) | |||
value_stack.append(token) | |||
if set_state: set_state(arg) | |||
token = next(stream) | |||
i += 1 | |||
else: | |||
reduce(*arg) | |||
reduce(arg) | |||
except StopIteration: | |||
pass | |||
while True: | |||
_action, arg = get_action('$end') | |||
if _action == ACTION_SHIFT: | |||
_action, arg = get_action('$END') | |||
if _action is Shift: | |||
assert arg == self.end_state | |||
val ,= value_stack | |||
return val | |||
else: | |||
reduce(*arg) | |||
reduce(arg) | |||
###} |
@@ -20,7 +20,7 @@ | |||
from collections import defaultdict | |||
from ..common import ParseError, UnexpectedToken, Terminal | |||
from ..common import ParseError, UnexpectedToken, is_terminal | |||
from ..lexer import Token, UnexpectedInput | |||
from ..tree import Tree | |||
from .grammar_analysis import GrammarAnalyzer | |||
@@ -28,37 +28,34 @@ from .grammar_analysis import GrammarAnalyzer | |||
from .earley import ApplyCallbacks, Item, Column | |||
class Parser: | |||
def __init__(self, rules, start_symbol, callback, resolve_ambiguity=None, ignore=(), predict_all=False): | |||
self.analysis = GrammarAnalyzer(rules, start_symbol) | |||
self.start_symbol = start_symbol | |||
def __init__(self, parser_conf, term_matcher, resolve_ambiguity=None, ignore=(), predict_all=False): | |||
self.analysis = GrammarAnalyzer(parser_conf) | |||
self.parser_conf = parser_conf | |||
self.resolve_ambiguity = resolve_ambiguity | |||
self.ignore = list(ignore) | |||
self.predict_all = predict_all | |||
self.FIRST = self.analysis.FIRST | |||
self.postprocess = {} | |||
self.predictions = {} | |||
self.FIRST = {} | |||
for rule in self.analysis.rules: | |||
if rule.origin != '$root': # XXX kinda ugly | |||
a = rule.alias | |||
self.postprocess[rule] = a if callable(a) else (a and getattr(callback, a)) | |||
self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)] | |||
for rule in parser_conf.rules: | |||
self.postprocess[rule] = getattr(parser_conf.callback, rule.alias) | |||
self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)] | |||
self.FIRST[rule.origin] = self.analysis.FIRST[rule.origin] | |||
self.term_matcher = term_matcher | |||
def parse(self, stream, start_symbol=None): | |||
# Define parser functions | |||
start_symbol = start_symbol or self.start_symbol | |||
start_symbol = start_symbol or self.parser_conf.start | |||
delayed_matches = defaultdict(list) | |||
match = self.term_matcher | |||
text_line = 1 | |||
text_column = 0 | |||
def predict(nonterm, column): | |||
assert not isinstance(nonterm, Terminal), nonterm | |||
assert not is_terminal(nonterm), nonterm | |||
return [Item(rule, 0, column, None) for rule in self.predictions[nonterm]] | |||
def complete(item): | |||
@@ -77,16 +74,15 @@ class Parser: | |||
column.add( predict(nonterm, column) ) | |||
for item in to_reduce: | |||
new_items = list(complete(item)) | |||
for new_item in new_items: | |||
if new_item.similar(item): | |||
raise ParseError('Infinite recursion detected! (rule %s)' % new_item.rule) | |||
if item in new_items: | |||
raise ParseError('Infinite recursion detected! (rule %s)' % item.rule) | |||
column.add(new_items) | |||
def scan(i, token, column): | |||
to_scan = column.to_scan | |||
for x in self.ignore: | |||
m = x.match(stream, i) | |||
m = match(x, stream, i) | |||
if m: | |||
delayed_matches[m.end()] += set(to_scan) | |||
delayed_matches[m.end()] += set(column.to_reduce) | |||
@@ -99,16 +95,16 @@ class Parser: | |||
# delayed_matches[m.end()] += to_scan | |||
for item in to_scan: | |||
m = item.expect.match(stream, i) | |||
m = match(item.expect, stream, i) | |||
if m: | |||
t = Token(item.expect.name, m.group(0), i, text_line, text_column) | |||
t = Token(item.expect, m.group(0), i, text_line, text_column) | |||
delayed_matches[m.end()].append(item.advance(t)) | |||
s = m.group(0) | |||
for j in range(1, len(s)): | |||
m = item.expect.match(s[:-j]) | |||
m = match(item.expect, s[:-j]) | |||
if m: | |||
t = Token(item.expect.name, m.group(0), i, text_line, text_column) | |||
t = Token(item.expect, m.group(0), i, text_line, text_column) | |||
delayed_matches[i+m.end()].append(item.advance(t)) | |||
next_set = Column(i+1, self.FIRST, predict_all=self.predict_all) | |||
@@ -131,7 +127,7 @@ class Parser: | |||
if token == '\n': | |||
text_line += 1 | |||
text_column = 1 | |||
text_column = 0 | |||
else: | |||
text_column += 1 | |||
@@ -143,7 +139,7 @@ class Parser: | |||
if n.rule.origin==start_symbol and n.start is column0] | |||
if not solutions: | |||
expected_tokens = [t.expect.name for t in column.to_scan] | |||
expected_tokens = [t.expect for t in column.to_scan] | |||
raise ParseError('Unexpected end of input! Expecting a terminal of: %s' % expected_tokens) | |||
elif len(solutions) == 1: | |||
@@ -0,0 +1,203 @@ | |||
###{standalone | |||
# | |||
# | |||
# Lark Stand-alone Generator Tool | |||
# ---------------------------------- | |||
# Generates a stand-alone LALR(1) parser with a standard lexer | |||
# | |||
# Git: https://github.com/erezsh/lark | |||
# Author: Erez Shinan (erezshin@gmail.com) | |||
# | |||
# | |||
# >>> LICENSE | |||
# | |||
# This tool and its generated code use a separate license from Lark. | |||
# | |||
# It is licensed under GPLv2 or above. | |||
# | |||
# If you wish to purchase a commercial license for this tool and its | |||
# generated code, contact me via email. | |||
# | |||
# This program is free software: you can redistribute it and/or modify | |||
# it under the terms of the GNU General Public License as published by | |||
# the Free Software Foundation, either version 2 of the License, or | |||
# (at your option) any later version. | |||
# | |||
# This program is distributed in the hope that it will be useful, | |||
# but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |||
# GNU General Public License for more details. | |||
# | |||
# See <http://www.gnu.org/licenses/>. | |||
# | |||
# | |||
###} | |||
import codecs | |||
import sys | |||
import os | |||
from pprint import pprint | |||
from os import path | |||
from collections import defaultdict | |||
import lark | |||
from lark import Lark | |||
from lark.parsers.lalr_analysis import Shift, Reduce | |||
from ..grammar import Rule | |||
__dir__ = path.dirname(__file__) | |||
__larkdir__ = path.join(__dir__, path.pardir) | |||
EXTRACT_STANDALONE_FILES = [ | |||
'tools/standalone.py', | |||
'utils.py', | |||
'common.py', | |||
'tree.py', | |||
'indenter.py', | |||
'lexer.py', | |||
'parse_tree_builder.py', | |||
'parsers/lalr_parser.py', | |||
] | |||
def extract_sections(lines): | |||
section = None | |||
text = [] | |||
sections = defaultdict(list) | |||
for l in lines: | |||
if l.startswith('###'): | |||
if l[3] == '{': | |||
section = l[4:].strip() | |||
elif l[3] == '}': | |||
sections[section] += text | |||
section = None | |||
text = [] | |||
else: | |||
raise ValueError(l) | |||
elif section: | |||
text.append(l) | |||
return {name:''.join(text) for name, text in sections.items()} | |||
class LexerAtoms: | |||
def __init__(self, lexer): | |||
self.mres = [(p.pattern,d) for p,d in lexer.mres] | |||
self.newline_types = lexer.newline_types | |||
self.ignore_types = lexer.ignore_types | |||
self.callback = {name:[(p.pattern,d) for p,d in c.mres] | |||
for name, c in lexer.callback.items()} | |||
def print_python(self): | |||
print('import re') | |||
print('MRES = (') | |||
pprint(self.mres) | |||
print(')') | |||
print('LEXER_CALLBACK = (') | |||
pprint(self.callback) | |||
print(')') | |||
print('NEWLINE_TYPES = %s' % self.newline_types) | |||
print('IGNORE_TYPES = %s' % self.ignore_types) | |||
print('class LexerRegexps: pass') | |||
print('lexer_regexps = LexerRegexps()') | |||
print('lexer_regexps.mres = [(re.compile(p), d) for p, d in MRES]') | |||
print('lexer_regexps.callback = {n: UnlessCallback([(re.compile(p), d) for p, d in mres])') | |||
print(' for n, mres in LEXER_CALLBACK.items()}') | |||
print('lexer = _Lex(lexer_regexps)') | |||
print('def lex(stream):') | |||
print(' return lexer.lex(stream, NEWLINE_TYPES, IGNORE_TYPES)') | |||
class GetRule: | |||
def __init__(self, rule_id): | |||
self.rule_id = rule_id | |||
def __repr__(self): | |||
return 'RULES[%d]' % self.rule_id | |||
rule_ids = {} | |||
token_types = {} | |||
def _get_token_type(token_type): | |||
if token_type not in token_types: | |||
token_types[token_type] = len(token_types) | |||
return token_types[token_type] | |||
class ParserAtoms: | |||
def __init__(self, parser): | |||
self.parse_table = parser.analysis.parse_table | |||
def print_python(self): | |||
print('class ParseTable: pass') | |||
print('parse_table = ParseTable()') | |||
print('STATES = {') | |||
for state, actions in self.parse_table.states.items(): | |||
print(' %r: %r,' % (state, {_get_token_type(token): ((1, rule_ids[arg]) if action is Reduce else (0, arg)) | |||
for token, (action, arg) in actions.items()})) | |||
print('}') | |||
print('TOKEN_TYPES = (') | |||
pprint({v:k for k, v in token_types.items()}) | |||
print(')') | |||
print('parse_table.states = {s: {TOKEN_TYPES[t]: (a, RULES[x] if a is Reduce else x) for t, (a, x) in acts.items()}') | |||
print(' for s, acts in STATES.items()}') | |||
print('parse_table.start_state = %s' % self.parse_table.start_state) | |||
print('parse_table.end_state = %s' % self.parse_table.end_state) | |||
print('class Lark_StandAlone:') | |||
print(' def __init__(self, transformer=None, postlex=None):') | |||
print(' callback = parse_tree_builder.create_callback(transformer=transformer)') | |||
print(' callbacks = {rule: getattr(callback, rule.alias or rule.origin, None) for rule in RULES.values()}') | |||
print(' self.parser = _Parser(parse_table, callbacks)') | |||
print(' self.postlex = postlex') | |||
print(' def parse(self, stream):') | |||
print(' tokens = lex(stream)') | |||
print(' if self.postlex: tokens = self.postlex.process(tokens)') | |||
print(' return self.parser.parse(tokens)') | |||
class TreeBuilderAtoms: | |||
def __init__(self, lark): | |||
self.rules = lark.rules | |||
self.ptb = lark._parse_tree_builder | |||
def print_python(self): | |||
print('RULES = {') | |||
for i, r in enumerate(self.rules): | |||
rule_ids[r] = i | |||
print(' %d: Rule(%r, %r, %r, %r),' % (i, r.origin, r.expansion, self.ptb.user_aliases[r], r.options )) | |||
print('}') | |||
print('parse_tree_builder = ParseTreeBuilder(RULES.values(), Tree)') | |||
def main(fn, start): | |||
with codecs.open(fn, encoding='utf8') as f: | |||
lark_inst = Lark(f, parser="lalr", start=start) | |||
lexer_atoms = LexerAtoms(lark_inst.parser.lexer) | |||
parser_atoms = ParserAtoms(lark_inst.parser.parser) | |||
tree_builder_atoms = TreeBuilderAtoms(lark_inst) | |||
print('# The file was automatically generated by Lark v%s' % lark.__version__) | |||
for pyfile in EXTRACT_STANDALONE_FILES: | |||
print (extract_sections(open(os.path.join(__larkdir__, pyfile)))['standalone']) | |||
print(open(os.path.join(__larkdir__, 'grammar.py')).read()) | |||
print('Shift = 0') | |||
print('Reduce = 1') | |||
lexer_atoms.print_python() | |||
tree_builder_atoms.print_python() | |||
parser_atoms.print_python() | |||
if __name__ == '__main__': | |||
if len(sys.argv) < 2: | |||
print("Lark Stand-alone Generator Tool") | |||
print("Usage: python -m lark.tools.standalone <grammar-file> [<start>]") | |||
sys.exit(1) | |||
if len(sys.argv) == 3: | |||
fn, start = sys.argv[1:] | |||
elif len(sys.argv) == 2: | |||
fn, start = sys.argv[1], 'start' | |||
else: | |||
assert False, sys.argv | |||
main(fn, start) |
@@ -7,6 +7,7 @@ from copy import deepcopy | |||
from .utils import inline_args | |||
###{standalone | |||
class Tree(object): | |||
def __init__(self, data, children, rule=None): | |||
self.data = data | |||
@@ -34,6 +35,7 @@ class Tree(object): | |||
def pretty(self, indent_str=' '): | |||
return ''.join(self._pretty(0, indent_str)) | |||
###} | |||
def expand_kids_by_index(self, *indices): | |||
for i in sorted(indices, reverse=True): # reverse so that changing tail won't affect indices | |||
@@ -100,6 +102,7 @@ class Tree(object): | |||
###{standalone | |||
class Transformer(object): | |||
def _get_func(self, name): | |||
return getattr(self, name) | |||
@@ -139,7 +142,7 @@ class TransformerChain(object): | |||
def __mul__(self, other): | |||
return TransformerChain(*self.transformers + (other,)) | |||
class InlineTransformer(Transformer): | |||
@@ -196,6 +199,7 @@ class Transformer_NoRecurse(Transformer): | |||
def __default__(self, t): | |||
return t | |||
###} | |||
def pydot__tree_to_png(tree, filename): | |||
@@ -1,7 +1,4 @@ | |||
import functools | |||
import types | |||
from collections import deque | |||
from contextlib import contextmanager | |||
class fzset(frozenset): | |||
def __repr__(self): | |||
@@ -49,8 +46,13 @@ try: | |||
except NameError: # Python 3 | |||
STRING_TYPE = str | |||
Str = type(u'') | |||
###{standalone | |||
import types | |||
import functools | |||
from contextlib import contextmanager | |||
Str = type(u'') | |||
def inline_args(f): | |||
# print '@@', f.__name__, type(f), isinstance(f, types.FunctionType), isinstance(f, types.TypeType), isinstance(f, types.BuiltinFunctionType) | |||
@@ -76,19 +78,6 @@ def inline_args(f): | |||
return _f | |||
try: | |||
compare = cmp | |||
except NameError: | |||
def compare(a, b): | |||
if a == b: | |||
return 0 | |||
elif a > b: | |||
return 1 | |||
else: | |||
return -1 | |||
try: | |||
from contextlib import suppress # Python 3 | |||
except ImportError: | |||
@@ -107,6 +96,26 @@ except ImportError: | |||
except excs: | |||
pass | |||
###} | |||
try: | |||
compare = cmp | |||
except NameError: | |||
def compare(a, b): | |||
if a == b: | |||
return 0 | |||
elif a > b: | |||
return 1 | |||
else: | |||
return -1 | |||
import sre_parse | |||
import sre_constants | |||
def get_regexp_width(regexp): | |||
try: | |||
return sre_parse.parse(regexp).getwidth() | |||
except sre_constants.error: | |||
raise ValueError(regexp) |
@@ -126,7 +126,7 @@ class TestParsers(unittest.TestCase): | |||
r = T().transform(g.parse("x")) | |||
self.assertEqual( r.children, ["<b>"] ) | |||
g = Lark("""start: a | |||
?a : b | |||
b : "x" | |||
@@ -142,14 +142,14 @@ class TestParsers(unittest.TestCase): | |||
r = T().transform(g.parse("xx")) | |||
self.assertEqual( r.children, ["<c>"] ) | |||
g = Lark("""start: a | |||
?a : b b -> c | |||
b : "x" | |||
""", parser='lalr', transformer=T()) | |||
r = g.parse("xx") | |||
self.assertEqual( r.children, ["<c>"] ) | |||
@@ -159,7 +159,7 @@ def _make_full_earley_test(LEXER): | |||
# Fails an Earley implementation without special handling for empty rules, | |||
# or re-processing of already completed rules. | |||
g = Lark(r"""start: B | |||
B: ("ab"|/[^b]/)* | |||
B: ("ab"|/[^b]/)+ | |||
""", lexer=LEXER) | |||
self.assertEqual( g.parse('abc').children[0], 'abc') | |||
@@ -796,6 +796,49 @@ def _make_parser_test(LEXER, PARSER): | |||
self.assertEqual(tree.children, ['a', 'A']) | |||
def test_twice_empty(self): | |||
g = """!start: [["A"]] | |||
""" | |||
l = _Lark(g) | |||
tree = l.parse('A') | |||
self.assertEqual(tree.children, ['A']) | |||
tree = l.parse('') | |||
self.assertEqual(tree.children, []) | |||
def test_undefined_ignore(self): | |||
g = """!start: "A" | |||
%ignore B | |||
""" | |||
self.assertRaises( GrammarError, _Lark, g) | |||
@unittest.skipIf(LEXER==None, "TODO: Fix scanless parsing or get rid of it") # TODO | |||
def test_line_and_column(self): | |||
g = r"""!start: "A" bc "D" | |||
!bc: "B\nC" | |||
""" | |||
l = _Lark(g) | |||
a, bc, d = l.parse("AB\nCD").children | |||
self.assertEqual(a.line, 1) | |||
self.assertEqual(a.column, 0) | |||
bc ,= bc.children | |||
self.assertEqual(bc.line, 1) | |||
self.assertEqual(bc.column, 1) | |||
self.assertEqual(d.line, 2) | |||
self.assertEqual(d.column, 1) | |||
# self.assertEqual(a.end_line, 1) | |||
# self.assertEqual(a.end_col, 1) | |||
# self.assertEqual(bc.end_line, 2) | |||
# self.assertEqual(bc.end_col, 1) | |||
# self.assertEqual(d.end_line, 2) | |||
# self.assertEqual(d.end_col, 2) | |||
def test_reduce_cycle(self): | |||
"""Tests an edge-condition in the LALR parser, in which a transition state looks exactly like the end state. | |||
It seems that the correct solution is to explicitely distinguish finalization in the reduce() function. | |||
@@ -969,7 +1012,7 @@ def _make_parser_test(LEXER, PARSER): | |||
parser = _Lark(grammar) | |||
tree = parser.parse("int 1 ! This is a comment\n") | |||
tree = parser.parse("int 1 ! This is a comment\n") | |||
self.assertEqual(tree.children, ['1']) | |||
tree = parser.parse("int 1 ! This is a comment") # A trailing ignore token can be tricky! | |||
@@ -983,6 +1026,7 @@ def _make_parser_test(LEXER, PARSER): | |||
self.assertEqual(tree.children, []) | |||
@unittest.skipIf(LEXER==None, "Scanless doesn't support regular expressions") | |||
def test_regex_escaping(self): | |||
g = _Lark("start: /[ab]/") | |||