Browse Source

Merge branch 'master' into master

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.5.3
ehudt 6 years ago
committed by GitHub
parent
commit
c1166695b7
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
24 changed files with 1538 additions and 445 deletions
  1. +15
    -13
      README.md
  2. +1
    -0
      examples/standalone/create_standalone.sh
  3. +21
    -0
      examples/standalone/json.g
  4. +794
    -0
      examples/standalone/json_parser.py
  5. +25
    -0
      examples/standalone/json_parser_main.py
  6. +1
    -1
      lark/__init__.py
  7. +10
    -31
      lark/common.py
  8. +37
    -0
      lark/grammar.py
  9. +1
    -0
      lark/grammars/common.g
  10. +3
    -0
      lark/indenter.py
  11. +5
    -3
      lark/lark.py
  12. +74
    -90
      lark/lexer.py
  13. +51
    -51
      lark/load_grammar.py
  14. +33
    -28
      lark/parse_tree_builder.py
  15. +55
    -63
      lark/parser_frontends.py
  16. +17
    -48
      lark/parsers/earley.py
  17. +19
    -29
      lark/parsers/grammar_analysis.py
  18. +47
    -18
      lark/parsers/lalr_analysis.py
  19. +25
    -22
      lark/parsers/lalr_parser.py
  20. +21
    -25
      lark/parsers/xearley.py
  21. +203
    -0
      lark/tools/standalone.py
  22. +5
    -1
      lark/tree.py
  23. +26
    -17
      lark/utils.py
  24. +49
    -5
      tests/test_parser.py

+ 15
- 13
README.md View File

@@ -12,6 +12,7 @@ Lark can:
- Build a parse-tree automagically, no construction code required - Build a parse-tree automagically, no construction code required
- Outperform all other Python libraries when using LALR(1) (Yes, including PLY) - Outperform all other Python libraries when using LALR(1) (Yes, including PLY)
- Run on every Python interpreter (it's pure-python) - Run on every Python interpreter (it's pure-python)
- Generate a stand-alone parser (for LALR(1) grammars)


And many more features. Read ahead and find out. And many more features. Read ahead and find out.


@@ -66,10 +67,11 @@ See more [examples in the wiki](https://github.com/erezsh/lark/wiki/Examples)


- Builds a parse-tree (AST) automagically, based on the structure of the grammar - Builds a parse-tree (AST) automagically, based on the structure of the grammar
- **Earley** parser - **Earley** parser
- Can parse *ALL* context-free grammars
- Full support for ambiguity in grammar
- Can parse all context-free grammars
- Full support for ambiguous grammars
- **LALR(1)** parser - **LALR(1)** parser
- Competitive with PLY
- Fast and light, competitive with PLY
- Can generate a stand-alone parser
- **EBNF** grammar - **EBNF** grammar
- **Unicode** fully supported - **Unicode** fully supported
- **Python 2 & 3** compatible - **Python 2 & 3** compatible
@@ -86,7 +88,7 @@ See the full list of [features in the wiki](https://github.com/erezsh/lark/wiki/


#### Performance comparison #### Performance comparison


Lower is better!
Lark is the fastest and lightest (lower is better)


![Run-time Comparison](docs/comparison_runtime.png) ![Run-time Comparison](docs/comparison_runtime.png)


@@ -99,17 +101,17 @@ Check out the [JSON tutorial](/docs/json_tutorial.md#conclusion) for more detail


#### Feature comparison #### Feature comparison


| Library | Algorithm | Grammar | Builds tree? | Supports ambiguity? | Can handle every CFG?
|:--------|:----------|:----|:--------|:------------|:------------
| **Lark** | Earley/LALR(1) | EBNF | Yes! | Yes! | Yes! |
| [PLY](http://www.dabeaz.com/ply/) | LALR(1) | BNF | No | No | No |
| [PyParsing](http://pyparsing.wikispaces.com/) | PEG | Combinators | No | No | No\* |
| [Parsley](https://pypi.python.org/pypi/Parsley) | PEG | EBNF | No | No | No\* |
| [funcparserlib](https://github.com/vlasovskikh/funcparserlib) | Recursive-Descent | Combinators | No | No | No |
| [Parsimonious](https://github.com/erikrose/parsimonious) | PEG | EBNF | Yes | No | No\* |
| Library | Algorithm | Grammar | Builds tree? | Supports ambiguity? | Can handle every CFG? | Line/Column tracking | Generates Stand-alone
|:--------|:----------|:----|:--------|:------------|:------------|:----------|:----------
| **Lark** | Earley/LALR(1) | EBNF | Yes! | Yes! | Yes! | Yes! | Yes! (LALR only) |
| [PLY](http://www.dabeaz.com/ply/) | LALR(1) | BNF | No | No | No | No | No |
| [PyParsing](http://pyparsing.wikispaces.com/) | PEG | Combinators | No | No | No\* | No | No |
| [Parsley](https://pypi.python.org/pypi/Parsley) | PEG | EBNF | No | No | No\* | No | No |
| [funcparserlib](https://github.com/vlasovskikh/funcparserlib) | Recursive-Descent | Combinators | No | No | No | No | No |
| [Parsimonious](https://github.com/erikrose/parsimonious) | PEG | EBNF | Yes | No | No\* | No | No |




(\* *According to Wikipedia, it remains unanswered whether PEGs can really parse all deterministic CFGs*)
(\* *PEGs cannot handle non-deterministic grammars. Also, according to Wikipedia, it remains unanswered whether PEGs can really parse all deterministic CFGs*)




### Projects using Lark ### Projects using Lark


+ 1
- 0
examples/standalone/create_standalone.sh View File

@@ -0,0 +1 @@
python -m lark.tools.standalone json.g > json_parser.py

+ 21
- 0
examples/standalone/json.g View File

@@ -0,0 +1,21 @@
?start: value

?value: object
| array
| string
| SIGNED_NUMBER -> number
| "true" -> true
| "false" -> false
| "null" -> null

array : "[" [value ("," value)*] "]"
object : "{" [pair ("," pair)*] "}"
pair : string ":" value

string : ESCAPED_STRING

%import common.ESCAPED_STRING
%import common.SIGNED_NUMBER
%import common.WS

%ignore WS

+ 794
- 0
examples/standalone/json_parser.py View File

@@ -0,0 +1,794 @@
# The file was automatically generated by Lark v0.5.2
#
#
# Lark Stand-alone Generator Tool
# ----------------------------------
# Generates a stand-alone LALR(1) parser with a standard lexer
#
# Git: https://github.com/erezsh/lark
# Author: Erez Shinan (erezshin@gmail.com)
#
#
# >>> LICENSE
#
# This tool and its generated code use a separate license from Lark.
#
# It is licensed under GPLv2 or above.
#
# If you wish to purchase a commercial license for this tool and its
# generated code, contact me via email.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# See <http://www.gnu.org/licenses/>.
#
#


import types
import functools
from contextlib import contextmanager

Str = type(u'')

def inline_args(f):
# print '@@', f.__name__, type(f), isinstance(f, types.FunctionType), isinstance(f, types.TypeType), isinstance(f, types.BuiltinFunctionType)
if isinstance(f, types.FunctionType):
@functools.wraps(f)
def _f_func(self, args):
return f(self, *args)
return _f_func
elif isinstance(f, (type, types.BuiltinFunctionType)):
@functools.wraps(f)
def _f_builtin(_self, args):
return f(*args)
return _f_builtin
elif isinstance(f, types.MethodType):
@functools.wraps(f.__func__)
def _f(self, args):
return f.__func__(self, *args)
return _f
else:
@functools.wraps(f.__call__.__func__)
def _f(self, args):
return f.__call__.__func__(self, *args)
return _f


try:
from contextlib import suppress # Python 3
except ImportError:
@contextmanager
def suppress(*excs):
'''Catch and dismiss the provided exception

>>> x = 'hello'
>>> with suppress(IndexError):
... x = x[10]
>>> x
'hello'
'''
try:
yield
except excs:
pass


def is_terminal(sym):
return sym.isupper()

class GrammarError(Exception):
pass

class ParseError(Exception):
pass

class UnexpectedToken(ParseError):
def __init__(self, token, expected, seq, index):
self.token = token
self.expected = expected
self.line = getattr(token, 'line', '?')
self.column = getattr(token, 'column', '?')

try:
context = ' '.join(['%r(%s)' % (t.value, t.type) for t in seq[index:index+5]])
except AttributeError:
context = seq[index:index+5]
except TypeError:
context = "<no context>"
message = ("Unexpected token %r at line %s, column %s.\n"
"Expected: %s\n"
"Context: %s" % (token, self.line, self.column, expected, context))

super(UnexpectedToken, self).__init__(message)



class Tree(object):
def __init__(self, data, children):
self.data = data
self.children = list(children)

def __repr__(self):
return 'Tree(%s, %s)' % (self.data, self.children)

def _pretty_label(self):
return self.data

def _pretty(self, level, indent_str):
if len(self.children) == 1 and not isinstance(self.children[0], Tree):
return [ indent_str*level, self._pretty_label(), '\t', '%s' % self.children[0], '\n']

l = [ indent_str*level, self._pretty_label(), '\n' ]
for n in self.children:
if isinstance(n, Tree):
l += n._pretty(level+1, indent_str)
else:
l += [ indent_str*(level+1), '%s' % n, '\n' ]

return l

def pretty(self, indent_str=' '):
return ''.join(self._pretty(0, indent_str))
class Transformer(object):
def _get_func(self, name):
return getattr(self, name)

def transform(self, tree):
items = []
for c in tree.children:
try:
items.append(self.transform(c) if isinstance(c, Tree) else c)
except Discard:
pass
try:
f = self._get_func(tree.data)
except AttributeError:
return self.__default__(tree.data, items)
else:
return f(items)

def __default__(self, data, children):
return Tree(data, children)

def __mul__(self, other):
return TransformerChain(self, other)


class Discard(Exception):
pass

class TransformerChain(object):
def __init__(self, *transformers):
self.transformers = transformers

def transform(self, tree):
for t in self.transformers:
tree = t.transform(tree)
return tree

def __mul__(self, other):
return TransformerChain(*self.transformers + (other,))



class InlineTransformer(Transformer):
def _get_func(self, name): # use super()._get_func
return inline_args(getattr(self, name)).__get__(self)


class Visitor(object):
def visit(self, tree):
for child in tree.children:
if isinstance(child, Tree):
self.visit(child)

f = getattr(self, tree.data, self.__default__)
f(tree)
return tree

def __default__(self, tree):
pass


class Visitor_NoRecurse(Visitor):
def visit(self, tree):
subtrees = list(tree.iter_subtrees())

for subtree in (subtrees):
getattr(self, subtree.data, self.__default__)(subtree)
return tree


class Transformer_NoRecurse(Transformer):
def transform(self, tree):
subtrees = list(tree.iter_subtrees())

def _t(t):
# Assumes t is already transformed
try:
f = self._get_func(t.data)
except AttributeError:
return self.__default__(t)
else:
return f(t)

for subtree in subtrees:
children = []
for c in subtree.children:
try:
children.append(_t(c) if isinstance(c, Tree) else c)
except Discard:
pass
subtree.children = children

return _t(tree)

def __default__(self, t):
return t

class Indenter:
def __init__(self):
self.paren_level = 0
self.indent_level = [0]

def handle_NL(self, token):
if self.paren_level > 0:
return

yield token

indent_str = token.rsplit('\n', 1)[1] # Tabs and spaces
indent = indent_str.count(' ') + indent_str.count('\t') * self.tab_len

if indent > self.indent_level[-1]:
self.indent_level.append(indent)
yield Token.new_borrow_pos(self.INDENT_type, indent_str, token)
else:
while indent < self.indent_level[-1]:
self.indent_level.pop()
yield Token.new_borrow_pos(self.DEDENT_type, indent_str, token)

assert indent == self.indent_level[-1], '%s != %s' % (indent, self.indent_level[-1])

def process(self, stream):
for token in stream:
if token.type == self.NL_type:
for t in self.handle_NL(token):
yield t
else:
yield token

if token.type in self.OPEN_PAREN_types:
self.paren_level += 1
elif token.type in self.CLOSE_PAREN_types:
self.paren_level -= 1
assert self.paren_level >= 0

while len(self.indent_level) > 1:
self.indent_level.pop()
yield Token(self.DEDENT_type, '')

assert self.indent_level == [0], self.indent_level

# XXX Hack for ContextualLexer. Maybe there's a more elegant solution?
@property
def always_accept(self):
return (self.NL_type,)


class LexError(Exception):
pass

class UnexpectedInput(LexError):
def __init__(self, seq, lex_pos, line, column, allowed=None):
context = seq[lex_pos:lex_pos+5]
message = "No token defined for: '%s' in %r at line %d col %d" % (seq[lex_pos], context, line, column)

super(UnexpectedInput, self).__init__(message)

self.line = line
self.column = column
self.context = context
self.allowed = allowed

class Token(Str):
def __new__(cls, type_, value, pos_in_stream=None, line=None, column=None):
inst = Str.__new__(cls, value)
inst.type = type_
inst.pos_in_stream = pos_in_stream
inst.value = value
inst.line = line
inst.column = column
return inst

@classmethod
def new_borrow_pos(cls, type_, value, borrow_t):
return cls(type_, value, borrow_t.pos_in_stream, line=borrow_t.line, column=borrow_t.column)

def __repr__(self):
return 'Token(%s, %r)' % (self.type, self.value)

def __deepcopy__(self, memo):
return Token(self.type, self.value, self.pos_in_stream, self.line, self.column)

def __eq__(self, other):
if isinstance(other, Token) and self.type != other.type:
return False

return Str.__eq__(self, other)

__hash__ = Str.__hash__


class LineCounter:
def __init__(self):
self.newline_char = '\n'
self.char_pos = 0
self.line = 1
self.column = 0
self.line_start_pos = 0

def feed(self, token, test_newline=True):
"""Consume a token and calculate the new line & column.

As an optional optimization, set test_newline=False is token doesn't contain a newline.
"""
if test_newline:
newlines = token.count(self.newline_char)
if newlines:
self.line += newlines
self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1

self.char_pos += len(token)
self.column = self.char_pos - self.line_start_pos

class _Lex:
"Built to serve both Lexer and ContextualLexer"
def __init__(self, lexer):
self.lexer = lexer

def lex(self, stream, newline_types, ignore_types):
newline_types = list(newline_types)
newline_types = list(newline_types)
line_ctr = LineCounter()

while True:
lexer = self.lexer
for mre, type_from_index in lexer.mres:
m = mre.match(stream, line_ctr.char_pos)
if m:
value = m.group(0)
type_ = type_from_index[m.lastindex]
if type_ not in ignore_types:
t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
if t.type in lexer.callback:
t = lexer.callback[t.type](t)
lexer = yield t

line_ctr.feed(value, type_ in newline_types)
break
else:
if line_ctr.char_pos < len(stream):
raise UnexpectedInput(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column)
break

class UnlessCallback:
def __init__(self, mres):
self.mres = mres

def __call__(self, t):
for mre, type_from_index in self.mres:
m = mre.match(t.value)
if m:
value = m.group(0)
t.type = type_from_index[m.lastindex]
break
return t



class NodeBuilder:
def __init__(self, tree_class, name):
self.tree_class = tree_class
self.name = name

def __call__(self, children):
return self.tree_class(self.name, children)

class Expand1:
def __init__(self, node_builder):
self.node_builder = node_builder

def __call__(self, children):
if len(children) == 1:
return children[0]
else:
return self.node_builder(children)

class Factory:
def __init__(self, cls, *args):
self.cls = cls
self.args = args

def __call__(self, node_builder):
return self.cls(node_builder, *self.args)


class TokenWrapper:
"Used for fixing the results of scanless parsing"

def __init__(self, node_builder, token_name):
self.node_builder = node_builder
self.token_name = token_name

def __call__(self, children):
return self.node_builder( [Token(self.token_name, ''.join(children))] )

def identity(node_builder):
return node_builder


class ChildFilter:
def __init__(self, node_builder, to_include):
self.node_builder = node_builder
self.to_include = to_include

def __call__(self, children):
filtered = []
for i, to_expand in self.to_include:
if to_expand:
filtered += children[i].children
else:
filtered.append(children[i])

return self.node_builder(filtered)

def create_rule_handler(expansion, keep_all_tokens, filter_out):
# if not keep_all_tokens:
to_include = [(i, not is_terminal(sym) and sym.startswith('_'))
for i, sym in enumerate(expansion)
if keep_all_tokens
or not ((is_terminal(sym) and sym.startswith('_')) or sym in filter_out)
]

if len(to_include) < len(expansion) or any(to_expand for i, to_expand in to_include):
return Factory(ChildFilter, to_include)

# else, if no filtering required..
return identity

class PropagatePositions:
def __init__(self, node_builder):
self.node_builder = node_builder

def __call__(self, children):
res = self.node_builder(children)

if children:
for a in children:
with suppress(AttributeError):
res.line = a.line
res.column = a.column
break

for a in reversed(children):
with suppress(AttributeError):
res.end_line = a.end_line
res.end_col = a.end_col
break

return res


class Callback(object):
pass

class ParseTreeBuilder:
def __init__(self, rules, tree_class, propagate_positions=False, keep_all_tokens=False):
self.tree_class = tree_class
self.propagate_positions = propagate_positions
self.always_keep_all_tokens = keep_all_tokens

self.rule_builders = list(self._init_builders(rules))

self.user_aliases = {}

def _init_builders(self, rules):
filter_out = set()
for rule in rules:
if rule.options and rule.options.filter_out:
assert rule.origin.startswith('_') # Just to make sure
filter_out.add(rule.origin)

for rule in rules:
options = rule.options
keep_all_tokens = self.always_keep_all_tokens or (options.keep_all_tokens if options else False)
expand1 = options.expand1 if options else False
create_token = options.create_token if options else False

wrapper_chain = filter(None, [
(expand1 and not rule.alias) and Expand1,
create_token and Factory(TokenWrapper, create_token),
create_rule_handler(rule.expansion, keep_all_tokens, filter_out),
self.propagate_positions and PropagatePositions,
])

yield rule, wrapper_chain


def create_callback(self, transformer=None):
callback = Callback()

for rule, wrapper_chain in self.rule_builders:
internal_callback_name = '_callback_%s_%s' % (rule.origin, '_'.join(rule.expansion))

user_callback_name = rule.alias or rule.origin
try:
f = transformer._get_func(user_callback_name)
except AttributeError:
f = NodeBuilder(self.tree_class, user_callback_name)

self.user_aliases[rule] = rule.alias
rule.alias = internal_callback_name

for w in wrapper_chain:
f = w(f)

if hasattr(callback, internal_callback_name):
raise GrammarError("Rule '%s' already exists" % (rule,))
setattr(callback, internal_callback_name, f)

return callback



class _Parser:
def __init__(self, parse_table, callbacks):
self.states = parse_table.states
self.start_state = parse_table.start_state
self.end_state = parse_table.end_state
self.callbacks = callbacks

def parse(self, seq, set_state=None):
i = 0
token = None
stream = iter(seq)
states = self.states

state_stack = [self.start_state]
value_stack = []

if set_state: set_state(self.start_state)

def get_action(key):
state = state_stack[-1]
try:
return states[state][key]
except KeyError:
expected = states[state].keys()

raise UnexpectedToken(token, expected, seq, i)

def reduce(rule):
size = len(rule.expansion)
if size:
s = value_stack[-size:]
del state_stack[-size:]
del value_stack[-size:]
else:
s = []

value = self.callbacks[rule](s)

_action, new_state = get_action(rule.origin)
assert _action is Shift
state_stack.append(new_state)
value_stack.append(value)

# Main LALR-parser loop
try:
token = next(stream)
i += 1
while True:
action, arg = get_action(token.type)
assert arg != self.end_state

if action is Shift:
state_stack.append(arg)
value_stack.append(token)
if set_state: set_state(arg)
token = next(stream)
i += 1
else:
reduce(arg)
except StopIteration:
pass

while True:
_action, arg = get_action('$END')
if _action is Shift:
assert arg == self.end_state
val ,= value_stack
return val
else:
reduce(arg)



class Rule(object):
"""
origin : a symbol
expansion : a list of symbols
"""
def __init__(self, origin, expansion, alias=None, options=None):
self.origin = origin
self.expansion = expansion
self.alias = alias
self.options = options

def __str__(self):
return '<%s : %s>' % (self.origin, ' '.join(map(str,self.expansion)))

def __repr__(self):
return 'Rule(%r, %r, %r, %r)' % (self.origin, self.expansion, self.alias, self.options)


class RuleOptions:
def __init__(self, keep_all_tokens=False, expand1=False, create_token=None, filter_out=False, priority=None):
self.keep_all_tokens = keep_all_tokens
self.expand1 = expand1
self.create_token = create_token # used for scanless postprocessing
self.priority = priority

self.filter_out = filter_out # remove this rule from the tree
# used for "token"-rules in scanless

def __repr__(self):
return 'RuleOptions(%r, %r, %r, %r, %r)' % (
self.keep_all_tokens,
self.expand1,
self.create_token,
self.priority,
self.filter_out
)

Shift = 0
Reduce = 1
import re
MRES = (
[('(?P<SIGNED_NUMBER>(?:(?:\\+|\\-))?(?:(?:(?:[0-9])+(?:e|E)(?:(?:\\+|\\-))?(?:[0-9])+|(?:(?:[0-9])+\\.(?:(?:[0-9])+)?|\\.(?:[0-9])+)(?:(?:e|E)(?:(?:\\+|\\-))?(?:[0-9])+)?)|(?:[0-9])+))|(?P<ESCAPED_STRING>\\"(?:(?:\\\\\\"|[^"]))*\\")|(?P<WS>(?:[ \t\x0c'
'\r\n'
'])+)|(?P<__FALSE1>false)|(?P<__NULL2>null)|(?P<__TRUE0>true)|(?P<__COLON>\\:)|(?P<__COMMA>\\,)|(?P<__LBRACE>\\{)|(?P<__LSQB>\\[)|(?P<__RBRACE>\\})|(?P<__RSQB>\\])',
{1: 'SIGNED_NUMBER',
2: 'ESCAPED_STRING',
3: 'WS',
4: '__FALSE1',
5: '__NULL2',
6: '__TRUE0',
7: '__COLON',
8: '__COMMA',
9: '__LBRACE',
10: '__LSQB',
11: '__RBRACE',
12: '__RSQB'})]
)
LEXER_CALLBACK = (
{}
)
NEWLINE_TYPES = ['WS']
IGNORE_TYPES = ['WS']
class LexerRegexps: pass
lexer_regexps = LexerRegexps()
lexer_regexps.mres = [(re.compile(p), d) for p, d in MRES]
lexer_regexps.callback = {n: UnlessCallback([(re.compile(p), d) for p, d in mres])
for n, mres in LEXER_CALLBACK.items()}
lexer = _Lex(lexer_regexps)
def lex(stream):
return lexer.lex(stream, NEWLINE_TYPES, IGNORE_TYPES)
RULES = {
0: Rule('start', ['value'], None, RuleOptions(False, True, None, None, False)),
1: Rule('value', ['object'], None, RuleOptions(False, True, None, None, False)),
2: Rule('value', ['array'], None, RuleOptions(False, True, None, None, False)),
3: Rule('value', ['string'], None, RuleOptions(False, True, None, None, False)),
4: Rule('value', ['SIGNED_NUMBER'], 'number', RuleOptions(False, True, None, None, False)),
5: Rule('value', ['__TRUE0'], 'true', RuleOptions(False, True, None, None, False)),
6: Rule('value', ['__FALSE1'], 'false', RuleOptions(False, True, None, None, False)),
7: Rule('value', ['__NULL2'], 'null', RuleOptions(False, True, None, None, False)),
8: Rule('array', ['__LSQB', 'value', '__anon_star_0', '__RSQB'], None, RuleOptions(False, False, None, None, False)),
9: Rule('array', ['__LSQB', 'value', '__RSQB'], None, RuleOptions(False, False, None, None, False)),
10: Rule('array', ['__LSQB', '__RSQB'], None, RuleOptions(False, False, None, None, False)),
11: Rule('object', ['__LBRACE', 'pair', '__anon_star_1', '__RBRACE'], None, RuleOptions(False, False, None, None, False)),
12: Rule('object', ['__LBRACE', 'pair', '__RBRACE'], None, RuleOptions(False, False, None, None, False)),
13: Rule('object', ['__LBRACE', '__RBRACE'], None, RuleOptions(False, False, None, None, False)),
14: Rule('pair', ['string', '__COLON', 'value'], None, RuleOptions(False, False, None, None, False)),
15: Rule('string', ['ESCAPED_STRING'], None, RuleOptions(False, False, None, None, False)),
16: Rule('__anon_star_0', ['__COMMA', 'value'], None, None),
17: Rule('__anon_star_0', ['__anon_star_0', '__COMMA', 'value'], None, None),
18: Rule('__anon_star_1', ['__COMMA', 'pair'], None, None),
19: Rule('__anon_star_1', ['__anon_star_1', '__COMMA', 'pair'], None, None),
}
parse_tree_builder = ParseTreeBuilder(RULES.values(), Tree)
class ParseTable: pass
parse_table = ParseTable()
STATES = {
0: {0: (0, 1), 1: (0, 2), 2: (0, 3), 3: (0, 4), 4: (0, 5), 5: (0, 6), 6: (0, 7), 7: (0, 8), 8: (0, 9), 9: (0, 10), 10: (0, 11), 11: (0, 12)},
1: {12: (1, 5), 13: (1, 5), 14: (1, 5), 15: (1, 5)},
2: {9: (0, 10), 14: (0, 13), 16: (0, 14), 11: (0, 15)},
3: {12: (1, 2), 13: (1, 2), 14: (1, 2), 15: (1, 2)},
4: {12: (1, 1), 13: (1, 1), 14: (1, 1), 15: (1, 1)},
5: {12: (0, 16)},
6: {7: (0, 17), 0: (0, 1), 1: (0, 2), 2: (0, 3), 3: (0, 4), 5: (0, 6), 6: (0, 7), 8: (0, 9), 9: (0, 10), 15: (0, 18), 10: (0, 11), 11: (0, 12)},
7: {12: (1, 4), 13: (1, 4), 14: (1, 4), 15: (1, 4)},
8: {12: (1, 0)},
9: {12: (1, 7), 13: (1, 7), 14: (1, 7), 15: (1, 7)},
10: {12: (1, 15), 17: (1, 15), 13: (1, 15), 14: (1, 15), 15: (1, 15)},
11: {12: (1, 6), 13: (1, 6), 14: (1, 6), 15: (1, 6)},
12: {12: (1, 3), 13: (1, 3), 14: (1, 3), 15: (1, 3)},
13: {13: (1, 13), 12: (1, 13), 14: (1, 13), 15: (1, 13)},
14: {14: (0, 19), 13: (0, 20), 18: (0, 21)},
15: {17: (0, 22)},
16: {},
17: {19: (0, 23), 15: (0, 24), 13: (0, 25)},
18: {13: (1, 10), 12: (1, 10), 14: (1, 10), 15: (1, 10)},
19: {13: (1, 12), 12: (1, 12), 14: (1, 12), 15: (1, 12)},
20: {9: (0, 10), 11: (0, 15), 16: (0, 26)},
21: {14: (0, 27), 13: (0, 28)},
22: {5: (0, 6), 1: (0, 2), 0: (0, 1), 8: (0, 9), 2: (0, 3), 3: (0, 4), 9: (0, 10), 6: (0, 7), 10: (0, 11), 11: (0, 12), 7: (0, 29)},
23: {15: (0, 30), 13: (0, 31)},
24: {13: (1, 9), 12: (1, 9), 14: (1, 9), 15: (1, 9)},
25: {5: (0, 6), 1: (0, 2), 0: (0, 1), 8: (0, 9), 2: (0, 3), 3: (0, 4), 7: (0, 32), 9: (0, 10), 6: (0, 7), 10: (0, 11), 11: (0, 12)},
26: {13: (1, 18), 14: (1, 18)},
27: {13: (1, 11), 12: (1, 11), 14: (1, 11), 15: (1, 11)},
28: {16: (0, 33), 9: (0, 10), 11: (0, 15)},
29: {13: (1, 14), 14: (1, 14)},
30: {13: (1, 8), 12: (1, 8), 14: (1, 8), 15: (1, 8)},
31: {5: (0, 6), 1: (0, 2), 0: (0, 1), 7: (0, 34), 8: (0, 9), 2: (0, 3), 3: (0, 4), 9: (0, 10), 6: (0, 7), 10: (0, 11), 11: (0, 12)},
32: {15: (1, 16), 13: (1, 16)},
33: {13: (1, 19), 14: (1, 19)},
34: {15: (1, 17), 13: (1, 17)},
}
TOKEN_TYPES = (
{0: '__TRUE0',
1: '__LBRACE',
2: 'array',
3: 'object',
4: 'start',
5: '__LSQB',
6: 'SIGNED_NUMBER',
7: 'value',
8: '__NULL2',
9: 'ESCAPED_STRING',
10: '__FALSE1',
11: 'string',
12: '$END',
13: '__COMMA',
14: '__RBRACE',
15: '__RSQB',
16: 'pair',
17: '__COLON',
18: '__anon_star_1',
19: '__anon_star_0'}
)
parse_table.states = {s: {TOKEN_TYPES[t]: (a, RULES[x] if a is Reduce else x) for t, (a, x) in acts.items()}
for s, acts in STATES.items()}
parse_table.start_state = 0
parse_table.end_state = 16
class Lark_StandAlone:
def __init__(self, transformer=None, postlex=None):
callback = parse_tree_builder.create_callback(transformer=transformer)
callbacks = {rule: getattr(callback, rule.alias or rule.origin, None) for rule in RULES.values()}
self.parser = _Parser(parse_table, callbacks)
self.postlex = postlex
def parse(self, stream):
tokens = lex(stream)
if self.postlex: tokens = self.postlex.process(tokens)
return self.parser.parse(tokens)

+ 25
- 0
examples/standalone/json_parser_main.py View File

@@ -0,0 +1,25 @@
import sys

from json_parser import Lark_StandAlone, Transformer, inline_args

class TreeToJson(Transformer):
@inline_args
def string(self, s):
return s[1:-1].replace('\\"', '"')

array = list
pair = tuple
object = dict
number = inline_args(float)

null = lambda self, _: None
true = lambda self, _: True
false = lambda self, _: False


parser = Lark_StandAlone(transformer=TreeToJson())

if __name__ == '__main__':
with open(sys.argv[1]) as f:
print(parser.parse(f.read()))


+ 1
- 1
lark/__init__.py View File

@@ -4,4 +4,4 @@ from .lexer import UnexpectedInput, LexError
from .lark import Lark from .lark import Lark
from .utils import inline_args from .utils import inline_args


__version__ = "0.5.1"
__version__ = "0.5.2"

+ 10
- 31
lark/common.py View File

@@ -1,16 +1,21 @@
import re import re
import sre_parse
import sys import sys


from .utils import get_regexp_width

Py36 = (sys.version_info[:2] >= (3, 6)) Py36 = (sys.version_info[:2] >= (3, 6))



###{standalone
def is_terminal(sym):
return sym.isupper()

class GrammarError(Exception): class GrammarError(Exception):
pass pass


class ParseError(Exception): class ParseError(Exception):
pass pass



class UnexpectedToken(ParseError): class UnexpectedToken(ParseError):
def __init__(self, token, expected, seq, index): def __init__(self, token, expected, seq, index):
self.token = token self.token = token
@@ -31,9 +36,8 @@ class UnexpectedToken(ParseError):
super(UnexpectedToken, self).__init__(message) super(UnexpectedToken, self).__init__(message)




###}


def is_terminal(sym):
return isinstance(sym, Terminal) or sym.isupper() or sym == '$end'




class LexerConf: class LexerConf:
@@ -44,7 +48,6 @@ class LexerConf:


class ParserConf: class ParserConf:
def __init__(self, rules, callback, start): def __init__(self, rules, callback, start):
assert all(len(r) == 4 for r in rules)
self.rules = rules self.rules = rules
self.callback = callback self.callback = callback
self.start = start self.start = start
@@ -93,10 +96,10 @@ class PatternRE(Pattern):


@property @property
def min_width(self): def min_width(self):
return sre_parse.parse(self.to_regexp()).getwidth()[0]
return get_regexp_width(self.to_regexp())[0]
@property @property
def max_width(self): def max_width(self):
return sre_parse.parse(self.to_regexp()).getwidth()[1]
return get_regexp_width(self.to_regexp())[1]


class TokenDef(object): class TokenDef(object):
def __init__(self, name, pattern, priority=1): def __init__(self, name, pattern, priority=1):
@@ -108,27 +111,3 @@ class TokenDef(object):
def __repr__(self): def __repr__(self):
return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern) return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern)



class Terminal:
def __init__(self, data):
self.data = data

def __repr__(self):
return '%r' % self.data

def __eq__(self, other):
return isinstance(other, type(self)) and self.data == other.data
def __hash__(self):
return hash(self.data)


class Terminal_Regexp(Terminal):
def __init__(self, name, regexp):
Terminal.__init__(self, regexp)
self.name = name
self.match = re.compile(regexp).match

class Terminal_Token(Terminal):
def match(self, other):
return self.data == other.type


+ 37
- 0
lark/grammar.py View File

@@ -0,0 +1,37 @@

class Rule(object):
"""
origin : a symbol
expansion : a list of symbols
"""
def __init__(self, origin, expansion, alias=None, options=None):
self.origin = origin
self.expansion = expansion
self.alias = alias
self.options = options

def __str__(self):
return '<%s : %s>' % (self.origin, ' '.join(map(str,self.expansion)))

def __repr__(self):
return 'Rule(%r, %r, %r, %r)' % (self.origin, self.expansion, self.alias, self.options)


class RuleOptions:
def __init__(self, keep_all_tokens=False, expand1=False, create_token=None, filter_out=False, priority=None):
self.keep_all_tokens = keep_all_tokens
self.expand1 = expand1
self.create_token = create_token # used for scanless postprocessing
self.priority = priority

self.filter_out = filter_out # remove this rule from the tree
# used for "token"-rules in scanless

def __repr__(self):
return 'RuleOptions(%r, %r, %r, %r, %r)' % (
self.keep_all_tokens,
self.expand1,
self.create_token,
self.priority,
self.filter_out
)

+ 1
- 0
lark/grammars/common.g View File

@@ -12,6 +12,7 @@ DECIMAL: INT "." INT? | "." INT
// float = /-?\d+(\.\d+)?([eE][+-]?\d+)?/ // float = /-?\d+(\.\d+)?([eE][+-]?\d+)?/
_EXP: ("e"|"E") SIGNED_INT _EXP: ("e"|"E") SIGNED_INT
FLOAT: INT _EXP | DECIMAL _EXP? FLOAT: INT _EXP | DECIMAL _EXP?
SIGNED_FLOAT: ["+"|"-"] INT


NUMBER: FLOAT | INT NUMBER: FLOAT | INT
SIGNED_NUMBER: ["+"|"-"] NUMBER SIGNED_NUMBER: ["+"|"-"] NUMBER


+ 3
- 0
lark/indenter.py View File

@@ -2,6 +2,7 @@


from .lexer import Token from .lexer import Token


###{standalone
class Indenter: class Indenter:
def __init__(self): def __init__(self):
self.paren_level = 0 self.paren_level = 0
@@ -50,3 +51,5 @@ class Indenter:
@property @property
def always_accept(self): def always_accept(self):
return (self.NL_type,) return (self.NL_type,)

###}

+ 5
- 3
lark/lark.py View File

@@ -169,13 +169,15 @@ class Lark:


def _build_parser(self): def _build_parser(self):
self.parser_class = get_frontend(self.options.parser, self.options.lexer) self.parser_class = get_frontend(self.options.parser, self.options.lexer)
self.parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class, self.options.propagate_positions, self.options.keep_all_tokens)
rules, callback = self.parse_tree_builder.apply(self.options.transformer)

self._parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class, self.options.propagate_positions, self.options.keep_all_tokens)
callback = self._parse_tree_builder.create_callback(self.options.transformer)
if self.profiler: if self.profiler:
for f in dir(callback): for f in dir(callback):
if not (f.startswith('__') and f.endswith('__')): if not (f.startswith('__') and f.endswith('__')):
setattr(callback, f, self.profiler.make_wrapper('transformer', getattr(callback, f))) setattr(callback, f, self.profiler.make_wrapper('transformer', getattr(callback, f)))
parser_conf = ParserConf(rules, callback, self.options.start)

parser_conf = ParserConf(self.rules, callback, self.options.start)


return self.parser_class(self.lexer_conf, parser_conf, options=self.options) return self.parser_class(self.lexer_conf, parser_conf, options=self.options)




+ 74
- 90
lark/lexer.py View File

@@ -5,6 +5,7 @@ import re
from .utils import Str, classify from .utils import Str, classify
from .common import is_terminal, PatternStr, PatternRE, TokenDef from .common import is_terminal, PatternStr, PatternRE, TokenDef


###{standalone
class LexError(Exception): class LexError(Exception):
pass pass


@@ -48,27 +49,75 @@ class Token(Str):


__hash__ = Str.__hash__ __hash__ = Str.__hash__


class Regex:
def __init__(self, pattern, flags=()):
self.pattern = pattern
self.flags = flags


def _regexp_has_newline(r):
return '\n' in r or '\\n' in r or ('(?s)' in r and '.' in r)
class LineCounter:
def __init__(self):
self.newline_char = '\n'
self.char_pos = 0
self.line = 1
self.column = 0
self.line_start_pos = 0

def feed(self, token, test_newline=True):
"""Consume a token and calculate the new line & column.

As an optional optimization, set test_newline=False is token doesn't contain a newline.
"""
if test_newline:
newlines = token.count(self.newline_char)
if newlines:
self.line += newlines
self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1

self.char_pos += len(token)
self.column = self.char_pos - self.line_start_pos

class _Lex:
"Built to serve both Lexer and ContextualLexer"
def __init__(self, lexer):
self.lexer = lexer

def lex(self, stream, newline_types, ignore_types):
newline_types = list(newline_types)
ignore_types = list(ignore_types)
line_ctr = LineCounter()


def _create_unless_callback(strs):
mres = build_mres(strs, match_whole=True)
def unless_callback(t):
# if t in strs:
# t.type = strs[t]
for mre, type_from_index in mres:
while True:
lexer = self.lexer
for mre, type_from_index in lexer.mres:
m = mre.match(stream, line_ctr.char_pos)
if m:
value = m.group(0)
type_ = type_from_index[m.lastindex]
if type_ not in ignore_types:
t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
if t.type in lexer.callback:
t = lexer.callback[t.type](t)
yield t

line_ctr.feed(value, type_ in newline_types)
break
else:
if line_ctr.char_pos < len(stream):
raise UnexpectedInput(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column)
break

class UnlessCallback:
def __init__(self, mres):
self.mres = mres

def __call__(self, t):
for mre, type_from_index in self.mres:
m = mre.match(t.value) m = mre.match(t.value)
if m: if m:
value = m.group(0) value = m.group(0)
t.type = type_from_index[m.lastindex] t.type = type_from_index[m.lastindex]
break break
return t return t
return unless_callback

###}




def _create_unless(tokens): def _create_unless(tokens):
tokens_by_type = classify(tokens, lambda t: type(t.pattern)) tokens_by_type = classify(tokens, lambda t: type(t.pattern))
@@ -85,7 +134,7 @@ def _create_unless(tokens):
if strtok.pattern.flags <= retok.pattern.flags: if strtok.pattern.flags <= retok.pattern.flags:
embedded_strs.add(strtok) embedded_strs.add(strtok)
if unless: if unless:
callback[retok.name] = _create_unless_callback(unless)
callback[retok.name] = UnlessCallback(build_mres(unless, match_whole=True))


tokens = [t for t in tokens if t not in embedded_strs] tokens = [t for t in tokens if t not in embedded_strs]
return tokens, callback return tokens, callback
@@ -110,13 +159,13 @@ def _build_mres(tokens, max_size, match_whole):
def build_mres(tokens, match_whole=False): def build_mres(tokens, match_whole=False):
return _build_mres(tokens, len(tokens), match_whole) return _build_mres(tokens, len(tokens), match_whole)


def _regexp_has_newline(r):
return '\n' in r or '\\n' in r or ('(?s)' in r and '.' in r)


class Lexer(object):
class Lexer:
def __init__(self, tokens, ignore=()): def __init__(self, tokens, ignore=()):
assert all(isinstance(t, TokenDef) for t in tokens), tokens assert all(isinstance(t, TokenDef) for t in tokens), tokens


self.ignore = ignore
self.newline_char = '\n'
tokens = list(tokens) tokens = list(tokens)


# Sanitization # Sanitization
@@ -129,14 +178,11 @@ class Lexer(object):
if t.pattern.min_width == 0: if t.pattern.min_width == 0:
raise LexError("Lexer does not allow zero-width tokens. (%s: %s)" % (t.name, t.pattern)) raise LexError("Lexer does not allow zero-width tokens. (%s: %s)" % (t.name, t.pattern))


token_names = {t.name for t in tokens}
for t in ignore:
if t not in token_names:
raise LexError("Token '%s' was marked to ignore but it is not defined!" % t)
assert set(ignore) <= {t.name for t in tokens}


# Init # Init
self.newline_types = [t.name for t in tokens if _regexp_has_newline(t.pattern.to_regexp())] self.newline_types = [t.name for t in tokens if _regexp_has_newline(t.pattern.to_regexp())]
self.ignore_types = [t for t in ignore]
self.ignore_types = list(ignore)


tokens.sort(key=lambda x:(-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name)) tokens.sort(key=lambda x:(-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name))


@@ -147,46 +193,8 @@ class Lexer(object):


self.mres = build_mres(tokens) self.mres = build_mres(tokens)



def lex(self, stream): def lex(self, stream):
lex_pos = 0
line = 1
col_start_pos = 0
newline_types = list(self.newline_types)
ignore_types = list(self.ignore_types)
while True:
for mre, type_from_index in self.mres:
m = mre.match(stream, lex_pos)
if m:
value = m.group(0)
type_ = type_from_index[m.lastindex]
to_yield = type_ not in ignore_types

if to_yield:
t = Token(type_, value, lex_pos, line, lex_pos - col_start_pos)
end_col = t.column + len(value)
if t.type in self.callback:
t = self.callback[t.type](t)

if type_ in newline_types:
newlines = value.count(self.newline_char)
if newlines:
line += newlines
last_newline_index = value.rindex(self.newline_char) + 1
col_start_pos = lex_pos + last_newline_index
end_col = len(value) - last_newline_index

if to_yield:
t.end_line = line
t.end_col = end_col
yield t

lex_pos += len(value)
break
else:
if lex_pos < len(stream):
raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos)
break
return _Lex(self).lex(stream, self.newline_types, self.ignore_types)




class ContextualLexer: class ContextualLexer:
@@ -204,7 +212,7 @@ class ContextualLexer:
lexer = lexer_by_tokens[key] lexer = lexer_by_tokens[key]
except KeyError: except KeyError:
accepts = set(accepts) | set(ignore) | set(always_accept) accepts = set(accepts) | set(ignore) | set(always_accept)
state_tokens = [tokens_by_name[n] for n in accepts if is_terminal(n) and n!='$end']
state_tokens = [tokens_by_name[n] for n in accepts if is_terminal(n) and n!='$END']
lexer = Lexer(state_tokens, ignore=ignore) lexer = Lexer(state_tokens, ignore=ignore)
lexer_by_tokens[key] = lexer lexer_by_tokens[key] = lexer


@@ -218,33 +226,9 @@ class ContextualLexer:
self.parser_state = state self.parser_state = state


def lex(self, stream): def lex(self, stream):
lex_pos = 0
line = 1
col_start_pos = 0
newline_types = list(self.root_lexer.newline_types)
ignore_types = list(self.root_lexer.ignore_types)
while True:
lexer = self.lexers[self.parser_state]
for mre, type_from_index in lexer.mres:
m = mre.match(stream, lex_pos)
if m:
value = m.group(0)
type_ = type_from_index[m.lastindex]
if type_ not in ignore_types:
t = Token(type_, value, lex_pos, line, lex_pos - col_start_pos)
if t.type in lexer.callback:
t = lexer.callback[t.type](t)
yield t
l = _Lex(self.lexers[self.parser_state])
for x in l.lex(stream, self.root_lexer.newline_types, self.root_lexer.ignore_types):
yield x
l.lexer = self.lexers[self.parser_state]


if type_ in newline_types:
newlines = value.count(lexer.newline_char)
if newlines:
line += newlines
col_start_pos = lex_pos + value.rindex(lexer.newline_char)
lex_pos += len(value)
break
else:
if lex_pos < len(stream):
raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos, lexer.tokens)
break



+ 51
- 51
lark/load_grammar.py View File

@@ -12,6 +12,7 @@ from .parse_tree_builder import ParseTreeBuilder
from .parser_frontends import LALR from .parser_frontends import LALR
from .parsers.lalr_parser import UnexpectedToken from .parsers.lalr_parser import UnexpectedToken
from .common import is_terminal, GrammarError, LexerConf, ParserConf, PatternStr, PatternRE, TokenDef from .common import is_terminal, GrammarError, LexerConf, ParserConf, PatternStr, PatternRE, TokenDef
from .grammar import RuleOptions, Rule


from .tree import Tree as T, Transformer, InlineTransformer, Visitor from .tree import Tree as T, Transformer, InlineTransformer, Visitor


@@ -127,7 +128,7 @@ RULES = {


class EBNF_to_BNF(InlineTransformer): class EBNF_to_BNF(InlineTransformer):
def __init__(self): def __init__(self):
self.new_rules = {}
self.new_rules = []
self.rules_by_expr = {} self.rules_by_expr = {}
self.prefix = 'anon' self.prefix = 'anon'
self.i = 0 self.i = 0
@@ -140,7 +141,8 @@ class EBNF_to_BNF(InlineTransformer):
new_name = '__%s_%s_%d' % (self.prefix, type_, self.i) new_name = '__%s_%s_%d' % (self.prefix, type_, self.i)
self.i += 1 self.i += 1
t = Token('RULE', new_name, -1) t = Token('RULE', new_name, -1)
self.new_rules[new_name] = T('expansions', [T('expansion', [expr]), T('expansion', [t, expr])]), self.rule_options
tree = T('expansions', [T('expansion', [expr]), T('expansion', [t, expr])])
self.new_rules.append((new_name, tree, self.rule_options))
self.rules_by_expr[expr] = t self.rules_by_expr[expr] = t
return t return t


@@ -174,7 +176,6 @@ class SimplifyRule_Visitor(Visitor):
break break
tree.expand_kids_by_index(*to_expand) tree.expand_kids_by_index(*to_expand)



def expansion(self, tree): def expansion(self, tree):
# rules_list unpacking # rules_list unpacking
# a : b (c|d) e # a : b (c|d) e
@@ -194,7 +195,7 @@ class SimplifyRule_Visitor(Visitor):
tree.data = 'expansions' tree.data = 'expansions'
tree.children = [self.visit(T('expansion', [option if i==j else other tree.children = [self.visit(T('expansion', [option if i==j else other
for j, other in enumerate(tree.children)])) for j, other in enumerate(tree.children)]))
for option in child.children]
for option in set(child.children)]
break break
else: else:
break break
@@ -208,7 +209,10 @@ class SimplifyRule_Visitor(Visitor):
tree.data = 'expansions' tree.data = 'expansions'
tree.children = aliases tree.children = aliases


expansions = _flatten
def expansions(self, tree):
self._flatten(tree)
tree.children = list(set(tree.children))



class RuleTreeToText(Transformer): class RuleTreeToText(Transformer):
def expansions(self, x): def expansions(self, x):
@@ -389,12 +393,6 @@ def _interleave(l, item):
def _choice_of_rules(rules): def _choice_of_rules(rules):
return T('expansions', [T('expansion', [Token('RULE', name)]) for name in rules]) return T('expansions', [T('expansion', [Token('RULE', name)]) for name in rules])


def dict_update_safe(d1, d2):
for k, v in d2.items():
assert k not in d1
d1[k] = v


class Grammar: class Grammar:
def __init__(self, rule_defs, token_defs, ignore): def __init__(self, rule_defs, token_defs, ignore):
self.token_defs = token_defs self.token_defs = token_defs
@@ -411,6 +409,7 @@ class Grammar:
terms_to_ignore = {name:'__'+name for name in self.ignore} terms_to_ignore = {name:'__'+name for name in self.ignore}
if terms_to_ignore: if terms_to_ignore:
assert set(terms_to_ignore) <= {name for name, _t in term_defs} assert set(terms_to_ignore) <= {name for name, _t in term_defs}

term_defs = [(terms_to_ignore.get(name,name),t) for name,t in term_defs] term_defs = [(terms_to_ignore.get(name,name),t) for name,t in term_defs]
expr = Token('RULE', '__ignore') expr = Token('RULE', '__ignore')
for r, tree, _o in rule_defs: for r, tree, _o in rule_defs:
@@ -466,57 +465,41 @@ class Grammar:
# ================= # =================
# Compile Rules # Compile Rules
# ================= # =================
ebnf_to_bnf = EBNF_to_BNF()
simplify_rule = SimplifyRule_Visitor()


# 1. Pre-process terminals
transformer = PrepareLiterals() transformer = PrepareLiterals()
if not lexer: if not lexer:
transformer *= SplitLiterals() transformer *= SplitLiterals()
transformer *= ExtractAnonTokens(tokens) # Adds to tokens transformer *= ExtractAnonTokens(tokens) # Adds to tokens


rules = {}
# 2. Convert EBNF to BNF (and apply step 1)
ebnf_to_bnf = EBNF_to_BNF()
rules = []
for name, rule_tree, options in rule_defs: for name, rule_tree, options in rule_defs:
assert name not in rules, name
ebnf_to_bnf.rule_options = RuleOptions(keep_all_tokens=True) if options and options.keep_all_tokens else None ebnf_to_bnf.rule_options = RuleOptions(keep_all_tokens=True) if options and options.keep_all_tokens else None
tree = transformer.transform(rule_tree) tree = transformer.transform(rule_tree)
rules[name] = ebnf_to_bnf.transform(tree), options
rules.append((name, ebnf_to_bnf.transform(tree), options))
rules += ebnf_to_bnf.new_rules


dict_update_safe(rules, ebnf_to_bnf.new_rules)

for tree, _o in rules.values():
simplify_rule.visit(tree)
assert len(rules) == len({name for name, _t, _o in rules}), "Whoops, name collision"


# 3. Compile tree to Rule objects
rule_tree_to_text = RuleTreeToText() rule_tree_to_text = RuleTreeToText()
rules = {origin: (rule_tree_to_text.transform(tree), options) for origin, (tree, options) in rules.items()}

return tokens, rules, self.ignore


simplify_rule = SimplifyRule_Visitor()
compiled_rules = []
for name, tree, options in rules:
simplify_rule.visit(tree)
expansions = rule_tree_to_text.transform(tree)


for expansion, alias in expansions:
if alias and name.startswith('_'):
raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (name, alias))


class RuleOptions:
def __init__(self, keep_all_tokens=False, expand1=False, create_token=None, filter_out=False, priority=None):
self.keep_all_tokens = keep_all_tokens
self.expand1 = expand1
self.create_token = create_token # used for scanless postprocessing
self.priority = priority

self.filter_out = filter_out # remove this rule from the tree
# used for "token"-rules in scanless
@classmethod
def from_rule(cls, name, *x):
if len(x) > 1:
priority, expansions = x
priority = int(priority)
else:
expansions ,= x
priority = None

keep_all_tokens = name.startswith('!')
name = name.lstrip('!')
expand1 = name.startswith('?')
name = name.lstrip('?')
rule = Rule(name, expansion, alias, options)
compiled_rules.append(rule)


return name, expansions, cls(keep_all_tokens, expand1, priority=priority)
return tokens, compiled_rules, self.ignore






@@ -553,15 +536,30 @@ def resolve_token_references(token_defs):
if not changed: if not changed:
break break


def options_from_rule(name, *x):
if len(x) > 1:
priority, expansions = x
priority = int(priority)
else:
expansions ,= x
priority = None

keep_all_tokens = name.startswith('!')
name = name.lstrip('!')
expand1 = name.startswith('?')
name = name.lstrip('?')

return name, expansions, RuleOptions(keep_all_tokens, expand1, priority=priority)


class GrammarLoader: class GrammarLoader:
def __init__(self): def __init__(self):
tokens = [TokenDef(name, PatternRE(value)) for name, value in TOKENS.items()] tokens = [TokenDef(name, PatternRE(value)) for name, value in TOKENS.items()]


rules = [RuleOptions.from_rule(name, x) for name, x in RULES.items()]
d = {r: ([(x.split(), None) for x in xs], o) for r, xs, o in rules}
rules, callback = ParseTreeBuilder(d, T).apply()
rules = [options_from_rule(name, x) for name, x in RULES.items()]
rules = [Rule(r, x.split(), None, o) for r, xs, o in rules for x in xs]
callback = ParseTreeBuilder(rules, T).create_callback()
lexer_conf = LexerConf(tokens, ['WS', 'COMMENT']) lexer_conf = LexerConf(tokens, ['WS', 'COMMENT'])

parser_conf = ParserConf(rules, callback, 'start') parser_conf = ParserConf(rules, callback, 'start')
self.parser = LALR(lexer_conf, parser_conf) self.parser = LALR(lexer_conf, parser_conf)


@@ -636,7 +634,6 @@ class GrammarLoader:
ignore_names.append(name) ignore_names.append(name)
token_defs.append((name, (t, 0))) token_defs.append((name, (t, 0)))



# Verify correctness 2 # Verify correctness 2
token_names = set() token_names = set()
for name, _ in token_defs: for name, _ in token_defs:
@@ -644,10 +641,13 @@ class GrammarLoader:
raise GrammarError("Token '%s' defined more than once" % name) raise GrammarError("Token '%s' defined more than once" % name)
token_names.add(name) token_names.add(name)


if set(ignore_names) > token_names:
raise GrammarError("Tokens %s were marked to ignore but were not defined!" % (set(ignore_names) - token_names))

# Resolve token references # Resolve token references
resolve_token_references(token_defs) resolve_token_references(token_defs)


rules = [RuleOptions.from_rule(*x) for x in rule_defs]
rules = [options_from_rule(*x) for x in rule_defs]


rule_names = set() rule_names = set()
for name, _x, _o in rules: for name, _x, _o in rules:


+ 33
- 28
lark/parse_tree_builder.py View File

@@ -1,6 +1,9 @@
from .common import is_terminal, GrammarError from .common import is_terminal, GrammarError
from .utils import suppress from .utils import suppress
from .lexer import Token from .lexer import Token
from .grammar import Rule

###{standalone


class NodeBuilder: class NodeBuilder:
def __init__(self, tree_class, name): def __init__(self, tree_class, name):
@@ -27,7 +30,7 @@ class Factory:


def __call__(self, node_builder): def __call__(self, node_builder):
return self.cls(node_builder, *self.args) return self.cls(node_builder, *self.args)


class TokenWrapper: class TokenWrapper:
"Used for fixing the results of scanless parsing" "Used for fixing the results of scanless parsing"
@@ -106,51 +109,53 @@ class ParseTreeBuilder:


self.rule_builders = list(self._init_builders(rules)) self.rule_builders = list(self._init_builders(rules))


self.user_aliases = {}

def _init_builders(self, rules): def _init_builders(self, rules):
filter_out = set() filter_out = set()
for origin, (expansions, options) in rules.items():
if options and options.filter_out:
assert origin.startswith('_') # Just to make sure
filter_out.add(origin)
for rule in rules:
if rule.options and rule.options.filter_out:
assert rule.origin.startswith('_') # Just to make sure
filter_out.add(rule.origin)


for origin, (expansions, options) in rules.items():
for rule in rules:
options = rule.options
keep_all_tokens = self.always_keep_all_tokens or (options.keep_all_tokens if options else False) keep_all_tokens = self.always_keep_all_tokens or (options.keep_all_tokens if options else False)
expand1 = options.expand1 if options else False expand1 = options.expand1 if options else False
create_token = options.create_token if options else False create_token = options.create_token if options else False


for expansion, alias in expansions:
if alias and origin.startswith('_'):
raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (origin, alias))
wrapper_chain = filter(None, [
(expand1 and not rule.alias) and Expand1,
create_token and Factory(TokenWrapper, create_token),
create_rule_handler(rule.expansion, keep_all_tokens, filter_out),
self.propagate_positions and PropagatePositions,
])


wrapper_chain = filter(None, [
(expand1 and not alias) and Expand1,
create_token and Factory(TokenWrapper, create_token),
create_rule_handler(expansion, keep_all_tokens, filter_out),
self.propagate_positions and PropagatePositions,
])
yield rule, wrapper_chain


yield origin, expansion, options, alias or origin, wrapper_chain



def apply(self, transformer=None):
def create_callback(self, transformer=None):
callback = Callback() callback = Callback()


new_rules = []
for origin, expansion, options, alias, wrapper_chain in self.rule_builders:
callback_name = '_callback_%s_%s' % (origin, '_'.join(expansion))
for rule, wrapper_chain in self.rule_builders:
internal_callback_name = '_callback_%s_%s' % (rule.origin, '_'.join(rule.expansion))


user_callback_name = rule.alias or rule.origin
try: try:
f = transformer._get_func(alias)
f = transformer._get_func(user_callback_name)
except AttributeError: except AttributeError:
f = NodeBuilder(self.tree_class, alias)
f = NodeBuilder(self.tree_class, user_callback_name)

self.user_aliases[rule] = rule.alias
rule.alias = internal_callback_name


for w in wrapper_chain: for w in wrapper_chain:
f = w(f) f = w(f)


if hasattr(callback, callback_name):
raise GrammarError("Rule expansion '%s' already exists in rule %s" % (' '.join(expansion), origin))
setattr(callback, callback_name, f)
if hasattr(callback, internal_callback_name):
raise GrammarError("Rule '%s' already exists" % (rule,))
setattr(callback, internal_callback_name, f)


new_rules.append(( origin, expansion, callback_name, options ))
return callback


return new_rules, callback
###}

+ 55
- 63
lark/parser_frontends.py View File

@@ -1,5 +1,5 @@
import re import re
import sre_parse
from .utils import get_regexp_width


from parsers.grammar_analysis import GrammarAnalyzer from parsers.grammar_analysis import GrammarAnalyzer
from .lexer import Lexer, ContextualLexer, Token from .lexer import Lexer, ContextualLexer, Token
@@ -9,10 +9,16 @@ from .parsers import lalr_parser, earley, xearley, resolve_ambig, cyk
from .tree import Tree from .tree import Tree


class WithLexer: class WithLexer:
def __init__(self, lexer_conf):
def init_traditional_lexer(self, lexer_conf):
self.lexer_conf = lexer_conf self.lexer_conf = lexer_conf
self.lexer = Lexer(lexer_conf.tokens, ignore=lexer_conf.ignore) self.lexer = Lexer(lexer_conf.tokens, ignore=lexer_conf.ignore)


def init_contextual_lexer(self, lexer_conf, parser_conf):
self.lexer_conf = lexer_conf
d = {idx:t.keys() for idx, t in self.parser.analysis.parse_table.states.items()}
always_accept = lexer_conf.postlex.always_accept if lexer_conf.postlex else ()
self.lexer = ContextualLexer(lexer_conf.tokens, d, ignore=lexer_conf.ignore, always_accept=always_accept)

def lex(self, text): def lex(self, text):
stream = self.lexer.lex(text) stream = self.lexer.lex(text)
if self.lexer_conf.postlex: if self.lexer_conf.postlex:
@@ -23,32 +29,22 @@ class WithLexer:


class LALR(WithLexer): class LALR(WithLexer):
def __init__(self, lexer_conf, parser_conf, options=None): def __init__(self, lexer_conf, parser_conf, options=None):
WithLexer.__init__(self, lexer_conf)

self.parser_conf = parser_conf
self.parser = lalr_parser.Parser(parser_conf) self.parser = lalr_parser.Parser(parser_conf)
self.init_traditional_lexer(lexer_conf)


def parse(self, text): def parse(self, text):
tokens = self.lex(text)
return self.parser.parse(tokens)
token_stream = self.lex(text)
return self.parser.parse(token_stream)




class LALR_ContextualLexer:
class LALR_ContextualLexer(WithLexer):
def __init__(self, lexer_conf, parser_conf, options=None): def __init__(self, lexer_conf, parser_conf, options=None):
self.lexer_conf = lexer_conf
self.parser_conf = parser_conf

self.parser = lalr_parser.Parser(parser_conf) self.parser = lalr_parser.Parser(parser_conf)

d = {idx:t.keys() for idx, t in self.parser.analysis.states_idx.items()}
always_accept = lexer_conf.postlex.always_accept if lexer_conf.postlex else ()
self.lexer = ContextualLexer(lexer_conf.tokens, d, ignore=lexer_conf.ignore, always_accept=always_accept)
self.init_contextual_lexer(lexer_conf, parser_conf)


def parse(self, text): def parse(self, text):
tokens = self.lexer.lex(text)
if self.lexer_conf.postlex:
tokens = self.lexer_conf.postlex.process(tokens)
return self.parser.parse(tokens, self.lexer.set_parser_state)
token_stream = self.lex(text)
return self.parser.parse(token_stream, self.lexer.set_parser_state)


def get_ambiguity_resolver(options): def get_ambiguity_resolver(options):
if not options or options.ambiguity == 'resolve': if not options or options.ambiguity == 'resolve':
@@ -60,55 +56,47 @@ def get_ambiguity_resolver(options):
raise ValueError(options) raise ValueError(options)


def tokenize_text(text): def tokenize_text(text):
new_text = []
line = 1 line = 1
col_start_pos = 0 col_start_pos = 0
for i, ch in enumerate(text): for i, ch in enumerate(text):
if '\n' in ch: if '\n' in ch:
line += ch.count('\n') line += ch.count('\n')
col_start_pos = i + ch.rindex('\n') col_start_pos = i + ch.rindex('\n')
new_text.append(Token('CHAR', ch, line=line, column=i - col_start_pos))
return new_text
yield Token('CHAR', ch, line=line, column=i - col_start_pos)


class Earley_NoLex: class Earley_NoLex:
def __init__(self, lexer_conf, parser_conf, options=None): def __init__(self, lexer_conf, parser_conf, options=None):
self.token_by_name = {t.name:t for t in lexer_conf.tokens}
self._prepare_match(lexer_conf)


rules = [(n, list(self._prepare_expansion(x)), a, o) for n,x,a,o in parser_conf.rules]

self.parser = earley.Parser(rules,
parser_conf.start,
parser_conf.callback,
self.parser = earley.Parser(parser_conf, self.match,
resolve_ambiguity=get_ambiguity_resolver(options)) resolve_ambiguity=get_ambiguity_resolver(options))


def _prepare_expansion(self, expansion):
for sym in expansion:
if is_terminal(sym):
regexp = self.token_by_name[sym].pattern.to_regexp()
width = sre_parse.parse(regexp).getwidth()
if width != (1,1):
raise GrammarError('Scanless parsing (lexer=None) requires all tokens to have a width of 1 (terminal %s: %s is %s)' % (sym, regexp, width))
yield Terminal_Regexp(sym, regexp)
else:
yield sym

def match(self, term, text, index=0):
return self.regexps[term].match(text, index)

def _prepare_match(self, lexer_conf):
self.regexps = {}
for t in lexer_conf.tokens:
regexp = t.pattern.to_regexp()
width = get_regexp_width(regexp)
if width != (1,1):
raise GrammarError('Scanless parsing (lexer=None) requires all tokens to have a width of 1 (terminal %s: %s is %s)' % (sym, regexp, width))
self.regexps[t.name] = re.compile(regexp)


def parse(self, text): def parse(self, text):
new_text = tokenize_text(text)
return self.parser.parse(new_text)
token_stream = tokenize_text(text)
return self.parser.parse(token_stream)


class Earley(WithLexer): class Earley(WithLexer):
def __init__(self, lexer_conf, parser_conf, options=None): def __init__(self, lexer_conf, parser_conf, options=None):
WithLexer.__init__(self, lexer_conf)

rules = [(n, self._prepare_expansion(x), a, o) for n,x,a,o in parser_conf.rules]
self.init_traditional_lexer(lexer_conf)


self.parser = earley.Parser(rules,
parser_conf.start,
parser_conf.callback,
self.parser = earley.Parser(parser_conf, self.match,
resolve_ambiguity=get_ambiguity_resolver(options)) resolve_ambiguity=get_ambiguity_resolver(options))


def _prepare_expansion(self, expansion):
return [Terminal_Token(sym) if is_terminal(sym) else sym for sym in expansion]
def match(self, term, token):
return term == token.type


def parse(self, text): def parse(self, text):
tokens = self.lex(text) tokens = self.lex(text)
@@ -119,27 +107,31 @@ class XEarley:
def __init__(self, lexer_conf, parser_conf, options=None): def __init__(self, lexer_conf, parser_conf, options=None):
self.token_by_name = {t.name:t for t in lexer_conf.tokens} self.token_by_name = {t.name:t for t in lexer_conf.tokens}


rules = [(n, list(self._prepare_expansion(x)), a, o) for n,x,a,o in parser_conf.rules]
self._prepare_match(lexer_conf)


ignore = [Terminal_Regexp(x, self.token_by_name[x].pattern.to_regexp()) for x in lexer_conf.ignore]

self.parser = xearley.Parser(rules,
parser_conf.start,
parser_conf.callback,
self.parser = xearley.Parser(parser_conf,
self.match,
resolve_ambiguity=get_ambiguity_resolver(options), resolve_ambiguity=get_ambiguity_resolver(options),
ignore=ignore,
ignore=lexer_conf.ignore,
predict_all=options.earley__predict_all predict_all=options.earley__predict_all
) )


def _prepare_expansion(self, expansion):
for sym in expansion:
if is_terminal(sym):
regexp = self.token_by_name[sym].pattern.to_regexp()
width = sre_parse.parse(regexp).getwidth()
assert width
yield Terminal_Regexp(sym, regexp)
def match(self, term, text, index=0):
return self.regexps[term].match(text, index)

def _prepare_match(self, lexer_conf):
self.regexps = {}
for t in lexer_conf.tokens:
regexp = t.pattern.to_regexp()
try:
width = get_regexp_width(regexp)[0]
except ValueError:
raise ValueError("Bad regexp in token %s: %s" % (t.name, regexp))
else: else:
yield sym
if width == 0:
raise ValueError("Dynamic Earley doesn't allow zero-width regexps")

self.regexps[t.name] = re.compile(regexp)


def parse(self, text): def parse(self, text):
return self.parser.parse(text) return self.parser.parse(text)


+ 17
- 48
lark/parsers/earley.py View File

@@ -13,14 +13,11 @@
# Author: Erez Shinan (2017) # Author: Erez Shinan (2017)
# Email : erezshin@gmail.com # Email : erezshin@gmail.com


from ..common import ParseError, UnexpectedToken, Terminal
from ..common import ParseError, UnexpectedToken, is_terminal
from ..tree import Tree, Visitor_NoRecurse, Transformer_NoRecurse from ..tree import Tree, Visitor_NoRecurse, Transformer_NoRecurse
from .grammar_analysis import GrammarAnalyzer from .grammar_analysis import GrammarAnalyzer




class EndToken:
type = '$end'

class Derivation(Tree): class Derivation(Tree):
_hash = None _hash = None


@@ -35,8 +32,6 @@ class Derivation(Tree):
self._hash = Tree.__hash__(self) self._hash = Tree.__hash__(self)
return self._hash return self._hash


END_TOKEN = EndToken()

class Item(object): class Item(object):
"An Earley Item, the atom of the algorithm." "An Earley Item, the atom of the algorithm."


@@ -59,11 +54,8 @@ class Item(object):
new_tree = Derivation(self.rule, self.tree.children + [tree]) new_tree = Derivation(self.rule, self.tree.children + [tree])
return self.__class__(self.rule, self.ptr+1, self.start, new_tree) return self.__class__(self.rule, self.ptr+1, self.start, new_tree)


def similar(self, other):
return self.start is other.start and self.ptr == other.ptr and self.rule == other.rule

def __eq__(self, other): def __eq__(self, other):
return self.similar(other) #and (self.tree == other.tree)
return self.start is other.start and self.ptr == other.ptr and self.rule == other.rule


def __hash__(self): def __hash__(self):
return hash((self.rule, self.ptr, id(self.start))) # Always runs Derivation.__hash__ return hash((self.rule, self.ptr, id(self.start))) # Always runs Derivation.__hash__
@@ -134,7 +126,7 @@ class Column:
self.completed[item_key] = item self.completed[item_key] = item
self.to_reduce.append(item) self.to_reduce.append(item)
else: else:
if isinstance(item.expect, Terminal):
if is_terminal(item.expect):
self.to_scan.append(item) self.to_scan.append(item)
else: else:
k = item_key if self.predict_all else item k = item_key if self.predict_all else item
@@ -151,31 +143,30 @@ class Column:
__nonzero__ = __bool__ # Py2 backwards-compatibility __nonzero__ = __bool__ # Py2 backwards-compatibility


class Parser: class Parser:
def __init__(self, rules, start_symbol, callback, resolve_ambiguity=None):
self.analysis = GrammarAnalyzer(rules, start_symbol)
self.start_symbol = start_symbol
def __init__(self, parser_conf, term_matcher, resolve_ambiguity=None):
self.analysis = GrammarAnalyzer(parser_conf)
self.parser_conf = parser_conf
self.resolve_ambiguity = resolve_ambiguity self.resolve_ambiguity = resolve_ambiguity


self.FIRST = self.analysis.FIRST
self.postprocess = {} self.postprocess = {}
self.predictions = {} self.predictions = {}
self.FIRST = {}
for rule in self.analysis.rules:
if rule.origin != '$root': # XXX kinda ugly
a = rule.alias
self.postprocess[rule] = a if callable(a) else (a and getattr(callback, a))
self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)]
for rule in parser_conf.rules:
self.postprocess[rule] = getattr(parser_conf.callback, rule.alias)
self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)]


self.FIRST[rule.origin] = self.analysis.FIRST[rule.origin]
self.term_matcher = term_matcher




def parse(self, stream, start_symbol=None): def parse(self, stream, start_symbol=None):
# Define parser functions # Define parser functions
start_symbol = start_symbol or self.start_symbol
start_symbol = start_symbol or self.parser_conf.start


_Item = Item _Item = Item
match = self.term_matcher


def predict(nonterm, column): def predict(nonterm, column):
assert not isinstance(nonterm, Terminal), nonterm
assert not is_terminal(nonterm), nonterm
return [_Item(rule, 0, column, None) for rule in self.predictions[nonterm]] return [_Item(rule, 0, column, None) for rule in self.predictions[nonterm]]


def complete(item): def complete(item):
@@ -195,14 +186,13 @@ class Parser:


for item in to_reduce: for item in to_reduce:
new_items = list(complete(item)) new_items = list(complete(item))
for new_item in new_items:
if new_item.similar(item):
raise ParseError('Infinite recursion detected! (rule %s)' % new_item.rule)
if item in new_items:
raise ParseError('Infinite recursion detected! (rule %s)' % item.rule)
column.add(new_items) column.add(new_items)


def scan(i, token, column): def scan(i, token, column):
next_set = Column(i, self.FIRST) next_set = Column(i, self.FIRST)
next_set.add(item.advance(token) for item in column.to_scan if item.expect.match(token))
next_set.add(item.advance(token) for item in column.to_scan if match(item.expect, token))


if not next_set: if not next_set:
expect = {i.expect for i in column.to_scan} expect = {i.expect for i in column.to_scan}
@@ -249,24 +239,3 @@ class ApplyCallbacks(Transformer_NoRecurse):
return callback(children) return callback(children)
else: else:
return Tree(rule.origin, children) return Tree(rule.origin, children)

# RULES = [
# ('a', ['d']),
# ('d', ['b']),
# ('b', ['C']),
# ('b', ['b', 'C']),
# ('b', ['C', 'b']),
# ]
# p = Parser(RULES, 'a')
# for x in p.parse('CC'):
# print x.pretty()

#---------------
# RULES = [
# ('s', ['a', 'a']),
# ('a', ['b', 'b']),
# ('b', ['C'], lambda (x,): x),
# ('b', ['b', 'C']),
# ]
# p = Parser(RULES, 's', {})
# print p.parse('CCCCC').pretty()

+ 19
- 29
lark/parsers/grammar_analysis.py View File

@@ -1,20 +1,8 @@


from ..utils import bfs, fzset from ..utils import bfs, fzset
from ..common import GrammarError, is_terminal from ..common import GrammarError, is_terminal
from ..grammar import Rule


class Rule(object):
"""
origin : a symbol
expansion : a list of symbols
"""
def __init__(self, origin, expansion, alias=None, options=None):
self.origin = origin
self.expansion = expansion
self.alias = alias
self.options = options

def __repr__(self):
return '<%s : %s>' % (self.origin, ' '.join(map(str,self.expansion)))


class RulePtr(object): class RulePtr(object):
def __init__(self, rule, index): def __init__(self, rule, index):
@@ -106,28 +94,30 @@ def calculate_sets(rules):




class GrammarAnalyzer(object): class GrammarAnalyzer(object):
def __init__(self, rule_tuples, start_symbol, debug=False):
self.start_symbol = start_symbol
def __init__(self, parser_conf, debug=False):
rules = parser_conf.rules
assert len(rules) == len(set(rules))

self.start_symbol = parser_conf.start
self.debug = debug self.debug = debug
rule_tuples = list(rule_tuples)
rule_tuples.append(('$root', [start_symbol, '$end']))
rule_tuples = [(t[0], t[1], None, None) if len(t)==2 else t for t in rule_tuples]

self.rules = set()
self.rules_by_origin = {o: [] for o, _x, _a, _opt in rule_tuples}
for origin, exp, alias, options in rule_tuples:
r = Rule( origin, exp, alias, options )
self.rules.add(r)
self.rules_by_origin[origin].append(r)

for r in self.rules:

root_rule = Rule('$root', [self.start_symbol, '$END'])

self.rules_by_origin = {r.origin: [] for r in rules}
for r in rules:
self.rules_by_origin[r.origin].append(r)

self.rules_by_origin[root_rule.origin] = [root_rule]

for r in rules:
for sym in r.expansion: for sym in r.expansion:
if not (is_terminal(sym) or sym in self.rules_by_origin): if not (is_terminal(sym) or sym in self.rules_by_origin):
raise GrammarError("Using an undefined rule: %s" % sym) raise GrammarError("Using an undefined rule: %s" % sym)


self.init_state = self.expand_rule('$root')
self.start_state = self.expand_rule('$root')
self.rules = rules


self.FIRST, self.FOLLOW, self.NULLABLE = calculate_sets(self.rules)
self.FIRST, self.FOLLOW, self.NULLABLE = calculate_sets(rules + [root_rule])


def expand_rule(self, rule): def expand_rule(self, rule):
"Returns all init_ptrs accessible by rule (recursive)" "Returns all init_ptrs accessible by rule (recursive)"


+ 47
- 18
lark/parsers/lalr_analysis.py View File

@@ -14,7 +14,43 @@ from ..common import GrammarError, is_terminal


from .grammar_analysis import GrammarAnalyzer from .grammar_analysis import GrammarAnalyzer


ACTION_SHIFT = 0
class Action:
def __init__(self, name):
self.name = name
def __str__(self):
return self.name
def __repr__(self):
return str(self)

Shift = Action('Shift')
Reduce = Action('Reduce')

class ParseTable:
def __init__(self, states, start_state, end_state):
self.states = states
self.start_state = start_state
self.end_state = end_state

class IntParseTable(ParseTable):

@classmethod
def from_ParseTable(cls, parse_table):
enum = list(parse_table.states)
state_to_idx = {s:i for i,s in enumerate(enum)}
int_states = {}

for s, la in parse_table.states.items():
la = {k:(v[0], state_to_idx[v[1]]) if v[0] is Shift else v
for k,v in la.items()}
int_states[ state_to_idx[s] ] = la


start_state = state_to_idx[parse_table.start_state]
end_state = state_to_idx[parse_table.end_state]
return cls(int_states, start_state, end_state)





class LALR_Analyzer(GrammarAnalyzer): class LALR_Analyzer(GrammarAnalyzer):


@@ -27,7 +63,7 @@ class LALR_Analyzer(GrammarAnalyzer):
sat, unsat = classify_bool(state, lambda rp: rp.is_satisfied) sat, unsat = classify_bool(state, lambda rp: rp.is_satisfied)
for rp in sat: for rp in sat:
for term in self.FOLLOW.get(rp.rule.origin, ()): for term in self.FOLLOW.get(rp.rule.origin, ()):
lookahead[term].append(('reduce', rp.rule))
lookahead[term].append((Reduce, rp.rule))


d = classify(unsat, lambda rp: rp.next) d = classify(unsat, lambda rp: rp.next)
for sym, rps in d.items(): for sym, rps in d.items():
@@ -38,8 +74,8 @@ class LALR_Analyzer(GrammarAnalyzer):
rps |= self.expand_rule(rp.next) rps |= self.expand_rule(rp.next)


new_state = fzset(rps) new_state = fzset(rps)
lookahead[sym].append(('shift', new_state))
if sym == '$end':
lookahead[sym].append((Shift, new_state))
if sym == '$END':
self.end_states.append( new_state ) self.end_states.append( new_state )
yield fzset(rps) yield fzset(rps)


@@ -50,7 +86,7 @@ class LALR_Analyzer(GrammarAnalyzer):
for x in v: for x in v:
# XXX resolving shift/reduce into shift, like PLY # XXX resolving shift/reduce into shift, like PLY
# Give a proper warning # Give a proper warning
if x[0] == 'shift':
if x[0] is Shift:
lookahead[k] = [x] lookahead[k] = [x]


for k, v in lookahead.items(): for k, v in lookahead.items():
@@ -59,22 +95,15 @@ class LALR_Analyzer(GrammarAnalyzer):


self.states[state] = {k:v[0] for k, v in lookahead.items()} self.states[state] = {k:v[0] for k, v in lookahead.items()}


for _ in bfs([self.init_state], step):
for _ in bfs([self.start_state], step):
pass pass


self.end_state ,= self.end_states self.end_state ,= self.end_states


# --
self.enum = list(self.states)
self.enum_rev = {s:i for i,s in enumerate(self.enum)}
self.states_idx = {}

for s, la in self.states.items():
la = {k:(ACTION_SHIFT, self.enum_rev[v[1]]) if v[0]=='shift'
else (v[0], (v[1], len(v[1].expansion))) # Reduce
for k,v in la.items()}
self.states_idx[ self.enum_rev[s] ] = la
self._parse_table = ParseTable(self.states, self.start_state, self.end_state)


if self.debug:
self.parse_table = self._parse_table
else:
self.parse_table = IntParseTable.from_ParseTable(self._parse_table)


self.init_state_idx = self.enum_rev[self.init_state]
self.end_state_idx = self.enum_rev[self.end_state]

+ 25
- 22
lark/parsers/lalr_parser.py View File

@@ -3,30 +3,30 @@
# Author: Erez Shinan (2017) # Author: Erez Shinan (2017)
# Email : erezshin@gmail.com # Email : erezshin@gmail.com


from ..common import ParseError, UnexpectedToken
from ..common import UnexpectedToken


from .lalr_analysis import LALR_Analyzer, ACTION_SHIFT

class FinalReduce:
def __init__(self, value):
self.value = value
from .lalr_analysis import LALR_Analyzer, Shift


class Parser: class Parser:
def __init__(self, parser_conf): def __init__(self, parser_conf):
assert all(o is None or o.priority is None for n,x,a,o in parser_conf.rules), "LALR doesn't yet support prioritization"
self.analysis = analysis = LALR_Analyzer(parser_conf.rules, parser_conf.start)
assert all(r.options is None or r.options.priority is None
for r in parser_conf.rules), "LALR doesn't yet support prioritization"
self.analysis = analysis = LALR_Analyzer(parser_conf)
analysis.compute_lookahead() analysis.compute_lookahead()
callbacks = {rule: getattr(parser_conf.callback, rule.alias or rule.origin, None) callbacks = {rule: getattr(parser_conf.callback, rule.alias or rule.origin, None)
for rule in analysis.rules} for rule in analysis.rules}


self.parser = _Parser(analysis.states_idx, analysis.init_state_idx, analysis.end_state_idx, callbacks)
self.parser_conf = parser_conf
self.parser = _Parser(analysis.parse_table, callbacks)
self.parse = self.parser.parse self.parse = self.parser.parse


###{standalone

class _Parser: class _Parser:
def __init__(self, states, init_state, end_state, callbacks):
self.states = states
self.init_state = init_state
self.end_state = end_state
def __init__(self, parse_table, callbacks):
self.states = parse_table.states
self.start_state = parse_table.start_state
self.end_state = parse_table.end_state
self.callbacks = callbacks self.callbacks = callbacks


def parse(self, seq, set_state=None): def parse(self, seq, set_state=None):
@@ -35,10 +35,10 @@ class _Parser:
stream = iter(seq) stream = iter(seq)
states = self.states states = self.states


state_stack = [self.init_state]
state_stack = [self.start_state]
value_stack = [] value_stack = []


if set_state: set_state(self.init_state)
if set_state: set_state(self.start_state)


def get_action(key): def get_action(key):
state = state_stack[-1] state = state_stack[-1]
@@ -49,7 +49,8 @@ class _Parser:


raise UnexpectedToken(token, expected, seq, i) raise UnexpectedToken(token, expected, seq, i)


def reduce(rule, size):
def reduce(rule):
size = len(rule.expansion)
if size: if size:
s = value_stack[-size:] s = value_stack[-size:]
del state_stack[-size:] del state_stack[-size:]
@@ -60,7 +61,7 @@ class _Parser:
value = self.callbacks[rule](s) value = self.callbacks[rule](s)


_action, new_state = get_action(rule.origin) _action, new_state = get_action(rule.origin)
assert _action == ACTION_SHIFT
assert _action is Shift
state_stack.append(new_state) state_stack.append(new_state)
value_stack.append(value) value_stack.append(value)


@@ -72,22 +73,24 @@ class _Parser:
action, arg = get_action(token.type) action, arg = get_action(token.type)
assert arg != self.end_state assert arg != self.end_state


if action == ACTION_SHIFT:
if action is Shift:
state_stack.append(arg) state_stack.append(arg)
value_stack.append(token) value_stack.append(token)
if set_state: set_state(arg) if set_state: set_state(arg)
token = next(stream) token = next(stream)
i += 1 i += 1
else: else:
reduce(*arg)
reduce(arg)
except StopIteration: except StopIteration:
pass pass


while True: while True:
_action, arg = get_action('$end')
if _action == ACTION_SHIFT:
_action, arg = get_action('$END')
if _action is Shift:
assert arg == self.end_state assert arg == self.end_state
val ,= value_stack val ,= value_stack
return val return val
else: else:
reduce(*arg)
reduce(arg)

###}

+ 21
- 25
lark/parsers/xearley.py View File

@@ -20,7 +20,7 @@


from collections import defaultdict from collections import defaultdict


from ..common import ParseError, UnexpectedToken, Terminal
from ..common import ParseError, UnexpectedToken, is_terminal
from ..lexer import Token, UnexpectedInput from ..lexer import Token, UnexpectedInput
from ..tree import Tree from ..tree import Tree
from .grammar_analysis import GrammarAnalyzer from .grammar_analysis import GrammarAnalyzer
@@ -28,37 +28,34 @@ from .grammar_analysis import GrammarAnalyzer
from .earley import ApplyCallbacks, Item, Column from .earley import ApplyCallbacks, Item, Column


class Parser: class Parser:
def __init__(self, rules, start_symbol, callback, resolve_ambiguity=None, ignore=(), predict_all=False):
self.analysis = GrammarAnalyzer(rules, start_symbol)
self.start_symbol = start_symbol
def __init__(self, parser_conf, term_matcher, resolve_ambiguity=None, ignore=(), predict_all=False):
self.analysis = GrammarAnalyzer(parser_conf)
self.parser_conf = parser_conf
self.resolve_ambiguity = resolve_ambiguity self.resolve_ambiguity = resolve_ambiguity
self.ignore = list(ignore) self.ignore = list(ignore)
self.predict_all = predict_all self.predict_all = predict_all


self.FIRST = self.analysis.FIRST
self.postprocess = {} self.postprocess = {}
self.predictions = {} self.predictions = {}
self.FIRST = {}

for rule in self.analysis.rules:
if rule.origin != '$root': # XXX kinda ugly
a = rule.alias
self.postprocess[rule] = a if callable(a) else (a and getattr(callback, a))
self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)]
for rule in parser_conf.rules:
self.postprocess[rule] = getattr(parser_conf.callback, rule.alias)
self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)]


self.FIRST[rule.origin] = self.analysis.FIRST[rule.origin]
self.term_matcher = term_matcher




def parse(self, stream, start_symbol=None): def parse(self, stream, start_symbol=None):
# Define parser functions # Define parser functions
start_symbol = start_symbol or self.start_symbol
start_symbol = start_symbol or self.parser_conf.start
delayed_matches = defaultdict(list) delayed_matches = defaultdict(list)
match = self.term_matcher


text_line = 1 text_line = 1
text_column = 0 text_column = 0


def predict(nonterm, column): def predict(nonterm, column):
assert not isinstance(nonterm, Terminal), nonterm
assert not is_terminal(nonterm), nonterm
return [Item(rule, 0, column, None) for rule in self.predictions[nonterm]] return [Item(rule, 0, column, None) for rule in self.predictions[nonterm]]


def complete(item): def complete(item):
@@ -77,16 +74,15 @@ class Parser:
column.add( predict(nonterm, column) ) column.add( predict(nonterm, column) )
for item in to_reduce: for item in to_reduce:
new_items = list(complete(item)) new_items = list(complete(item))
for new_item in new_items:
if new_item.similar(item):
raise ParseError('Infinite recursion detected! (rule %s)' % new_item.rule)
if item in new_items:
raise ParseError('Infinite recursion detected! (rule %s)' % item.rule)
column.add(new_items) column.add(new_items)


def scan(i, token, column): def scan(i, token, column):
to_scan = column.to_scan to_scan = column.to_scan


for x in self.ignore: for x in self.ignore:
m = x.match(stream, i)
m = match(x, stream, i)
if m: if m:
delayed_matches[m.end()] += set(to_scan) delayed_matches[m.end()] += set(to_scan)
delayed_matches[m.end()] += set(column.to_reduce) delayed_matches[m.end()] += set(column.to_reduce)
@@ -99,16 +95,16 @@ class Parser:
# delayed_matches[m.end()] += to_scan # delayed_matches[m.end()] += to_scan


for item in to_scan: for item in to_scan:
m = item.expect.match(stream, i)
m = match(item.expect, stream, i)
if m: if m:
t = Token(item.expect.name, m.group(0), i, text_line, text_column)
t = Token(item.expect, m.group(0), i, text_line, text_column)
delayed_matches[m.end()].append(item.advance(t)) delayed_matches[m.end()].append(item.advance(t))


s = m.group(0) s = m.group(0)
for j in range(1, len(s)): for j in range(1, len(s)):
m = item.expect.match(s[:-j])
m = match(item.expect, s[:-j])
if m: if m:
t = Token(item.expect.name, m.group(0), i, text_line, text_column)
t = Token(item.expect, m.group(0), i, text_line, text_column)
delayed_matches[i+m.end()].append(item.advance(t)) delayed_matches[i+m.end()].append(item.advance(t))


next_set = Column(i+1, self.FIRST, predict_all=self.predict_all) next_set = Column(i+1, self.FIRST, predict_all=self.predict_all)
@@ -131,7 +127,7 @@ class Parser:


if token == '\n': if token == '\n':
text_line += 1 text_line += 1
text_column = 1
text_column = 0
else: else:
text_column += 1 text_column += 1


@@ -143,7 +139,7 @@ class Parser:
if n.rule.origin==start_symbol and n.start is column0] if n.rule.origin==start_symbol and n.start is column0]


if not solutions: if not solutions:
expected_tokens = [t.expect.name for t in column.to_scan]
expected_tokens = [t.expect for t in column.to_scan]
raise ParseError('Unexpected end of input! Expecting a terminal of: %s' % expected_tokens) raise ParseError('Unexpected end of input! Expecting a terminal of: %s' % expected_tokens)


elif len(solutions) == 1: elif len(solutions) == 1:


+ 203
- 0
lark/tools/standalone.py View File

@@ -0,0 +1,203 @@
###{standalone
#
#
# Lark Stand-alone Generator Tool
# ----------------------------------
# Generates a stand-alone LALR(1) parser with a standard lexer
#
# Git: https://github.com/erezsh/lark
# Author: Erez Shinan (erezshin@gmail.com)
#
#
# >>> LICENSE
#
# This tool and its generated code use a separate license from Lark.
#
# It is licensed under GPLv2 or above.
#
# If you wish to purchase a commercial license for this tool and its
# generated code, contact me via email.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# See <http://www.gnu.org/licenses/>.
#
#
###}

import codecs
import sys
import os
from pprint import pprint
from os import path
from collections import defaultdict

import lark
from lark import Lark
from lark.parsers.lalr_analysis import Shift, Reduce

from ..grammar import Rule

__dir__ = path.dirname(__file__)
__larkdir__ = path.join(__dir__, path.pardir)


EXTRACT_STANDALONE_FILES = [
'tools/standalone.py',
'utils.py',
'common.py',
'tree.py',
'indenter.py',
'lexer.py',
'parse_tree_builder.py',
'parsers/lalr_parser.py',
]


def extract_sections(lines):
section = None
text = []
sections = defaultdict(list)
for l in lines:
if l.startswith('###'):
if l[3] == '{':
section = l[4:].strip()
elif l[3] == '}':
sections[section] += text
section = None
text = []
else:
raise ValueError(l)
elif section:
text.append(l)

return {name:''.join(text) for name, text in sections.items()}

class LexerAtoms:
def __init__(self, lexer):
self.mres = [(p.pattern,d) for p,d in lexer.mres]
self.newline_types = lexer.newline_types
self.ignore_types = lexer.ignore_types
self.callback = {name:[(p.pattern,d) for p,d in c.mres]
for name, c in lexer.callback.items()}

def print_python(self):
print('import re')
print('MRES = (')
pprint(self.mres)
print(')')
print('LEXER_CALLBACK = (')
pprint(self.callback)
print(')')
print('NEWLINE_TYPES = %s' % self.newline_types)
print('IGNORE_TYPES = %s' % self.ignore_types)
print('class LexerRegexps: pass')
print('lexer_regexps = LexerRegexps()')
print('lexer_regexps.mres = [(re.compile(p), d) for p, d in MRES]')
print('lexer_regexps.callback = {n: UnlessCallback([(re.compile(p), d) for p, d in mres])')
print(' for n, mres in LEXER_CALLBACK.items()}')
print('lexer = _Lex(lexer_regexps)')
print('def lex(stream):')
print(' return lexer.lex(stream, NEWLINE_TYPES, IGNORE_TYPES)')


class GetRule:
def __init__(self, rule_id):
self.rule_id = rule_id

def __repr__(self):
return 'RULES[%d]' % self.rule_id

rule_ids = {}
token_types = {}

def _get_token_type(token_type):
if token_type not in token_types:
token_types[token_type] = len(token_types)
return token_types[token_type]

class ParserAtoms:
def __init__(self, parser):
self.parse_table = parser.analysis.parse_table

def print_python(self):
print('class ParseTable: pass')
print('parse_table = ParseTable()')
print('STATES = {')
for state, actions in self.parse_table.states.items():
print(' %r: %r,' % (state, {_get_token_type(token): ((1, rule_ids[arg]) if action is Reduce else (0, arg))
for token, (action, arg) in actions.items()}))
print('}')
print('TOKEN_TYPES = (')
pprint({v:k for k, v in token_types.items()})
print(')')
print('parse_table.states = {s: {TOKEN_TYPES[t]: (a, RULES[x] if a is Reduce else x) for t, (a, x) in acts.items()}')
print(' for s, acts in STATES.items()}')
print('parse_table.start_state = %s' % self.parse_table.start_state)
print('parse_table.end_state = %s' % self.parse_table.end_state)
print('class Lark_StandAlone:')
print(' def __init__(self, transformer=None, postlex=None):')
print(' callback = parse_tree_builder.create_callback(transformer=transformer)')
print(' callbacks = {rule: getattr(callback, rule.alias or rule.origin, None) for rule in RULES.values()}')
print(' self.parser = _Parser(parse_table, callbacks)')
print(' self.postlex = postlex')
print(' def parse(self, stream):')
print(' tokens = lex(stream)')
print(' if self.postlex: tokens = self.postlex.process(tokens)')
print(' return self.parser.parse(tokens)')

class TreeBuilderAtoms:
def __init__(self, lark):
self.rules = lark.rules
self.ptb = lark._parse_tree_builder

def print_python(self):
print('RULES = {')
for i, r in enumerate(self.rules):
rule_ids[r] = i
print(' %d: Rule(%r, %r, %r, %r),' % (i, r.origin, r.expansion, self.ptb.user_aliases[r], r.options ))
print('}')
print('parse_tree_builder = ParseTreeBuilder(RULES.values(), Tree)')

def main(fn, start):
with codecs.open(fn, encoding='utf8') as f:
lark_inst = Lark(f, parser="lalr", start=start)

lexer_atoms = LexerAtoms(lark_inst.parser.lexer)
parser_atoms = ParserAtoms(lark_inst.parser.parser)
tree_builder_atoms = TreeBuilderAtoms(lark_inst)

print('# The file was automatically generated by Lark v%s' % lark.__version__)

for pyfile in EXTRACT_STANDALONE_FILES:
print (extract_sections(open(os.path.join(__larkdir__, pyfile)))['standalone'])

print(open(os.path.join(__larkdir__, 'grammar.py')).read())
print('Shift = 0')
print('Reduce = 1')
lexer_atoms.print_python()
tree_builder_atoms.print_python()
parser_atoms.print_python()

if __name__ == '__main__':
if len(sys.argv) < 2:
print("Lark Stand-alone Generator Tool")
print("Usage: python -m lark.tools.standalone <grammar-file> [<start>]")
sys.exit(1)

if len(sys.argv) == 3:
fn, start = sys.argv[1:]
elif len(sys.argv) == 2:
fn, start = sys.argv[1], 'start'
else:
assert False, sys.argv

main(fn, start)

+ 5
- 1
lark/tree.py View File

@@ -7,6 +7,7 @@ from copy import deepcopy


from .utils import inline_args from .utils import inline_args


###{standalone
class Tree(object): class Tree(object):
def __init__(self, data, children, rule=None): def __init__(self, data, children, rule=None):
self.data = data self.data = data
@@ -34,6 +35,7 @@ class Tree(object):


def pretty(self, indent_str=' '): def pretty(self, indent_str=' '):
return ''.join(self._pretty(0, indent_str)) return ''.join(self._pretty(0, indent_str))
###}


def expand_kids_by_index(self, *indices): def expand_kids_by_index(self, *indices):
for i in sorted(indices, reverse=True): # reverse so that changing tail won't affect indices for i in sorted(indices, reverse=True): # reverse so that changing tail won't affect indices
@@ -100,6 +102,7 @@ class Tree(object):






###{standalone
class Transformer(object): class Transformer(object):
def _get_func(self, name): def _get_func(self, name):
return getattr(self, name) return getattr(self, name)
@@ -139,7 +142,7 @@ class TransformerChain(object):


def __mul__(self, other): def __mul__(self, other):
return TransformerChain(*self.transformers + (other,)) return TransformerChain(*self.transformers + (other,))




class InlineTransformer(Transformer): class InlineTransformer(Transformer):
@@ -196,6 +199,7 @@ class Transformer_NoRecurse(Transformer):


def __default__(self, t): def __default__(self, t):
return t return t
###}




def pydot__tree_to_png(tree, filename): def pydot__tree_to_png(tree, filename):


+ 26
- 17
lark/utils.py View File

@@ -1,7 +1,4 @@
import functools
import types
from collections import deque from collections import deque
from contextlib import contextmanager


class fzset(frozenset): class fzset(frozenset):
def __repr__(self): def __repr__(self):
@@ -49,8 +46,13 @@ try:
except NameError: # Python 3 except NameError: # Python 3
STRING_TYPE = str STRING_TYPE = str


Str = type(u'')
###{standalone


import types
import functools
from contextlib import contextmanager

Str = type(u'')


def inline_args(f): def inline_args(f):
# print '@@', f.__name__, type(f), isinstance(f, types.FunctionType), isinstance(f, types.TypeType), isinstance(f, types.BuiltinFunctionType) # print '@@', f.__name__, type(f), isinstance(f, types.FunctionType), isinstance(f, types.TypeType), isinstance(f, types.BuiltinFunctionType)
@@ -76,19 +78,6 @@ def inline_args(f):
return _f return _f





try:
compare = cmp
except NameError:
def compare(a, b):
if a == b:
return 0
elif a > b:
return 1
else:
return -1


try: try:
from contextlib import suppress # Python 3 from contextlib import suppress # Python 3
except ImportError: except ImportError:
@@ -107,6 +96,26 @@ except ImportError:
except excs: except excs:
pass pass


###}






try:
compare = cmp
except NameError:
def compare(a, b):
if a == b:
return 0
elif a > b:
return 1
else:
return -1


import sre_parse
import sre_constants
def get_regexp_width(regexp):
try:
return sre_parse.parse(regexp).getwidth()
except sre_constants.error:
raise ValueError(regexp)

+ 49
- 5
tests/test_parser.py View File

@@ -126,7 +126,7 @@ class TestParsers(unittest.TestCase):
r = T().transform(g.parse("x")) r = T().transform(g.parse("x"))
self.assertEqual( r.children, ["<b>"] ) self.assertEqual( r.children, ["<b>"] )


g = Lark("""start: a g = Lark("""start: a
?a : b ?a : b
b : "x" b : "x"
@@ -142,14 +142,14 @@ class TestParsers(unittest.TestCase):
r = T().transform(g.parse("xx")) r = T().transform(g.parse("xx"))
self.assertEqual( r.children, ["<c>"] ) self.assertEqual( r.children, ["<c>"] )


g = Lark("""start: a g = Lark("""start: a
?a : b b -> c ?a : b b -> c
b : "x" b : "x"
""", parser='lalr', transformer=T()) """, parser='lalr', transformer=T())
r = g.parse("xx") r = g.parse("xx")
self.assertEqual( r.children, ["<c>"] ) self.assertEqual( r.children, ["<c>"] )






@@ -159,7 +159,7 @@ def _make_full_earley_test(LEXER):
# Fails an Earley implementation without special handling for empty rules, # Fails an Earley implementation without special handling for empty rules,
# or re-processing of already completed rules. # or re-processing of already completed rules.
g = Lark(r"""start: B g = Lark(r"""start: B
B: ("ab"|/[^b]/)*
B: ("ab"|/[^b]/)+
""", lexer=LEXER) """, lexer=LEXER)


self.assertEqual( g.parse('abc').children[0], 'abc') self.assertEqual( g.parse('abc').children[0], 'abc')
@@ -796,6 +796,49 @@ def _make_parser_test(LEXER, PARSER):
self.assertEqual(tree.children, ['a', 'A']) self.assertEqual(tree.children, ['a', 'A'])




def test_twice_empty(self):
g = """!start: [["A"]]
"""
l = _Lark(g)
tree = l.parse('A')
self.assertEqual(tree.children, ['A'])

tree = l.parse('')
self.assertEqual(tree.children, [])

def test_undefined_ignore(self):
g = """!start: "A"

%ignore B
"""
self.assertRaises( GrammarError, _Lark, g)

@unittest.skipIf(LEXER==None, "TODO: Fix scanless parsing or get rid of it") # TODO
def test_line_and_column(self):
g = r"""!start: "A" bc "D"
!bc: "B\nC"
"""
l = _Lark(g)
a, bc, d = l.parse("AB\nCD").children
self.assertEqual(a.line, 1)
self.assertEqual(a.column, 0)

bc ,= bc.children
self.assertEqual(bc.line, 1)
self.assertEqual(bc.column, 1)

self.assertEqual(d.line, 2)
self.assertEqual(d.column, 1)

# self.assertEqual(a.end_line, 1)
# self.assertEqual(a.end_col, 1)
# self.assertEqual(bc.end_line, 2)
# self.assertEqual(bc.end_col, 1)
# self.assertEqual(d.end_line, 2)
# self.assertEqual(d.end_col, 2)



def test_reduce_cycle(self): def test_reduce_cycle(self):
"""Tests an edge-condition in the LALR parser, in which a transition state looks exactly like the end state. """Tests an edge-condition in the LALR parser, in which a transition state looks exactly like the end state.
It seems that the correct solution is to explicitely distinguish finalization in the reduce() function. It seems that the correct solution is to explicitely distinguish finalization in the reduce() function.
@@ -969,7 +1012,7 @@ def _make_parser_test(LEXER, PARSER):


parser = _Lark(grammar) parser = _Lark(grammar)


tree = parser.parse("int 1 ! This is a comment\n")
tree = parser.parse("int 1 ! This is a comment\n")
self.assertEqual(tree.children, ['1']) self.assertEqual(tree.children, ['1'])


tree = parser.parse("int 1 ! This is a comment") # A trailing ignore token can be tricky! tree = parser.parse("int 1 ! This is a comment") # A trailing ignore token can be tricky!
@@ -983,6 +1026,7 @@ def _make_parser_test(LEXER, PARSER):
self.assertEqual(tree.children, []) self.assertEqual(tree.children, [])





@unittest.skipIf(LEXER==None, "Scanless doesn't support regular expressions") @unittest.skipIf(LEXER==None, "Scanless doesn't support regular expressions")
def test_regex_escaping(self): def test_regex_escaping(self):
g = _Lark("start: /[ab]/") g = _Lark("start: /[ab]/")


Loading…
Cancel
Save