Browse Source

Merge branch 'master' of https://github.com/lark-parser/lark into file-ext-change

- Merging updated upstream into branch for file extension changes.
- Will push so Pull Request has no remaining conflicts.
- Also will change the file type of lark example grammar.
tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.6.0
Rob Rose 7 years ago
parent
commit
f92ed2607e
21 changed files with 403 additions and 64 deletions
  1. +2
    -0
      README.md
  2. +2
    -0
      examples/README.md
  3. +81
    -0
      examples/error_reporting_lalr.py
  4. +49
    -0
      examples/lark.g
  5. +18
    -0
      examples/lark_grammar.py
  6. +1
    -1
      lark/__init__.py
  7. +33
    -3
      lark/common.py
  8. +1
    -0
      lark/grammars/common.lark
  9. +1
    -1
      lark/lark.py
  10. +5
    -1
      lark/lexer.py
  11. +47
    -28
      lark/load_grammar.py
  12. +18
    -4
      lark/parse_tree_builder.py
  13. +2
    -2
      lark/parser_frontends.py
  14. +3
    -3
      lark/parsers/earley.py
  15. +4
    -1
      lark/parsers/grammar_analysis.py
  16. +3
    -4
      lark/parsers/lalr_parser.py
  17. +1
    -5
      lark/parsers/resolve_ambig.py
  18. +1
    -1
      lark/tools/standalone.py
  19. +26
    -0
      lark/tree.py
  20. +65
    -9
      tests/test_parser.py
  21. +40
    -1
      tests/test_trees.py

+ 2
- 0
README.md View File

@@ -165,3 +165,5 @@ If you're interested in taking one of these on, let me know and I will provide m
If you have any questions or want my assistance, you can email me at erezshin at gmail com. If you have any questions or want my assistance, you can email me at erezshin at gmail com.


I'm also available for contract work. I'm also available for contract work.

-- [Erez](https://github.com/erezsh)

+ 2
- 0
examples/README.md View File

@@ -7,9 +7,11 @@
- [indented\_tree.py](indented\_tree.py) - A demonstration of parsing indentation ("whitespace significant" language) - [indented\_tree.py](indented\_tree.py) - A demonstration of parsing indentation ("whitespace significant" language)
- [fruitflies.py](fruitflies.py) - A demonstration of ambiguity - [fruitflies.py](fruitflies.py) - A demonstration of ambiguity
- [turtle\_dsl.py](turtle_dsl.py) - Implements a LOGO-like toy language for Python's turtle, with interpreter. - [turtle\_dsl.py](turtle_dsl.py) - Implements a LOGO-like toy language for Python's turtle, with interpreter.
- [lark\_grammar.py](lark_grammar.py) + [lark.g](lark.g) - A reference implementation of the Lark grammar (using LALR(1) + standard lexer)


### Advanced ### Advanced


- [error\_reporting\_lalr.py](error_reporting_lalr.py) - A demonstration of example-driven error reporting with the LALR parser
- [python\_parser.py](python_parser.py) - A fully-working Python 2 & 3 parser (but not production ready yet!) - [python\_parser.py](python_parser.py) - A fully-working Python 2 & 3 parser (but not production ready yet!)
- [conf.py](conf.py) - Demonstrates the power of LALR's contextual lexer on a toy configuration language - [conf.py](conf.py) - Demonstrates the power of LALR's contextual lexer on a toy configuration language
- [reconstruct\_json.py](reconstruct_json.py) - Demonstrates the experimental text-reconstruction feature - [reconstruct\_json.py](reconstruct_json.py) - Demonstrates the experimental text-reconstruction feature

+ 81
- 0
examples/error_reporting_lalr.py View File

@@ -0,0 +1,81 @@
#
# This demonstrates example-driven error reporting with the LALR parser
#

from lark import Lark, UnexpectedToken

from .json_parser import json_grammar # Using the grammar from the json_parser example

json_parser = Lark(json_grammar, parser='lalr')

class JsonSyntaxError(SyntaxError):
def __str__(self):
context, line, column = self.args
return '%s at line %s, column %s.\n\n%s' % (self.label, line, column, context)

class JsonMissingValue(JsonSyntaxError):
label = 'Missing Value'

class JsonMissingOpening(JsonSyntaxError):
label = 'Missing Opening'

class JsonMissingClosing(JsonSyntaxError):
label = 'Missing Closing'

class JsonMissingComma(JsonSyntaxError):
label = 'Missing Comma'

class JsonTrailingComma(JsonSyntaxError):
label = 'Trailing Comma'


def parse(json_text):
try:
j = json_parser.parse(json_text)
except UnexpectedToken as ut:
exc_class = ut.match_examples(json_parser.parse, {
JsonMissingValue: ['{"foo": }'],
JsonMissingOpening: ['{"foo": ]}',
'{"foor": }}'],
JsonMissingClosing: ['{"foo": [}',
'{',
'{"a": 1',
'[1'],
JsonMissingComma: ['[1 2]',
'[false 1]',
'["b" 1]',
'{"a":true 1:4}',
'{"a":1 1:4}',
'{"a":"b" 1:4}'],
JsonTrailingComma: ['[,]',
'[1,]',
'[1,2,]',
'{"foo":1,}',
'{"foo":false,"bar":true,}']
})
if not exc_class:
raise
raise exc_class(ut.get_context(json_text), ut.line, ut.column)


def test():
try:
parse('{"key":')
except JsonMissingValue:
pass

try:
parse('{"key": "value"')
except JsonMissingClosing:
pass

try:
parse('{"key": ] ')
except JsonMissingOpening:
pass


if __name__ == '__main__':
test()



+ 49
- 0
examples/lark.g View File

@@ -0,0 +1,49 @@
start: (_item | _NL)*

_item: rule
| token
| statement

rule: RULE priority? ":" expansions _NL
token: TOKEN priority? ":" expansions _NL

priority: "." NUMBER

statement: "%ignore" expansions _NL -> ignore
| "%import" import_args ["->" TOKEN] _NL -> import

import_args: name ("." name)*

?expansions: alias (_VBAR alias)*

?alias: expansion ["->" RULE]

?expansion: expr*

?expr: atom [OP | "~" NUMBER [".." NUMBER]]

?atom: "(" expansions ")"
| "[" expansions "]" -> maybe
| STRING ".." STRING -> literal_range
| name
| (REGEXP | STRING) -> literal

name: RULE
| TOKEN

_VBAR: _NL? "|"
OP: /[+*][?]?|[?](?![a-z])/
RULE: /!?[_?]?[a-z][_a-z0-9]*/
TOKEN: /_?[A-Z][_A-Z0-9]*/
STRING: _STRING "i"?
REGEXP: /\/(?!\/)(\\\/|\\\\|[^\/\n])*?\/[imslux]*/
_NL: /(\r?\n)+\s*/

%import common.ESCAPED_STRING -> _STRING
%import common.INT -> NUMBER
%import common.WS_INLINE

COMMENT: "//" /[^\n]/*

%ignore WS_INLINE
%ignore COMMENT

+ 18
- 0
examples/lark_grammar.py View File

@@ -0,0 +1,18 @@
from lark import Lark

parser = Lark(open('examples/lark.g'), parser="lalr")

grammar_files = [
'examples/python2.g',
'examples/python3.g',
'examples/lark.g',
'lark/grammars/common.g',
]

def test():
for grammar_file in grammar_files:
tree = parser.parse(open(grammar_file).read())
print("All grammars parsed successfully")

if __name__ == '__main__':
test()

+ 1
- 1
lark/__init__.py View File

@@ -4,4 +4,4 @@ from .lexer import UnexpectedInput, LexError
from .lark import Lark from .lark import Lark
from .utils import inline_args from .utils import inline_args


__version__ = "0.5.5"
__version__ = "0.5.6"

+ 33
- 3
lark/common.py View File

@@ -1,7 +1,7 @@
import re import re
import sys import sys


from .utils import get_regexp_width
from .utils import get_regexp_width, STRING_TYPE


Py36 = (sys.version_info[:2] >= (3, 6)) Py36 = (sys.version_info[:2] >= (3, 6))


@@ -17,12 +17,13 @@ class ParseError(Exception):
pass pass


class UnexpectedToken(ParseError): class UnexpectedToken(ParseError):
def __init__(self, token, expected, seq, index, considered_rules=None):
def __init__(self, token, expected, seq, index, considered_rules=None, state=None):
self.token = token self.token = token
self.expected = expected self.expected = expected
self.line = getattr(token, 'line', '?') self.line = getattr(token, 'line', '?')
self.column = getattr(token, 'column', '?') self.column = getattr(token, 'column', '?')
self.considered_rules = considered_rules self.considered_rules = considered_rules
self.state = state


try: try:
context = ' '.join(['%r(%s)' % (t.value, t.type) for t in seq[index:index+5]]) context = ' '.join(['%r(%s)' % (t.value, t.type) for t in seq[index:index+5]])
@@ -36,7 +37,36 @@ class UnexpectedToken(ParseError):


super(UnexpectedToken, self).__init__(message) super(UnexpectedToken, self).__init__(message)



def match_examples(self, parse_fn, examples):
""" Given a parser instance and a dictionary mapping some label with
some malformed syntax examples, it'll return the label for the
example that bests matches the current error.
"""
assert self.state, "Not supported for this exception"

candidate = None
for label, example in examples.items():
assert not isinstance(example, STRING_TYPE)

for malformed in example:
try:
parse_fn(malformed)
except UnexpectedToken as ut:
if ut.state == self.state:
if ut.token == self.token: # Try exact match first
return label
elif not candidate:
candidate = label

return candidate

def get_context(self, text, span=10):
pos = self.token.pos_in_stream
start = max(pos - span, 0)
end = pos + span
before = text[start:pos].rsplit('\n', 1)[-1]
after = text[pos:end].split('\n', 1)[0]
return before + after + '\n' + ' ' * len(before) + '^\n'
###} ###}






+ 1
- 0
lark/grammars/common.lark View File

@@ -20,6 +20,7 @@ SIGNED_NUMBER: ["+"|"-"] NUMBER
// //
// Strings // Strings
// //
//STRING: /"(\\\"|\\\\|[^"\n])*?"i?/
STRING_INNER: ("\\\""|/[^"]/) STRING_INNER: ("\\\""|/[^"]/)
ESCAPED_STRING: "\"" STRING_INNER* "\"" ESCAPED_STRING: "\"" STRING_INNER* "\""




+ 1
- 1
lark/lark.py View File

@@ -172,7 +172,7 @@ class Lark:
def _build_parser(self): def _build_parser(self):
self.parser_class = get_frontend(self.options.parser, self.options.lexer) self.parser_class = get_frontend(self.options.parser, self.options.lexer)


self._parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class, self.options.propagate_positions, self.options.keep_all_tokens)
self._parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class, self.options.propagate_positions, self.options.keep_all_tokens, self.options.parser!='lalr')
callback = self._parse_tree_builder.create_callback(self.options.transformer) callback = self._parse_tree_builder.create_callback(self.options.transformer)
if self.profiler: if self.profiler:
for f in dir(callback): for f in dir(callback):


+ 5
- 1
lark/lexer.py View File

@@ -25,6 +25,8 @@ class UnexpectedInput(LexError):
self.considered_rules = considered_rules self.considered_rules = considered_rules


class Token(Str): class Token(Str):
__slots__ = ('type', 'pos_in_stream', 'value', 'line', 'column', 'end_line', 'end_column')

def __new__(cls, type_, value, pos_in_stream=None, line=None, column=None): def __new__(cls, type_, value, pos_in_stream=None, line=None, column=None):
self = super(Token, cls).__new__(cls, value) self = super(Token, cls).__new__(cls, value)
self.type = type_ self.type = type_
@@ -39,7 +41,7 @@ class Token(Str):
return cls(type_, value, borrow_t.pos_in_stream, line=borrow_t.line, column=borrow_t.column) return cls(type_, value, borrow_t.pos_in_stream, line=borrow_t.line, column=borrow_t.column)


def __reduce__(self): def __reduce__(self):
return (self.__class__, (self.type, self.pos_in_stream, self.value, self.line, self.column, ))
return (self.__class__, (self.type, self.value, self.pos_in_stream, self.line, self.column, ))


def __repr__(self): def __repr__(self):
return 'Token(%s, %r)' % (self.type, self.value) return 'Token(%s, %r)' % (self.type, self.value)
@@ -141,6 +143,8 @@ def _create_unless(tokens):
for retok in tokens_by_type.get(PatternRE, []): for retok in tokens_by_type.get(PatternRE, []):
unless = [] # {} unless = [] # {}
for strtok in tokens_by_type.get(PatternStr, []): for strtok in tokens_by_type.get(PatternStr, []):
if strtok.priority > retok.priority:
continue
s = strtok.pattern.value s = strtok.pattern.value
m = re.match(retok.pattern.to_regexp(), s) m = re.match(retok.pattern.to_regexp(), s)
if m and m.group(0) == s: if m and m.group(0) == s:


+ 47
- 28
lark/load_grammar.py View File

@@ -14,7 +14,7 @@ from .parsers.lalr_parser import UnexpectedToken
from .common import is_terminal, GrammarError, LexerConf, ParserConf, PatternStr, PatternRE, TokenDef from .common import is_terminal, GrammarError, LexerConf, ParserConf, PatternStr, PatternRE, TokenDef
from .grammar import RuleOptions, Rule from .grammar import RuleOptions, Rule


from .tree import Tree as T, Transformer, InlineTransformer, Visitor
from .tree import Tree, Transformer, InlineTransformer, Visitor, SlottedTree as ST


__path__ = os.path.dirname(__file__) __path__ = os.path.dirname(__file__)
IMPORT_PATHS = [os.path.join(__path__, 'grammars')] IMPORT_PATHS = [os.path.join(__path__, 'grammars')]
@@ -122,7 +122,7 @@ RULES = {
'statement': ['ignore', 'import'], 'statement': ['ignore', 'import'],
'ignore': ['_IGNORE expansions _NL'], 'ignore': ['_IGNORE expansions _NL'],
'import': ['_IMPORT import_args _NL', 'import': ['_IMPORT import_args _NL',
'_IMPORT import_args _TO TOKEN'],
'_IMPORT import_args _TO TOKEN _NL'],
'import_args': ['_import_args'], 'import_args': ['_import_args'],
'_import_args': ['name', '_import_args _DOT name'], '_import_args': ['name', '_import_args _DOT name'],


@@ -145,14 +145,14 @@ class EBNF_to_BNF(InlineTransformer):
new_name = '__%s_%s_%d' % (self.prefix, type_, self.i) new_name = '__%s_%s_%d' % (self.prefix, type_, self.i)
self.i += 1 self.i += 1
t = Token('RULE', new_name, -1) t = Token('RULE', new_name, -1)
tree = T('expansions', [T('expansion', [expr]), T('expansion', [t, expr])])
tree = ST('expansions', [ST('expansion', [expr]), ST('expansion', [t, expr])])
self.new_rules.append((new_name, tree, self.rule_options)) self.new_rules.append((new_name, tree, self.rule_options))
self.rules_by_expr[expr] = t self.rules_by_expr[expr] = t
return t return t


def expr(self, rule, op, *args): def expr(self, rule, op, *args):
if op.value == '?': if op.value == '?':
return T('expansions', [rule, T('expansion', [])])
return ST('expansions', [rule, ST('expansion', [])])
elif op.value == '+': elif op.value == '+':
# a : b c+ d # a : b c+ d
# --> # -->
@@ -165,7 +165,7 @@ class EBNF_to_BNF(InlineTransformer):
# a : b _c? d # a : b _c? d
# _c : _c c | c; # _c : _c c | c;
new_name = self._add_recurse_rule('star', rule) new_name = self._add_recurse_rule('star', rule)
return T('expansions', [new_name, T('expansion', [])])
return ST('expansions', [new_name, ST('expansion', [])])
elif op.value == '~': elif op.value == '~':
if len(args) == 1: if len(args) == 1:
mn = mx = int(args[0]) mn = mx = int(args[0])
@@ -173,7 +173,7 @@ class EBNF_to_BNF(InlineTransformer):
mn, mx = map(int, args) mn, mx = map(int, args)
if mx < mn: if mx < mn:
raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (rule, mn, mx)) raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (rule, mn, mx))
return T('expansions', [T('expansion', [rule] * n) for n in range(mn, mx+1)])
return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx+1)])
assert False, op assert False, op




@@ -183,7 +183,7 @@ class SimplifyRule_Visitor(Visitor):
def _flatten(tree): def _flatten(tree):
while True: while True:
to_expand = [i for i, child in enumerate(tree.children) to_expand = [i for i, child in enumerate(tree.children)
if isinstance(child, T) and child.data == tree.data]
if isinstance(child, Tree) and child.data == tree.data]
if not to_expand: if not to_expand:
break break
tree.expand_kids_by_index(*to_expand) tree.expand_kids_by_index(*to_expand)
@@ -203,9 +203,9 @@ class SimplifyRule_Visitor(Visitor):
self._flatten(tree) self._flatten(tree)


for i, child in enumerate(tree.children): for i, child in enumerate(tree.children):
if isinstance(child, T) and child.data == 'expansions':
if isinstance(child, Tree) and child.data == 'expansions':
tree.data = 'expansions' tree.data = 'expansions'
tree.children = [self.visit(T('expansion', [option if i==j else other
tree.children = [self.visit(ST('expansion', [option if i==j else other
for j, other in enumerate(tree.children)])) for j, other in enumerate(tree.children)]))
for option in set(child.children)] for option in set(child.children)]
break break
@@ -217,7 +217,7 @@ class SimplifyRule_Visitor(Visitor):
if rule.data == 'expansions': if rule.data == 'expansions':
aliases = [] aliases = []
for child in tree.children[0].children: for child in tree.children[0].children:
aliases.append(T('alias', [child, alias_name]))
aliases.append(ST('alias', [child, alias_name]))
tree.data = 'expansions' tree.data = 'expansions'
tree.children = aliases tree.children = aliases


@@ -239,7 +239,7 @@ class RuleTreeToText(Transformer):


class CanonizeTree(InlineTransformer): class CanonizeTree(InlineTransformer):
def maybe(self, expr): def maybe(self, expr):
return T('expr', [expr, Token('OP', '?', -1)])
return ST('expr', [expr, Token('OP', '?', -1)])


def tokenmods(self, *args): def tokenmods(self, *args):
if len(args) == 1: if len(args) == 1:
@@ -353,7 +353,7 @@ def _literal_to_pattern(literal):


class PrepareLiterals(InlineTransformer): class PrepareLiterals(InlineTransformer):
def literal(self, literal): def literal(self, literal):
return T('pattern', [_literal_to_pattern(literal)])
return ST('pattern', [_literal_to_pattern(literal)])


def range(self, start, end): def range(self, start, end):
assert start.type == end.type == 'STRING' assert start.type == end.type == 'STRING'
@@ -361,13 +361,13 @@ class PrepareLiterals(InlineTransformer):
end = end.value[1:-1] end = end.value[1:-1]
assert len(start) == len(end) == 1, (start, end, len(start), len(end)) assert len(start) == len(end) == 1, (start, end, len(start), len(end))
regexp = '[%s-%s]' % (start, end) regexp = '[%s-%s]' % (start, end)
return T('pattern', [PatternRE(regexp)])
return ST('pattern', [PatternRE(regexp)])


class SplitLiterals(InlineTransformer): class SplitLiterals(InlineTransformer):
def pattern(self, p): def pattern(self, p):
if isinstance(p, PatternStr) and len(p.value)>1: if isinstance(p, PatternStr) and len(p.value)>1:
return T('expansion', [T('pattern', [PatternStr(ch, flags=p.flags)]) for ch in p.value])
return T('pattern', [p])
return ST('expansion', [ST('pattern', [PatternStr(ch, flags=p.flags)]) for ch in p.value])
return ST('pattern', [p])


class TokenTreeToPattern(Transformer): class TokenTreeToPattern(Transformer):
def pattern(self, ps): def pattern(self, ps):
@@ -375,6 +375,7 @@ class TokenTreeToPattern(Transformer):
return p return p


def expansion(self, items): def expansion(self, items):
assert items
if len(items) == 1: if len(items) == 1:
return items[0] return items[0]
if len({i.flags for i in items}) > 1: if len({i.flags for i in items}) > 1:
@@ -402,18 +403,20 @@ class TokenTreeToPattern(Transformer):
assert len(args) == 2 assert len(args) == 2
return PatternRE('(?:%s)%s' % (inner.to_regexp(), op), inner.flags) return PatternRE('(?:%s)%s' % (inner.to_regexp(), op), inner.flags)


def alias(self, t):
raise GrammarError("Aliasing not allowed in terminals (You used -> in the wrong place)")


def _interleave(l, item): def _interleave(l, item):
for e in l: for e in l:
yield e yield e
if isinstance(e, T):
if isinstance(e, Tree):
if e.data in ('literal', 'range'): if e.data in ('literal', 'range'):
yield item yield item
elif is_terminal(e): elif is_terminal(e):
yield item yield item


def _choice_of_rules(rules): def _choice_of_rules(rules):
return T('expansions', [T('expansion', [Token('RULE', name)]) for name in rules])
return ST('expansions', [ST('expansion', [Token('RULE', name)]) for name in rules])


class Grammar: class Grammar:
def __init__(self, rule_defs, token_defs, ignore): def __init__(self, rule_defs, token_defs, ignore):
@@ -440,9 +443,9 @@ class Grammar:
if r == start: if r == start:
exp.children = [expr] + exp.children exp.children = [expr] + exp.children
for exp in tree.find_data('expr'): for exp in tree.find_data('expr'):
exp.children[0] = T('expansion', list(_interleave(exp.children[:1], expr)))
exp.children[0] = ST('expansion', list(_interleave(exp.children[:1], expr)))


_ignore_tree = T('expr', [_choice_of_rules(terms_to_ignore.values()), Token('OP', '?')])
_ignore_tree = ST('expr', [_choice_of_rules(terms_to_ignore.values()), Token('OP', '?')])
rule_defs.append(('__ignore', _ignore_tree, None)) rule_defs.append(('__ignore', _ignore_tree, None))


# Convert all tokens to rules # Convert all tokens to rules
@@ -455,6 +458,9 @@ class Grammar:
exp.children[i] = Token(sym.type, new_terminal_names[sym]) exp.children[i] = Token(sym.type, new_terminal_names[sym])


for name, (tree, priority) in term_defs: # TODO transfer priority to rule? for name, (tree, priority) in term_defs: # TODO transfer priority to rule?
if any(tree.find_data('alias')):
raise GrammarError("Aliasing not allowed in terminals (You used -> in the wrong place)")

if name.startswith('_'): if name.startswith('_'):
options = RuleOptions(filter_out=True, priority=-priority) options = RuleOptions(filter_out=True, priority=-priority)
else: else:
@@ -481,6 +487,11 @@ class Grammar:


# Convert token-trees to strings/regexps # Convert token-trees to strings/regexps
transformer = PrepareLiterals() * TokenTreeToPattern() transformer = PrepareLiterals() * TokenTreeToPattern()
for name, (token_tree, priority) in token_defs:
for t in token_tree.find_data('expansion'):
if not t.children:
raise GrammarError("Tokens cannot be empty (%s)" % name)

tokens = [TokenDef(name, transformer.transform(token_tree), priority) tokens = [TokenDef(name, transformer.transform(token_tree), priority)
for name, (token_tree, priority) in token_defs] for name, (token_tree, priority) in token_defs]


@@ -516,7 +527,7 @@ class Grammar:


for expansion, alias in expansions: for expansion, alias in expansions:
if alias and name.startswith('_'): if alias and name.startswith('_'):
raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (name, alias))
raise GrammarError("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (name, alias))


rule = Rule(name, expansion, alias, options) rule = Rule(name, expansion, alias, options)
compiled_rules.append(rule) compiled_rules.append(rule)
@@ -579,7 +590,7 @@ class GrammarLoader:


rules = [options_from_rule(name, x) for name, x in RULES.items()] rules = [options_from_rule(name, x) for name, x in RULES.items()]
rules = [Rule(r, x.split(), None, o) for r, xs, o in rules for x in xs] rules = [Rule(r, x.split(), None, o) for r, xs, o in rules for x in xs]
callback = ParseTreeBuilder(rules, T).create_callback()
callback = ParseTreeBuilder(rules, ST).create_callback()
lexer_conf = LexerConf(tokens, ['WS', 'COMMENT']) lexer_conf = LexerConf(tokens, ['WS', 'COMMENT'])


parser_conf = ParserConf(rules, callback, 'start') parser_conf = ParserConf(rules, callback, 'start')
@@ -595,14 +606,22 @@ class GrammarLoader:
except UnexpectedInput as e: except UnexpectedInput as e:
raise GrammarError("Unexpected input %r at line %d column %d in %s" % (e.context, e.line, e.column, name)) raise GrammarError("Unexpected input %r at line %d column %d in %s" % (e.context, e.line, e.column, name))
except UnexpectedToken as e: except UnexpectedToken as e:
if e.expected == ['_COLON']:
raise GrammarError("Missing colon at line %s column %s" % (e.line, e.column))
elif e.expected == ['RULE']:
raise GrammarError("Missing alias at line %s column %s" % (e.line, e.column))
context = e.get_context(grammar_text)
error = e.match_examples(self.parser.parse, {
'Unclosed parenthesis': ['a: (\n'],
'Umatched closing parenthesis': ['a: )\n', 'a: [)\n', 'a: (]\n'],
'Expecting rule or token definition (missing colon)': ['a\n', 'a->\n', 'A->\n', 'a A\n'],
'Alias expects lowercase name': ['a: -> "a"\n'],
'Unexpected colon': ['a::\n', 'a: b:\n', 'a: B:\n', 'a: "a":\n'],
'Misplaced operator': ['a: b??', 'a: b(?)', 'a:+\n', 'a:?\n', 'a:*\n', 'a:|*\n'],
'Expecting option ("|") or a new rule or token definition': ['a:a\n()\n'],
'%import expects a name': ['%import "a"\n'],
'%ignore expects a value': ['%ignore %import\n'],
})
if error:
raise GrammarError("%s at line %s column %s\n\n%s" % (error, e.line, e.column, context))
elif 'STRING' in e.expected: elif 'STRING' in e.expected:
raise GrammarError("Expecting a value at line %s column %s" % (e.line, e.column))
elif e.expected == ['_OR']:
raise GrammarError("Newline without starting a new option (Expecting '|') at line %s column %s" % (e.line, e.column))
raise GrammarError("Expecting a value at line %s column %s\n\n%s" % (e.line, e.column, context))
raise raise


# Extract grammar items # Extract grammar items


+ 18
- 4
lark/parse_tree_builder.py View File

@@ -57,6 +57,19 @@ class ChildFilter:
self.node_builder = node_builder self.node_builder = node_builder
self.to_include = to_include self.to_include = to_include


def __call__(self, children):
filtered = []
for i, to_expand in self.to_include:
if to_expand:
filtered += children[i].children
else:
filtered.append(children[i])

return self.node_builder(filtered)

class ChildFilterLALR(ChildFilter):
"Optimized childfilter for LALR (assumes no duplication in parse tree, so it's safe to change it)"

def __call__(self, children): def __call__(self, children):
filtered = [] filtered = []
for i, to_expand in self.to_include: for i, to_expand in self.to_include:
@@ -73,21 +86,22 @@ class ChildFilter:
def _should_expand(sym): def _should_expand(sym):
return not is_terminal(sym) and sym.startswith('_') return not is_terminal(sym) and sym.startswith('_')


def maybe_create_child_filter(expansion, filter_out):
def maybe_create_child_filter(expansion, filter_out, ambiguous):
to_include = [(i, _should_expand(sym)) for i, sym in enumerate(expansion) if sym not in filter_out] to_include = [(i, _should_expand(sym)) for i, sym in enumerate(expansion) if sym not in filter_out]


if len(to_include) < len(expansion) or any(to_expand for i, to_expand in to_include): if len(to_include) < len(expansion) or any(to_expand for i, to_expand in to_include):
return partial(ChildFilter, to_include)
return partial(ChildFilter if ambiguous else ChildFilterLALR, to_include)




class Callback(object): class Callback(object):
pass pass


class ParseTreeBuilder: class ParseTreeBuilder:
def __init__(self, rules, tree_class, propagate_positions=False, keep_all_tokens=False):
def __init__(self, rules, tree_class, propagate_positions=False, keep_all_tokens=False, ambiguous=False):
self.tree_class = tree_class self.tree_class = tree_class
self.propagate_positions = propagate_positions self.propagate_positions = propagate_positions
self.always_keep_all_tokens = keep_all_tokens self.always_keep_all_tokens = keep_all_tokens
self.ambiguous = ambiguous


self.rule_builders = list(self._init_builders(rules)) self.rule_builders = list(self._init_builders(rules))


@@ -107,7 +121,7 @@ class ParseTreeBuilder:
wrapper_chain = filter(None, [ wrapper_chain = filter(None, [
create_token and partial(CreateToken, create_token), create_token and partial(CreateToken, create_token),
(expand_single_child and not rule.alias) and ExpandSingleChild, (expand_single_child and not rule.alias) and ExpandSingleChild,
maybe_create_child_filter(rule.expansion, () if keep_all_tokens else filter_out),
maybe_create_child_filter(rule.expansion, () if keep_all_tokens else filter_out, self.ambiguous),
self.propagate_positions and PropagatePositions, self.propagate_positions and PropagatePositions,
]) ])




+ 2
- 2
lark/parser_frontends.py View File

@@ -15,9 +15,9 @@ class WithLexer:


def init_contextual_lexer(self, lexer_conf, parser_conf): def init_contextual_lexer(self, lexer_conf, parser_conf):
self.lexer_conf = lexer_conf self.lexer_conf = lexer_conf
d = {idx:t.keys() for idx, t in self.parser.analysis.parse_table.states.items()}
states = {idx:list(t.keys()) for idx, t in self.parser._parse_table.states.items()}
always_accept = lexer_conf.postlex.always_accept if lexer_conf.postlex else () always_accept = lexer_conf.postlex.always_accept if lexer_conf.postlex else ()
self.lexer = ContextualLexer(lexer_conf.tokens, d, ignore=lexer_conf.ignore, always_accept=always_accept, user_callbacks=lexer_conf.callbacks)
self.lexer = ContextualLexer(lexer_conf.tokens, states, ignore=lexer_conf.ignore, always_accept=always_accept, user_callbacks=lexer_conf.callbacks)


def lex(self, text): def lex(self, text):
stream = self.lexer.lex(text) stream = self.lexer.lex(text)


+ 3
- 3
lark/parsers/earley.py View File

@@ -145,16 +145,16 @@ class Column:


class Parser: class Parser:
def __init__(self, parser_conf, term_matcher, resolve_ambiguity=None): def __init__(self, parser_conf, term_matcher, resolve_ambiguity=None):
self.analysis = GrammarAnalyzer(parser_conf)
analysis = GrammarAnalyzer(parser_conf)
self.parser_conf = parser_conf self.parser_conf = parser_conf
self.resolve_ambiguity = resolve_ambiguity self.resolve_ambiguity = resolve_ambiguity


self.FIRST = self.analysis.FIRST
self.FIRST = analysis.FIRST
self.postprocess = {} self.postprocess = {}
self.predictions = {} self.predictions = {}
for rule in parser_conf.rules: for rule in parser_conf.rules:
self.postprocess[rule] = rule.alias if callable(rule.alias) else getattr(parser_conf.callback, rule.alias) self.postprocess[rule] = rule.alias if callable(rule.alias) else getattr(parser_conf.callback, rule.alias)
self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)]
self.predictions[rule.origin] = [x.rule for x in analysis.expand_rule(rule.origin)]


self.term_matcher = term_matcher self.term_matcher = term_matcher




+ 4
- 1
lark/parsers/grammar_analysis.py View File

@@ -5,6 +5,8 @@ from ..grammar import Rule




class RulePtr(object): class RulePtr(object):
__slots__ = ('rule', 'index')

def __init__(self, rule, index): def __init__(self, rule, index):
assert isinstance(rule, Rule) assert isinstance(rule, Rule)
assert index <= len(rule.expansion) assert index <= len(rule.expansion)
@@ -134,7 +136,8 @@ class GrammarAnalyzer(object):
if not is_terminal(new_r): if not is_terminal(new_r):
yield new_r yield new_r


_ = list(bfs([rule], _expand_rule))
for _ in bfs([rule], _expand_rule):
pass


return fzset(init_ptrs) return fzset(init_ptrs)




+ 3
- 4
lark/parsers/lalr_parser.py View File

@@ -2,7 +2,6 @@
""" """
# Author: Erez Shinan (2017) # Author: Erez Shinan (2017)
# Email : erezshin@gmail.com # Email : erezshin@gmail.com

from ..common import UnexpectedToken from ..common import UnexpectedToken


from .lalr_analysis import LALR_Analyzer, Shift from .lalr_analysis import LALR_Analyzer, Shift
@@ -11,11 +10,12 @@ class Parser:
def __init__(self, parser_conf): def __init__(self, parser_conf):
assert all(r.options is None or r.options.priority is None assert all(r.options is None or r.options.priority is None
for r in parser_conf.rules), "LALR doesn't yet support prioritization" for r in parser_conf.rules), "LALR doesn't yet support prioritization"
self.analysis = analysis = LALR_Analyzer(parser_conf)
analysis = LALR_Analyzer(parser_conf)
analysis.compute_lookahead() analysis.compute_lookahead()
callbacks = {rule: getattr(parser_conf.callback, rule.alias or rule.origin, None) callbacks = {rule: getattr(parser_conf.callback, rule.alias or rule.origin, None)
for rule in parser_conf.rules} for rule in parser_conf.rules}


self._parse_table = analysis.parse_table
self.parser_conf = parser_conf self.parser_conf = parser_conf
self.parser = _Parser(analysis.parse_table, callbacks) self.parser = _Parser(analysis.parse_table, callbacks)
self.parse = self.parser.parse self.parse = self.parser.parse
@@ -46,8 +46,7 @@ class _Parser:
return states[state][key] return states[state][key]
except KeyError: except KeyError:
expected = states[state].keys() expected = states[state].keys()

raise UnexpectedToken(token, expected, seq, i)
raise UnexpectedToken(token, expected, seq, i, state=state)


def reduce(rule): def reduce(rule):
size = len(rule.expansion) size = len(rule.expansion)


+ 1
- 5
lark/parsers/resolve_ambig.py View File

@@ -9,11 +9,7 @@ from ..tree import Tree, Visitor_NoRecurse
# Author: Erez Sh # Author: Erez Sh


def _compare_rules(rule1, rule2): def _compare_rules(rule1, rule2):
c = -compare( len(rule1.expansion), len(rule2.expansion))
if rule1.origin.startswith('__'): # XXX hack! We should set priority in parser, not here
c = -c
return c

return -compare( len(rule1.expansion), len(rule2.expansion))


def _sum_priority(tree): def _sum_priority(tree):
p = 0 p = 0


+ 1
- 1
lark/tools/standalone.py View File

@@ -126,7 +126,7 @@ def _get_token_type(token_type):


class ParserAtoms: class ParserAtoms:
def __init__(self, parser): def __init__(self, parser):
self.parse_table = parser.analysis.parse_table
self.parse_table = parser._parse_table


def print_python(self): def print_python(self):
print('class ParseTable: pass') print('class ParseTable: pass')


+ 26
- 0
lark/tree.py View File

@@ -99,6 +99,8 @@ class Tree(object):
self.data = data self.data = data
self.children = children self.children = children


class SlottedTree(Tree):
__slots__ = 'data', 'children', 'rule'




###{standalone ###{standalone
@@ -172,6 +174,30 @@ class Visitor_NoRecurse(Visitor):
return tree return tree




from functools import wraps
def visit_children_decor(func):
@wraps(func)
def inner(cls, tree):
values = cls.visit_children(tree)
return func(cls, values)
return inner

class Interpreter(object):

def visit(self, tree):
return getattr(self, tree.data)(tree)

def visit_children(self, tree):
return [self.visit(child) if isinstance(child, Tree) else child
for child in tree.children]

def __getattr__(self, name):
return self.__default__

def __default__(self, tree):
return self.visit_children(tree)


class Transformer_NoRecurse(Transformer): class Transformer_NoRecurse(Transformer):
def transform(self, tree): def transform(self, tree):
subtrees = list(tree.iter_subtrees()) subtrees = list(tree.iter_subtrees())


+ 65
- 9
tests/test_parser.py View File

@@ -187,17 +187,22 @@ def _make_full_earley_test(LEXER):
l.parse(program) l.parse(program)




def test_earley_scanless3(self):
"Tests prioritization and disambiguation for pseudo-terminals (there should be only one result)"
# XXX Fails for scanless mode
# XXX Decided not to fix, because
# a) It's a subtle bug
# b) Scanless is intended for deprecation
#
# def test_earley_scanless3(self):
# "Tests prioritization and disambiguation for pseudo-terminals (there should be only one result)"


grammar = """
start: A A
A: "a"+
"""
# grammar = """
# start: A A
# A: "a"+
# """


l = Lark(grammar, parser='earley', lexer=LEXER)
res = l.parse("aaa")
self.assertEqual(res.children, ['aa', 'a'])
# l = Lark(grammar, parser='earley', lexer=LEXER)
# res = l.parse("aaa")
# self.assertEqual(res.children, ['aa', 'a'])


def test_earley_scanless4(self): def test_earley_scanless4(self):
grammar = """ grammar = """
@@ -293,6 +298,39 @@ def _make_full_earley_test(LEXER):
self.assertEqual(res, expected) self.assertEqual(res, expected)




def test_explicit_ambiguity2(self):
grammar = r"""
start: NAME+
NAME: /\w+/
%ignore " "
"""
text = """cat"""

parser = Lark(grammar, start='start', ambiguity='explicit')
tree = parser.parse(text)
self.assertEqual(tree.data, '_ambig')

combinations = {tuple(str(s) for s in t.children) for t in tree.children}
self.assertEqual(combinations, {
('cat',),
('ca', 't'),
('c', 'at'),
('c', 'a' ,'t')
})

def test_term_ambig_resolve(self):
grammar = r"""
!start: NAME+
NAME: /\w+/
%ignore " "
"""
text = """foo bar"""

parser = Lark(grammar)
tree = parser.parse(text)
self.assertEqual(tree.children, ['foo', 'bar'])








@@ -822,6 +860,12 @@ def _make_parser_test(LEXER, PARSER):
""" """
self.assertRaises( GrammarError, _Lark, g) self.assertRaises( GrammarError, _Lark, g)


def test_alias_in_terminal(self):
g = """start: TERM
TERM: "a" -> alias
"""
self.assertRaises( GrammarError, _Lark, g)

@unittest.skipIf(LEXER==None, "TODO: Fix scanless parsing or get rid of it") # TODO @unittest.skipIf(LEXER==None, "TODO: Fix scanless parsing or get rid of it") # TODO
def test_line_and_column(self): def test_line_and_column(self):
g = r"""!start: "A" bc "D" g = r"""!start: "A" bc "D"
@@ -1129,6 +1173,18 @@ def _make_parser_test(LEXER, PARSER):
self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB') self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB')
self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB') self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB')


@unittest.skipIf(PARSER=='earley', "Priority not handled correctly right now") # TODO XXX
def test_priority_vs_embedded(self):
g = """
A.2: "a"
WORD: ("a".."z")+

start: (A | WORD)+
"""
l = _Lark(g)
t = l.parse('abc')
self.assertEqual(t.children, ['a', 'bc'])
self.assertEqual(t.children[0].type, 'A')








+ 40
- 1
tests/test_trees.py View File

@@ -5,7 +5,7 @@ from unittest import TestCase
import copy import copy
import pickle import pickle


from lark.tree import Tree
from lark.tree import Tree, Interpreter, visit_children_decor




class TestTrees(TestCase): class TestTrees(TestCase):
@@ -21,6 +21,45 @@ class TestTrees(TestCase):
assert pickle.loads(data) == s assert pickle.loads(data) == s




def test_interp(self):
t = Tree('a', [Tree('b', []), Tree('c', []), 'd'])

class Interp1(Interpreter):
def a(self, tree):
return self.visit_children(tree) + ['e']

def b(self, tree):
return 'B'

def c(self, tree):
return 'C'

self.assertEqual(Interp1().visit(t), list('BCde'))

class Interp2(Interpreter):
@visit_children_decor
def a(self, values):
return values + ['e']

def b(self, tree):
return 'B'

def c(self, tree):
return 'C'

self.assertEqual(Interp2().visit(t), list('BCde'))

class Interp3(Interpreter):
def b(self, tree):
return 'B'

def c(self, tree):
return 'C'

self.assertEqual(Interp3().visit(t), list('BCd'))



if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()



Loading…
Cancel
Save