Browse Source

Merge branch 'master' into dyn_earley

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.5.1
Erez Shinan 7 years ago
parent
commit
2d0f2af0aa
15 changed files with 262 additions and 51 deletions
  1. +19
    -2
      README.md
  2. +5
    -4
      examples/python3.g
  3. +1
    -1
      lark/__init__.py
  4. +14
    -5
      lark/common.py
  5. +8
    -4
      lark/lark.py
  6. +15
    -7
      lark/lexer.py
  7. +16
    -6
      lark/load_grammar.py
  8. +29
    -3
      lark/parse_tree_builder.py
  9. +2
    -2
      lark/parser_frontends.py
  10. +39
    -6
      lark/parsers/earley.py
  11. +6
    -5
      lark/parsers/grammar_analysis.py
  12. +4
    -3
      lark/parsers/lalr_parser.py
  13. +8
    -1
      lark/tree.py
  14. +28
    -2
      lark/utils.py
  15. +68
    -0
      tests/test_parser.py

+ 19
- 2
README.md View File

@@ -125,7 +125,8 @@ Lark has no dependencies.


### Projects using Lark ### Projects using Lark


- [mappyfile](https://github.com/geographika/mappyfile) - A pure Python MapFile parser for working with MapServer
- [mappyfile](https://github.com/geographika/mappyfile) - a MapFile parser for working with MapServer configuration
- [pytreeview](https://gitlab.com/parmenti/pytreeview) - a lightweight tree-based grammar explorer


Using Lark? Send me a message and I'll add your project! Using Lark? Send me a message and I'll add your project!


@@ -251,6 +252,22 @@ Lark offers both Earley and LALR(1), which means you can choose between the most


Lark uses the [MIT license](LICENSE). Lark uses the [MIT license](LICENSE).


## Contribute

Lark is currently accepting pull-requests.

There are many ways you can help the project:

* Improve the performance of Lark's parsing algorithm
* Implement macros for grammars (important for grammar composition)
* Write new grammars for Lark's library
* Write & improve the documentation
* Write a blog post introducing Lark to your audience

If you're interested in taking one of these on, let me know and I will provide more details and assist you in the process.

## Contact ## Contact


If you have any questions or want to contribute, you can email me at erezshin at gmail com.
If you have any questions or want my assistance, you can email me at erezshin at gmail com.

I'm also available for contract work.

+ 5
- 4
examples/python3.g View File

@@ -116,11 +116,12 @@ AWAIT: "await"
| atom_expr "." NAME -> getattr | atom_expr "." NAME -> getattr
| atom | atom


?atom: "(" [yield_expr|testlist_comp] ")"
| "[" [testlist_comp] "]"
| "{" [dictorsetmaker] "}"
?atom: "(" [yield_expr|testlist_comp] ")" -> tuple
| "[" [testlist_comp] "]" -> list
| "{" [dictorsetmaker] "}" -> dict
| NAME -> var | NAME -> var
| number | string+ | "..."
| number | string+
| "..." -> ellipsis
| "None" -> const_none | "None" -> const_none
| "True" -> const_true | "True" -> const_true
| "False" -> const_false | "False" -> const_false


+ 1
- 1
lark/__init__.py View File

@@ -3,4 +3,4 @@ from .common import ParseError, GrammarError
from .lark import Lark from .lark import Lark
from .utils import inline_args from .utils import inline_args


__version__ = "0.2.7"
__version__ = "0.2.10"

+ 14
- 5
lark/common.py View File

@@ -1,4 +1,5 @@
import re import re
import sre_parse


class GrammarError(Exception): class GrammarError(Exception):
pass pass
@@ -40,7 +41,7 @@ class LexerConf:


class ParserConf: class ParserConf:
def __init__(self, rules, callback, start): def __init__(self, rules, callback, start):
assert all(len(r)==3 for r in rules)
assert all(len(r) == 4 for r in rules)
self.rules = rules self.rules = rules
self.callback = callback self.callback = callback
self.start = start self.start = start
@@ -57,9 +58,9 @@ class Pattern(object):


# Pattern Hashing assumes all subclasses have a different priority! # Pattern Hashing assumes all subclasses have a different priority!
def __hash__(self): def __hash__(self):
return hash((self.priority, self.value))
return hash((type(self), self.value))
def __eq__(self, other): def __eq__(self, other):
return self.priority == other.priority and self.value == other.value
return type(self) == type(other) and self.value == other.value


def _get_flags(self): def _get_flags(self):
if self.flags: if self.flags:
@@ -71,13 +72,21 @@ class PatternStr(Pattern):
def to_regexp(self): def to_regexp(self):
return self._get_flags() + re.escape(self.value) return self._get_flags() + re.escape(self.value)


priority = 0
@property
def min_width(self):
return len(self.value)
max_width = min_width


class PatternRE(Pattern): class PatternRE(Pattern):
def to_regexp(self): def to_regexp(self):
return self._get_flags() + self.value return self._get_flags() + self.value


priority = 1
@property
def min_width(self):
return sre_parse.parse(self.to_regexp()).getwidth()[0]
@property
def max_width(self):
return sre_parse.parse(self.to_regexp()).getwidth()[1]


class TokenDef(object): class TokenDef(object):
def __init__(self, name, pattern): def __init__(self, name, pattern):


+ 8
- 4
lark/lark.py View File

@@ -39,6 +39,7 @@ class LarkOptions(object):
postlex - Lexer post-processing (Default: None) postlex - Lexer post-processing (Default: None)
start - The start symbol (Default: start) start - The start symbol (Default: start)
profile - Measure run-time usage in Lark. Read results from the profiler proprety (Default: False) profile - Measure run-time usage in Lark. Read results from the profiler proprety (Default: False)
propagate_positions - Experimental. Don't use yet.
""" """
__doc__ += OPTIONS_DOC __doc__ += OPTIONS_DOC
def __init__(self, options_dict): def __init__(self, options_dict):
@@ -55,14 +56,13 @@ class LarkOptions(object):
self.start = o.pop('start', 'start') self.start = o.pop('start', 'start')
self.profile = o.pop('profile', False) self.profile = o.pop('profile', False)
self.ambiguity = o.pop('ambiguity', 'auto') self.ambiguity = o.pop('ambiguity', 'auto')
self.propagate_positions = o.pop('propagate_positions', False)


assert self.parser in ('earley', 'lalr', None) assert self.parser in ('earley', 'lalr', None)


if self.parser == 'earley' and self.transformer: if self.parser == 'earley' and self.transformer:
raise ValueError('Cannot specify an auto-transformer when using the Earley algorithm.' raise ValueError('Cannot specify an auto-transformer when using the Earley algorithm.'
'Please use your transformer on the resulting parse tree, or use a different algorithm (i.e. lalr)') 'Please use your transformer on the resulting parse tree, or use a different algorithm (i.e. lalr)')
if self.keep_all_tokens:
raise NotImplementedError("keep_all_tokens: Not implemented yet!")


if o: if o:
raise ValueError("Unknown options: %s" % o.keys()) raise ValueError("Unknown options: %s" % o.keys())
@@ -119,7 +119,7 @@ class Lark:


assert isinstance(grammar, STRING_TYPE) assert isinstance(grammar, STRING_TYPE)


if self.options.cache_grammar or self.options.keep_all_tokens:
if self.options.cache_grammar:
raise NotImplementedError("Not available yet") raise NotImplementedError("Not available yet")


assert not self.options.profile, "Feature temporarily disabled" assert not self.options.profile, "Feature temporarily disabled"
@@ -142,8 +142,12 @@ class Lark:
assert self.options.parser == 'earley' assert self.options.parser == 'earley'
assert self.options.ambiguity in ('resolve', 'explicit', 'auto') assert self.options.ambiguity in ('resolve', 'explicit', 'auto')


# Parse the grammar file and compose the grammars (TODO)
self.grammar = load_grammar(grammar, source) self.grammar = load_grammar(grammar, source)

# Compile the EBNF grammar into BNF
tokens, self.rules, self.grammar_extra = self.grammar.compile(lexer=bool(lexer), start=self.options.start) tokens, self.rules, self.grammar_extra = self.grammar.compile(lexer=bool(lexer), start=self.options.start)

self.ignore_tokens = self.grammar.extra['ignore'] self.ignore_tokens = self.grammar.extra['ignore']


self.lexer_conf = LexerConf(tokens, self.ignore_tokens, self.options.postlex) self.lexer_conf = LexerConf(tokens, self.ignore_tokens, self.options.postlex)
@@ -162,7 +166,7 @@ class Lark:


def _build_parser(self): def _build_parser(self):
self.parser_class = get_frontend(self.options.parser, self.options.lexer) self.parser_class = get_frontend(self.options.parser, self.options.lexer)
self.parse_tree_builder = ParseTreeBuilder(self.options.tree_class)
self.parse_tree_builder = ParseTreeBuilder(self.options.tree_class, self.options.propagate_positions, self.options.keep_all_tokens)
rules, callback = self.parse_tree_builder.create_tree_builder(self.rules, self.options.transformer) rules, callback = self.parse_tree_builder.create_tree_builder(self.rules, self.options.transformer)
if self.profiler: if self.profiler:
for f in dir(callback): for f in dir(callback):


+ 15
- 7
lark/lexer.py View File

@@ -1,7 +1,6 @@
## Lexer Implementation ## Lexer Implementation


import re import re
import sre_parse


from .utils import Str, classify from .utils import Str, classify
from .common import is_terminal, PatternStr, PatternRE, TokenDef from .common import is_terminal, PatternStr, PatternRE, TokenDef
@@ -120,8 +119,7 @@ class Lexer(object):
except: except:
raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern)) raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern))


width = sre_parse.parse(t.pattern.to_regexp()).getwidth()
if width[0] == 0:
if t.pattern.min_width == 0:
raise LexError("Lexer does not allow zero-width tokens. (%s: %s)" % (t.name, t.pattern)) raise LexError("Lexer does not allow zero-width tokens. (%s: %s)" % (t.name, t.pattern))


token_names = {t.name for t in tokens} token_names = {t.name for t in tokens}
@@ -133,7 +131,7 @@ class Lexer(object):
self.newline_types = [t.name for t in tokens if _regexp_has_newline(t.pattern.to_regexp())] self.newline_types = [t.name for t in tokens if _regexp_has_newline(t.pattern.to_regexp())]
self.ignore_types = [t for t in ignore] self.ignore_types = [t for t in ignore]


tokens.sort(key=lambda x:(x.pattern.priority, len(x.pattern.value)), reverse=True)
tokens.sort(key=lambda x:x.pattern.max_width, reverse=True)


tokens, self.callback = _create_unless(tokens) tokens, self.callback = _create_unless(tokens)
assert all(self.callback.values()) assert all(self.callback.values())
@@ -155,17 +153,27 @@ class Lexer(object):
if m: if m:
value = m.group(0) value = m.group(0)
type_ = type_from_index[m.lastindex] type_ = type_from_index[m.lastindex]
if type_ not in ignore_types:
to_yield = type_ not in ignore_types

if to_yield:
t = Token(type_, value, lex_pos, line, lex_pos - col_start_pos) t = Token(type_, value, lex_pos, line, lex_pos - col_start_pos)
end_col = t.column + len(value)
if t.type in self.callback: if t.type in self.callback:
t = self.callback[t.type](t) t = self.callback[t.type](t)
yield t


if type_ in newline_types: if type_ in newline_types:
newlines = value.count(self.newline_char) newlines = value.count(self.newline_char)
if newlines: if newlines:
line += newlines line += newlines
col_start_pos = lex_pos + value.rindex(self.newline_char)
last_newline_index = value.rindex(self.newline_char) + 1
col_start_pos = lex_pos + last_newline_index
end_col = len(value) - last_newline_index

if to_yield:
t.end_line = line
t.end_col = end_col
yield t

lex_pos += len(value) lex_pos += len(value)
break break
else: else:


+ 16
- 6
lark/load_grammar.py View File

@@ -75,6 +75,7 @@ TOKENS = {
'_TO': '->', '_TO': '->',
'_IGNORE': r'%ignore', '_IGNORE': r'%ignore',
'_IMPORT': r'%import', '_IMPORT': r'%import',
'NUMBER': '\d+',
} }


RULES = { RULES = {
@@ -82,7 +83,8 @@ RULES = {
'_list': ['_item', '_list _item'], '_list': ['_item', '_list _item'],
'_item': ['rule', 'token', 'statement', '_NL'], '_item': ['rule', 'token', 'statement', '_NL'],


'rule': ['RULE _COLON expansions _NL'],
'rule': ['RULE _COLON expansions _NL',
'RULE _DOT NUMBER _COLON expansions _NL'],
'expansions': ['alias', 'expansions': ['alias',
'expansions _OR alias', 'expansions _OR alias',
'expansions _NL _OR alias'], 'expansions _NL _OR alias'],
@@ -313,7 +315,7 @@ class PrepareLiterals(InlineTransformer):
class SplitLiterals(InlineTransformer): class SplitLiterals(InlineTransformer):
def pattern(self, p): def pattern(self, p):
if isinstance(p, PatternStr) and len(p.value)>1: if isinstance(p, PatternStr) and len(p.value)>1:
return T('expansion', [T('pattern', [PatternStr(ch)]) for ch in p.value])
return T('expansion', [T('pattern', [PatternStr(ch, flags=p.flags)]) for ch in p.value])
return T('pattern', [p]) return T('pattern', [p])


class TokenTreeToPattern(Transformer): class TokenTreeToPattern(Transformer):
@@ -470,21 +472,29 @@ class Grammar:




class RuleOptions: class RuleOptions:
def __init__(self, keep_all_tokens=False, expand1=False, create_token=None, filter_out=False):
def __init__(self, keep_all_tokens=False, expand1=False, create_token=None, filter_out=False, priority=None):
self.keep_all_tokens = keep_all_tokens self.keep_all_tokens = keep_all_tokens
self.expand1 = expand1 self.expand1 = expand1
self.create_token = create_token # used for scanless postprocessing self.create_token = create_token # used for scanless postprocessing
self.priority = priority


self.filter_out = filter_out # remove this rule from the tree self.filter_out = filter_out # remove this rule from the tree
# used for "token"-rules in scanless # used for "token"-rules in scanless
@classmethod @classmethod
def from_rule(cls, name, expansions):
def from_rule(cls, name, *x):
if len(x) > 1:
priority, expansions = x
priority = int(priority)
else:
expansions ,= x
priority = None

keep_all_tokens = name.startswith('!') keep_all_tokens = name.startswith('!')
name = name.lstrip('!') name = name.lstrip('!')
expand1 = name.startswith('?') expand1 = name.startswith('?')
name = name.lstrip('?') name = name.lstrip('?')


return name, expansions, cls(keep_all_tokens, expand1)
return name, expansions, cls(keep_all_tokens, expand1, priority=priority)






@@ -605,7 +615,7 @@ class GrammarLoader:
raise GrammarError("Token '%s' defined more than once" % name) raise GrammarError("Token '%s' defined more than once" % name)
token_names.add(name) token_names.add(name)


rules = [RuleOptions.from_rule(name, x) for name, x in rule_defs]
rules = [RuleOptions.from_rule(*x) for x in rule_defs]


rule_names = set() rule_names = set()
for name, _x, _o in rules: for name, _x, _o in rules:


+ 29
- 3
lark/parse_tree_builder.py View File

@@ -1,4 +1,5 @@
from .common import is_terminal, GrammarError from .common import is_terminal, GrammarError
from .utils import suppress
from .lexer import Token from .lexer import Token


class Callback(object): class Callback(object):
@@ -42,10 +43,32 @@ def create_rule_handler(expansion, usermethod, keep_all_tokens, filter_out):
# else, if no filtering required.. # else, if no filtering required..
return usermethod return usermethod


def propagate_positions_wrapper(f):
def _f(args):
res = f(args)

if args:
for a in args:
with suppress(AttributeError):
res.line = a.line
res.column = a.column
break

for a in reversed(args):
with suppress(AttributeError):
res.end_line = a.end_line
res.end_col = a.end_col
break

return res

return _f


class ParseTreeBuilder: class ParseTreeBuilder:
def __init__(self, tree_class):
def __init__(self, tree_class, propagate_positions=False, keep_all_tokens=False):
self.tree_class = tree_class self.tree_class = tree_class
self.propagate_positions = propagate_positions
self.always_keep_all_tokens = keep_all_tokens


def _create_tree_builder_function(self, name): def _create_tree_builder_function(self, name):
tree_class = self.tree_class tree_class = self.tree_class
@@ -66,7 +89,7 @@ class ParseTreeBuilder:
filter_out.add(origin) filter_out.add(origin)


for origin, (expansions, options) in rules.items(): for origin, (expansions, options) in rules.items():
keep_all_tokens = options.keep_all_tokens if options else False
keep_all_tokens = self.always_keep_all_tokens or (options.keep_all_tokens if options else False)
expand1 = options.expand1 if options else False expand1 = options.expand1 if options else False
create_token = options.create_token if options else False create_token = options.create_token if options else False


@@ -92,11 +115,14 @@ class ParseTreeBuilder:


alias_handler = create_rule_handler(expansion, f, keep_all_tokens, filter_out) alias_handler = create_rule_handler(expansion, f, keep_all_tokens, filter_out)


if self.propagate_positions:
alias_handler = propagate_positions_wrapper(alias_handler)

callback_name = 'autoalias_%s_%s' % (_origin, '_'.join(expansion)) callback_name = 'autoalias_%s_%s' % (_origin, '_'.join(expansion))
if hasattr(callback, callback_name): if hasattr(callback, callback_name):
raise GrammarError("Rule expansion '%s' already exists in rule %s" % (' '.join(expansion), origin)) raise GrammarError("Rule expansion '%s' already exists in rule %s" % (' '.join(expansion), origin))
setattr(callback, callback_name, alias_handler) setattr(callback, callback_name, alias_handler)


new_rules.append(( _origin, expansion, callback_name ))
new_rules.append(( _origin, expansion, callback_name, options ))


return new_rules, callback return new_rules, callback

+ 2
- 2
lark/parser_frontends.py View File

@@ -131,7 +131,7 @@ class Earley_NoLex:
def __init__(self, lexer_conf, parser_conf, options=None): def __init__(self, lexer_conf, parser_conf, options=None):
self.token_by_name = {t.name:t for t in lexer_conf.tokens} self.token_by_name = {t.name:t for t in lexer_conf.tokens}


rules = [(n, list(self._prepare_expansion(x)), a) for n,x,a in parser_conf.rules]
rules = [(n, list(self._prepare_expansion(x)), a, o) for n,x,a,o in parser_conf.rules]


resolve_ambiguity = (options.ambiguity=='resolve') if options else True resolve_ambiguity = (options.ambiguity=='resolve') if options else True
self.parser = earley.Parser(rules, self.parser = earley.Parser(rules,
@@ -158,7 +158,7 @@ class Earley(WithLexer):
def __init__(self, lexer_conf, parser_conf, options=None): def __init__(self, lexer_conf, parser_conf, options=None):
WithLexer.__init__(self, lexer_conf) WithLexer.__init__(self, lexer_conf)


rules = [(n, self._prepare_expansion(x), a) for n,x,a in parser_conf.rules]
rules = [(n, self._prepare_expansion(x), a, o) for n,x,a,o in parser_conf.rules]


resolve_ambiguity = (options.ambiguity=='resolve') if options else True resolve_ambiguity = (options.ambiguity=='resolve') if options else True
self.parser = earley.Parser(rules, self.parser = earley.Parser(rules,


+ 39
- 6
lark/parsers/earley.py View File

@@ -29,6 +29,9 @@ class Derivation(Tree):
Tree.__init__(self, 'drv', items or []) Tree.__init__(self, 'drv', items or [])
self.rule = rule self.rule = rule


def _pretty_label(self): # Nicer pretty for debugging the parser
return self.rule.origin if self.rule else self.data

END_TOKEN = EndToken() END_TOKEN = EndToken()


class Item(object): class Item(object):
@@ -106,8 +109,11 @@ class Column:
new_tree = old_tree.copy() new_tree = old_tree.copy()
new_tree.rule = old_tree.rule new_tree.rule = old_tree.rule
old_tree.set('_ambig', [new_tree]) old_tree.set('_ambig', [new_tree])
old_tree.rule = None # No longer a 'drv' node

if item.tree.children[0] is old_tree: # XXX a little hacky! if item.tree.children[0] is old_tree: # XXX a little hacky!
raise ParseError("Infinite recursion in grammar!")
raise ParseError("Infinite recursion in grammar! (Rule %s)" % item.rule)

old_tree.children.append(item.tree) old_tree.children.append(item.tree)
else: else:
self.completed[item] = item self.completed[item] = item
@@ -218,7 +224,13 @@ class ApplyCallbacks(Transformer_NoRecurse):
return Tree(rule.origin, children) return Tree(rule.origin, children)


def _compare_rules(rule1, rule2): def _compare_rules(rule1, rule2):
assert rule1.origin == rule2.origin
if rule1.options and rule2.options:
if rule1.options.priority is not None and rule2.options.priority is not None:
assert rule1.options.priority != rule2.options.priority, "Priority is the same between both rules: %s == %s" % (rule1, rule2)
return -compare(rule1.options.priority, rule2.options.priority)

if rule1.origin != rule2.origin:
return 0
c = compare( len(rule1.expansion), len(rule2.expansion)) c = compare( len(rule1.expansion), len(rule2.expansion))
if rule1.origin.startswith('__'): # XXX hack! We need to set priority in parser, not here if rule1.origin.startswith('__'): # XXX hack! We need to set priority in parser, not here
c = -c c = -c
@@ -228,6 +240,20 @@ def _compare_drv(tree1, tree2):
if not (isinstance(tree1, Tree) and isinstance(tree2, Tree)): if not (isinstance(tree1, Tree) and isinstance(tree2, Tree)):
return compare(tree1, tree2) return compare(tree1, tree2)


try:
rule1, rule2 = tree1.rule, tree2.rule
except AttributeError:
# Probably trees that don't take part in this parse (better way to distinguish?)
return compare(tree1, tree2)

# XXX These artifacts can appear due to imperfections in the ordering of Visitor_NoRecurse,
# when confronted with duplicate (same-id) nodes. Fixing this ordering is possible, but would be
# computationally inefficient. So we handle it here.
if tree1.data == '_ambig':
_resolve_ambig(tree1)
if tree2.data == '_ambig':
_resolve_ambig(tree2)

c = _compare_rules(tree1.rule, tree2.rule) c = _compare_rules(tree1.rule, tree2.rule)
if c: if c:
return c return c
@@ -241,12 +267,19 @@ def _compare_drv(tree1, tree2):
return compare(len(tree1.children), len(tree2.children)) return compare(len(tree1.children), len(tree2.children))




def _resolve_ambig(tree):
assert tree.data == '_ambig'

best = min(tree.children, key=cmp_to_key(_compare_drv))
assert best.data == 'drv'
tree.set('drv', best.children)
tree.rule = best.rule # needed for applying callbacks

assert tree.data != '_ambig'

class ResolveAmbig(Visitor_NoRecurse): class ResolveAmbig(Visitor_NoRecurse):
def _ambig(self, tree): def _ambig(self, tree):
best = min(tree.children, key=cmp_to_key(_compare_drv))
assert best.data == 'drv'
tree.set('drv', best.children)
tree.rule = best.rule # needed for applying callbacks
_resolve_ambig(tree)




# RULES = [ # RULES = [


+ 6
- 5
lark/parsers/grammar_analysis.py View File

@@ -7,10 +7,11 @@ class Rule(object):
origin : a symbol origin : a symbol
expansion : a list of symbols expansion : a list of symbols
""" """
def __init__(self, origin, expansion, alias=None):
def __init__(self, origin, expansion, alias=None, options=None):
self.origin = origin self.origin = origin
self.expansion = expansion self.expansion = expansion
self.alias = alias self.alias = alias
self.options = options


def __repr__(self): def __repr__(self):
return '<%s : %s>' % (self.origin, ' '.join(map(str,self.expansion))) return '<%s : %s>' % (self.origin, ' '.join(map(str,self.expansion)))
@@ -111,12 +112,12 @@ class GrammarAnalyzer(object):
self.debug = debug self.debug = debug
rule_tuples = list(rule_tuples) rule_tuples = list(rule_tuples)
rule_tuples.append(('$root', [start_symbol, '$end'])) rule_tuples.append(('$root', [start_symbol, '$end']))
rule_tuples = [(t[0], t[1], None) if len(t)==2 else t for t in rule_tuples]
rule_tuples = [(t[0], t[1], None, None) if len(t)==2 else t for t in rule_tuples]


self.rules = set() self.rules = set()
self.rules_by_origin = {o: [] for o, _x, _a in rule_tuples}
for origin, exp, alias in rule_tuples:
r = Rule( origin, exp, alias )
self.rules_by_origin = {o: [] for o, _x, _a, _opt in rule_tuples}
for origin, exp, alias, options in rule_tuples:
r = Rule( origin, exp, alias, options )
self.rules.add(r) self.rules.add(r)
self.rules_by_origin[origin].append(r) self.rules_by_origin[origin].append(r)




+ 4
- 3
lark/parsers/lalr_parser.py View File

@@ -9,6 +9,7 @@ from .lalr_analysis import LALR_Analyzer, ACTION_SHIFT


class Parser(object): class Parser(object):
def __init__(self, parser_conf): def __init__(self, parser_conf):
assert all(o is None or o.priority is None for n,x,a,o in parser_conf.rules), "LALR doesn't yet support prioritization"
self.analysis = LALR_Analyzer(parser_conf.rules, parser_conf.start) self.analysis = LALR_Analyzer(parser_conf.rules, parser_conf.start)
self.analysis.compute_lookahead() self.analysis.compute_lookahead()
self.callbacks = {rule: getattr(parser_conf.callback, rule.alias or rule.origin, None) self.callbacks = {rule: getattr(parser_conf.callback, rule.alias or rule.origin, None)
@@ -34,7 +35,7 @@ class Parser(object):


raise UnexpectedToken(token, expected, seq, i) raise UnexpectedToken(token, expected, seq, i)


def reduce(rule, size):
def reduce(rule, size, end=False):
if size: if size:
s = value_stack[-size:] s = value_stack[-size:]
del state_stack[-size:] del state_stack[-size:]
@@ -44,7 +45,7 @@ class Parser(object):


res = self.callbacks[rule](s) res = self.callbacks[rule](s)


if len(state_stack) == 1 and rule.origin == self.analysis.start_symbol:
if end and len(state_stack) == 1 and rule.origin == self.analysis.start_symbol:
return res return res


_action, new_state = get_action(rule.origin) _action, new_state = get_action(rule.origin)
@@ -73,7 +74,7 @@ class Parser(object):
while True: while True:
_action, rule = get_action('$end') _action, rule = get_action('$end')
assert _action == 'reduce' assert _action == 'reduce'
res = reduce(*rule)
res = reduce(*rule, end=True)
if res: if res:
assert state_stack == [self.analysis.init_state_idx] and not value_stack, len(state_stack) assert state_stack == [self.analysis.init_state_idx] and not value_stack, len(state_stack)
return res return res


+ 8
- 1
lark/tree.py View File

@@ -10,11 +10,14 @@ class Tree(object):
def __repr__(self): def __repr__(self):
return 'Tree(%s, %s)' % (self.data, self.children) return 'Tree(%s, %s)' % (self.data, self.children)


def _pretty_label(self):
return self.data

def _pretty(self, level, indent_str): def _pretty(self, level, indent_str):
if len(self.children) == 1 and not isinstance(self.children[0], Tree): if len(self.children) == 1 and not isinstance(self.children[0], Tree):
return [ indent_str*level, self.data, '\t', '%s' % self.children[0], '\n'] return [ indent_str*level, self.data, '\t', '%s' % self.children[0], '\n']


l = [ indent_str*level, self.data, '\n' ]
l = [ indent_str*level, self._pretty_label(), '\n' ]
for n in self.children: for n in self.children:
if isinstance(n, Tree): if isinstance(n, Tree):
l += n._pretty(level+1, indent_str) l += n._pretty(level+1, indent_str)
@@ -62,10 +65,14 @@ class Tree(object):
yield c yield c


def iter_subtrees(self): def iter_subtrees(self):
visited = set()
q = [self] q = [self]


while q: while q:
subtree = q.pop() subtree = q.pop()
if id(subtree) in visited:
continue # already been here from another branch
visited.add(id(subtree))
yield subtree yield subtree
q += [c for c in subtree.children if isinstance(c, Tree)] q += [c for c in subtree.children if isinstance(c, Tree)]




+ 28
- 2
lark/utils.py View File

@@ -1,6 +1,7 @@
import functools import functools
import types import types
from collections import deque from collections import deque
from contextlib import contextmanager


class fzset(frozenset): class fzset(frozenset):
def __repr__(self): def __repr__(self):
@@ -63,11 +64,17 @@ def inline_args(f):
def _f_builtin(_self, args): def _f_builtin(_self, args):
return f(*args) return f(*args)
return _f_builtin return _f_builtin
else:
@functools.wraps(f)
elif isinstance(f, types.MethodType):
@functools.wraps(f.__func__)
def _f(self, args): def _f(self, args):
return f.__func__(self, *args) return f.__func__(self, *args)
return _f return _f
else:
@functools.wraps(f.__call__.__func__)
def _f(self, args):
return f.__call__.__func__(self, *args)
return _f





try: try:
@@ -82,5 +89,24 @@ except NameError:
return -1 return -1




try:
from contextlib import suppress # Python 3
except ImportError:
@contextmanager
def suppress(*excs):
'''Catch and dismiss the provided exception

>>> x = 'hello'
>>> with suppress(IndexError):
... x = x[10]
>>> x
'hello'
'''
try:
yield
except excs:
pass







+ 68
- 0
tests/test_parser.py View File

@@ -380,6 +380,20 @@ def _make_parser_test(LEXER, PARSER):
x = g.parse('Hello HelloWorld') x = g.parse('Hello HelloWorld')
self.assertSequenceEqual(x.children, ['HelloWorld']) self.assertSequenceEqual(x.children, ['HelloWorld'])


def test_token_collision2(self):
# NOTE: This test reveals a bug in token reconstruction in Scanless Earley
# I probably need to re-write grammar transformation

g = _Lark("""
!start: "starts"

%import common.LCASE_LETTER
""")

x = g.parse("starts")
self.assertSequenceEqual(x.children, ['starts'])


# def test_string_priority(self): # def test_string_priority(self):
# g = _Lark("""start: (A | /a?bb/)+ # g = _Lark("""start: (A | /a?bb/)+
# A: "a" """) # A: "a" """)
@@ -539,6 +553,12 @@ def _make_parser_test(LEXER, PARSER):
g.parse("+2e-9") g.parse("+2e-9")
self.assertRaises(ParseError, g.parse, "+2e-9e") self.assertRaises(ParseError, g.parse, "+2e-9e")


def test_keep_all_tokens(self):
l = _Lark("""start: "a"+ """, keep_all_tokens=True)
tree = l.parse('aaa')
self.assertEqual(tree.children, ['a', 'a', 'a'])


def test_token_flags(self): def test_token_flags(self):
l = _Lark("""!start: "a"i+ l = _Lark("""!start: "a"i+
""" """
@@ -569,6 +589,14 @@ def _make_parser_test(LEXER, PARSER):
tree = l.parse('AB,a') tree = l.parse('AB,a')
self.assertEqual(tree.children, ['AB']) self.assertEqual(tree.children, ['AB'])


def test_token_flags3(self):
l = _Lark("""!start: ABC+
ABC: "abc"i
"""
)
tree = l.parse('aBcAbC')
self.assertEqual(tree.children, ['aBc', 'AbC'])

def test_token_flags2(self): def test_token_flags2(self):
g = """!start: ("a"i | /a/ /b/?)+ g = """!start: ("a"i | /a/ /b/?)+
""" """
@@ -577,6 +605,46 @@ def _make_parser_test(LEXER, PARSER):
self.assertEqual(tree.children, ['a', 'A']) self.assertEqual(tree.children, ['a', 'A'])




def test_reduce_cycle(self):
"""Tests an edge-condition in the LALR parser, in which a transition state looks exactly like the end state.
It seems that the correct solution is to explicitely distinguish finalization in the reduce() function.
"""

l = _Lark("""
term: A
| term term

A: "a"

""", start='term')

tree = l.parse("aa")
self.assertEqual(len(tree.children), 2)


@unittest.skipIf(PARSER != 'earley', "Currently only Earley supports priority in rules")
def test_earley_prioritization(self):
"Tests effect of priority on result"

grammar = """
start: a | b
a.1: "a"
b.2: "a"
"""

l = Lark(grammar, parser='earley', lexer='standard')
res = l.parse("a")
self.assertEqual(res.children[0].data, 'b')

grammar = """
start: a | b
a.2: "a"
b.1: "a"
"""

l = Lark(grammar, parser='earley', lexer='standard')
res = l.parse("a")
self.assertEqual(res.children[0].data, 'a')








Loading…
Cancel
Save