Browse Source

Merge branch 'master' into true_lalr2

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.8.0
Erez Shinan 5 years ago
parent
commit
d10182c253
16 changed files with 176 additions and 54 deletions
  1. +1
    -0
      .gitignore
  2. +2
    -2
      README.md
  3. +1
    -1
      lark/__init__.py
  4. +8
    -4
      lark/lexer.py
  5. +19
    -11
      lark/load_grammar.py
  6. +5
    -2
      lark/parser_frontends.py
  7. +6
    -1
      lark/parsers/earley.py
  8. +1
    -1
      lark/parsers/earley_forest.py
  9. +2
    -2
      lark/parsers/xearley.py
  10. +39
    -0
      lark/tools/serialize.py
  11. +25
    -24
      lark/tree.py
  12. +2
    -2
      lark/utils.py
  13. +10
    -0
      readthedocs.yml
  14. +1
    -0
      tests/__main__.py
  15. +37
    -4
      tests/test_parser.py
  16. +17
    -0
      tests/test_trees.py

+ 1
- 0
.gitignore View File

@@ -4,6 +4,7 @@
/lark_parser.egg-info/**
tags
.vscode
.idea
.ropeproject
.cache
/dist


+ 2
- 2
README.md View File

@@ -72,7 +72,7 @@ Lark is great at handling ambiguity. Let's parse the phrase "fruit flies like ba

![fruitflies.png](examples/fruitflies.png)

See more [examples in the wiki](https://github.com/erezsh/lark/wiki/Examples)
See more [examples here](https://github.com/lark-parser/lark/tree/master/examples)



@@ -95,7 +95,7 @@ See more [examples in the wiki](https://github.com/erezsh/lark/wiki/Examples)
- Extensive test suite [![codecov](https://codecov.io/gh/erezsh/lark/branch/master/graph/badge.svg)](https://codecov.io/gh/erezsh/lark)
- And much more!

See the full list of [features in the wiki](https://github.com/erezsh/lark/wiki/Features)
See the full list of [features here](https://lark-parser.readthedocs.io/en/latest/features/)


### Comparison to other libraries


+ 1
- 1
lark/__init__.py View File

@@ -5,4 +5,4 @@ from .exceptions import ParseError, LexError, GrammarError, UnexpectedToken, Une
from .lexer import Token
from .lark import Lark

__version__ = "0.7.2"
__version__ = "0.7.4"

+ 8
- 4
lark/lexer.py View File

@@ -8,7 +8,6 @@ from .exceptions import UnexpectedCharacters, LexError
###{standalone

class Pattern(Serialize):
__serialize_fields__ = 'value', 'flags'

def __init__(self, value, flags=()):
self.value = value
@@ -41,6 +40,8 @@ class Pattern(Serialize):


class PatternStr(Pattern):
__serialize_fields__ = 'value', 'flags'

type = "str"
def to_regexp(self):
@@ -52,6 +53,8 @@ class PatternStr(Pattern):
max_width = min_width

class PatternRE(Pattern):
__serialize_fields__ = 'value', 'flags', '_width'

type = "re"

def to_regexp(self):
@@ -98,7 +101,7 @@ class Token(Str):

self.type = type_
self.pos_in_stream = pos_in_stream
self.value = Str(value)
self.value = value
self.line = line
self.column = column
self.end_line = end_line
@@ -265,13 +268,14 @@ def build_mres(terminals, match_whole=False):
return _build_mres(terminals, len(terminals), match_whole)

def _regexp_has_newline(r):
"""Expressions that may indicate newlines in a regexp:
r"""Expressions that may indicate newlines in a regexp:
- newlines (\n)
- escaped newline (\\n)
- anything but ([^...])
- any-char (.) when the flag (?s) exists
- spaces (\s)
"""
return '\n' in r or '\\n' in r or '[^' in r or ('(?s' in r and '.' in r)
return '\n' in r or '\\n' in r or '\\s' in r or '[^' in r or ('(?s' in r and '.' in r)

class Lexer(object):
"""Lexer interface


+ 19
- 11
lark/load_grammar.py View File

@@ -12,7 +12,7 @@ from .parse_tree_builder import ParseTreeBuilder
from .parser_frontends import LALR_TraditionalLexer
from .common import LexerConf, ParserConf
from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol
from .utils import classify, suppress, dedup_list
from .utils import classify, suppress, dedup_list, Str
from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken

from .tree import Tree, SlottedTree as ST
@@ -351,7 +351,10 @@ def _fix_escaping(s):
for n in i:
w += n
if n == '\\':
n2 = next(i)
try:
n2 = next(i)
except StopIteration:
raise ValueError("Literal ended unexpectedly (bad escaping): `%r`" % s)
if n2 == '\\':
w += '\\\\'
elif n2 not in 'uxnftr':
@@ -451,9 +454,9 @@ class PrepareSymbols(Transformer_InPlace):
if isinstance(v, Tree):
return v
elif v.type == 'RULE':
return NonTerminal(v.value)
return NonTerminal(Str(v.value))
elif v.type == 'TERMINAL':
return Terminal(v.value, filter_out=v.startswith('_'))
return Terminal(Str(v.value), filter_out=v.startswith('_'))
assert False

def _choice_of_rules(rules):
@@ -511,12 +514,12 @@ class Grammar:

simplify_rule = SimplifyRule_Visitor()
compiled_rules = []
for i, rule_content in enumerate(rules):
for rule_content in rules:
name, tree, options = rule_content
simplify_rule.visit(tree)
expansions = rule_tree_to_text.transform(tree)

for expansion, alias in expansions:
for i, (expansion, alias) in enumerate(expansions):
if alias and name.startswith('_'):
raise GrammarError("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (name, alias))

@@ -538,7 +541,7 @@ class Grammar:
for dups in duplicates.values():
if len(dups) > 1:
if dups[0].expansion:
raise GrammarError("Rules defined twice: %s" % ', '.join(str(i) for i in duplicates))
raise GrammarError("Rules defined twice: %s\n\n(Might happen due to colliding expansion of optionals: [] or ?)" % ''.join('\n * %s' % i for i in dups))

# Empty rule; assert all other attributes are equal
assert len({(r.alias, r.order, r.options) for r in dups}) == len(dups)
@@ -605,7 +608,9 @@ def import_from_grammar_into_namespace(grammar, namespace, aliases):
_, tree, _ = imported_rules[symbol]
except KeyError:
raise GrammarError("Missing symbol '%s' in grammar %s" % (symbol, namespace))
return tree.scan_values(lambda x: x.type in ('RULE', 'TERMINAL'))

return _find_used_symbols(tree)


def get_namespace_name(name):
try:
@@ -682,6 +687,11 @@ class PrepareGrammar(Transformer_InPlace):
return name


def _find_used_symbols(tree):
assert tree.data == 'expansions'
return {t for x in tree.find_data('expansion')
for t in x.scan_values(lambda t: t.type in ('RULE', 'TERMINAL'))}

class GrammarLoader:
def __init__(self):
terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()]
@@ -843,9 +853,7 @@ class GrammarLoader:
rule_names.add(name)

for name, expansions, _o in rules:
used_symbols = {t for x in expansions.find_data('expansion')
for t in x.scan_values(lambda t: t.type in ('RULE', 'TERMINAL'))}
for sym in used_symbols:
for sym in _find_used_symbols(expansions):
if sym.type == 'TERMINAL':
if sym not in terminal_names:
raise GrammarError("Token '%s' used but not defined (in rule %s)" % (sym, name))


+ 5
- 2
lark/parser_frontends.py View File

@@ -118,7 +118,7 @@ class LALR_ContextualLexer(LALR_WithLexer):

class LALR_CustomLexer(LALR_WithLexer):
def __init__(self, lexer_cls, lexer_conf, parser_conf, options=None):
self.lexer = lexer_cls(self.lexer_conf)
self.lexer = lexer_cls(lexer_conf)
debug = options.debug if options else False
self.parser = LALR_Parser(parser_conf, debug=debug)
WithLexer.__init__(self, lexer_conf, parser_conf, options)
@@ -139,7 +139,8 @@ class Earley(WithLexer):
self.init_traditional_lexer()

resolve_ambiguity = options.ambiguity == 'resolve'
self.parser = earley.Parser(parser_conf, self.match, resolve_ambiguity=resolve_ambiguity)
debug = options.debug if options else False
self.parser = earley.Parser(parser_conf, self.match, resolve_ambiguity=resolve_ambiguity, debug=debug)

def match(self, term, token):
return term.name == token.type
@@ -152,10 +153,12 @@ class XEarley(_ParserFrontend):

self._prepare_match(lexer_conf)
resolve_ambiguity = options.ambiguity == 'resolve'
debug = options.debug if options else False
self.parser = xearley.Parser(parser_conf,
self.match,
ignore=lexer_conf.ignore,
resolve_ambiguity=resolve_ambiguity,
debug=debug,
**kw
)



+ 6
- 1
lark/parsers/earley.py View File

@@ -20,10 +20,11 @@ from .earley_common import Item, TransitiveItem
from .earley_forest import ForestToTreeVisitor, ForestSumVisitor, SymbolNode, ForestToAmbiguousTreeVisitor

class Parser:
def __init__(self, parser_conf, term_matcher, resolve_ambiguity=True):
def __init__(self, parser_conf, term_matcher, resolve_ambiguity=True, debug=False):
analysis = GrammarAnalyzer(parser_conf)
self.parser_conf = parser_conf
self.resolve_ambiguity = resolve_ambiguity
self.debug = debug

self.FIRST = analysis.FIRST
self.NULLABLE = analysis.NULLABLE
@@ -296,6 +297,10 @@ class Parser:
# symbol should have been completed in the last step of the Earley cycle, and will be in
# this column. Find the item for the start_symbol, which is the root of the SPPF tree.
solutions = [n.node for n in columns[-1] if n.is_complete and n.node is not None and n.s == start_symbol and n.start == 0]
if self.debug:
from .earley_forest import ForestToPyDotVisitor
debug_walker = ForestToPyDotVisitor()
debug_walker.visit(solutions[0], "sppf.png")

if not solutions:
expected_tokens = [t.expect for t in to_scan]


+ 1
- 1
lark/parsers/earley_forest.py View File

@@ -122,7 +122,7 @@ class PackedNode(ForestNode):
ambiguously. Hence, we use the sort order to identify
the order in which ambiguous children should be considered.
"""
return self.is_empty, -self.priority, -self.rule.order
return self.is_empty, -self.priority, self.rule.order

def __iter__(self):
return iter([self.left, self.right])


+ 2
- 2
lark/parsers/xearley.py View File

@@ -24,8 +24,8 @@ from .earley_forest import SymbolNode


class Parser(BaseParser):
def __init__(self, parser_conf, term_matcher, resolve_ambiguity=True, ignore = (), complete_lex = False):
BaseParser.__init__(self, parser_conf, term_matcher, resolve_ambiguity)
def __init__(self, parser_conf, term_matcher, resolve_ambiguity=True, ignore = (), complete_lex = False, debug=False):
BaseParser.__init__(self, parser_conf, term_matcher, resolve_ambiguity, debug)
self.ignore = [Terminal(t) for t in ignore]
self.complete_lex = complete_lex



+ 39
- 0
lark/tools/serialize.py View File

@@ -0,0 +1,39 @@
import codecs
import sys
import json

from lark import Lark
from lark.grammar import RuleOptions, Rule
from lark.lexer import TerminalDef

import argparse

argparser = argparse.ArgumentParser(prog='python -m lark.tools.serialize') #description='''Lark Serialization Tool -- Stores Lark's internal state & LALR analysis as a convenient JSON file''')

argparser.add_argument('grammar_file', type=argparse.FileType('r'), help='A valid .lark file')
argparser.add_argument('-o', '--out', type=argparse.FileType('w'), default=sys.stdout, help='json file path to create (default=stdout)')
argparser.add_argument('-s', '--start', default='start', help='start symbol (default="start")', nargs='+')
argparser.add_argument('-l', '--lexer', default='standard', choices=['standard', 'contextual'], help='lexer type (default="standard")')


def serialize(infile, outfile, lexer, start):
lark_inst = Lark(infile, parser="lalr", lexer=lexer, start=start) # TODO contextual

data, memo = lark_inst.memo_serialize([TerminalDef, Rule])
outfile.write('{\n')
outfile.write(' "data": %s,\n' % json.dumps(data))
outfile.write(' "memo": %s\n' % json.dumps(memo))
outfile.write('}\n')


def main():
if len(sys.argv) == 1 or '-h' in sys.argv or '--help' in sys.argv:
print("Lark Serialization Tool - Stores Lark's internal state & LALR analysis as a JSON file")
print("")
argparser.print_help()
else:
args = argparser.parse_args()
serialize(args.grammar_file, args.out, args.lexer, args.start)

if __name__ == '__main__':
main()

+ 25
- 24
lark/tree.py View File

@@ -56,30 +56,6 @@ class Tree(object):

def __hash__(self):
return hash((self.data, tuple(self.children)))
###}

def expand_kids_by_index(self, *indices):
"Expand (inline) children at the given indices"
for i in sorted(indices, reverse=True): # reverse so that changing tail won't affect indices
kid = self.children[i]
self.children[i:i+1] = kid.children

def find_pred(self, pred):
"Find all nodes where pred(tree) == True"
return filter(pred, self.iter_subtrees())

def find_data(self, data):
"Find all nodes where tree.data == data"
return self.find_pred(lambda t: t.data == data)

def scan_values(self, pred):
for c in self.children:
if isinstance(c, Tree):
for t in c.scan_values(pred):
yield t
else:
if pred(c):
yield c

def iter_subtrees(self):
# TODO: Re-write as a more efficient version
@@ -102,6 +78,31 @@ class Tree(object):
yield x
seen.add(id(x))

def find_pred(self, pred):
"Find all nodes where pred(tree) == True"
return filter(pred, self.iter_subtrees())

def find_data(self, data):
"Find all nodes where tree.data == data"
return self.find_pred(lambda t: t.data == data)

###}

def expand_kids_by_index(self, *indices):
"Expand (inline) children at the given indices"
for i in sorted(indices, reverse=True): # reverse so that changing tail won't affect indices
kid = self.children[i]
self.children[i:i+1] = kid.children

def scan_values(self, pred):
for c in self.children:
if isinstance(c, Tree):
for t in c.scan_values(pred):
yield t
else:
if pred(c):
yield c

def iter_subtrees_topdown(self):
stack = [self]
while stack:


+ 2
- 2
lark/utils.py View File

@@ -160,7 +160,7 @@ def smart_decorator(f, create_decorator):

elif isinstance(f, partial):
# wraps does not work for partials in 2.7: https://bugs.python.org/issue3445
return create_decorator(f.__func__, True)
return wraps(f.func)(create_decorator(lambda *args, **kw: f(*args[1:], **kw), True))

else:
return create_decorator(f.__func__.__call__, True)
@@ -172,7 +172,7 @@ import sre_parse
import sre_constants
def get_regexp_width(regexp):
try:
return sre_parse.parse(regexp).getwidth()
return [int(x) for x in sre_parse.parse(regexp).getwidth()]
except sre_constants.error:
raise ValueError(regexp)



+ 10
- 0
readthedocs.yml View File

@@ -0,0 +1,10 @@
version: 2

mkdocs:
configuration: mkdocs.yml
fail_on_warning: false

formats: all

python:
version: 3.5

+ 1
- 0
tests/__main__.py View File

@@ -21,6 +21,7 @@ from .test_parser import (
TestCykStandard,
TestLalrContextual,
TestEarleyDynamic,
TestLalrCustom,

# TestFullEarleyStandard,
TestFullEarleyDynamic,


+ 37
- 4
tests/test_parser.py View File

@@ -22,7 +22,7 @@ from lark.exceptions import GrammarError, ParseError, UnexpectedToken, Unexpecte
from lark.tree import Tree
from lark.visitors import Transformer, Transformer_InPlace, v_args
from lark.grammar import Rule
from lark.lexer import TerminalDef
from lark.lexer import TerminalDef, Lexer, TraditionalLexer

__path__ = os.path.dirname(__file__)
def _read(n, *args):
@@ -431,12 +431,22 @@ def _make_full_earley_test(LEXER):
_TestFullEarley.__name__ = _NAME
globals()[_NAME] = _TestFullEarley

class CustomLexer(Lexer):
"""
Purpose of this custom lexer is to test the integration,
so it uses the traditionalparser as implementation without custom lexing behaviour.
"""
def __init__(self, lexer_conf):
self.lexer = TraditionalLexer(lexer_conf.tokens, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks)
def lex(self, *args, **kwargs):
return self.lexer.lex(*args, **kwargs)

def _make_parser_test(LEXER, PARSER):
lexer_class_or_name = CustomLexer if LEXER == 'custom' else LEXER
def _Lark(grammar, **kwargs):
return Lark(grammar, lexer=LEXER, parser=PARSER, propagate_positions=True, **kwargs)
return Lark(grammar, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs)
def _Lark_open(gfilename, **kwargs):
return Lark.open(gfilename, lexer=LEXER, parser=PARSER, propagate_positions=True, **kwargs)
return Lark.open(gfilename, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs)
class _TestParser(unittest.TestCase):
def test_basic1(self):
g = _Lark("""start: a+ b a* "b" a*
@@ -1532,7 +1542,7 @@ def _make_parser_test(LEXER, PARSER):
parser = _Lark(grammar)


@unittest.skipIf(PARSER!='lalr', "Serialize currently only works for LALR parsers (though it should be easy to extend)")
@unittest.skipIf(PARSER!='lalr' or LEXER=='custom', "Serialize currently only works for LALR parsers without custom lexers (though it should be easy to extend)")
def test_serialize(self):
grammar = """
start: _ANY b "C"
@@ -1558,6 +1568,28 @@ def _make_parser_test(LEXER, PARSER):
self.assertEqual(parser.parse('xa', 'a'), Tree('a', []))
self.assertEqual(parser.parse('xb', 'b'), Tree('b', []))

def test_lexer_detect_newline_tokens(self):
# Detect newlines in regular tokens
g = _Lark(r"""start: "go" tail*
!tail : SA "@" | SB "@" | SC "@" | SD "@"
SA : "a" /\n/
SB : /b./s
SC : "c" /[^a-z]/
SD : "d" /\s/
""")
a,b,c,d = [x.children[1] for x in g.parse('goa\n@b\n@c\n@d\n@').children]
self.assertEqual(a.line, 2)
self.assertEqual(b.line, 3)
self.assertEqual(c.line, 4)
self.assertEqual(d.line, 5)

# Detect newlines in ignored tokens
for re in ['/\\n/', '/[^a-z]/', '/\\s/']:
g = _Lark('''!start: "a" "a"
%ignore {}'''.format(re))
a, b = g.parse('a\na').children
self.assertEqual(a.line, 1)
self.assertEqual(b.line, 2)


_NAME = "Test" + PARSER.capitalize() + LEXER.capitalize()
@@ -1572,6 +1604,7 @@ _TO_TEST = [
('dynamic_complete', 'earley'),
('standard', 'lalr'),
('contextual', 'lalr'),
('custom', 'lalr'),
# (None, 'earley'),
]



+ 17
- 0
tests/test_trees.py View File

@@ -4,6 +4,7 @@ import unittest
from unittest import TestCase
import copy
import pickle
import functools

from lark.tree import Tree
from lark.visitors import Transformer, Interpreter, visit_children_decor, v_args, Discard
@@ -146,6 +147,22 @@ class TestTrees(TestCase):
res = T().transform(t)
self.assertEqual(res, 2.9)

def test_partial(self):

tree = Tree("start", [Tree("a", ["test1"]), Tree("b", ["test2"])])

def test(prefix, s, postfix):
return prefix + s.upper() + postfix

@v_args(inline=True)
class T(Transformer):
a = functools.partial(test, "@", postfix="!")
b = functools.partial(lambda s: s + "!")

res = T().transform(tree)
assert res.children == ["@TEST1!", "test2!"]


def test_discard(self):
class MyTransformer(Transformer):
def a(self, args):


Loading…
Cancel
Save