Pārlūkot izejas kodu

Merge branch 'master' into js2py2

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.5.1
Erez Shinan pirms 7 gadiem
vecāks
revīzija
9a11b46afc
8 mainītis faili ar 91 papildinājumiem un 76 dzēšanām
  1. +4
    -4
      README.md
  2. +8
    -7
      docs/json_tutorial.md
  3. +2
    -2
      examples/json_parser.py
  4. +1
    -1
      lark/__init__.py
  5. +50
    -54
      lark/load_grammar.py
  6. +2
    -2
      lark/parser_frontends.py
  7. +8
    -6
      lark/parsers/earley.py
  8. +16
    -0
      tests/test_parser.py

+ 4
- 4
README.md Parādīt failu

@@ -154,10 +154,10 @@ These features may be implemented some day:

| Code | CPython Time | PyPy Time | CPython Mem | PyPy Mem
|:-----|:-------------|:------------|:----------|:---------
| **Lark - LALR(1)** | 4.2s | 1.1s | 0.4M | 0.3M |
| PyParsing | 32s | 4.1s | 0.4M | 0.2M |
| funcparserlib | 11s | 1.9s | 0.5M | 0.3M |
| Parsimonious | | 7s | | 1.4M |
| **Lark - LALR(1)** | 4.7s | 1.2s | 70M | 134M |
| PyParsing | 32s | 3.5s | 443M | 225M |
| funcparserlib | 8.5s | 1.3s | 483M | 293M |
| Parsimonious | | 5.7s | | 1545M |

Check out the [JSON tutorial](/docs/json_tutorial.md#conclusion) for more details on how the comparison was made.



+ 8
- 7
docs/json_tutorial.md Parādīt failu

@@ -327,7 +327,7 @@ class TreeToJson(Transformer):
true = lambda self, _: True
false = lambda self, _: False

json_parser = Lark(json_grammar, start='value')
json_parser = Lark(json_grammar, start='value', lexer='standard')

if __name__ == '__main__':
with open(sys.argv[1]) as f:
@@ -419,12 +419,13 @@ I measured memory consumption using a little script called [memusg](https://gist

| Code | CPython Time | PyPy Time | CPython Mem | PyPy Mem
|:-----|:-------------|:------------|:----------|:---------
| Lark - Earley | 36s | 4.3s | 6.2M | 1.2M |
| Lark - LALR(1) | 7s | 1.3s | 0.6M | 0.3M |
| Lark - LALR(1) tree-less | 4.2s | 1.1s | 0.4M | 0.3M |
| PyParsing ([Parser](http://pyparsing.wikispaces.com/file/view/jsonParser.py)) | 32s | 4.1s | 0.4M | 0.2M |
| funcparserlib ([Parser](https://github.com/vlasovskikh/funcparserlib/blob/master/funcparserlib/tests/json.py)) | 11s | 1.9s | 0.5M | 0.3M |
| Parsimonious ([Parser](https://gist.githubusercontent.com/reclosedev/5222560/raw/5e97cf7eb62c3a3671885ec170577285e891f7d5/parsimonious_json.py)) | ? | 7s | ? | 1.4M |
| Lark - Earley *(with lexer)* | 42s | 4s | 1167M | 608M |
| Lark - LALR(1) | 8s | 1.53s | 453M | 266M |
| Lark - LALR(1) tree-less | 4.76s | 1.23s | 70M | 134M |
| PyParsing ([Parser](http://pyparsing.wikispaces.com/file/view/jsonParser.py)) | 32s | 3.53s | 443M | 225M |
| funcparserlib ([Parser](https://github.com/vlasovskikh/funcparserlib/blob/master/funcparserlib/tests/json.py)) | 8.5s | 1.3s | 483M | 293M |
| Parsimonious ([Parser](https://gist.githubusercontent.com/reclosedev/5222560/raw/5e97cf7eb62c3a3671885ec170577285e891f7d5/parsimonious_json.py)) | ? | 5.7s | ? | 1545M |


I added a few other parsers for comparison. PyParsing and funcparselib fair pretty well in their memory usage (they don't build a tree), but they can't compete with the run-time speed of LALR(1).



+ 2
- 2
examples/json_parser.py Parādīt failu

@@ -47,7 +47,7 @@ class TreeToJson(Transformer):
true = lambda self, _: True
false = lambda self, _: False

# json_parser = Lark(json_grammar, parser='earley')
# json_parser = Lark(json_grammar, parser='earley', lexer='standard')
# def parse(x):
# return TreeToJson().transform(json_parser.parse(x))

@@ -72,7 +72,7 @@ def test():
assert j == json.loads(test_json)

if __name__ == '__main__':
test()
# test()
with open(sys.argv[1]) as f:
print(parse(f.read()))


+ 1
- 1
lark/__init__.py Parādīt failu

@@ -3,4 +3,4 @@ from .common import ParseError, GrammarError
from .lark import Lark
from .utils import inline_args

__version__ = "0.2.2"
__version__ = "0.2.3"

+ 50
- 54
lark/load_grammar.py Parādīt failu

@@ -324,7 +324,7 @@ class TokenTreeToPattern(Transformer):
def expansion(self, items):
if len(items) == 1:
return items[0]
if len(set(i.flags for i in items)) > 1:
if len({i.flags for i in items}) > 1:
raise GrammarError("Lark doesn't support joining tokens with conflicting flags!")
return PatternRE(''.join(i.to_regexp() for i in items), items[0].flags)

@@ -348,60 +348,64 @@ def _interleave(l, item):
elif is_terminal(e):
yield item

def _choice_of_rules(rules):
return T('expansions', [T('expansion', [Token('RULE', name)]) for name in rules])

class Grammar:
def __init__(self, rule_defs, token_defs, extra):
self.token_defs = token_defs
self.rule_defs = rule_defs
self.extra = extra

def compile(self, lexer=False, start=None):
if not lexer:
rule_defs = deepcopy(self.rule_defs)

# XXX VERY HACKY!! There must be a better way..
ignore_tokens = [('_'+name, t) for name, t in self.token_defs if name in self.extra['ignore']]
if ignore_tokens:
self.token_defs = [('_'+name if name in self.extra['ignore'] else name,t) for name,t in self.token_defs]
ignore_names = [t[0] for t in ignore_tokens]
expr = Token('RULE', '__ignore')
for r, tree, _o in rule_defs:
for exp in tree.find_data('expansion'):
exp.children = list(_interleave(exp.children, expr))
if r == start:
exp.children = [expr] + exp.children
for exp in tree.find_data('expr'):
exp.children[0] = T('expansion', list(_interleave(exp.children[:1], expr)))

x = [T('expansion', [Token('RULE', x)]) for x in ignore_names]
_ignore_tree = T('expr', [T('expansions', x), Token('OP', '?')])
rule_defs.append(('__ignore', _ignore_tree, None))
# End of "ignore" section

rule_defs += [(name, tree, RuleOptions(keep_all_tokens=True)) for name, tree in self.token_defs]
token_defs = []

tokens_to_convert = {name: '__token_'+name for name, tree, _ in rule_defs if is_terminal(name)}
new_rule_defs = []
for name, tree, options in rule_defs:
if name in tokens_to_convert:
if name.startswith('_'):
options = RuleOptions.new_from(options, filter_out=True)
else:
options = RuleOptions.new_from(options, create_token=name)
name = tokens_to_convert[name]
inner = Token('RULE', name + '_inner')
new_rule_defs.append((name, T('expansions', [T('expansion', [inner])]), None))
name = inner
def _prepare_scanless_grammar(self, start):
# XXX Pretty hacky! There should be a better way to write this method..

rule_defs = deepcopy(self.rule_defs)
term_defs = self.token_defs

# Implement the "%ignore" feature without a lexer..
terms_to_ignore = {name:'__'+name for name in self.extra['ignore']}
if terms_to_ignore:
assert set(terms_to_ignore) <= {name for name, t in term_defs}
term_defs = [(terms_to_ignore.get(name,name),t) for name,t in term_defs]
expr = Token('RULE', '__ignore')
for r, tree, _o in rule_defs:
for exp in tree.find_data('expansion'):
exp.children = list(_interleave(exp.children, expr))
if r == start:
exp.children = [expr] + exp.children
for exp in tree.find_data('expr'):
exp.children[0] = T('expansion', list(_interleave(exp.children[:1], expr)))

_ignore_tree = T('expr', [_choice_of_rules(terms_to_ignore.values()), Token('OP', '?')])
rule_defs.append(('__ignore', _ignore_tree, None))

# Convert all tokens to rules
new_terminal_names = {name: '__token_'+name for name, tree in term_defs}

for name, tree, options in rule_defs:
for exp in chain( tree.find_data('expansion'), tree.find_data('expr') ):
for i, sym in enumerate(exp.children):
if sym in new_terminal_names:
exp.children[i] = Token(sym.type, new_terminal_names[sym])

for name, tree in term_defs:
if name.startswith('_'):
options = RuleOptions(filter_out=True)
else:
options = RuleOptions(keep_all_tokens=True, create_token=name)

else:
for exp in chain( tree.find_data('expansion'), tree.find_data('expr') ):
for i, sym in enumerate(exp.children):
if sym in tokens_to_convert:
exp.children[i] = Token(sym.type, tokens_to_convert[sym])
name = new_terminal_names[name]
inner_name = name + '_inner'
rule_defs.append((name, _choice_of_rules([inner_name]), None))
rule_defs.append((inner_name, tree, options))

new_rule_defs.append((name, tree, options))
return [], rule_defs

rule_defs = new_rule_defs

def compile(self, lexer=False, start=None):
if not lexer:
token_defs, rule_defs = self._prepare_scanless_grammar(start)
else:
token_defs = list(self.token_defs)
rule_defs = self.rule_defs
@@ -473,14 +477,6 @@ class RuleOptions:

self.filter_out = filter_out # remove this rule from the tree
# used for "token"-rules in scanless

@classmethod
def new_from(cls, options, **kw):
return cls(
keep_all_tokens=options and options.keep_all_tokens,
expand1=options and options.expand1,
**kw)

@classmethod
def from_rule(cls, name, expansions):
keep_all_tokens = name.startswith('!')


+ 2
- 2
lark/parser_frontends.py Parādīt failu

@@ -27,7 +27,7 @@ class LALR(WithLexer):
self.parser = lalr_parser.Parser(parser_conf)

def parse(self, text):
tokens = list(self.lex(text))
tokens = self.lex(text)
return self.parser.parse(tokens)

class LALR_ContextualLexer:
@@ -160,7 +160,7 @@ class Earley(WithLexer):
return [Terminal_Token(sym) if is_terminal(sym) else sym for sym in expansion]

def parse(self, text):
tokens = list(self.lex(text))
tokens = self.lex(text)
return self.parser.parse(tokens)

def get_frontend(parser, lexer):


+ 8
- 6
lark/parsers/earley.py Parādīt failu

@@ -98,7 +98,7 @@ class Column:
for item in items:

if item.is_complete:
# XXX TODO Potential bug: What happens if there's ambiguity in an empty rule?
# XXX Potential bug: What happens if there's ambiguity in an empty rule?
if item.rule.expansion and item in self.completed:
old_tree = self.completed[item].tree
if old_tree.data != 'ambig':
@@ -110,7 +110,7 @@ class Column:
old_tree.children.append(item.tree)
else:
self.completed[item] = item
self.to_reduce.append(item)
self.to_reduce.append(item)
else:
if item not in added:
added.add(item)
@@ -168,7 +168,7 @@ class Parser:
to_scan = cur_set.to_scan.get_news()
for item in to_scan:
if item.expect.match(token):
next_set.add([item.advance(stream[i])])
next_set.add([item.advance(token)])

if not next_set and token is not END_TOKEN:
expect = {i.expect for i in cur_set.to_scan}
@@ -181,10 +181,12 @@ class Parser:
column0.add(predict(start, column0))

cur_set = column0
for i, char in enumerate(stream):
_, cur_set = process_column(i, char, cur_set)
i = 0
for token in stream:
_, cur_set = process_column(i, token, cur_set)
i += 1

last_set, _ = process_column(len(stream), END_TOKEN, cur_set)
last_set, _ = process_column(i, END_TOKEN, cur_set)

# Parse ended. Now build a parse tree
solutions = [n.tree for n in last_set.to_reduce


+ 16
- 0
tests/test_parser.py Parādīt failu

@@ -19,6 +19,7 @@ logging.basicConfig(level=logging.INFO)
from lark.lark import Lark
from lark.common import GrammarError, ParseError
from lark.lexer import LexError
from lark.tree import Tree

__path__ = os.path.dirname(__file__)
def _read(n, *args):
@@ -104,6 +105,21 @@ class TestEarley(unittest.TestCase):
res = l.parse("aaa")
self.assertEqual(res.children, ['aaa'])

def test_earley_repeating_empty(self):
# This was a sneaky bug!

grammar = """
!start: "a" empty empty "b"
empty: empty2
empty2:
"""

parser = Lark(grammar, parser='earley', lexer=None)
res = parser.parse('ab')

empty_tree = Tree('empty', [Tree('empty2', [])])
self.assertSequenceEqual(res.children, ['a', empty_tree, empty_tree, 'b'])

def _make_parser_test(LEXER, PARSER):
def _Lark(grammar, **kwargs):
return Lark(grammar, lexer=LEXER, parser=PARSER, **kwargs)


Notiek ielāde…
Atcelt
Saglabāt