ソースを参照

Improved lexer, added profiler option to Lark

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.5.1
Erez Shinan 7年前
コミット
8b9c5801da
5個のファイルの変更101行の追加40行の削除
  1. +10
    -8
      README.md
  2. +1
    -0
      lark/__init__.py
  3. +46
    -4
      lark/lark.py
  4. +37
    -28
      lark/lexer.py
  5. +7
    -0
      lark/tests/test_parser.py

+ 10
- 8
README.md ファイルの表示

@@ -1,13 +1,13 @@
# Lark - a modern pure-Python parsing library
# Lark - a modern parsing library

Lark is a modern general-purpose Python parsing library, that focuses on simplicity and power.
Lark is a modern general-purpose parsing library for Python.

Lark accepts grammars as EBNF and lets you choose between two parsing algorithms:
Lark focuses on simplicity and power. It lets you choose between two parsing algorithms:

- Earley : Parses all context-free grammars (even ambiguous ones)!
- Earley : Parses all context-free grammars (even ambiguous ones)! It is the default.
- LALR(1): Only LR grammars. Outperforms PLY and most if not all other pure-python parsing libraries.

Both algorithms are pure-python implementations and can be used interchangably (aside for algorithmic restrictions).
Both algorithms are written in Python and can be used interchangably with the same grammar (aside for algorithmic restrictions). See "Comparison to other parsers" for more details.

Lark can automagically build an AST from your grammar, without any more code on your part.

@@ -41,10 +41,12 @@ Tree(start, [Token(WORD, Hello), Token(WORD, World)])

Notice punctuation doesn't appear in the resulting tree. It's automatically filtered away by Lark.

To learn more about Lark:
- Learn how to parse json at the [tutorial](/docs/json_tutorial.md)
## Learn more about using Lark

## Features
- Read the [tutorial](/docs/json_tutorial.md), which shows how to write a JSON parser in Lark.
- Browse the [examples](/examples), which include a calculator, and a Python-code parser.

## List of Features

- EBNF grammar with a little extra
- Earley & LALR(1)


+ 1
- 0
lark/__init__.py ファイルの表示

@@ -0,0 +1 @@
from .lark import Lark, Transformer

+ 46
- 4
lark/lark.py ファイルの表示

@@ -39,6 +39,7 @@ class LarkOptions(object):
self.parser = o.pop('parser', 'earley')
self.transformer = o.pop('transformer', None)
self.start = o.pop('start', 'start')
self.profile = o.pop('profile', False) # XXX new

assert self.parser in ENGINE_DICT
if self.parser == 'earley' and self.transformer:
@@ -50,6 +51,30 @@ class LarkOptions(object):
raise ValueError("Unknown options: %s" % o.keys())


import time
from collections import defaultdict
class Profiler:
def __init__(self):
self.total_time = defaultdict(float)
self.cur_section = '__init__'
self.last_enter_time = time.time()

def enter_section(self, name):
cur_time = time.time()
self.total_time[self.cur_section] += cur_time - self.last_enter_time
self.last_enter_time = cur_time
self.cur_section = name

def make_wrapper(self, name, f):
def _f(*args, **kwargs):
last_section = self.cur_section
self.enter_section(name)
try:
return f(*args, **kwargs)
finally:
self.enter_section(last_section)

return _f


class Lark:
@@ -82,6 +107,8 @@ class Lark:
if self.options.cache_grammar:
raise NotImplementedError("Not available yet")

self.profiler = Profiler() if self.options.profile else None

self.tokens, self.rules = load_grammar(grammar)

self.lexer = self._build_lexer()
@@ -90,6 +117,9 @@ class Lark:
self.parse_tree_builder = ParseTreeBuilder(self.options.tree_class)
self.parser = self._build_parser()

if self.profiler: self.profiler.enter_section('outside_lark')


def _create_unless_callback(self, strs):
def f(t):
if t in strs:
@@ -105,8 +135,6 @@ class Lark:
for flag in flags:
if flag == 'ignore':
ignore_tokens.append(name)
elif flag == 'newline':
pass # TODO
elif isinstance(flag, tuple) and flag[0] == 'unless':
_, strs = flag
callbacks[name] = self._create_unless_callback(strs)
@@ -119,6 +147,10 @@ class Lark:

def _build_parser(self):
rules, callback = self.parse_tree_builder.create_tree_builder(self.rules, self.options.transformer)
if self.profiler:
for f in dir(callback):
if not f.startswith('__'):
setattr(callback, f, self.profiler.make_wrapper('transformer', getattr(callback, f)))
return self.parser_engine.build_parser(rules, callback, self.options.start)


@@ -133,6 +165,16 @@ class Lark:

def parse(self, text):
assert not self.options.only_lex
l = list(self.lex(text))
return self.parser.parse(l)

if self.profiler:
self.profiler.enter_section('lex')
l = list(self.lex(text))
self.profiler.enter_section('parse')
try:
return self.parser.parse(l)
finally:
self.profiler.enter_section('outside_lark')
else:
l = list(self.lex(text))
return self.parser.parse(l)


+ 37
- 28
lark/lexer.py ファイルの表示

@@ -1,5 +1,7 @@
## Lexer Implementation

import re

from .utils import Str

class LexError(Exception):
@@ -13,13 +15,6 @@ class Token(Str):
inst.value = value
return inst

# class Token(object):
# def __init__(self, type, value, lexpos):
# self.type = type
# self.value = value
# self.lexpos = lexpos


def __repr__(self):
return 'Token(%s, %s)' % (self.type, self.value)

@@ -29,12 +24,11 @@ class Regex:
self.flags = flags


import re
LIMIT = 50 # Stupid named groups limit in python re
class Lexer(object):
def __init__(self, tokens, callbacks, ignore=()):
self.ignore = ignore
self.newline_char = '\n'
tokens = list(tokens)

# Sanitization
token_names = {t[0] for t in tokens}
@@ -49,42 +43,57 @@ class Lexer(object):
self.tokens = tokens
self.callbacks = callbacks

# self.tokens.sort(key=lambda x:len(x[1]), reverse=True)
self.token_types = list(token_names)
self.type_index = {name:i for i,name in enumerate(self.token_types)}

self.newline_types = [self.type_index[t[0]] for t in tokens if '\n' in t[1] or '\\n' in t[1]]
self.ignore_types = [self.type_index[t] for t in ignore]

self.mres = []
self.name_from_index = []
x = list(tokens)
while x:
mre = re.compile(u'|'.join(u'(?P<%s>%s)'%t for t in x[:LIMIT]))
self.mres.append(mre)
self.name_from_index.append( {i:n for n,i in mre.groupindex.items()} )
x = x[LIMIT:]
self.mres = self._build_mres(tokens, len(tokens))


def _build_mres(self, tokens, max_size):
# Python sets an unreasonable group limit (currently 100) in its re module
# Worse, the only way to know we reached it is by catching an AssertionError!
# This function recursively tries less and less groups until it's successful.
mres = []
while tokens:
try:
mre = re.compile(u'|'.join(u'(?P<%s>%s)'%t for t in tokens[:max_size]))
except AssertionError: # Yes, this is what Python provides us.. :/
return self._build_mres(tokens, max_size/2)

mres.append((mre, {i:self.type_index[n] for n,i in mre.groupindex.items()} ))
tokens = tokens[max_size:]
return mres

def lex(self, stream):
lex_pos = 0
line = 1
col_start_pos = 0
newline_types = list(self.newline_types)
ignore_types = list(self.ignore_types)
while True:
i = 0
for mre in self.mres:
for mre, type_from_index in self.mres:
m = mre.match(stream, lex_pos)
if m:
value = m.group(0)
type_ = self.name_from_index[i][m.lastindex]
if type_ not in self.ignore:
t = Token(type_, value, lex_pos)
type_num = type_from_index[m.lastindex]
if type_num not in ignore_types:
t = Token(self.token_types[type_num], value, lex_pos)
t.line = line
t.column = lex_pos - col_start_pos
if t.type in self.callbacks:
t = self.callbacks[t.type](t)
yield t
newlines = value.count(self.newline_char)
if newlines:
line += newlines
col_start_pos = lex_pos + value.rindex(self.newline_char)

if type_num in newline_types:
newlines = value.count(self.newline_char)
if newlines:
line += newlines
col_start_pos = lex_pos + value.rindex(self.newline_char)
lex_pos += len(value)
break
i += 1
else:
if lex_pos < len(stream):
context = stream[lex_pos:lex_pos+5]


+ 7
- 0
lark/tests/test_parser.py ファイルの表示

@@ -334,6 +334,13 @@ def _make_parser_test(PARSER):
x = g.parse('a')
self.assertEqual(x.data, "b")

def test_lexer_token_limit(self):
"Python has a stupid limit of 100 groups in a regular expression. Test that we handle this limitation"
tokens = {'A%d'%i:'"%d"'%i for i in range(300)}
g = _Lark("""start: %s
%s""" % (' '.join(tokens), '\n'.join("%s: %s"%x for x in tokens.items())))


_NAME = "Test" + PARSER.capitalize()
_TestParser.__name__ = _NAME
globals()[_NAME] = _TestParser


読み込み中…
キャンセル
保存