Browse Source

Merge branch 'master' into 1.0a

gm/2021-09-23T00Z/github.com--lark-parser-lark/1.0b
Erez Sh 3 years ago
parent
commit
9c975d932c
11 changed files with 179 additions and 133 deletions
  1. +4
    -4
      docs/json_tutorial.md
  2. +2
    -2
      lark-stubs/lark.pyi
  3. +13
    -0
      lark/common.py
  4. +9
    -6
      lark/exceptions.py
  5. +8
    -8
      lark/lark.py
  6. +72
    -58
      lark/lexer.py
  7. +33
    -28
      lark/parse_tree_builder.py
  8. +9
    -10
      lark/parser_frontends.py
  9. +2
    -2
      lark/parsers/lalr_parser.py
  10. +7
    -15
      lark/utils.py
  11. +20
    -0
      tests/test_parser.py

+ 4
- 4
docs/json_tutorial.md View File

@@ -427,9 +427,9 @@ I measured memory consumption using a little script called [memusg](https://gist
| Lark - Earley *(with lexer)* | 42s | 4s | 1167M | 608M |
| Lark - LALR(1) | 8s | 1.53s | 453M | 266M |
| Lark - LALR(1) tree-less | 4.76s | 1.23s | 70M | 134M |
| PyParsing ([Parser](http://pyparsing.wikispaces.com/file/view/jsonParser.py)) | 32s | 3.53s | 443M | 225M |
| funcparserlib ([Parser](https://github.com/vlasovskikh/funcparserlib/blob/master/funcparserlib/tests/json.py)) | 8.5s | 1.3s | 483M | 293M |
| Parsimonious ([Parser](https://gist.githubusercontent.com/reclosedev/5222560/raw/5e97cf7eb62c3a3671885ec170577285e891f7d5/parsimonious_json.py)) | ? | 5.7s | ? | 1545M |
| PyParsing ([Parser](https://github.com/pyparsing/pyparsing/blob/master/examples/jsonParser.py)) | 32s | 3.53s | 443M | 225M |
| funcparserlib ([Parser](https://github.com/vlasovskikh/funcparserlib/blob/master/tests/json.py)) | 8.5s | 1.3s | 483M | 293M |
| Parsimonious ([Parser](https://gist.github.com/reclosedev/5222560)) | ? | 5.7s | ? | 1545M |


I added a few other parsers for comparison. PyParsing and funcparselib fair pretty well in their memory usage (they don't build a tree), but they can't compete with the run-time speed of LALR(1).
@@ -442,7 +442,7 @@ Once again, shout-out to PyPy for being so effective.

This is the end of the tutorial. I hoped you liked it and learned a little about Lark.

To see what else you can do with Lark, check out the [examples](examples).
To see what else you can do with Lark, check out the [examples](/examples).

For questions or any other subject, feel free to email me at erezshin at gmail dot com.


+ 2
- 2
lark-stubs/lark.pyi View File

@@ -33,7 +33,7 @@ class LarkOptions:
regex: bool
debug: bool
keep_all_tokens: bool
propagate_positions: Union[bool, str]
propagate_positions: Union[bool, Callable]
maybe_placeholders: bool
lexer_callbacks: Dict[str, Callable[[Token], Token]]
cache: Union[bool, str]
@@ -77,7 +77,7 @@ class Lark:
regex: bool = False,
debug: bool = False,
keep_all_tokens: bool = False,
propagate_positions: Union[bool, str] = False,
propagate_positions: Union[bool, Callable] = False,
maybe_placeholders: bool = False,
lexer_callbacks: Optional[Dict[str, Callable[[Token], Token]]] = None,
cache: Union[bool, str] = False,


+ 13
- 0
lark/common.py View File

@@ -1,3 +1,5 @@
from copy import deepcopy

from .utils import Serialize
from .lexer import TerminalDef

@@ -24,6 +26,17 @@ class LexerConf(Serialize):
def _deserialize(self):
self.terminals_by_name = {t.name: t for t in self.terminals}

def __deepcopy__(self, memo=None):
return type(self)(
deepcopy(self.terminals, memo),
self.re_module,
deepcopy(self.ignore, memo),
deepcopy(self.postlex, memo),
deepcopy(self.callbacks, memo),
deepcopy(self.g_regex_flags, memo),
deepcopy(self.skip_validation, memo),
deepcopy(self.use_bytes, memo),
)


class ParserConf(Serialize):


+ 9
- 6
lark/exceptions.py View File

@@ -127,6 +127,8 @@ class UnexpectedInput(LarkError):

class UnexpectedEOF(ParseError, UnexpectedInput):
def __init__(self, expected, state=None, terminals_by_name=None):
super(UnexpectedEOF, self).__init__()

self.expected = expected
self.state = state
from .lexer import Token
@@ -136,7 +138,6 @@ class UnexpectedEOF(ParseError, UnexpectedInput):
self.column = -1
self._terminals_by_name = terminals_by_name

super(UnexpectedEOF, self).__init__()

def __str__(self):
message = "Unexpected end-of-input. "
@@ -147,6 +148,8 @@ class UnexpectedEOF(ParseError, UnexpectedInput):
class UnexpectedCharacters(LexError, UnexpectedInput):
def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None,
terminals_by_name=None, considered_rules=None):
super(UnexpectedCharacters, self).__init__()

# TODO considered_tokens and allowed can be figured out using state
self.line = line
self.column = column
@@ -165,7 +168,6 @@ class UnexpectedCharacters(LexError, UnexpectedInput):
self.char = seq[lex_pos]
self._context = self.get_context(seq)

super(UnexpectedCharacters, self).__init__()

def __str__(self):
message = "No terminal matches '%s' in the current parser context, at line %d col %d" % (self.char, self.line, self.column)
@@ -188,6 +190,8 @@ class UnexpectedToken(ParseError, UnexpectedInput):
"""

def __init__(self, token, expected, considered_rules=None, state=None, interactive_parser=None, terminals_by_name=None, token_history=None):
super(UnexpectedToken, self).__init__()
# TODO considered_rules and expected can be figured out using state
self.line = getattr(token, 'line', '?')
self.column = getattr(token, 'column', '?')
@@ -202,7 +206,6 @@ class UnexpectedToken(ParseError, UnexpectedInput):
self._terminals_by_name = terminals_by_name
self.token_history = token_history

super(UnexpectedToken, self).__init__()

@property
def accepts(self):
@@ -229,10 +232,10 @@ class VisitError(LarkError):
"""

def __init__(self, rule, obj, orig_exc):
self.obj = obj
self.orig_exc = orig_exc

message = 'Error trying to process rule "%s":\n\n%s' % (rule, orig_exc)
super(VisitError, self).__init__(message)

self.obj = obj
self.orig_exc = orig_exc

###}

+ 8
- 8
lark/lark.py View File

@@ -39,7 +39,7 @@ class LarkOptions(Serialize):
Applies the transformer to every parse tree (equivalent to applying it after the parse, but faster)
propagate_positions
Propagates (line, column, end_line, end_column) attributes into all tree branches.
Accepts ``False``, ``True``, or "ignore_ws", which will trim the whitespace around your trees.
Accepts ``False``, ``True``, or a callable, which will filter which nodes to ignore when propagating.
maybe_placeholders
When ``True``, the ``[]`` operator returns ``None`` when not matched.

@@ -157,7 +157,7 @@ class LarkOptions(Serialize):
assert_config(self.parser, ('earley', 'lalr', 'cyk', None))

if self.parser == 'earley' and self.transformer:
raise ConfigurationError('Cannot specify an embedded transformer when using the Earley algorithm.'
raise ConfigurationError('Cannot specify an embedded transformer when using the Earley algorithm. '
'Please use your transformer on the resulting parse tree, or use a different algorithm (i.e. LALR)')

if o:
@@ -443,11 +443,11 @@ class Lark(Serialize):
d = f
else:
d = pickle.load(f)
memo = d['memo']
memo_json = d['memo']
data = d['data']

assert memo
memo = SerializeMemoizer.deserialize(memo, {'Rule': Rule, 'TerminalDef': TerminalDef}, {})
assert memo_json
memo = SerializeMemoizer.deserialize(memo_json, {'Rule': Rule, 'TerminalDef': TerminalDef}, {})
options = dict(data['options'])
if (set(kwargs) - _LOAD_ALLOWED_OPTIONS) & set(LarkOptions._defaults):
raise ConfigurationError("Some options are not allowed when loading a Parser: {}"
@@ -504,11 +504,11 @@ class Lark(Serialize):

Lark.open_from_package(__name__, "example.lark", ("grammars",), parser=...)
"""
package = FromPackageLoader(package, search_paths)
full_path, text = package(None, grammar_path)
package_loader = FromPackageLoader(package, search_paths)
full_path, text = package_loader(None, grammar_path)
options.setdefault('source_path', full_path)
options.setdefault('import_paths', [])
options['import_paths'].append(package)
options['import_paths'].append(package_loader)
return cls(text, **options)

def __repr__(self):


+ 72
- 58
lark/lexer.py View File

@@ -124,20 +124,20 @@ class Token(str):

def __new__(cls, type_, value, start_pos=None, line=None, column=None, end_line=None, end_column=None, end_pos=None):
try:
self = super(Token, cls).__new__(cls, value)
inst = super(Token, cls).__new__(cls, value)
except UnicodeDecodeError:
value = value.decode('latin1')
self = super(Token, cls).__new__(cls, value)
self.type = type_
self.start_pos = start_pos
self.value = value
self.line = line
self.column = column
self.end_line = end_line
self.end_column = end_column
self.end_pos = end_pos
return self
inst = super(Token, cls).__new__(cls, value)
inst.type = type_
inst.start_pos = start_pos if start_pos is not None else pos_in_stream
inst.value = value
inst.line = line
inst.column = column
inst.end_line = end_line
inst.end_column = end_column
inst.end_pos = end_pos
return inst

def update(self, type_=None, value=None):
return Token.new_borrow_pos(
@@ -200,15 +200,13 @@ class LineCounter:


class UnlessCallback:
def __init__(self, mres):
self.mres = mres
def __init__(self, scanner):
self.scanner = scanner

def __call__(self, t):
for mre, type_from_index in self.mres:
m = mre.match(t.value)
if m:
t.type = type_from_index[m.lastindex]
break
res = self.scanner.match(t.value, 0)
if res:
_value, t.type = res
return t


@@ -223,6 +221,11 @@ class CallChain:
return self.callback2(t) if self.cond(t2) else t2


def _get_match(re_, regexp, s, flags):
m = re_.match(regexp, s, flags)
if m:
return m.group(0)

def _create_unless(terminals, g_regex_flags, re_, use_bytes):
tokens_by_type = classify(terminals, lambda t: type(t.pattern))
assert len(tokens_by_type) <= 2, tokens_by_type.keys()
@@ -234,40 +237,54 @@ def _create_unless(terminals, g_regex_flags, re_, use_bytes):
if strtok.priority > retok.priority:
continue
s = strtok.pattern.value
m = re_.match(retok.pattern.to_regexp(), s, g_regex_flags)
if m and m.group(0) == s:
if s == _get_match(re_, retok.pattern.to_regexp(), s, g_regex_flags):
unless.append(strtok)
if strtok.pattern.flags <= retok.pattern.flags:
embedded_strs.add(strtok)
if unless:
callback[retok.name] = UnlessCallback(build_mres(unless, g_regex_flags, re_, match_whole=True, use_bytes=use_bytes))

terminals = [t for t in terminals if t not in embedded_strs]
return terminals, callback


def _build_mres(terminals, max_size, g_regex_flags, match_whole, re_, use_bytes):
# Python sets an unreasonable group limit (currently 100) in its re module
# Worse, the only way to know we reached it is by catching an AssertionError!
# This function recursively tries less and less groups until it's successful.
postfix = '$' if match_whole else ''
mres = []
while terminals:
pattern = u'|'.join(u'(?P<%s>%s)' % (t.name, t.pattern.to_regexp() + postfix) for t in terminals[:max_size])
if use_bytes:
pattern = pattern.encode('latin-1')
try:
mre = re_.compile(pattern, g_regex_flags)
except AssertionError: # Yes, this is what Python provides us.. :/
return _build_mres(terminals, max_size//2, g_regex_flags, match_whole, re_, use_bytes)
callback[retok.name] = UnlessCallback(Scanner(unless, g_regex_flags, re_, match_whole=True, use_bytes=use_bytes))

mres.append((mre, {i: n for n, i in mre.groupindex.items()}))
terminals = terminals[max_size:]
return mres
new_terminals = [t for t in terminals if t not in embedded_strs]
return new_terminals, callback


def build_mres(terminals, g_regex_flags, re_, use_bytes, match_whole=False):
return _build_mres(terminals, len(terminals), g_regex_flags, match_whole, re_, use_bytes)

class Scanner:
def __init__(self, terminals, g_regex_flags, re_, use_bytes, match_whole=False):
self.terminals = terminals
self.g_regex_flags = g_regex_flags
self.re_ = re_
self.use_bytes = use_bytes
self.match_whole = match_whole

self.allowed_types = {t.name for t in self.terminals}

self._mres = self._build_mres(terminals, len(terminals))

def _build_mres(self, terminals, max_size):
# Python sets an unreasonable group limit (currently 100) in its re module
# Worse, the only way to know we reached it is by catching an AssertionError!
# This function recursively tries less and less groups until it's successful.
postfix = '$' if self.match_whole else ''
mres = []
while terminals:
pattern = u'|'.join(u'(?P<%s>%s)' % (t.name, t.pattern.to_regexp() + postfix) for t in terminals[:max_size])
if self.use_bytes:
pattern = pattern.encode('latin-1')
try:
mre = self.re_.compile(pattern, self.g_regex_flags)
except AssertionError: # Yes, this is what Python provides us.. :/
return self._build_mres(terminals, max_size//2)

mres.append((mre, {i: n for n, i in mre.groupindex.items()}))
terminals = terminals[max_size:]
return mres

def match(self, text, pos):
for mre, type_from_index in self._mres:
m = mre.match(text, pos)
if m:
return m.group(0), type_from_index[m.lastindex]


def _regexp_has_newline(r):
@@ -327,9 +344,9 @@ class TraditionalLexer(Lexer):
self.use_bytes = conf.use_bytes
self.terminals_by_name = conf.terminals_by_name

self._mres = None
self._scanner = None

def _build(self):
def _build_scanner(self):
terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, self.re, self.use_bytes)
assert all(self.callback.values())

@@ -340,19 +357,16 @@ class TraditionalLexer(Lexer):
else:
self.callback[type_] = f

self._mres = build_mres(terminals, self.g_regex_flags, self.re, self.use_bytes)
self._scanner = Scanner(terminals, self.g_regex_flags, self.re, self.use_bytes)

@property
def mres(self):
if self._mres is None:
self._build()
return self._mres
def scanner(self):
if self._scanner is None:
self._build_scanner()
return self._scanner

def match(self, text, pos):
for mre, type_from_index in self.mres:
m = mre.match(text, pos)
if m:
return m.group(0), type_from_index[m.lastindex]
return self.scanner.match(text, pos)

def lex(self, state, parser_state):
with suppress(EOFError):
@@ -364,7 +378,7 @@ class TraditionalLexer(Lexer):
while line_ctr.char_pos < len(lex_state.text):
res = self.match(lex_state.text, line_ctr.char_pos)
if not res:
allowed = {v for m, tfi in self.mres for v in tfi.values()} - self.ignore_types
allowed = self.scanner.allowed_types - self.ignore_types
if not allowed:
allowed = {"<END-OF-FILE>"}
raise UnexpectedCharacters(lex_state.text, line_ctr.char_pos, line_ctr.line, line_ctr.column,


+ 33
- 28
lark/parse_tree_builder.py View File

@@ -22,54 +22,59 @@ class ExpandSingleChild:


class PropagatePositions:
def __init__(self, node_builder):
def __init__(self, node_builder, node_filter=None):
self.node_builder = node_builder
self.node_filter = node_filter

def __call__(self, children):
res = self.node_builder(children)

# local reference to Tree.meta reduces number of presence checks
if isinstance(res, Tree):
res_meta = res.meta
# Calculate positions while the tree is streaming, according to the rule:
# - nodes start at the start of their first child's container,
# and end at the end of their last child's container.
# Containers are nodes that take up space in text, but have been inlined in the tree.

src_meta = self._pp_get_meta(children)
if src_meta is not None:
res_meta.line = src_meta.line
res_meta.column = src_meta.column
res_meta.start_pos = src_meta.start_pos
res_meta.empty = False
res_meta = res.meta

src_meta = self._pp_get_meta(reversed(children))
if src_meta is not None:
res_meta.end_line = src_meta.end_line
res_meta.end_column = src_meta.end_column
res_meta.end_pos = src_meta.end_pos
res_meta.empty = False
first_meta = self._pp_get_meta(children)
if first_meta is not None:
if not hasattr(res_meta, 'line'):
# meta was already set, probably because the rule has been inlined (e.g. `?rule`)
res_meta.line = getattr(first_meta, 'container_line', first_meta.line)
res_meta.column = getattr(first_meta, 'container_column', first_meta.column)
res_meta.start_pos = getattr(first_meta, 'container_start_pos', first_meta.start_pos)
res_meta.empty = False

res_meta.container_line = getattr(first_meta, 'container_line', first_meta.line)
res_meta.container_column = getattr(first_meta, 'container_column', first_meta.column)

last_meta = self._pp_get_meta(reversed(children))
if last_meta is not None:
if not hasattr(res_meta, 'end_line'):
res_meta.end_line = getattr(last_meta, 'container_end_line', last_meta.end_line)
res_meta.end_column = getattr(last_meta, 'container_end_column', last_meta.end_column)
res_meta.end_pos = getattr(last_meta, 'container_end_pos', last_meta.end_pos)
res_meta.empty = False

res_meta.container_end_line = getattr(last_meta, 'container_end_line', last_meta.end_line)
res_meta.container_end_column = getattr(last_meta, 'container_end_column', last_meta.end_column)

return res

def _pp_get_meta(self, children):
for c in children:
if self.node_filter is not None and not self.node_filter(c):
continue
if isinstance(c, Tree):
if not c.meta.empty:
return c.meta
elif isinstance(c, Token):
return c

class PropagatePositions_IgnoreWs(PropagatePositions):
def _pp_get_meta(self, children):
for c in children:
if isinstance(c, Tree):
if not c.meta.empty:
return c.meta
elif isinstance(c, Token):
if c and not c.isspace(): # Disregard whitespace-only tokens
return c


def make_propagate_positions(option):
if option == "ignore_ws":
return PropagatePositions_IgnoreWs
if callable(option):
return partial(PropagatePositions, node_filter=option)
elif option is True:
return PropagatePositions
elif option is False:


+ 9
- 10
lark/parser_frontends.py View File

@@ -39,8 +39,7 @@ class MakeParsingFrontend:
lexer_conf.lexer_type = self.lexer_type
return ParsingFrontend(lexer_conf, parser_conf, options)

@classmethod
def deserialize(cls, data, memo, lexer_conf, callbacks, options):
def deserialize(self, data, memo, lexer_conf, callbacks, options):
parser_conf = ParserConf.deserialize(data['parser_conf'], memo)
parser = LALR_Parser.deserialize(data['parser'], memo, callbacks, options.debug)
parser_conf.callbacks = callbacks
@@ -92,26 +91,26 @@ class ParsingFrontend(Serialize):
def _verify_start(self, start=None):
if start is None:
start = self.parser_conf.start
if len(start) > 1:
raise ConfigurationError("Lark initialized with more than 1 possible start rule. Must specify which start rule to parse", start)
start ,= start
start_decls = self.parser_conf.start
if len(start_decls) > 1:
raise ConfigurationError("Lark initialized with more than 1 possible start rule. Must specify which start rule to parse", start_decls)
start ,= start_decls
elif start not in self.parser_conf.start:
raise ConfigurationError("Unknown start rule %s. Must be one of %r" % (start, self.parser_conf.start))
return start

def parse(self, text, start=None, on_error=None):
start = self._verify_start(start)
chosen_start = self._verify_start(start)
stream = text if self.skip_lexer else LexerThread(self.lexer, text)
kw = {} if on_error is None else {'on_error': on_error}
return self.parser.parse(stream, start, **kw)
return self.parser.parse(stream, chosen_start, **kw)
def parse_interactive(self, text=None, start=None):
start = self._verify_start(start)
chosen_start = self._verify_start(start)
if self.parser_conf.parser_type != 'lalr':
raise ConfigurationError("parse_interactive() currently only works with parser='lalr' ")
stream = text if self.skip_lexer else LexerThread(self.lexer, text)
return self.parser.parse_interactive(stream, start)
return self.parser.parse_interactive(stream, chosen_start)


def get_frontend(parser, lexer):


+ 2
- 2
lark/parsers/lalr_parser.py View File

@@ -178,8 +178,8 @@ class _Parser(object):
for token in state.lexer.lex(state):
state.feed_token(token)

token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1)
return state.feed_token(token, True)
end_token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1)
return state.feed_token(end_token, True)
except UnexpectedInput as e:
try:
e.interactive_parser = InteractiveParser(self, state, state.lexer)


+ 7
- 15
lark/utils.py View File

@@ -61,14 +61,13 @@ class Serialize(object):
fields = getattr(self, '__serialize_fields__')
res = {f: _serialize(getattr(self, f), memo) for f in fields}
res['__type__'] = type(self).__name__
postprocess = getattr(self, '_serialize', None)
if postprocess:
postprocess(res, memo)
if hasattr(self, '_serialize'):
self._serialize(res, memo)
return res

@classmethod
def deserialize(cls, data, memo):
namespace = getattr(cls, '__serialize_namespace__', {})
namespace = getattr(cls, '__serialize_namespace__', [])
namespace = {c.__name__:c for c in namespace}

fields = getattr(cls, '__serialize_fields__')
@@ -82,9 +81,10 @@ class Serialize(object):
setattr(inst, f, _deserialize(data[f], namespace, memo))
except KeyError as e:
raise KeyError("Cannot find key for class", cls, e)
postprocess = getattr(inst, '_deserialize', None)
if postprocess:
postprocess()

if hasattr(inst, '_deserialize'):
inst._deserialize()

return inst


@@ -198,14 +198,6 @@ def dedup_list(l):
return [x for x in l if not (x in dedup or dedup.add(x))]


def compare(a, b):
if a == b:
return 0
elif a > b:
return 1
return -1


class Enumerator(Serialize):
def __init__(self):
self.enums = {}


+ 20
- 0
tests/test_parser.py View File

@@ -94,6 +94,26 @@ class TestParsers(unittest.TestCase):
r = g.parse('a')
self.assertEqual( r.children[0].meta.line, 1 )

def test_propagate_positions2(self):
g = Lark("""start: a
a: b
?b: "(" t ")"
!t: "t"
""", propagate_positions=True)

start = g.parse("(t)")
a ,= start.children
t ,= a.children
assert t.children[0] == "t"

assert t.meta.column == 2
assert t.meta.end_column == 3

assert start.meta.column == a.meta.column == 1
assert start.meta.end_column == a.meta.end_column == 4



def test_expand1(self):

g = Lark("""start: a


Loading…
Cancel
Save