Browse Source

Merge pull request #610 from lark-parser/dev

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.10.0
Erez Shinan 4 years ago
committed by GitHub
parent
commit
24331eb186
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 84 additions and 79 deletions
  1. +8
    -12
      examples/python_parser.py
  2. +4
    -2
      lark/common.py
  3. +7
    -7
      lark/lark.py
  4. +39
    -24
      lark/lexer.py
  5. +5
    -5
      lark/load_grammar.py
  6. +18
    -26
      lark/parser_frontends.py
  7. +3
    -3
      tests/test_parser.py

+ 8
- 12
examples/python_parser.py View File

@@ -26,6 +26,13 @@ python_parser2 = Lark.open('python2.lark', parser='lalr', **kwargs)
python_parser3 = Lark.open('python3.lark',parser='lalr', **kwargs) python_parser3 = Lark.open('python3.lark',parser='lalr', **kwargs)
python_parser2_earley = Lark.open('python2.lark', parser='earley', lexer='standard', **kwargs) python_parser2_earley = Lark.open('python2.lark', parser='earley', lexer='standard', **kwargs)


try:
xrange
except NameError:
chosen_parser = python_parser3
else:
chosen_parser = python_parser2



def _read(fn, *args): def _read(fn, *args):
kwargs = {'encoding': 'iso-8859-1'} kwargs = {'encoding': 'iso-8859-1'}
@@ -42,24 +49,13 @@ def _get_lib_path():
return [x for x in sys.path if x.endswith('%s.%s' % sys.version_info[:2])][0] return [x for x in sys.path if x.endswith('%s.%s' % sys.version_info[:2])][0]


def test_python_lib(): def test_python_lib():

path = _get_lib_path() path = _get_lib_path()


start = time.time() start = time.time()
files = glob.glob(path+'/*.py') files = glob.glob(path+'/*.py')
for f in files: for f in files:
print( f ) print( f )
try:
# print list(python_parser.lex(_read(os.path.join(path, f)) + '\n'))
try:
xrange
except NameError:
python_parser3.parse(_read(os.path.join(path, f)) + '\n')
else:
python_parser2.parse(_read(os.path.join(path, f)) + '\n')
except:
print ('At %s' % f)
raise
chosen_parser.parse(_read(os.path.join(path, f)) + '\n')


end = time.time() end = time.time()
print( "test_python_lib (%d files), time: %s secs"%(len(files), end-start) ) print( "test_python_lib (%d files), time: %s secs"%(len(files), end-start) )


+ 4
- 2
lark/common.py View File

@@ -7,12 +7,14 @@ class LexerConf(Serialize):
__serialize_fields__ = 'tokens', 'ignore', 'g_regex_flags' __serialize_fields__ = 'tokens', 'ignore', 'g_regex_flags'
__serialize_namespace__ = TerminalDef, __serialize_namespace__ = TerminalDef,


def __init__(self, tokens, ignore=(), postlex=None, callbacks=None, g_regex_flags=0):
self.tokens = tokens
def __init__(self, tokens, re_module, ignore=(), postlex=None, callbacks=None, g_regex_flags=0, skip_validation=False):
self.tokens = tokens # TODO should be terminals
self.ignore = ignore self.ignore = ignore
self.postlex = postlex self.postlex = postlex
self.callbacks = callbacks or {} self.callbacks = callbacks or {}
self.g_regex_flags = g_regex_flags self.g_regex_flags = g_regex_flags
self.re_module = re_module
self.skip_validation = skip_validation


def _deserialize(self): def _deserialize(self):
self.callbacks = {} # TODO self.callbacks = {} # TODO


+ 7
- 7
lark/lark.py View File

@@ -166,11 +166,11 @@ class Lark(Serialize):
use_regex = self.options.regex use_regex = self.options.regex
if use_regex: if use_regex:
if regex: if regex:
self.re = regex
re_module = regex
else: else:
raise ImportError('`regex` module must be installed if calling `Lark(regex=True)`.') raise ImportError('`regex` module must be installed if calling `Lark(regex=True)`.')
else: else:
self.re = re
re_module = re


# Some, but not all file-like objects have a 'name' attribute # Some, but not all file-like objects have a 'name' attribute
try: try:
@@ -243,7 +243,7 @@ class Lark(Serialize):
assert self.options.ambiguity in ('resolve', 'explicit', 'auto', ) assert self.options.ambiguity in ('resolve', 'explicit', 'auto', )


# Parse the grammar file and compose the grammars (TODO) # Parse the grammar file and compose the grammars (TODO)
self.grammar = load_grammar(grammar, self.source, self.re)
self.grammar = load_grammar(grammar, self.source, re_module)


# Compile the EBNF grammar into BNF # Compile the EBNF grammar into BNF
self.terminals, self.rules, self.ignore_tokens = self.grammar.compile(self.options.start) self.terminals, self.rules, self.ignore_tokens = self.grammar.compile(self.options.start)
@@ -276,7 +276,7 @@ class Lark(Serialize):
if hasattr(t, term.name): if hasattr(t, term.name):
lexer_callbacks[term.name] = getattr(t, term.name) lexer_callbacks[term.name] = getattr(t, term.name)


self.lexer_conf = LexerConf(self.terminals, self.ignore_tokens, self.options.postlex, lexer_callbacks, self.options.g_regex_flags)
self.lexer_conf = LexerConf(self.terminals, re_module, self.ignore_tokens, self.options.postlex, lexer_callbacks, self.options.g_regex_flags)


if self.options.parser: if self.options.parser:
self.parser = self._build_parser() self.parser = self._build_parser()
@@ -304,7 +304,7 @@ class Lark(Serialize):
def _build_parser(self): def _build_parser(self):
self._prepare_callbacks() self._prepare_callbacks()
parser_conf = ParserConf(self.rules, self._callbacks, self.options.start) parser_conf = ParserConf(self.rules, self._callbacks, self.options.start)
return self.parser_class(self.lexer_conf, parser_conf, self.re, options=self.options)
return self.parser_class(self.lexer_conf, parser_conf, options=self.options)


def save(self, f): def save(self, f):
data, m = self.memo_serialize([TerminalDef, Rule]) data, m = self.memo_serialize([TerminalDef, Rule])
@@ -331,11 +331,11 @@ class Lark(Serialize):
if postlex is not None: if postlex is not None:
options['postlex'] = postlex options['postlex'] = postlex
self.options = LarkOptions.deserialize(options, memo) self.options = LarkOptions.deserialize(options, memo)
self.re = regex if self.options.regex else re
re_module = regex if self.options.regex else re
self.rules = [Rule.deserialize(r, memo) for r in data['rules']] self.rules = [Rule.deserialize(r, memo) for r in data['rules']]
self.source = '<deserialized>' self.source = '<deserialized>'
self._prepare_callbacks() self._prepare_callbacks()
self.parser = self.parser_class.deserialize(data['parser'], memo, self._callbacks, self.options.postlex, self.re)
self.parser = self.parser_class.deserialize(data['parser'], memo, self._callbacks, self.options.postlex, re_module)
return self return self


@classmethod @classmethod


+ 39
- 24
lark/lexer.py View File

@@ -6,6 +6,7 @@ from .utils import Str, classify, get_regexp_width, Py36, Serialize
from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken


###{standalone ###{standalone
from copy import copy


class Pattern(Serialize): class Pattern(Serialize):


@@ -88,7 +89,6 @@ class TerminalDef(Serialize):
return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern) return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern)





class Token(Str): class Token(Str):
__slots__ = ('type', 'pos_in_stream', 'value', 'line', 'column', 'end_line', 'end_column', 'end_pos') __slots__ = ('type', 'pos_in_stream', 'value', 'line', 'column', 'end_line', 'end_column', 'end_pos')


@@ -294,35 +294,39 @@ class Lexer(object):


class TraditionalLexer(Lexer): class TraditionalLexer(Lexer):


def __init__(self, terminals, re_, ignore=(), user_callbacks={}, g_regex_flags=0):
def __init__(self, conf):
terminals = list(conf.tokens)
assert all(isinstance(t, TerminalDef) for t in terminals), terminals assert all(isinstance(t, TerminalDef) for t in terminals), terminals


terminals = list(terminals)
self.re = conf.re_module


self.re = re_
# Sanitization
for t in terminals:
try:
self.re.compile(t.pattern.to_regexp(), g_regex_flags)
except self.re.error:
raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern))
if not conf.skip_validation:
# Sanitization
for t in terminals:
try:
self.re.compile(t.pattern.to_regexp(), conf.g_regex_flags)
except self.re.error:
raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern))


if t.pattern.min_width == 0:
raise LexError("Lexer does not allow zero-width terminals. (%s: %s)" % (t.name, t.pattern))
if t.pattern.min_width == 0:
raise LexError("Lexer does not allow zero-width terminals. (%s: %s)" % (t.name, t.pattern))


assert set(ignore) <= {t.name for t in terminals}
assert set(conf.ignore) <= {t.name for t in terminals}


# Init # Init
self.newline_types = [t.name for t in terminals if _regexp_has_newline(t.pattern.to_regexp())] self.newline_types = [t.name for t in terminals if _regexp_has_newline(t.pattern.to_regexp())]
self.ignore_types = list(ignore)
self.ignore_types = list(conf.ignore)


terminals.sort(key=lambda x:(-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name)) terminals.sort(key=lambda x:(-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name))
self.terminals = terminals self.terminals = terminals
self.user_callbacks = user_callbacks
self.build(g_regex_flags)
self.user_callbacks = conf.callbacks
self.g_regex_flags = conf.g_regex_flags

self._mres = None
# self.build(g_regex_flags)


def build(self, g_regex_flags=0):
terminals, self.callback = _create_unless(self.terminals, g_regex_flags, re_=self.re)
def _build(self):
terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, re_=self.re)
assert all(self.callback.values()) assert all(self.callback.values())


for type_, f in self.user_callbacks.items(): for type_, f in self.user_callbacks.items():
@@ -332,7 +336,13 @@ class TraditionalLexer(Lexer):
else: else:
self.callback[type_] = f self.callback[type_] = f


self.mres = build_mres(terminals, g_regex_flags, self.re)
self._mres = build_mres(terminals, self.g_regex_flags, self.re)

@property
def mres(self):
if self._mres is None:
self._build()
return self._mres


def match(self, stream, pos): def match(self, stream, pos):
for mre, type_from_index in self.mres: for mre, type_from_index in self.mres:
@@ -348,13 +358,15 @@ class TraditionalLexer(Lexer):


class ContextualLexer(Lexer): class ContextualLexer(Lexer):


def __init__(self, terminals, states, re_, ignore=(), always_accept=(), user_callbacks={}, g_regex_flags=0):
self.re = re_
def __init__(self, conf, states, always_accept=()):
terminals = list(conf.tokens)
tokens_by_name = {} tokens_by_name = {}
for t in terminals: for t in terminals:
assert t.name not in tokens_by_name, t assert t.name not in tokens_by_name, t
tokens_by_name[t.name] = t tokens_by_name[t.name] = t


trad_conf = type(conf)(terminals, conf.re_module, conf.ignore, callbacks=conf.callbacks, g_regex_flags=conf.g_regex_flags, skip_validation=conf.skip_validation)

lexer_by_tokens = {} lexer_by_tokens = {}
self.lexers = {} self.lexers = {}
for state, accepts in states.items(): for state, accepts in states.items():
@@ -362,14 +374,17 @@ class ContextualLexer(Lexer):
try: try:
lexer = lexer_by_tokens[key] lexer = lexer_by_tokens[key]
except KeyError: except KeyError:
accepts = set(accepts) | set(ignore) | set(always_accept)
accepts = set(accepts) | set(conf.ignore) | set(always_accept)
state_tokens = [tokens_by_name[n] for n in accepts if n and n in tokens_by_name] state_tokens = [tokens_by_name[n] for n in accepts if n and n in tokens_by_name]
lexer = TraditionalLexer(state_tokens, re_=self.re, ignore=ignore, user_callbacks=user_callbacks, g_regex_flags=g_regex_flags)
lexer_conf = copy(trad_conf)
lexer_conf.tokens = state_tokens
lexer = TraditionalLexer(lexer_conf)
lexer_by_tokens[key] = lexer lexer_by_tokens[key] = lexer


self.lexers[state] = lexer self.lexers[state] = lexer


self.root_lexer = TraditionalLexer(terminals, re_=self.re, ignore=ignore, user_callbacks=user_callbacks, g_regex_flags=g_regex_flags)
assert trad_conf.tokens is terminals
self.root_lexer = TraditionalLexer(trad_conf)


def lex(self, stream, get_parser_state): def lex(self, stream, get_parser_state):
parser_state = get_parser_state() parser_state = get_parser_state()


+ 5
- 5
lark/load_grammar.py View File

@@ -755,19 +755,19 @@ def _find_used_symbols(tree):
for t in x.scan_values(lambda t: t.type in ('RULE', 'TERMINAL'))} for t in x.scan_values(lambda t: t.type in ('RULE', 'TERMINAL'))}


class GrammarLoader: class GrammarLoader:
def __init__(self, re_):
self.re = re_
def __init__(self, re_module):
terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()] terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()]


rules = [options_from_rule(name, None, x) for name, x in RULES.items()] rules = [options_from_rule(name, None, x) for name, x in RULES.items()]
rules = [Rule(NonTerminal(r), symbols_from_strcase(x.split()), i, None, o) for r, _p, xs, o in rules for i, x in enumerate(xs)] rules = [Rule(NonTerminal(r), symbols_from_strcase(x.split()), i, None, o) for r, _p, xs, o in rules for i, x in enumerate(xs)]
callback = ParseTreeBuilder(rules, ST).create_callback() callback = ParseTreeBuilder(rules, ST).create_callback()
lexer_conf = LexerConf(terminals, ['WS', 'COMMENT'])
lexer_conf = LexerConf(terminals, re_module, ['WS', 'COMMENT'])


parser_conf = ParserConf(rules, callback, ['start']) parser_conf = ParserConf(rules, callback, ['start'])
self.parser = LALR_TraditionalLexer(lexer_conf, parser_conf, re_)
self.parser = LALR_TraditionalLexer(lexer_conf, parser_conf)


self.canonize_tree = CanonizeTree() self.canonize_tree = CanonizeTree()
self.re_module = re_module


def load_grammar(self, grammar_text, grammar_name='<?>'): def load_grammar(self, grammar_text, grammar_name='<?>'):
"Parse grammar_text, verify, and create Grammar object. Display nice messages on error." "Parse grammar_text, verify, and create Grammar object. Display nice messages on error."
@@ -863,7 +863,7 @@ class GrammarLoader:
# import grammars # import grammars
for dotted_path, (base_paths, aliases) in imports.items(): for dotted_path, (base_paths, aliases) in imports.items():
grammar_path = os.path.join(*dotted_path) + EXT grammar_path = os.path.join(*dotted_path) + EXT
g = import_grammar(grammar_path, self.re, base_paths=base_paths)
g = import_grammar(grammar_path, self.re_module, base_paths=base_paths)
new_td, new_rd = import_from_grammar_into_namespace(g, '__'.join(dotted_path), aliases) new_td, new_rd = import_from_grammar_into_namespace(g, '__'.join(dotted_path), aliases)


term_defs += new_td term_defs += new_td


+ 18
- 26
lark/parser_frontends.py View File

@@ -62,18 +62,18 @@ class WithLexer(_ParserFrontend):
__serialize_fields__ = 'parser', 'lexer_conf', 'start' __serialize_fields__ = 'parser', 'lexer_conf', 'start'
__serialize_namespace__ = LexerConf, __serialize_namespace__ = LexerConf,


def __init__(self, lexer_conf, parser_conf, re_, options=None):
def __init__(self, lexer_conf, parser_conf, options=None):
self.lexer_conf = lexer_conf self.lexer_conf = lexer_conf
self.start = parser_conf.start self.start = parser_conf.start
self.postlex = lexer_conf.postlex self.postlex = lexer_conf.postlex
self.re = re_


@classmethod @classmethod
def deserialize(cls, data, memo, callbacks, postlex, re_):
def deserialize(cls, data, memo, callbacks, postlex, re_module):
inst = super(WithLexer, cls).deserialize(data, memo) inst = super(WithLexer, cls).deserialize(data, memo)
inst.re = re_
inst.postlex = postlex inst.postlex = postlex
inst.parser = LALR_Parser.deserialize(inst.parser, memo, callbacks) inst.parser = LALR_Parser.deserialize(inst.parser, memo, callbacks)
inst.lexer_conf.re_module = re_module
inst.lexer_conf.skip_validation=True
inst.init_lexer() inst.init_lexer()
return inst return inst


@@ -89,18 +89,17 @@ class WithLexer(_ParserFrontend):
return self._parse(token_stream, start) return self._parse(token_stream, start)


def init_traditional_lexer(self): def init_traditional_lexer(self):
self.lexer = TraditionalLexer(self.lexer_conf.tokens, re_=self.re, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks, g_regex_flags=self.lexer_conf.g_regex_flags)
self.lexer = TraditionalLexer(self.lexer_conf)


class LALR_WithLexer(WithLexer): class LALR_WithLexer(WithLexer):
def __init__(self, lexer_conf, parser_conf, re_, options=None):
def __init__(self, lexer_conf, parser_conf, options=None):
debug = options.debug if options else False debug = options.debug if options else False
self.re = re_
self.parser = LALR_Parser(parser_conf, debug=debug) self.parser = LALR_Parser(parser_conf, debug=debug)
WithLexer.__init__(self, lexer_conf, parser_conf, re_, options)
WithLexer.__init__(self, lexer_conf, parser_conf, options)


self.init_lexer() self.init_lexer()


def init_lexer(self):
def init_lexer(self, **kw):
raise NotImplementedError() raise NotImplementedError()


class LALR_TraditionalLexer(LALR_WithLexer): class LALR_TraditionalLexer(LALR_WithLexer):
@@ -111,12 +110,7 @@ class LALR_ContextualLexer(LALR_WithLexer):
def init_lexer(self): def init_lexer(self):
states = {idx:list(t.keys()) for idx, t in self.parser._parse_table.states.items()} states = {idx:list(t.keys()) for idx, t in self.parser._parse_table.states.items()}
always_accept = self.postlex.always_accept if self.postlex else () always_accept = self.postlex.always_accept if self.postlex else ()
self.lexer = ContextualLexer(self.lexer_conf.tokens, states,
re_=self.re,
ignore=self.lexer_conf.ignore,
always_accept=always_accept,
user_callbacks=self.lexer_conf.callbacks,
g_regex_flags=self.lexer_conf.g_regex_flags)
self.lexer = ContextualLexer(self.lexer_conf, states, always_accept=always_accept)




def parse(self, text, start=None): def parse(self, text, start=None):
@@ -129,11 +123,11 @@ class LALR_ContextualLexer(LALR_WithLexer):
###} ###}


class LALR_CustomLexer(LALR_WithLexer): class LALR_CustomLexer(LALR_WithLexer):
def __init__(self, lexer_cls, lexer_conf, parser_conf, re_, options=None):
self.lexer = lexer_cls(lexer_conf, re_=re_)
def __init__(self, lexer_cls, lexer_conf, parser_conf, options=None):
self.lexer = lexer_cls(lexer_conf)
debug = options.debug if options else False debug = options.debug if options else False
self.parser = LALR_Parser(parser_conf, debug=debug) self.parser = LALR_Parser(parser_conf, debug=debug)
WithLexer.__init__(self, lexer_conf, parser_conf, re_, options)
WithLexer.__init__(self, lexer_conf, parser_conf, options)




def tokenize_text(text): def tokenize_text(text):
@@ -146,8 +140,8 @@ def tokenize_text(text):
yield Token('CHAR', ch, line=line, column=i - col_start_pos) yield Token('CHAR', ch, line=line, column=i - col_start_pos)


class Earley(WithLexer): class Earley(WithLexer):
def __init__(self, lexer_conf, parser_conf, re_, options=None):
WithLexer.__init__(self, lexer_conf, parser_conf, re_, options)
def __init__(self, lexer_conf, parser_conf, options=None):
WithLexer.__init__(self, lexer_conf, parser_conf, options)
self.init_traditional_lexer() self.init_traditional_lexer()


resolve_ambiguity = options.ambiguity == 'resolve' resolve_ambiguity = options.ambiguity == 'resolve'
@@ -159,9 +153,7 @@ class Earley(WithLexer):




class XEarley(_ParserFrontend): class XEarley(_ParserFrontend):
def __init__(self, lexer_conf, parser_conf, re_, options=None, **kw):
self.re = re_

def __init__(self, lexer_conf, parser_conf, options=None, **kw):
self.token_by_name = {t.name:t for t in lexer_conf.tokens} self.token_by_name = {t.name:t for t in lexer_conf.tokens}
self.start = parser_conf.start self.start = parser_conf.start


@@ -193,7 +185,7 @@ class XEarley(_ParserFrontend):
if width == 0: if width == 0:
raise ValueError("Dynamic Earley doesn't allow zero-width regexps", t) raise ValueError("Dynamic Earley doesn't allow zero-width regexps", t)


self.regexps[t.name] = self.re.compile(regexp, lexer_conf.g_regex_flags)
self.regexps[t.name] = lexer_conf.re_module.compile(regexp, lexer_conf.g_regex_flags)


def parse(self, text, start): def parse(self, text, start):
return self._parse(text, start) return self._parse(text, start)
@@ -206,8 +198,8 @@ class XEarley_CompleteLex(XEarley):


class CYK(WithLexer): class CYK(WithLexer):


def __init__(self, lexer_conf, parser_conf, re_, options=None):
WithLexer.__init__(self, lexer_conf, parser_conf, re_, options)
def __init__(self, lexer_conf, parser_conf, options=None):
WithLexer.__init__(self, lexer_conf, parser_conf, options)
self.init_traditional_lexer() self.init_traditional_lexer()


self._analysis = GrammarAnalyzer(parser_conf) self._analysis = GrammarAnalyzer(parser_conf)


+ 3
- 3
tests/test_parser.py View File

@@ -6,7 +6,7 @@ import unittest
import logging import logging
import os import os
import sys import sys
from copy import deepcopy
from copy import copy, deepcopy
try: try:
from cStringIO import StringIO as cStringIO from cStringIO import StringIO as cStringIO
except ImportError: except ImportError:
@@ -553,8 +553,8 @@ class CustomLexer(Lexer):
Purpose of this custom lexer is to test the integration, Purpose of this custom lexer is to test the integration,
so it uses the traditionalparser as implementation without custom lexing behaviour. so it uses the traditionalparser as implementation without custom lexing behaviour.
""" """
def __init__(self, lexer_conf, re_):
self.lexer = TraditionalLexer(lexer_conf.tokens, re_, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks, g_regex_flags=lexer_conf.g_regex_flags)
def __init__(self, lexer_conf):
self.lexer = TraditionalLexer(copy(lexer_conf))
def lex(self, *args, **kwargs): def lex(self, *args, **kwargs):
return self.lexer.lex(*args, **kwargs) return self.lexer.lex(*args, **kwargs)




Loading…
Cancel
Save