From 7af3de208d2dceac1bb1dabb7ba6b1dfbada2615 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Mon, 13 Feb 2017 14:33:22 +0200 Subject: [PATCH] Moved lexing responsibility to parser frontend --- lark/common.py | 13 ++++++++ lark/lark.py | 67 +++++++++++++++++++++------------------- lark/load_grammar.py | 12 +++---- lark/parser_frontends.py | 55 +++++++++++++++++++++++---------- 4 files changed, 92 insertions(+), 55 deletions(-) diff --git a/lark/common.py b/lark/common.py index 8c82019..702c3cc 100644 --- a/lark/common.py +++ b/lark/common.py @@ -25,3 +25,16 @@ class UnexpectedToken(ParseError): def is_terminal(sym): return sym.isupper() or sym[0] == '$' + +class LexerConf: + def __init__(self, tokens, ignore, postlex): + self.tokens = tokens + self.ignore = ignore + self.postlex = postlex + +class ParserConf: + def __init__(self, rules, callback, start): + self.rules = rules + self.callback = callback + self.start = start + diff --git a/lark/lark.py b/lark/lark.py index 39e2231..f2fd601 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -5,7 +5,7 @@ import os from .utils import STRING_TYPE, inline_args from .load_grammar import load_grammar from .tree import Tree, Transformer -from .common import GrammarError +from .common import GrammarError, LexerConf, ParserConf from .lexer import Lexer from .parse_tree_builder import ParseTreeBuilder @@ -105,45 +105,46 @@ class Lark: assert isinstance(grammar, STRING_TYPE) - if self.options.cache_grammar: + if self.options.cache_grammar or self.options.keep_all_tokens: raise NotImplementedError("Not available yet") + assert not self.options.profile, "Feature temporarily disabled" self.profiler = Profiler() if self.options.profile else None - self.tokens, self.rules = load_grammar(grammar) + tokens, self.rules = load_grammar(grammar) + self.ignore_tokens = [] + for tokendef, flags in tokens: + for flag in flags: + if flag == 'ignore': + self.ignore_tokens.append(tokendef.name) + else: + raise GrammarError("No such flag: %s" % flag) + + self.lexer_conf = LexerConf([t[0] for t in tokens], self.ignore_tokens, self.options.postlex) if not self.options.only_lex: - self.parser_engine = ENGINE_DICT[self.options.parser]() - self.parse_tree_builder = ParseTreeBuilder(self.options.tree_class) self.parser = self._build_parser() - - self.lexer = self._build_lexer() + else: + self.lexer = self._build_lexer() if self.profiler: self.profiler.enter_section('outside_lark') __init__.__doc__ += "\nOPTIONS:" + LarkOptions.OPTIONS_DOC def _build_lexer(self): - ignore_tokens = [] - tokens = [] - for tokendef, flags in self.tokens: - for flag in flags: - if flag == 'ignore': - ignore_tokens.append(tokendef.name) - else: - raise GrammarError("No such flag: %s" % flag) - - tokens.append(tokendef) - - return Lexer(tokens, ignore=ignore_tokens) + return Lexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore) def _build_parser(self): + self.parser_class = ENGINE_DICT[self.options.parser] + self.parse_tree_builder = ParseTreeBuilder(self.options.tree_class) rules, callback = self.parse_tree_builder.create_tree_builder(self.rules, self.options.transformer) if self.profiler: for f in dir(callback): - if not f.startswith('__'): + if not (f.startswith('__') and f.endswith('__')): setattr(callback, f, self.profiler.make_wrapper('transformer', getattr(callback, f))) - return self.parser_engine.build_parser(rules, callback, self.options.start) + parser_conf = ParserConf(rules, callback, self.options.start) + + return self.parser_class(self.lexer_conf, parser_conf) def lex(self, text): @@ -156,15 +157,17 @@ class Lark: def parse(self, text): assert not self.options.only_lex - if self.profiler: - self.profiler.enter_section('lex') - l = list(self.lex(text)) - self.profiler.enter_section('parse') - try: - return self.parser.parse(l) - finally: - self.profiler.enter_section('outside_lark') - else: - l = list(self.lex(text)) - return self.parser.parse(l) + return self.parser.parse(text) + + # if self.profiler: + # self.profiler.enter_section('lex') + # l = list(self.lex(text)) + # self.profiler.enter_section('parse') + # try: + # return self.parser.parse(l) + # finally: + # self.profiler.enter_section('outside_lark') + # else: + # l = list(self.lex(text)) + # return self.parser.parse(l) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index a3a4aca..78990e0 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -6,7 +6,7 @@ from .lexer import Lexer, Token, UnexpectedInput, TokenDef__Str, TokenDef__Regex from .parse_tree_builder import ParseTreeBuilder from .parser_frontends import LALR from .parsers.lalr_parser import UnexpectedToken -from .common import is_terminal, GrammarError +from .common import is_terminal, GrammarError, LexerConf, ParserConf from .tree import Tree as T, Transformer, InlineTransformer, Visitor @@ -279,11 +279,12 @@ class ExtractAnonTokens(InlineTransformer): class GrammarLoader: def __init__(self): tokens = [TokenDef__Regexp(name, value) for name, value in TOKENS.items()] - self.lexer = Lexer(tokens, ignore=['WS', 'COMMENT']) d = {r: [(x.split(), None) for x in xs] for r, xs in RULES.items()} rules, callback = ParseTreeBuilder(T).create_tree_builder(d, None) - self.parser = LALR().build_parser(rules, callback, 'start') + lexer_conf = LexerConf(tokens, ['WS', 'COMMENT'], None) + parser_conf = ParserConf(rules, callback, 'start') + self.parser = LALR(lexer_conf, parser_conf) self.simplify_tree = SimplifyTree() self.simplify_rule = SimplifyRule_Visitor() @@ -291,12 +292,9 @@ class GrammarLoader: def load_grammar(self, grammar_text): try: - token_stream = list(self.lexer.lex(grammar_text+"\n")) + tree = self.simplify_tree.transform( self.parser.parse(grammar_text+'\n') ) except UnexpectedInput as e: raise GrammarError("Unexpected input %r at line %d column %d" % (e.context, e.line, e.column)) - - try: - tree = self.simplify_tree.transform( self.parser.parse(token_stream) ) except UnexpectedToken as e: if '_COLON' in e.expected: raise GrammarError("Missing colon at line %s column %s" % (e.line, e.column)) diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index f9ae809..fc016a0 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -1,32 +1,55 @@ +from .lexer import Lexer from .parsers.lalr_analysis import GrammarAnalyzer from .common import is_terminal from .parsers import lalr_parser, earley -class LALR: - def build_parser(self, rules, callback, start): - ga = GrammarAnalyzer(rules, start) - ga.analyze() - return lalr_parser.Parser(ga, callback) +class WithLexer: + def __init__(self, lexer_conf): + self.lexer_conf = lexer_conf + self.lexer = Lexer(lexer_conf.tokens, ignore=lexer_conf.ignore) -class Earley: - @staticmethod - def _process_expansion(x): - return [{'literal': s} if is_terminal(s) else s for s in x] + def lex(self, text): + stream = self.lexer.lex(text) + if self.lexer_conf.postlex: + return self.lexer_conf.postlex.process(stream) + else: + return stream - def build_parser(self, rules, callback, start): - rules = [{'name':n, 'symbols': self._process_expansion(x), 'postprocess':getattr(callback, a)} for n,x,a in rules] - return EarleyParser(earley.Parser(rules, start)) +class LALR(WithLexer): + def __init__(self, lexer_conf, parser_conf): + WithLexer.__init__(self, lexer_conf) -class EarleyParser: - def __init__(self, parser): - self.parser = parser + analyzer = GrammarAnalyzer(parser_conf.rules, parser_conf.start) + analyzer.analyze() + self.parser = lalr_parser.Parser(analyzer, parser_conf.callback) def parse(self, text): - res = self.parser.parse(text) + tokens = list(self.lex(text)) + return self.parser.parse(tokens) + +class Earley(WithLexer): + def __init__(self, lexer_conf, parser_conf): + WithLexer.__init__(self, lexer_conf) + + rules = [{'name':n, + 'symbols': self._process_expansion(x), + 'postprocess': getattr(parser_conf.callback, a)} + for n,x,a in parser_conf.rules] + + self.parser = earley.Parser(rules, parser_conf.start) + + def parse(self, text): + tokens = list(self.lex(text)) + res = self.parser.parse(tokens) assert len(res) ==1 , 'Ambiguious Parse! Not handled yet' return res[0] + @staticmethod + def _process_expansion(x): + return [{'literal': s} if is_terminal(s) else s for s in x] + + ENGINE_DICT = { 'lalr': LALR, 'earley': Earley }