Pārlūkot izejas kodu

Moved lexing responsibility to parser frontend

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.5.1
Erez Shinan pirms 7 gadiem
vecāks
revīzija
7af3de208d
4 mainītis faili ar 92 papildinājumiem un 55 dzēšanām
  1. +13
    -0
      lark/common.py
  2. +35
    -32
      lark/lark.py
  3. +5
    -7
      lark/load_grammar.py
  4. +39
    -16
      lark/parser_frontends.py

+ 13
- 0
lark/common.py Parādīt failu

@@ -25,3 +25,16 @@ class UnexpectedToken(ParseError):
def is_terminal(sym):
return sym.isupper() or sym[0] == '$'


class LexerConf:
def __init__(self, tokens, ignore, postlex):
self.tokens = tokens
self.ignore = ignore
self.postlex = postlex

class ParserConf:
def __init__(self, rules, callback, start):
self.rules = rules
self.callback = callback
self.start = start


+ 35
- 32
lark/lark.py Parādīt failu

@@ -5,7 +5,7 @@ import os
from .utils import STRING_TYPE, inline_args
from .load_grammar import load_grammar
from .tree import Tree, Transformer
from .common import GrammarError
from .common import GrammarError, LexerConf, ParserConf

from .lexer import Lexer
from .parse_tree_builder import ParseTreeBuilder
@@ -105,45 +105,46 @@ class Lark:

assert isinstance(grammar, STRING_TYPE)

if self.options.cache_grammar:
if self.options.cache_grammar or self.options.keep_all_tokens:
raise NotImplementedError("Not available yet")

assert not self.options.profile, "Feature temporarily disabled"
self.profiler = Profiler() if self.options.profile else None

self.tokens, self.rules = load_grammar(grammar)
tokens, self.rules = load_grammar(grammar)
self.ignore_tokens = []
for tokendef, flags in tokens:
for flag in flags:
if flag == 'ignore':
self.ignore_tokens.append(tokendef.name)
else:
raise GrammarError("No such flag: %s" % flag)

self.lexer_conf = LexerConf([t[0] for t in tokens], self.ignore_tokens, self.options.postlex)

if not self.options.only_lex:
self.parser_engine = ENGINE_DICT[self.options.parser]()
self.parse_tree_builder = ParseTreeBuilder(self.options.tree_class)
self.parser = self._build_parser()

self.lexer = self._build_lexer()
else:
self.lexer = self._build_lexer()

if self.profiler: self.profiler.enter_section('outside_lark')

__init__.__doc__ += "\nOPTIONS:" + LarkOptions.OPTIONS_DOC

def _build_lexer(self):
ignore_tokens = []
tokens = []
for tokendef, flags in self.tokens:
for flag in flags:
if flag == 'ignore':
ignore_tokens.append(tokendef.name)
else:
raise GrammarError("No such flag: %s" % flag)

tokens.append(tokendef)

return Lexer(tokens, ignore=ignore_tokens)
return Lexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore)

def _build_parser(self):
self.parser_class = ENGINE_DICT[self.options.parser]
self.parse_tree_builder = ParseTreeBuilder(self.options.tree_class)
rules, callback = self.parse_tree_builder.create_tree_builder(self.rules, self.options.transformer)
if self.profiler:
for f in dir(callback):
if not f.startswith('__'):
if not (f.startswith('__') and f.endswith('__')):
setattr(callback, f, self.profiler.make_wrapper('transformer', getattr(callback, f)))
return self.parser_engine.build_parser(rules, callback, self.options.start)
parser_conf = ParserConf(rules, callback, self.options.start)

return self.parser_class(self.lexer_conf, parser_conf)


def lex(self, text):
@@ -156,15 +157,17 @@ class Lark:
def parse(self, text):
assert not self.options.only_lex

if self.profiler:
self.profiler.enter_section('lex')
l = list(self.lex(text))
self.profiler.enter_section('parse')
try:
return self.parser.parse(l)
finally:
self.profiler.enter_section('outside_lark')
else:
l = list(self.lex(text))
return self.parser.parse(l)
return self.parser.parse(text)

# if self.profiler:
# self.profiler.enter_section('lex')
# l = list(self.lex(text))
# self.profiler.enter_section('parse')
# try:
# return self.parser.parse(l)
# finally:
# self.profiler.enter_section('outside_lark')
# else:
# l = list(self.lex(text))
# return self.parser.parse(l)


+ 5
- 7
lark/load_grammar.py Parādīt failu

@@ -6,7 +6,7 @@ from .lexer import Lexer, Token, UnexpectedInput, TokenDef__Str, TokenDef__Regex
from .parse_tree_builder import ParseTreeBuilder
from .parser_frontends import LALR
from .parsers.lalr_parser import UnexpectedToken
from .common import is_terminal, GrammarError
from .common import is_terminal, GrammarError, LexerConf, ParserConf

from .tree import Tree as T, Transformer, InlineTransformer, Visitor

@@ -279,11 +279,12 @@ class ExtractAnonTokens(InlineTransformer):
class GrammarLoader:
def __init__(self):
tokens = [TokenDef__Regexp(name, value) for name, value in TOKENS.items()]
self.lexer = Lexer(tokens, ignore=['WS', 'COMMENT'])

d = {r: [(x.split(), None) for x in xs] for r, xs in RULES.items()}
rules, callback = ParseTreeBuilder(T).create_tree_builder(d, None)
self.parser = LALR().build_parser(rules, callback, 'start')
lexer_conf = LexerConf(tokens, ['WS', 'COMMENT'], None)
parser_conf = ParserConf(rules, callback, 'start')
self.parser = LALR(lexer_conf, parser_conf)

self.simplify_tree = SimplifyTree()
self.simplify_rule = SimplifyRule_Visitor()
@@ -291,12 +292,9 @@ class GrammarLoader:

def load_grammar(self, grammar_text):
try:
token_stream = list(self.lexer.lex(grammar_text+"\n"))
tree = self.simplify_tree.transform( self.parser.parse(grammar_text+'\n') )
except UnexpectedInput as e:
raise GrammarError("Unexpected input %r at line %d column %d" % (e.context, e.line, e.column))

try:
tree = self.simplify_tree.transform( self.parser.parse(token_stream) )
except UnexpectedToken as e:
if '_COLON' in e.expected:
raise GrammarError("Missing colon at line %s column %s" % (e.line, e.column))


+ 39
- 16
lark/parser_frontends.py Parādīt failu

@@ -1,32 +1,55 @@
from .lexer import Lexer
from .parsers.lalr_analysis import GrammarAnalyzer

from .common import is_terminal
from .parsers import lalr_parser, earley

class LALR:
def build_parser(self, rules, callback, start):
ga = GrammarAnalyzer(rules, start)
ga.analyze()
return lalr_parser.Parser(ga, callback)
class WithLexer:
def __init__(self, lexer_conf):
self.lexer_conf = lexer_conf
self.lexer = Lexer(lexer_conf.tokens, ignore=lexer_conf.ignore)

class Earley:
@staticmethod
def _process_expansion(x):
return [{'literal': s} if is_terminal(s) else s for s in x]
def lex(self, text):
stream = self.lexer.lex(text)
if self.lexer_conf.postlex:
return self.lexer_conf.postlex.process(stream)
else:
return stream

def build_parser(self, rules, callback, start):
rules = [{'name':n, 'symbols': self._process_expansion(x), 'postprocess':getattr(callback, a)} for n,x,a in rules]
return EarleyParser(earley.Parser(rules, start))
class LALR(WithLexer):
def __init__(self, lexer_conf, parser_conf):
WithLexer.__init__(self, lexer_conf)

class EarleyParser:
def __init__(self, parser):
self.parser = parser
analyzer = GrammarAnalyzer(parser_conf.rules, parser_conf.start)
analyzer.analyze()
self.parser = lalr_parser.Parser(analyzer, parser_conf.callback)

def parse(self, text):
res = self.parser.parse(text)
tokens = list(self.lex(text))
return self.parser.parse(tokens)

class Earley(WithLexer):
def __init__(self, lexer_conf, parser_conf):
WithLexer.__init__(self, lexer_conf)

rules = [{'name':n,
'symbols': self._process_expansion(x),
'postprocess': getattr(parser_conf.callback, a)}
for n,x,a in parser_conf.rules]

self.parser = earley.Parser(rules, parser_conf.start)

def parse(self, text):
tokens = list(self.lex(text))
res = self.parser.parse(tokens)
assert len(res) ==1 , 'Ambiguious Parse! Not handled yet'
return res[0]

@staticmethod
def _process_expansion(x):
return [{'literal': s} if is_terminal(s) else s for s in x]



ENGINE_DICT = { 'lalr': LALR, 'earley': Earley }


Notiek ielāde…
Atcelt
Saglabāt