Bläddra i källkod

More refactoring towards standalone

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.5.2
Erez Shinan 6 år sedan
förälder
incheckning
da1910f5b6
7 ändrade filer med 96 tillägg och 88 borttagningar
  1. +6
    -3
      lark/common.py
  2. +2
    -2
      lark/lark.py
  3. +55
    -58
      lark/lexer.py
  4. +20
    -22
      lark/load_grammar.py
  5. +5
    -1
      lark/parse_tree_builder.py
  6. +5
    -1
      lark/parsers/lalr_parser.py
  7. +3
    -1
      lark/tree.py

+ 6
- 3
lark/common.py Visa fil

@@ -4,12 +4,18 @@ import sys


Py36 = (sys.version_info[:2] >= (3, 6)) Py36 = (sys.version_info[:2] >= (3, 6))



###{standalone
def is_terminal(sym):
return sym.isupper()

class GrammarError(Exception): class GrammarError(Exception):
pass pass


class ParseError(Exception): class ParseError(Exception):
pass pass


###}


class UnexpectedToken(ParseError): class UnexpectedToken(ParseError):
def __init__(self, token, expected, seq, index): def __init__(self, token, expected, seq, index):
@@ -32,9 +38,6 @@ class UnexpectedToken(ParseError):






def is_terminal(sym):
return sym.isupper()



class LexerConf: class LexerConf:
def __init__(self, tokens, ignore=(), postlex=None): def __init__(self, tokens, ignore=(), postlex=None):


+ 2
- 2
lark/lark.py Visa fil

@@ -166,8 +166,8 @@ class Lark:
def _build_parser(self): def _build_parser(self):
self.parser_class = get_frontend(self.options.parser, self.options.lexer) self.parser_class = get_frontend(self.options.parser, self.options.lexer)


self.parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class, self.options.propagate_positions, self.options.keep_all_tokens)
callback = self.parse_tree_builder.apply(self.options.transformer)
self._parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class, self.options.propagate_positions, self.options.keep_all_tokens)
callback = self._parse_tree_builder.create_callback(self.options.transformer)
if self.profiler: if self.profiler:
for f in dir(callback): for f in dir(callback):
if not (f.startswith('__') and f.endswith('__')): if not (f.startswith('__') and f.endswith('__')):


+ 55
- 58
lark/lexer.py Visa fil

@@ -5,6 +5,7 @@ import re
from .utils import Str, classify from .utils import Str, classify
from .common import is_terminal, PatternStr, PatternRE, TokenDef from .common import is_terminal, PatternStr, PatternRE, TokenDef


###{standalone
class LexError(Exception): class LexError(Exception):
pass pass


@@ -48,10 +49,60 @@ class Token(Str):


__hash__ = Str.__hash__ __hash__ = Str.__hash__


class Regex:
def __init__(self, pattern, flags=()):
self.pattern = pattern
self.flags = flags

class LineCounter:
def __init__(self):
self.newline_char = '\n'
self.char_pos = 0
self.line = 1
self.column = 0
self.line_start_pos = 0

def feed(self, token, test_newline=True):
"""Consume a token and calculate the new line & column.

As an optional optimization, set test_newline=False is token doesn't contain a newline.
"""
if test_newline:
newlines = token.count(self.newline_char)
if newlines:
self.line += newlines
self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1

self.char_pos += len(token)
self.column = self.char_pos - self.line_start_pos

class _Lex:
"Built to serve both Lexer and ContextualLexer"
def __init__(self, lexer):
self.lexer = lexer

def lex(self, stream, newline_types, ignore_types):
newline_types = list(newline_types)
newline_types = list(newline_types)
line_ctr = LineCounter()

while True:
lexer = self.lexer
for mre, type_from_index in lexer.mres:
m = mre.match(stream, line_ctr.char_pos)
if m:
value = m.group(0)
type_ = type_from_index[m.lastindex]
if type_ not in ignore_types:
t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
if t.type in lexer.callback:
t = lexer.callback[t.type](t)
lexer = yield t

line_ctr.feed(value, type_ in newline_types)
break
else:
if line_ctr.char_pos < len(stream):
raise UnexpectedInput(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column)
break
###}



def _regexp_has_newline(r): def _regexp_has_newline(r):
return '\n' in r or '\\n' in r or ('(?s)' in r and '.' in r) return '\n' in r or '\\n' in r or ('(?s)' in r and '.' in r)
@@ -182,57 +233,3 @@ class ContextualLexer:
l.lexer = self.lexers[self.parser_state] l.lexer = self.lexers[self.parser_state]




###{lexer

class LineCounter:
def __init__(self):
self.newline_char = '\n'
self.char_pos = 0
self.line = 1
self.column = 0
self.line_start_pos = 0

def feed(self, token, test_newline=True):
"""Consume a token and calculate the new line & column.

As an optional optimization, set test_newline=False is token doesn't contain a newline.
"""
if test_newline:
newlines = token.count(self.newline_char)
if newlines:
self.line += newlines
self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1

self.char_pos += len(token)
self.column = self.char_pos - self.line_start_pos

class _Lex:
"Built to serve both Lexer and ContextualLexer"
def __init__(self, lexer):
self.lexer = lexer

def lex(self, stream, newline_types, ignore_types):
newline_types = list(newline_types)
newline_types = list(newline_types)
line_ctr = LineCounter()

while True:
lexer = self.lexer
for mre, type_from_index in lexer.mres:
m = mre.match(stream, line_ctr.char_pos)
if m:
value = m.group(0)
type_ = type_from_index[m.lastindex]
if type_ not in ignore_types:
t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
if t.type in lexer.callback:
t = lexer.callback[t.type](t)
lexer = yield t

line_ctr.feed(value, type_ in newline_types)
break
else:
if line_ctr.char_pos < len(stream):
raise UnexpectedInput(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column)
break
###}

+ 20
- 22
lark/load_grammar.py Visa fil

@@ -128,7 +128,7 @@ RULES = {


class EBNF_to_BNF(InlineTransformer): class EBNF_to_BNF(InlineTransformer):
def __init__(self): def __init__(self):
self.new_rules = {}
self.new_rules = []
self.rules_by_expr = {} self.rules_by_expr = {}
self.prefix = 'anon' self.prefix = 'anon'
self.i = 0 self.i = 0
@@ -141,7 +141,8 @@ class EBNF_to_BNF(InlineTransformer):
new_name = '__%s_%s_%d' % (self.prefix, type_, self.i) new_name = '__%s_%s_%d' % (self.prefix, type_, self.i)
self.i += 1 self.i += 1
t = Token('RULE', new_name, -1) t = Token('RULE', new_name, -1)
self.new_rules[new_name] = T('expansions', [T('expansion', [expr]), T('expansion', [t, expr])]), self.rule_options
tree = T('expansions', [T('expansion', [expr]), T('expansion', [t, expr])])
self.new_rules.append((new_name, tree, self.rule_options))
self.rules_by_expr[expr] = t self.rules_by_expr[expr] = t
return t return t


@@ -390,12 +391,6 @@ def _interleave(l, item):
def _choice_of_rules(rules): def _choice_of_rules(rules):
return T('expansions', [T('expansion', [Token('RULE', name)]) for name in rules]) return T('expansions', [T('expansion', [Token('RULE', name)]) for name in rules])


def dict_update_safe(d1, d2):
for k, v in d2.items():
assert k not in d1
d1[k] = v


class Grammar: class Grammar:
def __init__(self, rule_defs, token_defs, ignore): def __init__(self, rule_defs, token_defs, ignore):
self.token_defs = token_defs self.token_defs = token_defs
@@ -468,38 +463,41 @@ class Grammar:
# ================= # =================
# Compile Rules # Compile Rules
# ================= # =================
ebnf_to_bnf = EBNF_to_BNF()
simplify_rule = SimplifyRule_Visitor()


# 1. Pre-process terminals
transformer = PrepareLiterals() transformer = PrepareLiterals()
if not lexer: if not lexer:
transformer *= SplitLiterals() transformer *= SplitLiterals()
transformer *= ExtractAnonTokens(tokens) # Adds to tokens transformer *= ExtractAnonTokens(tokens) # Adds to tokens


rules = {}
# 2. Convert EBNF to BNF (and apply step 1)
ebnf_to_bnf = EBNF_to_BNF()
rules = []
for name, rule_tree, options in rule_defs: for name, rule_tree, options in rule_defs:
assert name not in rules, name
ebnf_to_bnf.rule_options = RuleOptions(keep_all_tokens=True) if options and options.keep_all_tokens else None ebnf_to_bnf.rule_options = RuleOptions(keep_all_tokens=True) if options and options.keep_all_tokens else None
tree = transformer.transform(rule_tree) tree = transformer.transform(rule_tree)
rules[name] = ebnf_to_bnf.transform(tree), options
rules.append((name, ebnf_to_bnf.transform(tree), options))
rules += ebnf_to_bnf.new_rules


dict_update_safe(rules, ebnf_to_bnf.new_rules)
assert len(rules) == len({name for name, _t, _o in rules}), "Whoops, name collision"


# 3. Compile tree to Rule objects
rule_tree_to_text = RuleTreeToText() rule_tree_to_text = RuleTreeToText()


new_rules = []
for origin, (tree, options) in rules.items():
simplify_rule = SimplifyRule_Visitor()
compiled_rules = []
for name, tree, options in rules:
simplify_rule.visit(tree) simplify_rule.visit(tree)
expansions = rule_tree_to_text.transform(tree) expansions = rule_tree_to_text.transform(tree)


for expansion, alias in expansions: for expansion, alias in expansions:
if alias and origin.startswith('_'):
raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (origin, alias))
if alias and name.startswith('_'):
raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (name, alias))


rule = Rule(origin, expansion, alias, options)
new_rules.append(rule)
rule = Rule(name, expansion, alias, options)
compiled_rules.append(rule)


return tokens, new_rules, self.ignore
return tokens, compiled_rules, self.ignore






@@ -557,7 +555,7 @@ class GrammarLoader:


rules = [options_from_rule(name, x) for name, x in RULES.items()] rules = [options_from_rule(name, x) for name, x in RULES.items()]
rules = [Rule(r, x.split(), None, o) for r, xs, o in rules for x in xs] rules = [Rule(r, x.split(), None, o) for r, xs, o in rules for x in xs]
callback = ParseTreeBuilder(rules, T).apply()
callback = ParseTreeBuilder(rules, T).create_callback()
lexer_conf = LexerConf(tokens, ['WS', 'COMMENT']) lexer_conf = LexerConf(tokens, ['WS', 'COMMENT'])


parser_conf = ParserConf(rules, callback, 'start') parser_conf = ParserConf(rules, callback, 'start')


+ 5
- 1
lark/parse_tree_builder.py Visa fil

@@ -3,6 +3,8 @@ from .utils import suppress
from .lexer import Token from .lexer import Token
from .grammar import Rule from .grammar import Rule


###{standalone

class NodeBuilder: class NodeBuilder:
def __init__(self, tree_class, name): def __init__(self, tree_class, name):
self.tree_class = tree_class self.tree_class = tree_class
@@ -130,7 +132,7 @@ class ParseTreeBuilder:
yield rule, wrapper_chain yield rule, wrapper_chain




def apply(self, transformer=None):
def create_callback(self, transformer=None):
callback = Callback() callback = Callback()


for rule, wrapper_chain in self.rule_builders: for rule, wrapper_chain in self.rule_builders:
@@ -152,3 +154,5 @@ class ParseTreeBuilder:
setattr(callback, internal_callback_name, f) setattr(callback, internal_callback_name, f)


return callback return callback

###}

+ 5
- 1
lark/parsers/lalr_parser.py Visa fil

@@ -3,7 +3,7 @@
# Author: Erez Shinan (2017) # Author: Erez Shinan (2017)
# Email : erezshin@gmail.com # Email : erezshin@gmail.com


from ..common import ParseError, UnexpectedToken
from ..common import UnexpectedToken


from .lalr_analysis import LALR_Analyzer, Shift from .lalr_analysis import LALR_Analyzer, Shift


@@ -20,6 +20,8 @@ class Parser:
self.parser = _Parser(analysis.parse_table, callbacks) self.parser = _Parser(analysis.parse_table, callbacks)
self.parse = self.parser.parse self.parse = self.parser.parse


###{standalone

class _Parser: class _Parser:
def __init__(self, parse_table, callbacks): def __init__(self, parse_table, callbacks):
self.states = parse_table.states self.states = parse_table.states
@@ -90,3 +92,5 @@ class _Parser:
return val return val
else: else:
reduce(arg) reduce(arg)

###}

+ 3
- 1
lark/tree.py Visa fil

@@ -7,6 +7,7 @@ from copy import deepcopy


from .utils import inline_args from .utils import inline_args


###{standalone
class Tree(object): class Tree(object):
def __init__(self, data, children): def __init__(self, data, children):
self.data = data self.data = data
@@ -33,6 +34,7 @@ class Tree(object):


def pretty(self, indent_str=' '): def pretty(self, indent_str=' '):
return ''.join(self._pretty(0, indent_str)) return ''.join(self._pretty(0, indent_str))
###}


def expand_kids_by_index(self, *indices): def expand_kids_by_index(self, *indices):
for i in sorted(indices, reverse=True): # reverse so that changing tail won't affect indices for i in sorted(indices, reverse=True): # reverse so that changing tail won't affect indices
@@ -138,7 +140,7 @@ class TransformerChain(object):


def __mul__(self, other): def __mul__(self, other):
return TransformerChain(*self.transformers + (other,)) return TransformerChain(*self.transformers + (other,))




class InlineTransformer(Transformer): class InlineTransformer(Transformer):


Laddar…
Avbryt
Spara