@@ -4,12 +4,18 @@ import sys | |||||
Py36 = (sys.version_info[:2] >= (3, 6)) | Py36 = (sys.version_info[:2] >= (3, 6)) | ||||
###{standalone | |||||
def is_terminal(sym): | |||||
return sym.isupper() | |||||
class GrammarError(Exception): | class GrammarError(Exception): | ||||
pass | pass | ||||
class ParseError(Exception): | class ParseError(Exception): | ||||
pass | pass | ||||
###} | |||||
class UnexpectedToken(ParseError): | class UnexpectedToken(ParseError): | ||||
def __init__(self, token, expected, seq, index): | def __init__(self, token, expected, seq, index): | ||||
@@ -32,9 +38,6 @@ class UnexpectedToken(ParseError): | |||||
def is_terminal(sym): | |||||
return sym.isupper() | |||||
class LexerConf: | class LexerConf: | ||||
def __init__(self, tokens, ignore=(), postlex=None): | def __init__(self, tokens, ignore=(), postlex=None): | ||||
@@ -166,8 +166,8 @@ class Lark: | |||||
def _build_parser(self): | def _build_parser(self): | ||||
self.parser_class = get_frontend(self.options.parser, self.options.lexer) | self.parser_class = get_frontend(self.options.parser, self.options.lexer) | ||||
self.parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class, self.options.propagate_positions, self.options.keep_all_tokens) | |||||
callback = self.parse_tree_builder.apply(self.options.transformer) | |||||
self._parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class, self.options.propagate_positions, self.options.keep_all_tokens) | |||||
callback = self._parse_tree_builder.create_callback(self.options.transformer) | |||||
if self.profiler: | if self.profiler: | ||||
for f in dir(callback): | for f in dir(callback): | ||||
if not (f.startswith('__') and f.endswith('__')): | if not (f.startswith('__') and f.endswith('__')): | ||||
@@ -5,6 +5,7 @@ import re | |||||
from .utils import Str, classify | from .utils import Str, classify | ||||
from .common import is_terminal, PatternStr, PatternRE, TokenDef | from .common import is_terminal, PatternStr, PatternRE, TokenDef | ||||
###{standalone | |||||
class LexError(Exception): | class LexError(Exception): | ||||
pass | pass | ||||
@@ -48,10 +49,60 @@ class Token(Str): | |||||
__hash__ = Str.__hash__ | __hash__ = Str.__hash__ | ||||
class Regex: | |||||
def __init__(self, pattern, flags=()): | |||||
self.pattern = pattern | |||||
self.flags = flags | |||||
class LineCounter: | |||||
def __init__(self): | |||||
self.newline_char = '\n' | |||||
self.char_pos = 0 | |||||
self.line = 1 | |||||
self.column = 0 | |||||
self.line_start_pos = 0 | |||||
def feed(self, token, test_newline=True): | |||||
"""Consume a token and calculate the new line & column. | |||||
As an optional optimization, set test_newline=False is token doesn't contain a newline. | |||||
""" | |||||
if test_newline: | |||||
newlines = token.count(self.newline_char) | |||||
if newlines: | |||||
self.line += newlines | |||||
self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1 | |||||
self.char_pos += len(token) | |||||
self.column = self.char_pos - self.line_start_pos | |||||
class _Lex: | |||||
"Built to serve both Lexer and ContextualLexer" | |||||
def __init__(self, lexer): | |||||
self.lexer = lexer | |||||
def lex(self, stream, newline_types, ignore_types): | |||||
newline_types = list(newline_types) | |||||
newline_types = list(newline_types) | |||||
line_ctr = LineCounter() | |||||
while True: | |||||
lexer = self.lexer | |||||
for mre, type_from_index in lexer.mres: | |||||
m = mre.match(stream, line_ctr.char_pos) | |||||
if m: | |||||
value = m.group(0) | |||||
type_ = type_from_index[m.lastindex] | |||||
if type_ not in ignore_types: | |||||
t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) | |||||
if t.type in lexer.callback: | |||||
t = lexer.callback[t.type](t) | |||||
lexer = yield t | |||||
line_ctr.feed(value, type_ in newline_types) | |||||
break | |||||
else: | |||||
if line_ctr.char_pos < len(stream): | |||||
raise UnexpectedInput(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column) | |||||
break | |||||
###} | |||||
def _regexp_has_newline(r): | def _regexp_has_newline(r): | ||||
return '\n' in r or '\\n' in r or ('(?s)' in r and '.' in r) | return '\n' in r or '\\n' in r or ('(?s)' in r and '.' in r) | ||||
@@ -182,57 +233,3 @@ class ContextualLexer: | |||||
l.lexer = self.lexers[self.parser_state] | l.lexer = self.lexers[self.parser_state] | ||||
###{lexer | |||||
class LineCounter: | |||||
def __init__(self): | |||||
self.newline_char = '\n' | |||||
self.char_pos = 0 | |||||
self.line = 1 | |||||
self.column = 0 | |||||
self.line_start_pos = 0 | |||||
def feed(self, token, test_newline=True): | |||||
"""Consume a token and calculate the new line & column. | |||||
As an optional optimization, set test_newline=False is token doesn't contain a newline. | |||||
""" | |||||
if test_newline: | |||||
newlines = token.count(self.newline_char) | |||||
if newlines: | |||||
self.line += newlines | |||||
self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1 | |||||
self.char_pos += len(token) | |||||
self.column = self.char_pos - self.line_start_pos | |||||
class _Lex: | |||||
"Built to serve both Lexer and ContextualLexer" | |||||
def __init__(self, lexer): | |||||
self.lexer = lexer | |||||
def lex(self, stream, newline_types, ignore_types): | |||||
newline_types = list(newline_types) | |||||
newline_types = list(newline_types) | |||||
line_ctr = LineCounter() | |||||
while True: | |||||
lexer = self.lexer | |||||
for mre, type_from_index in lexer.mres: | |||||
m = mre.match(stream, line_ctr.char_pos) | |||||
if m: | |||||
value = m.group(0) | |||||
type_ = type_from_index[m.lastindex] | |||||
if type_ not in ignore_types: | |||||
t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) | |||||
if t.type in lexer.callback: | |||||
t = lexer.callback[t.type](t) | |||||
lexer = yield t | |||||
line_ctr.feed(value, type_ in newline_types) | |||||
break | |||||
else: | |||||
if line_ctr.char_pos < len(stream): | |||||
raise UnexpectedInput(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column) | |||||
break | |||||
###} |
@@ -128,7 +128,7 @@ RULES = { | |||||
class EBNF_to_BNF(InlineTransformer): | class EBNF_to_BNF(InlineTransformer): | ||||
def __init__(self): | def __init__(self): | ||||
self.new_rules = {} | |||||
self.new_rules = [] | |||||
self.rules_by_expr = {} | self.rules_by_expr = {} | ||||
self.prefix = 'anon' | self.prefix = 'anon' | ||||
self.i = 0 | self.i = 0 | ||||
@@ -141,7 +141,8 @@ class EBNF_to_BNF(InlineTransformer): | |||||
new_name = '__%s_%s_%d' % (self.prefix, type_, self.i) | new_name = '__%s_%s_%d' % (self.prefix, type_, self.i) | ||||
self.i += 1 | self.i += 1 | ||||
t = Token('RULE', new_name, -1) | t = Token('RULE', new_name, -1) | ||||
self.new_rules[new_name] = T('expansions', [T('expansion', [expr]), T('expansion', [t, expr])]), self.rule_options | |||||
tree = T('expansions', [T('expansion', [expr]), T('expansion', [t, expr])]) | |||||
self.new_rules.append((new_name, tree, self.rule_options)) | |||||
self.rules_by_expr[expr] = t | self.rules_by_expr[expr] = t | ||||
return t | return t | ||||
@@ -390,12 +391,6 @@ def _interleave(l, item): | |||||
def _choice_of_rules(rules): | def _choice_of_rules(rules): | ||||
return T('expansions', [T('expansion', [Token('RULE', name)]) for name in rules]) | return T('expansions', [T('expansion', [Token('RULE', name)]) for name in rules]) | ||||
def dict_update_safe(d1, d2): | |||||
for k, v in d2.items(): | |||||
assert k not in d1 | |||||
d1[k] = v | |||||
class Grammar: | class Grammar: | ||||
def __init__(self, rule_defs, token_defs, ignore): | def __init__(self, rule_defs, token_defs, ignore): | ||||
self.token_defs = token_defs | self.token_defs = token_defs | ||||
@@ -468,38 +463,41 @@ class Grammar: | |||||
# ================= | # ================= | ||||
# Compile Rules | # Compile Rules | ||||
# ================= | # ================= | ||||
ebnf_to_bnf = EBNF_to_BNF() | |||||
simplify_rule = SimplifyRule_Visitor() | |||||
# 1. Pre-process terminals | |||||
transformer = PrepareLiterals() | transformer = PrepareLiterals() | ||||
if not lexer: | if not lexer: | ||||
transformer *= SplitLiterals() | transformer *= SplitLiterals() | ||||
transformer *= ExtractAnonTokens(tokens) # Adds to tokens | transformer *= ExtractAnonTokens(tokens) # Adds to tokens | ||||
rules = {} | |||||
# 2. Convert EBNF to BNF (and apply step 1) | |||||
ebnf_to_bnf = EBNF_to_BNF() | |||||
rules = [] | |||||
for name, rule_tree, options in rule_defs: | for name, rule_tree, options in rule_defs: | ||||
assert name not in rules, name | |||||
ebnf_to_bnf.rule_options = RuleOptions(keep_all_tokens=True) if options and options.keep_all_tokens else None | ebnf_to_bnf.rule_options = RuleOptions(keep_all_tokens=True) if options and options.keep_all_tokens else None | ||||
tree = transformer.transform(rule_tree) | tree = transformer.transform(rule_tree) | ||||
rules[name] = ebnf_to_bnf.transform(tree), options | |||||
rules.append((name, ebnf_to_bnf.transform(tree), options)) | |||||
rules += ebnf_to_bnf.new_rules | |||||
dict_update_safe(rules, ebnf_to_bnf.new_rules) | |||||
assert len(rules) == len({name for name, _t, _o in rules}), "Whoops, name collision" | |||||
# 3. Compile tree to Rule objects | |||||
rule_tree_to_text = RuleTreeToText() | rule_tree_to_text = RuleTreeToText() | ||||
new_rules = [] | |||||
for origin, (tree, options) in rules.items(): | |||||
simplify_rule = SimplifyRule_Visitor() | |||||
compiled_rules = [] | |||||
for name, tree, options in rules: | |||||
simplify_rule.visit(tree) | simplify_rule.visit(tree) | ||||
expansions = rule_tree_to_text.transform(tree) | expansions = rule_tree_to_text.transform(tree) | ||||
for expansion, alias in expansions: | for expansion, alias in expansions: | ||||
if alias and origin.startswith('_'): | |||||
raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (origin, alias)) | |||||
if alias and name.startswith('_'): | |||||
raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (name, alias)) | |||||
rule = Rule(origin, expansion, alias, options) | |||||
new_rules.append(rule) | |||||
rule = Rule(name, expansion, alias, options) | |||||
compiled_rules.append(rule) | |||||
return tokens, new_rules, self.ignore | |||||
return tokens, compiled_rules, self.ignore | |||||
@@ -557,7 +555,7 @@ class GrammarLoader: | |||||
rules = [options_from_rule(name, x) for name, x in RULES.items()] | rules = [options_from_rule(name, x) for name, x in RULES.items()] | ||||
rules = [Rule(r, x.split(), None, o) for r, xs, o in rules for x in xs] | rules = [Rule(r, x.split(), None, o) for r, xs, o in rules for x in xs] | ||||
callback = ParseTreeBuilder(rules, T).apply() | |||||
callback = ParseTreeBuilder(rules, T).create_callback() | |||||
lexer_conf = LexerConf(tokens, ['WS', 'COMMENT']) | lexer_conf = LexerConf(tokens, ['WS', 'COMMENT']) | ||||
parser_conf = ParserConf(rules, callback, 'start') | parser_conf = ParserConf(rules, callback, 'start') | ||||
@@ -3,6 +3,8 @@ from .utils import suppress | |||||
from .lexer import Token | from .lexer import Token | ||||
from .grammar import Rule | from .grammar import Rule | ||||
###{standalone | |||||
class NodeBuilder: | class NodeBuilder: | ||||
def __init__(self, tree_class, name): | def __init__(self, tree_class, name): | ||||
self.tree_class = tree_class | self.tree_class = tree_class | ||||
@@ -130,7 +132,7 @@ class ParseTreeBuilder: | |||||
yield rule, wrapper_chain | yield rule, wrapper_chain | ||||
def apply(self, transformer=None): | |||||
def create_callback(self, transformer=None): | |||||
callback = Callback() | callback = Callback() | ||||
for rule, wrapper_chain in self.rule_builders: | for rule, wrapper_chain in self.rule_builders: | ||||
@@ -152,3 +154,5 @@ class ParseTreeBuilder: | |||||
setattr(callback, internal_callback_name, f) | setattr(callback, internal_callback_name, f) | ||||
return callback | return callback | ||||
###} |
@@ -3,7 +3,7 @@ | |||||
# Author: Erez Shinan (2017) | # Author: Erez Shinan (2017) | ||||
# Email : erezshin@gmail.com | # Email : erezshin@gmail.com | ||||
from ..common import ParseError, UnexpectedToken | |||||
from ..common import UnexpectedToken | |||||
from .lalr_analysis import LALR_Analyzer, Shift | from .lalr_analysis import LALR_Analyzer, Shift | ||||
@@ -20,6 +20,8 @@ class Parser: | |||||
self.parser = _Parser(analysis.parse_table, callbacks) | self.parser = _Parser(analysis.parse_table, callbacks) | ||||
self.parse = self.parser.parse | self.parse = self.parser.parse | ||||
###{standalone | |||||
class _Parser: | class _Parser: | ||||
def __init__(self, parse_table, callbacks): | def __init__(self, parse_table, callbacks): | ||||
self.states = parse_table.states | self.states = parse_table.states | ||||
@@ -90,3 +92,5 @@ class _Parser: | |||||
return val | return val | ||||
else: | else: | ||||
reduce(arg) | reduce(arg) | ||||
###} |
@@ -7,6 +7,7 @@ from copy import deepcopy | |||||
from .utils import inline_args | from .utils import inline_args | ||||
###{standalone | |||||
class Tree(object): | class Tree(object): | ||||
def __init__(self, data, children): | def __init__(self, data, children): | ||||
self.data = data | self.data = data | ||||
@@ -33,6 +34,7 @@ class Tree(object): | |||||
def pretty(self, indent_str=' '): | def pretty(self, indent_str=' '): | ||||
return ''.join(self._pretty(0, indent_str)) | return ''.join(self._pretty(0, indent_str)) | ||||
###} | |||||
def expand_kids_by_index(self, *indices): | def expand_kids_by_index(self, *indices): | ||||
for i in sorted(indices, reverse=True): # reverse so that changing tail won't affect indices | for i in sorted(indices, reverse=True): # reverse so that changing tail won't affect indices | ||||
@@ -138,7 +140,7 @@ class TransformerChain(object): | |||||
def __mul__(self, other): | def __mul__(self, other): | ||||
return TransformerChain(*self.transformers + (other,)) | return TransformerChain(*self.transformers + (other,)) | ||||
class InlineTransformer(Transformer): | class InlineTransformer(Transformer): | ||||