| @@ -4,12 +4,18 @@ import sys | |||||
| Py36 = (sys.version_info[:2] >= (3, 6)) | Py36 = (sys.version_info[:2] >= (3, 6)) | ||||
| ###{standalone | |||||
| def is_terminal(sym): | |||||
| return sym.isupper() | |||||
| class GrammarError(Exception): | class GrammarError(Exception): | ||||
| pass | pass | ||||
| class ParseError(Exception): | class ParseError(Exception): | ||||
| pass | pass | ||||
| ###} | |||||
| class UnexpectedToken(ParseError): | class UnexpectedToken(ParseError): | ||||
| def __init__(self, token, expected, seq, index): | def __init__(self, token, expected, seq, index): | ||||
| @@ -32,9 +38,6 @@ class UnexpectedToken(ParseError): | |||||
| def is_terminal(sym): | |||||
| return sym.isupper() | |||||
| class LexerConf: | class LexerConf: | ||||
| def __init__(self, tokens, ignore=(), postlex=None): | def __init__(self, tokens, ignore=(), postlex=None): | ||||
| @@ -166,8 +166,8 @@ class Lark: | |||||
| def _build_parser(self): | def _build_parser(self): | ||||
| self.parser_class = get_frontend(self.options.parser, self.options.lexer) | self.parser_class = get_frontend(self.options.parser, self.options.lexer) | ||||
| self.parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class, self.options.propagate_positions, self.options.keep_all_tokens) | |||||
| callback = self.parse_tree_builder.apply(self.options.transformer) | |||||
| self._parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class, self.options.propagate_positions, self.options.keep_all_tokens) | |||||
| callback = self._parse_tree_builder.create_callback(self.options.transformer) | |||||
| if self.profiler: | if self.profiler: | ||||
| for f in dir(callback): | for f in dir(callback): | ||||
| if not (f.startswith('__') and f.endswith('__')): | if not (f.startswith('__') and f.endswith('__')): | ||||
| @@ -5,6 +5,7 @@ import re | |||||
| from .utils import Str, classify | from .utils import Str, classify | ||||
| from .common import is_terminal, PatternStr, PatternRE, TokenDef | from .common import is_terminal, PatternStr, PatternRE, TokenDef | ||||
| ###{standalone | |||||
| class LexError(Exception): | class LexError(Exception): | ||||
| pass | pass | ||||
| @@ -48,10 +49,60 @@ class Token(Str): | |||||
| __hash__ = Str.__hash__ | __hash__ = Str.__hash__ | ||||
| class Regex: | |||||
| def __init__(self, pattern, flags=()): | |||||
| self.pattern = pattern | |||||
| self.flags = flags | |||||
| class LineCounter: | |||||
| def __init__(self): | |||||
| self.newline_char = '\n' | |||||
| self.char_pos = 0 | |||||
| self.line = 1 | |||||
| self.column = 0 | |||||
| self.line_start_pos = 0 | |||||
| def feed(self, token, test_newline=True): | |||||
| """Consume a token and calculate the new line & column. | |||||
| As an optional optimization, set test_newline=False is token doesn't contain a newline. | |||||
| """ | |||||
| if test_newline: | |||||
| newlines = token.count(self.newline_char) | |||||
| if newlines: | |||||
| self.line += newlines | |||||
| self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1 | |||||
| self.char_pos += len(token) | |||||
| self.column = self.char_pos - self.line_start_pos | |||||
| class _Lex: | |||||
| "Built to serve both Lexer and ContextualLexer" | |||||
| def __init__(self, lexer): | |||||
| self.lexer = lexer | |||||
| def lex(self, stream, newline_types, ignore_types): | |||||
| newline_types = list(newline_types) | |||||
| newline_types = list(newline_types) | |||||
| line_ctr = LineCounter() | |||||
| while True: | |||||
| lexer = self.lexer | |||||
| for mre, type_from_index in lexer.mres: | |||||
| m = mre.match(stream, line_ctr.char_pos) | |||||
| if m: | |||||
| value = m.group(0) | |||||
| type_ = type_from_index[m.lastindex] | |||||
| if type_ not in ignore_types: | |||||
| t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) | |||||
| if t.type in lexer.callback: | |||||
| t = lexer.callback[t.type](t) | |||||
| lexer = yield t | |||||
| line_ctr.feed(value, type_ in newline_types) | |||||
| break | |||||
| else: | |||||
| if line_ctr.char_pos < len(stream): | |||||
| raise UnexpectedInput(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column) | |||||
| break | |||||
| ###} | |||||
| def _regexp_has_newline(r): | def _regexp_has_newline(r): | ||||
| return '\n' in r or '\\n' in r or ('(?s)' in r and '.' in r) | return '\n' in r or '\\n' in r or ('(?s)' in r and '.' in r) | ||||
| @@ -182,57 +233,3 @@ class ContextualLexer: | |||||
| l.lexer = self.lexers[self.parser_state] | l.lexer = self.lexers[self.parser_state] | ||||
| ###{lexer | |||||
| class LineCounter: | |||||
| def __init__(self): | |||||
| self.newline_char = '\n' | |||||
| self.char_pos = 0 | |||||
| self.line = 1 | |||||
| self.column = 0 | |||||
| self.line_start_pos = 0 | |||||
| def feed(self, token, test_newline=True): | |||||
| """Consume a token and calculate the new line & column. | |||||
| As an optional optimization, set test_newline=False is token doesn't contain a newline. | |||||
| """ | |||||
| if test_newline: | |||||
| newlines = token.count(self.newline_char) | |||||
| if newlines: | |||||
| self.line += newlines | |||||
| self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1 | |||||
| self.char_pos += len(token) | |||||
| self.column = self.char_pos - self.line_start_pos | |||||
| class _Lex: | |||||
| "Built to serve both Lexer and ContextualLexer" | |||||
| def __init__(self, lexer): | |||||
| self.lexer = lexer | |||||
| def lex(self, stream, newline_types, ignore_types): | |||||
| newline_types = list(newline_types) | |||||
| newline_types = list(newline_types) | |||||
| line_ctr = LineCounter() | |||||
| while True: | |||||
| lexer = self.lexer | |||||
| for mre, type_from_index in lexer.mres: | |||||
| m = mre.match(stream, line_ctr.char_pos) | |||||
| if m: | |||||
| value = m.group(0) | |||||
| type_ = type_from_index[m.lastindex] | |||||
| if type_ not in ignore_types: | |||||
| t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) | |||||
| if t.type in lexer.callback: | |||||
| t = lexer.callback[t.type](t) | |||||
| lexer = yield t | |||||
| line_ctr.feed(value, type_ in newline_types) | |||||
| break | |||||
| else: | |||||
| if line_ctr.char_pos < len(stream): | |||||
| raise UnexpectedInput(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column) | |||||
| break | |||||
| ###} | |||||
| @@ -128,7 +128,7 @@ RULES = { | |||||
| class EBNF_to_BNF(InlineTransformer): | class EBNF_to_BNF(InlineTransformer): | ||||
| def __init__(self): | def __init__(self): | ||||
| self.new_rules = {} | |||||
| self.new_rules = [] | |||||
| self.rules_by_expr = {} | self.rules_by_expr = {} | ||||
| self.prefix = 'anon' | self.prefix = 'anon' | ||||
| self.i = 0 | self.i = 0 | ||||
| @@ -141,7 +141,8 @@ class EBNF_to_BNF(InlineTransformer): | |||||
| new_name = '__%s_%s_%d' % (self.prefix, type_, self.i) | new_name = '__%s_%s_%d' % (self.prefix, type_, self.i) | ||||
| self.i += 1 | self.i += 1 | ||||
| t = Token('RULE', new_name, -1) | t = Token('RULE', new_name, -1) | ||||
| self.new_rules[new_name] = T('expansions', [T('expansion', [expr]), T('expansion', [t, expr])]), self.rule_options | |||||
| tree = T('expansions', [T('expansion', [expr]), T('expansion', [t, expr])]) | |||||
| self.new_rules.append((new_name, tree, self.rule_options)) | |||||
| self.rules_by_expr[expr] = t | self.rules_by_expr[expr] = t | ||||
| return t | return t | ||||
| @@ -390,12 +391,6 @@ def _interleave(l, item): | |||||
| def _choice_of_rules(rules): | def _choice_of_rules(rules): | ||||
| return T('expansions', [T('expansion', [Token('RULE', name)]) for name in rules]) | return T('expansions', [T('expansion', [Token('RULE', name)]) for name in rules]) | ||||
| def dict_update_safe(d1, d2): | |||||
| for k, v in d2.items(): | |||||
| assert k not in d1 | |||||
| d1[k] = v | |||||
| class Grammar: | class Grammar: | ||||
| def __init__(self, rule_defs, token_defs, ignore): | def __init__(self, rule_defs, token_defs, ignore): | ||||
| self.token_defs = token_defs | self.token_defs = token_defs | ||||
| @@ -468,38 +463,41 @@ class Grammar: | |||||
| # ================= | # ================= | ||||
| # Compile Rules | # Compile Rules | ||||
| # ================= | # ================= | ||||
| ebnf_to_bnf = EBNF_to_BNF() | |||||
| simplify_rule = SimplifyRule_Visitor() | |||||
| # 1. Pre-process terminals | |||||
| transformer = PrepareLiterals() | transformer = PrepareLiterals() | ||||
| if not lexer: | if not lexer: | ||||
| transformer *= SplitLiterals() | transformer *= SplitLiterals() | ||||
| transformer *= ExtractAnonTokens(tokens) # Adds to tokens | transformer *= ExtractAnonTokens(tokens) # Adds to tokens | ||||
| rules = {} | |||||
| # 2. Convert EBNF to BNF (and apply step 1) | |||||
| ebnf_to_bnf = EBNF_to_BNF() | |||||
| rules = [] | |||||
| for name, rule_tree, options in rule_defs: | for name, rule_tree, options in rule_defs: | ||||
| assert name not in rules, name | |||||
| ebnf_to_bnf.rule_options = RuleOptions(keep_all_tokens=True) if options and options.keep_all_tokens else None | ebnf_to_bnf.rule_options = RuleOptions(keep_all_tokens=True) if options and options.keep_all_tokens else None | ||||
| tree = transformer.transform(rule_tree) | tree = transformer.transform(rule_tree) | ||||
| rules[name] = ebnf_to_bnf.transform(tree), options | |||||
| rules.append((name, ebnf_to_bnf.transform(tree), options)) | |||||
| rules += ebnf_to_bnf.new_rules | |||||
| dict_update_safe(rules, ebnf_to_bnf.new_rules) | |||||
| assert len(rules) == len({name for name, _t, _o in rules}), "Whoops, name collision" | |||||
| # 3. Compile tree to Rule objects | |||||
| rule_tree_to_text = RuleTreeToText() | rule_tree_to_text = RuleTreeToText() | ||||
| new_rules = [] | |||||
| for origin, (tree, options) in rules.items(): | |||||
| simplify_rule = SimplifyRule_Visitor() | |||||
| compiled_rules = [] | |||||
| for name, tree, options in rules: | |||||
| simplify_rule.visit(tree) | simplify_rule.visit(tree) | ||||
| expansions = rule_tree_to_text.transform(tree) | expansions = rule_tree_to_text.transform(tree) | ||||
| for expansion, alias in expansions: | for expansion, alias in expansions: | ||||
| if alias and origin.startswith('_'): | |||||
| raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (origin, alias)) | |||||
| if alias and name.startswith('_'): | |||||
| raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (name, alias)) | |||||
| rule = Rule(origin, expansion, alias, options) | |||||
| new_rules.append(rule) | |||||
| rule = Rule(name, expansion, alias, options) | |||||
| compiled_rules.append(rule) | |||||
| return tokens, new_rules, self.ignore | |||||
| return tokens, compiled_rules, self.ignore | |||||
| @@ -557,7 +555,7 @@ class GrammarLoader: | |||||
| rules = [options_from_rule(name, x) for name, x in RULES.items()] | rules = [options_from_rule(name, x) for name, x in RULES.items()] | ||||
| rules = [Rule(r, x.split(), None, o) for r, xs, o in rules for x in xs] | rules = [Rule(r, x.split(), None, o) for r, xs, o in rules for x in xs] | ||||
| callback = ParseTreeBuilder(rules, T).apply() | |||||
| callback = ParseTreeBuilder(rules, T).create_callback() | |||||
| lexer_conf = LexerConf(tokens, ['WS', 'COMMENT']) | lexer_conf = LexerConf(tokens, ['WS', 'COMMENT']) | ||||
| parser_conf = ParserConf(rules, callback, 'start') | parser_conf = ParserConf(rules, callback, 'start') | ||||
| @@ -3,6 +3,8 @@ from .utils import suppress | |||||
| from .lexer import Token | from .lexer import Token | ||||
| from .grammar import Rule | from .grammar import Rule | ||||
| ###{standalone | |||||
| class NodeBuilder: | class NodeBuilder: | ||||
| def __init__(self, tree_class, name): | def __init__(self, tree_class, name): | ||||
| self.tree_class = tree_class | self.tree_class = tree_class | ||||
| @@ -130,7 +132,7 @@ class ParseTreeBuilder: | |||||
| yield rule, wrapper_chain | yield rule, wrapper_chain | ||||
| def apply(self, transformer=None): | |||||
| def create_callback(self, transformer=None): | |||||
| callback = Callback() | callback = Callback() | ||||
| for rule, wrapper_chain in self.rule_builders: | for rule, wrapper_chain in self.rule_builders: | ||||
| @@ -152,3 +154,5 @@ class ParseTreeBuilder: | |||||
| setattr(callback, internal_callback_name, f) | setattr(callback, internal_callback_name, f) | ||||
| return callback | return callback | ||||
| ###} | |||||
| @@ -3,7 +3,7 @@ | |||||
| # Author: Erez Shinan (2017) | # Author: Erez Shinan (2017) | ||||
| # Email : erezshin@gmail.com | # Email : erezshin@gmail.com | ||||
| from ..common import ParseError, UnexpectedToken | |||||
| from ..common import UnexpectedToken | |||||
| from .lalr_analysis import LALR_Analyzer, Shift | from .lalr_analysis import LALR_Analyzer, Shift | ||||
| @@ -20,6 +20,8 @@ class Parser: | |||||
| self.parser = _Parser(analysis.parse_table, callbacks) | self.parser = _Parser(analysis.parse_table, callbacks) | ||||
| self.parse = self.parser.parse | self.parse = self.parser.parse | ||||
| ###{standalone | |||||
| class _Parser: | class _Parser: | ||||
| def __init__(self, parse_table, callbacks): | def __init__(self, parse_table, callbacks): | ||||
| self.states = parse_table.states | self.states = parse_table.states | ||||
| @@ -90,3 +92,5 @@ class _Parser: | |||||
| return val | return val | ||||
| else: | else: | ||||
| reduce(arg) | reduce(arg) | ||||
| ###} | |||||
| @@ -7,6 +7,7 @@ from copy import deepcopy | |||||
| from .utils import inline_args | from .utils import inline_args | ||||
| ###{standalone | |||||
| class Tree(object): | class Tree(object): | ||||
| def __init__(self, data, children): | def __init__(self, data, children): | ||||
| self.data = data | self.data = data | ||||
| @@ -33,6 +34,7 @@ class Tree(object): | |||||
| def pretty(self, indent_str=' '): | def pretty(self, indent_str=' '): | ||||
| return ''.join(self._pretty(0, indent_str)) | return ''.join(self._pretty(0, indent_str)) | ||||
| ###} | |||||
| def expand_kids_by_index(self, *indices): | def expand_kids_by_index(self, *indices): | ||||
| for i in sorted(indices, reverse=True): # reverse so that changing tail won't affect indices | for i in sorted(indices, reverse=True): # reverse so that changing tail won't affect indices | ||||
| @@ -138,7 +140,7 @@ class TransformerChain(object): | |||||
| def __mul__(self, other): | def __mul__(self, other): | ||||
| return TransformerChain(*self.transformers + (other,)) | return TransformerChain(*self.transformers + (other,)) | ||||
| class InlineTransformer(Transformer): | class InlineTransformer(Transformer): | ||||