This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
您最多选择25个主题 主题必须以字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符
 
 

625 行
21 KiB

  1. "Parses and creates Grammar objects"
  2. import os.path
  3. from itertools import chain
  4. import re
  5. from ast import literal_eval
  6. from copy import deepcopy
  7. from .lexer import Token, UnexpectedInput
  8. from .parse_tree_builder import ParseTreeBuilder
  9. from .parser_frontends import LALR
  10. from .parsers.lalr_parser import UnexpectedToken
  11. from .common import is_terminal, GrammarError, LexerConf, ParserConf, PatternStr, PatternRE, TokenDef
  12. from .tree import Tree as T, Transformer, InlineTransformer, Visitor
  13. __path__ = os.path.dirname(__file__)
  14. IMPORT_PATHS = [os.path.join(__path__, 'grammars')]
  15. _TOKEN_NAMES = {
  16. '.' : 'DOT',
  17. ',' : 'COMMA',
  18. ':' : 'COLON',
  19. ';' : 'SEMICOLON',
  20. '+' : 'PLUS',
  21. '-' : 'MINUS',
  22. '*' : 'STAR',
  23. '/' : 'SLASH',
  24. '\\' : 'BACKSLASH',
  25. '|' : 'VBAR',
  26. '?' : 'QMARK',
  27. '!' : 'BANG',
  28. '@' : 'AT',
  29. '#' : 'HASH',
  30. '$' : 'DOLLAR',
  31. '%' : 'PERCENT',
  32. '^' : 'CIRCUMFLEX',
  33. '&' : 'AMPERSAND',
  34. '_' : 'UNDERSCORE',
  35. '<' : 'LESSTHAN',
  36. '>' : 'MORETHAN',
  37. '=' : 'EQUAL',
  38. '"' : 'DBLQUOTE',
  39. '\'' : 'QUOTE',
  40. '`' : 'BACKQUOTE',
  41. '~' : 'TILDE',
  42. '(' : 'LPAR',
  43. ')' : 'RPAR',
  44. '{' : 'LBRACE',
  45. '}' : 'RBRACE',
  46. '[' : 'LSQB',
  47. ']' : 'RSQB',
  48. '\n' : 'NEWLINE',
  49. '\r\n' : 'CRLF',
  50. '\t' : 'TAB',
  51. ' ' : 'SPACE',
  52. }
  53. # Grammar Parser
  54. TOKENS = {
  55. '_LPAR': r'\(',
  56. '_RPAR': r'\)',
  57. '_LBRA': r'\[',
  58. '_RBRA': r'\]',
  59. 'OP': '[+*][?]?|[?](?![a-z])',
  60. '_COLON': ':',
  61. '_OR': r'\|',
  62. '_DOT': r'\.',
  63. 'RULE': '!?[_?]?[a-z][_a-z0-9]*',
  64. 'TOKEN': '_?[A-Z][_A-Z0-9]*',
  65. 'STRING': r'"(\\"|\\\\|[^"\n])*?"i?',
  66. 'REGEXP': r'/(?!/)(\\/|\\\\|[^/\n])*?/i?',
  67. '_NL': r'(\r?\n)+\s*',
  68. 'WS': r'[ \t]+',
  69. 'COMMENT': r'//[^\n]*',
  70. '_TO': '->',
  71. '_IGNORE': r'%ignore',
  72. '_IMPORT': r'%import',
  73. 'NUMBER': '\d+',
  74. }
  75. RULES = {
  76. 'start': ['_list'],
  77. '_list': ['_item', '_list _item'],
  78. '_item': ['rule', 'token', 'statement', '_NL'],
  79. 'rule': ['RULE _COLON expansions _NL',
  80. 'RULE _DOT NUMBER _COLON expansions _NL'],
  81. 'expansions': ['alias',
  82. 'expansions _OR alias',
  83. 'expansions _NL _OR alias'],
  84. '?alias': ['expansion _TO RULE', 'expansion'],
  85. 'expansion': ['_expansion'],
  86. '_expansion': ['', '_expansion expr'],
  87. '?expr': ['atom',
  88. 'atom OP'],
  89. '?atom': ['_LPAR expansions _RPAR',
  90. 'maybe',
  91. 'name',
  92. 'literal',
  93. 'range'],
  94. '?name': ['RULE', 'TOKEN'],
  95. 'maybe': ['_LBRA expansions _RBRA'],
  96. 'range': ['STRING _DOT _DOT STRING'],
  97. 'token': ['TOKEN _COLON expansions _NL'],
  98. 'statement': ['ignore', 'import'],
  99. 'ignore': ['_IGNORE expansions _NL'],
  100. 'import': ['_IMPORT import_args _NL',
  101. '_IMPORT import_args _TO TOKEN'],
  102. 'import_args': ['_import_args'],
  103. '_import_args': ['name', '_import_args _DOT name'],
  104. 'literal': ['REGEXP', 'STRING'],
  105. }
  106. class EBNF_to_BNF(InlineTransformer):
  107. def __init__(self):
  108. self.new_rules = {}
  109. self.rules_by_expr = {}
  110. self.prefix = 'anon'
  111. self.i = 0
  112. self.rule_options = None
  113. def _add_recurse_rule(self, type_, expr):
  114. if expr in self.rules_by_expr:
  115. return self.rules_by_expr[expr]
  116. new_name = '__%s_%s_%d' % (self.prefix, type_, self.i)
  117. self.i += 1
  118. t = Token('RULE', new_name, -1)
  119. self.new_rules[new_name] = T('expansions', [T('expansion', [expr]), T('expansion', [t, expr])]), self.rule_options
  120. self.rules_by_expr[expr] = t
  121. return t
  122. def expr(self, rule, op):
  123. if op.value == '?':
  124. return T('expansions', [rule, T('expansion', [])])
  125. elif op.value == '+':
  126. # a : b c+ d
  127. # -->
  128. # a : b _c d
  129. # _c : _c c | c;
  130. return self._add_recurse_rule('plus', rule)
  131. elif op.value == '*':
  132. # a : b c* d
  133. # -->
  134. # a : b _c? d
  135. # _c : _c c | c;
  136. new_name = self._add_recurse_rule('star', rule)
  137. return T('expansions', [new_name, T('expansion', [])])
  138. assert False, op
  139. class SimplifyRule_Visitor(Visitor):
  140. @staticmethod
  141. def _flatten(tree):
  142. while True:
  143. to_expand = [i for i, child in enumerate(tree.children)
  144. if isinstance(child, T) and child.data == tree.data]
  145. if not to_expand:
  146. break
  147. tree.expand_kids_by_index(*to_expand)
  148. def expansion(self, tree):
  149. # rules_list unpacking
  150. # a : b (c|d) e
  151. # -->
  152. # a : b c e | b d e
  153. #
  154. # In AST terms:
  155. # expansion(b, expansions(c, d), e)
  156. # -->
  157. # expansions( expansion(b, c, e), expansion(b, d, e) )
  158. while True:
  159. self._flatten(tree)
  160. for i, child in enumerate(tree.children):
  161. if isinstance(child, T) and child.data == 'expansions':
  162. tree.data = 'expansions'
  163. tree.children = [self.visit(T('expansion', [option if i==j else other
  164. for j, other in enumerate(tree.children)]))
  165. for option in child.children]
  166. break
  167. else:
  168. break
  169. def alias(self, tree):
  170. rule, alias_name = tree.children
  171. if rule.data == 'expansions':
  172. aliases = []
  173. for child in tree.children[0].children:
  174. aliases.append(T('alias', [child, alias_name]))
  175. tree.data = 'expansions'
  176. tree.children = aliases
  177. expansions = _flatten
  178. class RuleTreeToText(Transformer):
  179. def expansions(self, x):
  180. return x
  181. def expansion(self, symbols):
  182. return [sym.value for sym in symbols], None
  183. def alias(self, x):
  184. (expansion, _alias), alias = x
  185. assert _alias is None, (alias, expansion, '-', _alias)
  186. return expansion, alias.value
  187. class CanonizeTree(InlineTransformer):
  188. def maybe(self, expr):
  189. return T('expr', [expr, Token('OP', '?', -1)])
  190. def tokenmods(self, *args):
  191. if len(args) == 1:
  192. return list(args)
  193. tokenmods, value = args
  194. return tokenmods + [value]
  195. class ExtractAnonTokens(InlineTransformer):
  196. "Create a unique list of anonymous tokens. Attempt to give meaningful names to them when we add them"
  197. def __init__(self, tokens):
  198. self.tokens = tokens
  199. self.token_set = {td.name for td in self.tokens}
  200. self.token_reverse = {td.pattern: td for td in tokens}
  201. self.i = 0
  202. def pattern(self, p):
  203. value = p.value
  204. if p in self.token_reverse and p.flags != self.token_reverse[p].pattern.flags:
  205. raise GrammarError(u'Conflicting flags for the same terminal: %s' % p)
  206. if isinstance(p, PatternStr):
  207. try:
  208. # If already defined, use the user-defined token name
  209. token_name = self.token_reverse[p].name
  210. except KeyError:
  211. # Try to assign an indicative anon-token name, otherwise use a numbered name
  212. try:
  213. token_name = _TOKEN_NAMES[value]
  214. except KeyError:
  215. if value.isalnum() and value[0].isalpha() and ('__'+value.upper()) not in self.token_set:
  216. token_name = '%s%d' % (value.upper(), self.i)
  217. try:
  218. # Make sure we don't have unicode in our token names
  219. token_name.encode('ascii')
  220. except UnicodeEncodeError:
  221. token_name = 'ANONSTR_%d' % self.i
  222. else:
  223. token_name = 'ANONSTR_%d' % self.i
  224. self.i += 1
  225. token_name = '__' + token_name
  226. elif isinstance(p, PatternRE):
  227. if p in self.token_reverse: # Kind of a wierd placement.name
  228. token_name = self.token_reverse[p].name
  229. else:
  230. token_name = 'ANONRE_%d' % self.i
  231. self.i += 1
  232. else:
  233. assert False, p
  234. if token_name not in self.token_set:
  235. assert p not in self.token_reverse
  236. self.token_set.add(token_name)
  237. tokendef = TokenDef(token_name, p)
  238. self.token_reverse[p] = tokendef
  239. self.tokens.append(tokendef)
  240. return Token('TOKEN', token_name, -1)
  241. def _literal_to_pattern(literal):
  242. v = literal.value
  243. if v[-1] in 'i':
  244. flags = v[-1]
  245. v = v[:-1]
  246. else:
  247. flags = None
  248. assert v[0] == v[-1] and v[0] in '"/'
  249. x = v[1:-1].replace("'", r"\'")
  250. s = literal_eval("u'''%s'''" % x)
  251. return { 'STRING': PatternStr,
  252. 'REGEXP': PatternRE }[literal.type](s, flags)
  253. class PrepareLiterals(InlineTransformer):
  254. def literal(self, literal):
  255. return T('pattern', [_literal_to_pattern(literal)])
  256. def range(self, start, end):
  257. assert start.type == end.type == 'STRING'
  258. start = start.value[1:-1]
  259. end = end.value[1:-1]
  260. assert len(start) == len(end) == 1
  261. regexp = '[%s-%s]' % (start, end)
  262. return T('pattern', [PatternRE(regexp)])
  263. class SplitLiterals(InlineTransformer):
  264. def pattern(self, p):
  265. if isinstance(p, PatternStr) and len(p.value)>1:
  266. return T('expansion', [T('pattern', [PatternStr(ch, flags=p.flags)]) for ch in p.value])
  267. return T('pattern', [p])
  268. class TokenTreeToPattern(Transformer):
  269. def pattern(self, ps):
  270. p ,= ps
  271. return p
  272. def expansion(self, items):
  273. if len(items) == 1:
  274. return items[0]
  275. if len({i.flags for i in items}) > 1:
  276. raise GrammarError("Lark doesn't support joining tokens with conflicting flags!")
  277. return PatternRE(''.join(i.to_regexp() for i in items), items[0].flags)
  278. def expansions(self, exps):
  279. if len(exps) == 1:
  280. return exps[0]
  281. assert all(i.flags is None for i in exps)
  282. return PatternRE('(?:%s)' % ('|'.join(i.to_regexp() for i in exps)))
  283. def expr(self, args):
  284. inner, op = args
  285. return PatternRE('(?:%s)%s' % (inner.to_regexp(), op), inner.flags)
  286. def _interleave(l, item):
  287. for e in l:
  288. yield e
  289. if isinstance(e, T):
  290. if e.data in ('literal', 'range'):
  291. yield item
  292. elif is_terminal(e):
  293. yield item
  294. def _choice_of_rules(rules):
  295. return T('expansions', [T('expansion', [Token('RULE', name)]) for name in rules])
  296. def dict_update_safe(d1, d2):
  297. for k, v in d2.items():
  298. assert k not in d1
  299. d1[k] = v
  300. class Grammar:
  301. def __init__(self, rule_defs, token_defs, ignore):
  302. self.token_defs = token_defs
  303. self.rule_defs = rule_defs
  304. self.ignore = ignore
  305. def _prepare_scanless_grammar(self, start):
  306. # XXX Pretty hacky! There should be a better way to write this method..
  307. rule_defs = deepcopy(self.rule_defs)
  308. term_defs = self.token_defs
  309. # Implement the "%ignore" feature without a lexer..
  310. terms_to_ignore = {name:'__'+name for name in self.ignore}
  311. if terms_to_ignore:
  312. assert set(terms_to_ignore) <= {name for name, t in term_defs}
  313. term_defs = [(terms_to_ignore.get(name,name),t) for name,t in term_defs]
  314. expr = Token('RULE', '__ignore')
  315. for r, tree, _o in rule_defs:
  316. for exp in tree.find_data('expansion'):
  317. exp.children = list(_interleave(exp.children, expr))
  318. if r == start:
  319. exp.children = [expr] + exp.children
  320. for exp in tree.find_data('expr'):
  321. exp.children[0] = T('expansion', list(_interleave(exp.children[:1], expr)))
  322. _ignore_tree = T('expr', [_choice_of_rules(terms_to_ignore.values()), Token('OP', '?')])
  323. rule_defs.append(('__ignore', _ignore_tree, None))
  324. # Convert all tokens to rules
  325. new_terminal_names = {name: '__token_'+name for name, tree in term_defs}
  326. for name, tree, options in rule_defs:
  327. for exp in chain( tree.find_data('expansion'), tree.find_data('expr') ):
  328. for i, sym in enumerate(exp.children):
  329. if sym in new_terminal_names:
  330. exp.children[i] = Token(sym.type, new_terminal_names[sym])
  331. for name, tree in term_defs:
  332. if name.startswith('_'):
  333. options = RuleOptions(filter_out=True)
  334. else:
  335. options = RuleOptions(keep_all_tokens=True, create_token=name)
  336. name = new_terminal_names[name]
  337. inner_name = name + '_inner'
  338. rule_defs.append((name, _choice_of_rules([inner_name]), None))
  339. rule_defs.append((inner_name, tree, options))
  340. return [], rule_defs
  341. def compile(self, lexer=False, start=None):
  342. if not lexer:
  343. token_defs, rule_defs = self._prepare_scanless_grammar(start)
  344. else:
  345. token_defs = list(self.token_defs)
  346. rule_defs = self.rule_defs
  347. # =================
  348. # Compile Tokens
  349. # =================
  350. # Convert token-trees to strings/regexps
  351. transformer = PrepareLiterals() * TokenTreeToPattern()
  352. tokens = [TokenDef(name, transformer.transform(token_tree))
  353. for name, token_tree in token_defs]
  354. # =================
  355. # Compile Rules
  356. # =================
  357. ebnf_to_bnf = EBNF_to_BNF()
  358. simplify_rule = SimplifyRule_Visitor()
  359. transformer = PrepareLiterals()
  360. if not lexer:
  361. transformer *= SplitLiterals()
  362. transformer *= ExtractAnonTokens(tokens) # Adds to tokens
  363. rules = {}
  364. for name, rule_tree, options in rule_defs:
  365. assert name not in rules, name
  366. ebnf_to_bnf.rule_options = RuleOptions(keep_all_tokens=True) if options and options.keep_all_tokens else None
  367. tree = transformer.transform(rule_tree)
  368. rules[name] = ebnf_to_bnf.transform(tree), options
  369. dict_update_safe(rules, ebnf_to_bnf.new_rules)
  370. for tree, _o in rules.values():
  371. simplify_rule.visit(tree)
  372. rule_tree_to_text = RuleTreeToText()
  373. rules = {origin: (rule_tree_to_text.transform(tree), options) for origin, (tree, options) in rules.items()}
  374. return tokens, rules, self.ignore
  375. class RuleOptions:
  376. def __init__(self, keep_all_tokens=False, expand1=False, create_token=None, filter_out=False, priority=None):
  377. self.keep_all_tokens = keep_all_tokens
  378. self.expand1 = expand1
  379. self.create_token = create_token # used for scanless postprocessing
  380. self.priority = priority
  381. self.filter_out = filter_out # remove this rule from the tree
  382. # used for "token"-rules in scanless
  383. @classmethod
  384. def from_rule(cls, name, *x):
  385. if len(x) > 1:
  386. priority, expansions = x
  387. priority = int(priority)
  388. else:
  389. expansions ,= x
  390. priority = None
  391. keep_all_tokens = name.startswith('!')
  392. name = name.lstrip('!')
  393. expand1 = name.startswith('?')
  394. name = name.lstrip('?')
  395. return name, expansions, cls(keep_all_tokens, expand1, priority=priority)
  396. _imported_grammars = {}
  397. def import_grammar(grammar_path):
  398. if grammar_path not in _imported_grammars:
  399. for import_path in IMPORT_PATHS:
  400. with open(os.path.join(import_path, grammar_path)) as f:
  401. text = f.read()
  402. grammar = load_grammar(text, grammar_path)
  403. _imported_grammars[grammar_path] = grammar
  404. return _imported_grammars[grammar_path]
  405. def resolve_token_references(token_defs):
  406. # TODO Cycles detection
  407. # TODO Solve with transitive closure (maybe)
  408. token_dict = dict(token_defs)
  409. assert len(token_dict) == len(token_defs), "Same name defined twice?"
  410. while True:
  411. changed = False
  412. for name, token_tree in token_defs:
  413. for exp in chain(token_tree.find_data('expansion'), token_tree.find_data('expr')):
  414. for i, item in enumerate(exp.children):
  415. if isinstance(item, Token):
  416. if item.type == 'RULE':
  417. raise GrammarError("Rules aren't allowed inside tokens (%s in %s)" % (item, name))
  418. if item.type == 'TOKEN':
  419. exp.children[i] = token_dict[item]
  420. changed = True
  421. if not changed:
  422. break
  423. class GrammarLoader:
  424. def __init__(self):
  425. tokens = [TokenDef(name, PatternRE(value)) for name, value in TOKENS.items()]
  426. rules = [RuleOptions.from_rule(name, x) for name, x in RULES.items()]
  427. d = {r: ([(x.split(), None) for x in xs], o) for r, xs, o in rules}
  428. rules, callback = ParseTreeBuilder(T).create_tree_builder(d, None)
  429. lexer_conf = LexerConf(tokens, ['WS', 'COMMENT'])
  430. parser_conf = ParserConf(rules, callback, 'start')
  431. self.parser = LALR(lexer_conf, parser_conf)
  432. self.canonize_tree = CanonizeTree()
  433. def load_grammar(self, grammar_text, name='<?>'):
  434. "Parse grammar_text, verify, and create Grammar object. Display nice messages on error."
  435. try:
  436. tree = self.canonize_tree.transform( self.parser.parse(grammar_text+'\n') )
  437. except UnexpectedInput as e:
  438. raise GrammarError("Unexpected input %r at line %d column %d in %s" % (e.context, e.line, e.column, name))
  439. except UnexpectedToken as e:
  440. if e.expected == ['_COLON']:
  441. raise GrammarError("Missing colon at line %s column %s" % (e.line, e.column))
  442. elif e.expected == ['RULE']:
  443. raise GrammarError("Missing alias at line %s column %s" % (e.line, e.column))
  444. elif 'STRING' in e.expected:
  445. raise GrammarError("Expecting a value at line %s column %s" % (e.line, e.column))
  446. elif e.expected == ['_OR']:
  447. raise GrammarError("Newline without starting a new option (Expecting '|') at line %s column %s" % (e.line, e.column))
  448. raise
  449. # Extract grammar items
  450. token_defs = [c.children for c in tree.children if c.data=='token']
  451. rule_defs = [c.children for c in tree.children if c.data=='rule']
  452. statements = [c.children for c in tree.children if c.data=='statement']
  453. assert len(token_defs) + len(rule_defs) + len(statements) == len(tree.children)
  454. token_defs = [(name.value, t) for name, t in token_defs]
  455. # Execute statements
  456. ignore = []
  457. for (stmt,) in statements:
  458. if stmt.data == 'ignore':
  459. expansions ,= stmt.children
  460. ignore.append(expansions)
  461. elif stmt.data == 'import':
  462. dotted_path = stmt.children[0].children
  463. name = stmt.children[1] if len(stmt.children)>1 else dotted_path[-1]
  464. grammar_path = os.path.join(*dotted_path[:-1]) + '.g'
  465. g = import_grammar(grammar_path)
  466. token_tree = dict(g.token_defs)[dotted_path[-1]]
  467. token_defs.append([name.value, token_tree])
  468. else:
  469. assert False, stmt
  470. # Verify correctness 1
  471. for name, _ in token_defs:
  472. if name.startswith('__'):
  473. raise GrammarError('Names starting with double-underscore are reserved (Error at %s)' % name)
  474. # Handle ignore tokens
  475. ignore_defs = [('__IGNORE_%d'%i, t) for i, t in enumerate(ignore)]
  476. ignore_names = [name for name,_ in ignore_defs]
  477. token_defs += ignore_defs
  478. # Verify correctness 2
  479. token_names = set()
  480. for name, _ in token_defs:
  481. if name in token_names:
  482. raise GrammarError("Token '%s' defined more than once" % name)
  483. token_names.add(name)
  484. # Resolve token references
  485. resolve_token_references(token_defs)
  486. rules = [RuleOptions.from_rule(*x) for x in rule_defs]
  487. rule_names = set()
  488. for name, _x, _o in rules:
  489. if name.startswith('__'):
  490. raise GrammarError('Names starting with double-underscore are reserved (Error at %s)' % name)
  491. if name in rule_names:
  492. raise GrammarError("Rule '%s' defined more than once" % name)
  493. rule_names.add(name)
  494. for name, expansions, _o in rules:
  495. used_symbols = {t for x in expansions.find_data('expansion')
  496. for t in x.scan_values(lambda t: t.type in ('RULE', 'TOKEN'))}
  497. for sym in used_symbols:
  498. if is_terminal(sym):
  499. if sym not in token_names:
  500. raise GrammarError("Token '%s' used but not defined (in rule %s)" % (sym, name))
  501. else:
  502. if sym not in rule_names:
  503. raise GrammarError("Rule '%s' used but not defined (in rule %s)" % (sym, name))
  504. # TODO don't include unused tokens, they can only cause trouble!
  505. return Grammar(rules, token_defs, ignore_names)
  506. load_grammar = GrammarLoader().load_grammar