This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
您最多选择25个主题 主题必须以字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符
 
 

540 行
18 KiB

  1. import os.path
  2. from itertools import chain
  3. import re
  4. import codecs
  5. from .lexer import Lexer, Token, UnexpectedInput
  6. from .parse_tree_builder import ParseTreeBuilder
  7. from .parser_frontends import LALR
  8. from .parsers.lalr_parser import UnexpectedToken
  9. from .common import is_terminal, GrammarError, LexerConf, ParserConf, PatternStr, PatternRE, TokenDef
  10. from .tree import Tree as T, Transformer, InlineTransformer, Visitor
  11. unicode_escape = codecs.getdecoder('unicode_escape')
  12. __path__ = os.path.dirname(__file__)
  13. IMPORT_PATHS = [os.path.join(__path__, 'grammars')]
  14. _TOKEN_NAMES = {
  15. '.' : 'DOT',
  16. ',' : 'COMMA',
  17. ':' : 'COLON',
  18. ';' : 'SEMICOLON',
  19. '+' : 'PLUS',
  20. '-' : 'MINUS',
  21. '*' : 'STAR',
  22. '/' : 'SLASH',
  23. '\\' : 'BACKSLASH',
  24. '|' : 'VBAR',
  25. '?' : 'QMARK',
  26. '!' : 'BANG',
  27. '@' : 'AT',
  28. '#' : 'HASH',
  29. '$' : 'DOLLAR',
  30. '%' : 'PERCENT',
  31. '^' : 'CIRCUMFLEX',
  32. '&' : 'AMPERSAND',
  33. '_' : 'UNDERSCORE',
  34. '<' : 'LESSTHAN',
  35. '>' : 'MORETHAN',
  36. '=' : 'EQUAL',
  37. '"' : 'DBLQUOTE',
  38. '\'' : 'QUOTE',
  39. '`' : 'BACKQUOTE',
  40. '~' : 'TILDE',
  41. '(' : 'LPAR',
  42. ')' : 'RPAR',
  43. '{' : 'LBRACE',
  44. '}' : 'RBRACE',
  45. '[' : 'LSQB',
  46. ']' : 'RSQB',
  47. '\n' : 'NEWLINE',
  48. '\r\n' : 'CRLF',
  49. '\t' : 'TAB',
  50. ' ' : 'SPACE',
  51. }
  52. # Grammar Parser
  53. TOKENS = {
  54. '_LPAR': r'\(',
  55. '_RPAR': r'\)',
  56. '_LBRA': r'\[',
  57. '_RBRA': r'\]',
  58. 'OP': '[+*?](?![a-z])',
  59. '_COLON': ':',
  60. '_OR': r'\|',
  61. '_DOT': r'\.',
  62. 'RULE': '!?[_?]?[a-z][_a-z0-9]*',
  63. 'TOKEN': '_?[A-Z][_A-Z0-9]*',
  64. 'STRING': r'".*?[^\\]"',
  65. 'REGEXP': r"/(?!/).*?[^\\]/",
  66. '_NL': r'(\r?\n)+\s*',
  67. 'WS': r'[ \t]+',
  68. 'COMMENT': r'//[^\n]*',
  69. '_TO': '->',
  70. '_IGNORE': r'%ignore',
  71. '_IMPORT': r'%import',
  72. }
  73. RULES = {
  74. 'start': ['_list'],
  75. '_list': ['_item', '_list _item'],
  76. '_item': ['rule', 'token', 'statement', '_NL'],
  77. 'rule': ['RULE _COLON expansions _NL'],
  78. 'expansions': ['alias',
  79. 'expansions _OR alias',
  80. 'expansions _NL _OR alias'],
  81. '?alias': ['expansion _TO RULE', 'expansion'],
  82. 'expansion': ['_expansion'],
  83. '_expansion': ['', '_expansion expr'],
  84. '?expr': ['atom',
  85. 'atom OP'],
  86. '?atom': ['_LPAR expansions _RPAR',
  87. 'maybe',
  88. 'name',
  89. 'tokenvalue',
  90. 'range'],
  91. '?name': ['RULE', 'TOKEN'],
  92. 'maybe': ['_LBRA expansions _RBRA'],
  93. 'range': ['STRING _DOT _DOT STRING'],
  94. 'token': ['TOKEN _COLON expansions _NL'],
  95. 'statement': ['ignore', 'import'],
  96. 'ignore': ['_IGNORE expansions _NL'],
  97. 'import': ['_IMPORT import_args _NL',
  98. '_IMPORT import_args _TO TOKEN'],
  99. 'import_args': ['_import_args'],
  100. '_import_args': ['name', '_import_args _DOT name'],
  101. 'tokenvalue': ['REGEXP', 'STRING'],
  102. }
  103. class EBNF_to_BNF(InlineTransformer):
  104. def __init__(self):
  105. self.new_rules = {}
  106. self.rules_by_expr = {}
  107. self.prefix = 'anon'
  108. self.i = 0
  109. def _add_recurse_rule(self, type_, expr):
  110. if expr in self.rules_by_expr:
  111. return self.rules_by_expr[expr]
  112. new_name = '__%s_%s_%d' % (self.prefix, type_, self.i)
  113. self.i += 1
  114. t = Token('RULE', new_name, -1)
  115. self.new_rules[new_name] = T('expansions', [T('expansion', [expr]), T('expansion', [t, expr])])
  116. self.rules_by_expr[expr] = t
  117. return t
  118. def expr(self, rule, op):
  119. if op.value == '?':
  120. return T('expansions', [rule, T('expansion', [])])
  121. elif op.value == '+':
  122. # a : b c+ d
  123. # -->
  124. # a : b _c d
  125. # _c : _c c | c;
  126. return self._add_recurse_rule('plus', rule)
  127. elif op.value == '*':
  128. # a : b c* d
  129. # -->
  130. # a : b _c? d
  131. # _c : _c c | c;
  132. new_name = self._add_recurse_rule('star', rule)
  133. return T('expansions', [new_name, T('expansion', [])])
  134. assert False, op
  135. class SimplifyRule_Visitor(Visitor):
  136. @staticmethod
  137. def _flatten(tree):
  138. while True:
  139. to_expand = [i for i, child in enumerate(tree.children)
  140. if isinstance(child, T) and child.data == tree.data]
  141. if not to_expand:
  142. break
  143. tree.expand_kids_by_index(*to_expand)
  144. def expansion(self, tree):
  145. # rules_list unpacking
  146. # a : b (c|d) e
  147. # -->
  148. # a : b c e | b d e
  149. #
  150. # In AST terms:
  151. # expansion(b, expansions(c, d), e)
  152. # -->
  153. # expansions( expansion(b, c, e), expansion(b, d, e) )
  154. while True:
  155. self._flatten(tree)
  156. for i, child in enumerate(tree.children):
  157. if isinstance(child, T) and child.data == 'expansions':
  158. tree.data = 'expansions'
  159. tree.children = [self.visit(T('expansion', [option if i==j else other
  160. for j, other in enumerate(tree.children)]))
  161. for option in child.children]
  162. break
  163. else:
  164. break
  165. def alias(self, tree):
  166. rule, alias_name = tree.children
  167. if rule.data == 'expansions':
  168. aliases = []
  169. for child in tree.children[0].children:
  170. aliases.append(T('alias', [child, alias_name]))
  171. tree.data = 'expansions'
  172. tree.children = aliases
  173. expansions = _flatten
  174. def dict_update_safe(d1, d2):
  175. for k, v in d2.items():
  176. assert k not in d1
  177. d1[k] = v
  178. class RuleTreeToText(Transformer):
  179. def expansions(self, x):
  180. return x
  181. def expansion(self, symbols):
  182. return [sym.value for sym in symbols], None
  183. def alias(self, x):
  184. (expansion, _alias), alias = x
  185. assert _alias is None, (alias, expansion, '-', _alias)
  186. return expansion, alias.value
  187. class SimplifyTree(InlineTransformer):
  188. def maybe(self, expr):
  189. return T('expr', [expr, Token('OP', '?', -1)])
  190. def tokenmods(self, *args):
  191. if len(args) == 1:
  192. return list(args)
  193. tokenmods, value = args
  194. return tokenmods + [value]
  195. class ExtractAnonTokens(InlineTransformer):
  196. def __init__(self, tokens):
  197. self.tokens = tokens
  198. self.token_set = {td.name for td in self.tokens}
  199. self.str_reverse = {td.pattern.value: td.name for td in tokens if isinstance(td.pattern, PatternStr)}
  200. self.re_reverse = {td.pattern.value: td.name for td in tokens if isinstance(td.pattern, PatternRE)}
  201. self.i = 0
  202. def tokenvalue(self, token):
  203. value = token.value[1:-1]
  204. if token.type == 'STRING':
  205. try:
  206. # If already defined, use the user-defined token name
  207. token_name = self.str_reverse[value]
  208. except KeyError:
  209. # Try to assign an indicative anon-token name, otherwise use a numbered name
  210. try:
  211. token_name = _TOKEN_NAMES[value]
  212. except KeyError:
  213. if value.isalnum() and value[0].isalpha() and ('__'+value.upper()) not in self.token_set:
  214. token_name = value.upper() # This can create name duplications for unidentical tokens
  215. else:
  216. token_name = 'ANONSTR_%d' % self.i
  217. self.i += 1
  218. token_name = '__' + token_name
  219. elif token.type == 'REGEXP':
  220. if value in self.re_reverse: # Kind of a wierd placement
  221. token_name = self.re_reverse[value]
  222. else:
  223. token_name = 'ANONRE_%d' % self.i
  224. self.i += 1
  225. else:
  226. assert False, token
  227. if token_name not in self.token_set:
  228. self.token_set.add(token_name)
  229. if token.type == 'STRING':
  230. pattern = PatternStr(value)
  231. assert value not in self.str_reverse
  232. self.str_reverse[value] = token_name
  233. else:
  234. pattern = PatternRE(value)
  235. assert value not in self.re_reverse
  236. self.re_reverse[value] = token_name
  237. self.tokens.append(TokenDef(token_name, pattern))
  238. return Token('TOKEN', token_name, -1)
  239. class TokenTreeToPattern(Transformer):
  240. def tokenvalue(self, tv):
  241. tv ,= tv
  242. value = tv.value[1:-1]
  243. if r'\u' in value:
  244. # XXX for now, you can't mix unicode escaping and unicode characters at the same token
  245. value = unicode_escape(value)[0]
  246. if tv.type == 'REGEXP':
  247. return PatternRE(value)
  248. elif tv.type == 'STRING':
  249. return PatternStr(value)
  250. assert False
  251. def expansion(self, items):
  252. if len(items) == 1:
  253. return items[0]
  254. return PatternRE(''.join(i.to_regexp() for i in items))
  255. def expansions(self, exps):
  256. if len(exps) == 1:
  257. return exps[0]
  258. return PatternRE('(?:%s)' % ('|'.join(i.to_regexp() for i in exps)))
  259. def range(self, items):
  260. assert all(i.type=='STRING' for i in items)
  261. items = [i[1:-1] for i in items]
  262. start, end = items
  263. assert len(start) == len(end) == 1, (start, end)
  264. return PatternRE('[%s-%s]' % (start, end))
  265. def expr(self, args):
  266. inner, op = args
  267. return PatternRE('(?:%s)%s' % (inner.to_regexp(), op))
  268. class Grammar:
  269. def __init__(self, rule_defs, token_defs, extra):
  270. self.token_defs = token_defs
  271. self.rule_defs = rule_defs
  272. self.extra = extra
  273. def compile(self, lexer=False):
  274. assert lexer
  275. tokendefs = list(self.token_defs)
  276. # =================
  277. # Compile Tokens
  278. # =================
  279. token_tree_to_pattern = TokenTreeToPattern()
  280. # Convert tokens to strings/regexps
  281. tokens = []
  282. for name, token_tree in tokendefs:
  283. pattern = token_tree_to_pattern.transform(token_tree)
  284. tokens.append(TokenDef(name, pattern) )
  285. # Resolve regexp assignments of the form /..${X}../
  286. # XXX This is deprecated, since you can express most regexps with EBNF
  287. # XXX Also, since this happens after import, it can be a source of bugs
  288. token_dict = {td.name: td.pattern.to_regexp() for td in tokens}
  289. while True:
  290. changed = False
  291. for t in tokens:
  292. if isinstance(t.pattern, PatternRE):
  293. sp = re.split(r'(\$\{%s})' % TOKENS['TOKEN'], t.pattern.value)
  294. if sp:
  295. value = ''.join(token_dict[x[2:-1]] if x.startswith('${') and x.endswith('}') else x
  296. for x in sp)
  297. if value != t.pattern.value:
  298. t.pattern.value = value
  299. changed = True
  300. if not changed:
  301. break
  302. # =================
  303. # Compile Rules
  304. # =================
  305. extract_anon = ExtractAnonTokens(tokens)
  306. ebnf_to_bnf = EBNF_to_BNF()
  307. simplify_rule = SimplifyRule_Visitor()
  308. rule_tree_to_text = RuleTreeToText()
  309. rules = {}
  310. for name, rule_tree in self.rule_defs:
  311. assert name not in rules
  312. tree = extract_anon.transform(rule_tree) # Adds to tokens
  313. rules[name] = ebnf_to_bnf.transform(tree)
  314. dict_update_safe(rules, ebnf_to_bnf.new_rules)
  315. for r in rules.values():
  316. simplify_rule.visit(r)
  317. rules = {origin: rule_tree_to_text.transform(tree) for origin, tree in rules.items()}
  318. return tokens, rules, self.extra
  319. class GrammarRule:
  320. def __init__(self, name, expansions):
  321. self.keep_all_tokens = name.startswith('!')
  322. name = name.lstrip('!')
  323. self.expand1 = name.startswith('?')
  324. name = name.lstrip('?')
  325. self.name = name
  326. self.expansions = expansions
  327. _imported_grammars = {}
  328. def import_grammar(grammar_path):
  329. if grammar_path not in _imported_grammars:
  330. for import_path in IMPORT_PATHS:
  331. with open(os.path.join(import_path, grammar_path)) as f:
  332. text = f.read()
  333. grammar = load_grammar(text)
  334. _imported_grammars[grammar_path] = grammar
  335. return _imported_grammars[grammar_path]
  336. def resolve_token_references(token_defs):
  337. token_dict = dict(token_defs)
  338. assert len(token_dict) == len(token_defs), "Same name defined twice?"
  339. while True:
  340. changed = False
  341. for name, token_tree in token_defs:
  342. for exp in chain(token_tree.find_data('expansion'), token_tree.find_data('expr')):
  343. for i, item in enumerate(exp.children):
  344. if isinstance(item, Token):
  345. if item.type == 'RULE':
  346. raise GrammarError("Rules aren't allowed inside tokens (%s in %s)" % (item, name))
  347. if item.type == 'TOKEN':
  348. exp.children[i] = token_dict[item]
  349. changed = True
  350. if not changed:
  351. break
  352. class GrammarLoader:
  353. def __init__(self):
  354. tokens = [TokenDef(name, PatternRE(value)) for name, value in TOKENS.items()]
  355. d = {r: [(x.split(), None) for x in xs] for r, xs in RULES.items()}
  356. rules, callback = ParseTreeBuilder(T).create_tree_builder(d, None)
  357. lexer_conf = LexerConf(tokens, ['WS', 'COMMENT'], None)
  358. parser_conf = ParserConf(rules, callback, 'start')
  359. self.parser = LALR(lexer_conf, parser_conf)
  360. self.simplify_tree = SimplifyTree()
  361. def load_grammar(self, grammar_text):
  362. try:
  363. tree = self.simplify_tree.transform( self.parser.parse(grammar_text+'\n') )
  364. except UnexpectedInput as e:
  365. raise GrammarError("Unexpected input %r at line %d column %d" % (e.context, e.line, e.column))
  366. except UnexpectedToken as e:
  367. if '_COLON' in e.expected:
  368. raise GrammarError("Missing colon at line %s column %s" % (e.line, e.column))
  369. elif 'tokenvalue' in e.expected:
  370. raise GrammarError("Expecting a value at line %s column %s" % (e.line, e.column))
  371. elif e.expected == ['_OR']:
  372. raise GrammarError("Newline without starting a new option (Expecting '|') at line %s column %s" % (e.line, e.column))
  373. raise
  374. # Extract grammar items
  375. token_defs = [c.children for c in tree.children if c.data=='token']
  376. rule_defs = [c.children for c in tree.children if c.data=='rule']
  377. statements = [c.children for c in tree.children if c.data=='statement']
  378. assert len(token_defs) + len(rule_defs) + len(statements) == len(tree.children)
  379. token_defs = [(name.value, t) for name, t in token_defs]
  380. # Execute statements
  381. ignore = []
  382. for (stmt,) in statements:
  383. if stmt.data == 'ignore':
  384. expansions ,= stmt.children
  385. ignore.append(expansions)
  386. elif stmt.data == 'import':
  387. dotted_path = stmt.children[0].children
  388. name = stmt.children[1] if len(stmt.children)>1 else dotted_path[-1]
  389. grammar_path = os.path.join(*dotted_path[:-1]) + '.g'
  390. g = import_grammar(grammar_path)
  391. token_tree = dict(g.token_defs)[dotted_path[-1]]
  392. token_defs.append([name.value, token_tree])
  393. else:
  394. assert False, command
  395. # Verify correctness 1
  396. for name, _ in token_defs:
  397. if name.startswith('__'):
  398. raise GrammarError('Names starting with double-underscore are reserved (Error at %s)' % name)
  399. # Handle ignore tokens
  400. ignore_names = []
  401. for i, t in enumerate(ignore):
  402. if t.data == 'expansions' and len(t.children) == 1:
  403. x ,= t.children
  404. if x.data == 'expansion' and len(x.children) == 1:
  405. item ,= x.children
  406. if isinstance(item, Token) and item.type == 'TOKEN':
  407. # XXX is this really a wise solution? -- Erez
  408. ignore_names.append(item.value)
  409. continue
  410. name = '__IGNORE_%d'%i
  411. token_defs.append((name, t))
  412. ignore_names.append(name)
  413. # Resolve token references
  414. resolve_token_references(token_defs)
  415. # Verify correctness 2
  416. token_names = set()
  417. for name, _ in token_defs:
  418. if name in token_names:
  419. raise GrammarError("Token '%s' defined more than once" % name)
  420. token_names.add(name)
  421. rules = [GrammarRule(name, x) for name, x in rule_defs]
  422. rule_names = set()
  423. for r in rules:
  424. if r.name.startswith('__'):
  425. raise GrammarError('Names starting with double-underscore are reserved (Error at %s)' % name)
  426. if r.name in rule_names:
  427. raise GrammarError("Rule '%s' defined more than once" % r.name)
  428. rule_names.add(r.name)
  429. for r in rules:
  430. used_symbols = {t for x in r.expansions.find_data('expansion')
  431. for t in x.scan_values(lambda t: t.type in ('RULE', 'TOKEN'))}
  432. for sym in used_symbols:
  433. if is_terminal(sym):
  434. if sym not in token_names:
  435. raise GrammarError("Token '%s' used but not defined (in rule %s)" % (sym, r.name))
  436. else:
  437. if sym not in rule_names:
  438. raise GrammarError("Rule '%s' used but not defined (in rule %s)" % (sym, r.name))
  439. # TODO don't include unused tokens, they can only cause trouble!
  440. return Grammar(rule_defs, token_defs, {'ignore': ignore_names})
  441. load_grammar = GrammarLoader().load_grammar