This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

626 lines
22 KiB

  1. import os.path
  2. from itertools import chain
  3. import re
  4. from ast import literal_eval
  5. from copy import deepcopy
  6. from .lexer import Token, UnexpectedInput
  7. from .parse_tree_builder import ParseTreeBuilder
  8. from .parser_frontends import LALR
  9. from .parsers.lalr_parser import UnexpectedToken
  10. from .common import is_terminal, GrammarError, LexerConf, ParserConf, PatternStr, PatternRE, TokenDef
  11. from .tree import Tree as T, Transformer, InlineTransformer, Visitor
  12. __path__ = os.path.dirname(__file__)
  13. IMPORT_PATHS = [os.path.join(__path__, 'grammars')]
  14. _TOKEN_NAMES = {
  15. '.' : 'DOT',
  16. ',' : 'COMMA',
  17. ':' : 'COLON',
  18. ';' : 'SEMICOLON',
  19. '+' : 'PLUS',
  20. '-' : 'MINUS',
  21. '*' : 'STAR',
  22. '/' : 'SLASH',
  23. '\\' : 'BACKSLASH',
  24. '|' : 'VBAR',
  25. '?' : 'QMARK',
  26. '!' : 'BANG',
  27. '@' : 'AT',
  28. '#' : 'HASH',
  29. '$' : 'DOLLAR',
  30. '%' : 'PERCENT',
  31. '^' : 'CIRCUMFLEX',
  32. '&' : 'AMPERSAND',
  33. '_' : 'UNDERSCORE',
  34. '<' : 'LESSTHAN',
  35. '>' : 'MORETHAN',
  36. '=' : 'EQUAL',
  37. '"' : 'DBLQUOTE',
  38. '\'' : 'QUOTE',
  39. '`' : 'BACKQUOTE',
  40. '~' : 'TILDE',
  41. '(' : 'LPAR',
  42. ')' : 'RPAR',
  43. '{' : 'LBRACE',
  44. '}' : 'RBRACE',
  45. '[' : 'LSQB',
  46. ']' : 'RSQB',
  47. '\n' : 'NEWLINE',
  48. '\r\n' : 'CRLF',
  49. '\t' : 'TAB',
  50. ' ' : 'SPACE',
  51. }
  52. # Grammar Parser
  53. TOKENS = {
  54. '_LPAR': r'\(',
  55. '_RPAR': r'\)',
  56. '_LBRA': r'\[',
  57. '_RBRA': r'\]',
  58. 'OP': '[+*][?]?|[?](?![a-z])',
  59. '_COLON': ':',
  60. '_OR': r'\|',
  61. '_DOT': r'\.',
  62. 'RULE': '!?[_?]?[a-z][_a-z0-9]*',
  63. 'TOKEN': '_?[A-Z][_A-Z0-9]*',
  64. 'STRING': r'"(\\"|\\\\|[^"])*?"',
  65. 'REGEXP': r'/(?!/)(\\/|\\\\|[^/])*?/',
  66. '_NL': r'(\r?\n)+\s*',
  67. 'WS': r'[ \t]+',
  68. 'COMMENT': r'//[^\n]*',
  69. '_TO': '->',
  70. '_IGNORE': r'%ignore',
  71. '_IMPORT': r'%import',
  72. }
  73. RULES = {
  74. 'start': ['_list'],
  75. '_list': ['_item', '_list _item'],
  76. '_item': ['rule', 'token', 'statement', '_NL'],
  77. 'rule': ['RULE _COLON expansions _NL'],
  78. 'expansions': ['alias',
  79. 'expansions _OR alias',
  80. 'expansions _NL _OR alias'],
  81. '?alias': ['expansion _TO RULE', 'expansion'],
  82. 'expansion': ['_expansion'],
  83. '_expansion': ['', '_expansion expr'],
  84. '?expr': ['atom',
  85. 'atom OP'],
  86. '?atom': ['_LPAR expansions _RPAR',
  87. 'maybe',
  88. 'name',
  89. 'literal',
  90. 'range'],
  91. '?name': ['RULE', 'TOKEN'],
  92. 'maybe': ['_LBRA expansions _RBRA'],
  93. 'range': ['STRING _DOT _DOT STRING'],
  94. 'token': ['TOKEN _COLON expansions _NL'],
  95. 'statement': ['ignore', 'import'],
  96. 'ignore': ['_IGNORE expansions _NL'],
  97. 'import': ['_IMPORT import_args _NL',
  98. '_IMPORT import_args _TO TOKEN'],
  99. 'import_args': ['_import_args'],
  100. '_import_args': ['name', '_import_args _DOT name'],
  101. 'literal': ['REGEXP', 'STRING'],
  102. }
  103. class EBNF_to_BNF(InlineTransformer):
  104. def __init__(self):
  105. self.new_rules = {}
  106. self.rules_by_expr = {}
  107. self.prefix = 'anon'
  108. self.i = 0
  109. self.rule_options = None
  110. def _add_recurse_rule(self, type_, expr):
  111. if expr in self.rules_by_expr:
  112. return self.rules_by_expr[expr]
  113. new_name = '__%s_%s_%d' % (self.prefix, type_, self.i)
  114. self.i += 1
  115. t = Token('RULE', new_name, -1)
  116. self.new_rules[new_name] = T('expansions', [T('expansion', [expr]), T('expansion', [t, expr])]), self.rule_options
  117. self.rules_by_expr[expr] = t
  118. return t
  119. def expr(self, rule, op):
  120. if op.value == '?':
  121. return T('expansions', [rule, T('expansion', [])])
  122. elif op.value == '+':
  123. # a : b c+ d
  124. # -->
  125. # a : b _c d
  126. # _c : _c c | c;
  127. return self._add_recurse_rule('plus', rule)
  128. elif op.value == '*':
  129. # a : b c* d
  130. # -->
  131. # a : b _c? d
  132. # _c : _c c | c;
  133. new_name = self._add_recurse_rule('star', rule)
  134. return T('expansions', [new_name, T('expansion', [])])
  135. assert False, op
  136. class SimplifyRule_Visitor(Visitor):
  137. @staticmethod
  138. def _flatten(tree):
  139. while True:
  140. to_expand = [i for i, child in enumerate(tree.children)
  141. if isinstance(child, T) and child.data == tree.data]
  142. if not to_expand:
  143. break
  144. tree.expand_kids_by_index(*to_expand)
  145. def expansion(self, tree):
  146. # rules_list unpacking
  147. # a : b (c|d) e
  148. # -->
  149. # a : b c e | b d e
  150. #
  151. # In AST terms:
  152. # expansion(b, expansions(c, d), e)
  153. # -->
  154. # expansions( expansion(b, c, e), expansion(b, d, e) )
  155. while True:
  156. self._flatten(tree)
  157. for i, child in enumerate(tree.children):
  158. if isinstance(child, T) and child.data == 'expansions':
  159. tree.data = 'expansions'
  160. tree.children = [self.visit(T('expansion', [option if i==j else other
  161. for j, other in enumerate(tree.children)]))
  162. for option in child.children]
  163. break
  164. else:
  165. break
  166. def alias(self, tree):
  167. rule, alias_name = tree.children
  168. if rule.data == 'expansions':
  169. aliases = []
  170. for child in tree.children[0].children:
  171. aliases.append(T('alias', [child, alias_name]))
  172. tree.data = 'expansions'
  173. tree.children = aliases
  174. expansions = _flatten
  175. def dict_update_safe(d1, d2):
  176. for k, v in d2.items():
  177. assert k not in d1
  178. d1[k] = v
  179. class RuleTreeToText(Transformer):
  180. def expansions(self, x):
  181. return x
  182. def expansion(self, symbols):
  183. return [sym.value for sym in symbols], None
  184. def alias(self, x):
  185. (expansion, _alias), alias = x
  186. assert _alias is None, (alias, expansion, '-', _alias)
  187. return expansion, alias.value
  188. class SimplifyTree(InlineTransformer):
  189. def maybe(self, expr):
  190. return T('expr', [expr, Token('OP', '?', -1)])
  191. def tokenmods(self, *args):
  192. if len(args) == 1:
  193. return list(args)
  194. tokenmods, value = args
  195. return tokenmods + [value]
  196. class ExtractAnonTokens(InlineTransformer):
  197. def __init__(self, tokens):
  198. self.tokens = tokens
  199. self.token_set = {td.name for td in self.tokens}
  200. self.str_reverse = {td.pattern.value: td.name for td in tokens if isinstance(td.pattern, PatternStr)}
  201. self.re_reverse = {td.pattern.value: td.name for td in tokens if isinstance(td.pattern, PatternRE)}
  202. self.i = 0
  203. def pattern(self, p):
  204. value = p.value
  205. if isinstance(p, PatternStr):
  206. try:
  207. # If already defined, use the user-defined token name
  208. token_name = self.str_reverse[value]
  209. except KeyError:
  210. # Try to assign an indicative anon-token name, otherwise use a numbered name
  211. try:
  212. token_name = _TOKEN_NAMES[value]
  213. except KeyError:
  214. if value.isalnum() and value[0].isalpha() and ('__'+value.upper()) not in self.token_set:
  215. token_name = '%s%d' % (value.upper(), self.i)
  216. try:
  217. # Make sure we don't have unicode in our token names
  218. token_name.encode('ascii')
  219. except UnicodeEncodeError:
  220. token_name = 'ANONSTR_%d' % self.i
  221. else:
  222. token_name = 'ANONSTR_%d' % self.i
  223. self.i += 1
  224. token_name = '__' + token_name
  225. elif isinstance(p, PatternRE):
  226. if value in self.re_reverse: # Kind of a wierd placement
  227. token_name = self.re_reverse[value]
  228. else:
  229. token_name = 'ANONRE_%d' % self.i
  230. self.i += 1
  231. else:
  232. assert False, p
  233. if token_name not in self.token_set:
  234. self.token_set.add(token_name)
  235. if isinstance(p, PatternStr):
  236. assert value not in self.str_reverse
  237. self.str_reverse[value] = token_name
  238. else:
  239. assert value not in self.re_reverse
  240. self.re_reverse[value] = token_name
  241. self.tokens.append(TokenDef(token_name, p))
  242. return Token('TOKEN', token_name, -1)
  243. def _literal_to_pattern(literal):
  244. v = literal.value
  245. assert v[0] == v[-1] and v[0] in '"/'
  246. x = v[1:-1].replace("'", r"\'")
  247. s = literal_eval("u'''%s'''" % x)
  248. return { 'STRING': PatternStr,
  249. 'REGEXP': PatternRE }[literal.type](s)
  250. class PrepareLiterals(InlineTransformer):
  251. def literal(self, literal):
  252. return T('pattern', [_literal_to_pattern(literal)])
  253. def range(self, start, end):
  254. assert start.type == end.type == 'STRING'
  255. start = start.value[1:-1]
  256. end = end.value[1:-1]
  257. assert len(start) == len(end) == 1
  258. regexp = '[%s-%s]' % (start, end)
  259. return T('pattern', [PatternRE(regexp)])
  260. class SplitLiterals(InlineTransformer):
  261. def pattern(self, p):
  262. if isinstance(p, PatternStr) and len(p.value)>1:
  263. return T('expansion', [T('pattern', [PatternStr(ch)]) for ch in p.value])
  264. return T('pattern', [p])
  265. class TokenTreeToPattern(Transformer):
  266. def pattern(self, ps):
  267. p ,= ps
  268. return p
  269. def expansion(self, items):
  270. if len(items) == 1:
  271. return items[0]
  272. return PatternRE(''.join(i.to_regexp() for i in items))
  273. def expansions(self, exps):
  274. if len(exps) == 1:
  275. return exps[0]
  276. return PatternRE('(?:%s)' % ('|'.join(i.to_regexp() for i in exps)))
  277. def expr(self, args):
  278. inner, op = args
  279. return PatternRE('(?:%s)%s' % (inner.to_regexp(), op))
  280. def interleave(l, item):
  281. for e in l:
  282. yield e
  283. if isinstance(e, T):
  284. if e.data == 'literal':
  285. yield item
  286. elif is_terminal(e):
  287. yield item
  288. class Grammar:
  289. def __init__(self, rule_defs, token_defs, extra):
  290. self.token_defs = token_defs
  291. self.rule_defs = rule_defs
  292. self.extra = extra
  293. def compile(self, lexer=False, start=None):
  294. if not lexer:
  295. rule_defs = deepcopy(self.rule_defs)
  296. # XXX VERY HACKY!! There must be a better way..
  297. ignore_tokens = [('_'+name, t) for name, t in self.token_defs if name in self.extra['ignore']]
  298. if ignore_tokens:
  299. self.token_defs = [('_'+name if name in self.extra['ignore'] else name,t) for name,t in self.token_defs]
  300. ignore_names = [t[0] for t in ignore_tokens]
  301. expr = Token('RULE', '__ignore')
  302. for r, tree, _o in rule_defs:
  303. for exp in tree.find_data('expansion'):
  304. exp.children = list(interleave(exp.children, expr))
  305. if r == start: # TODO use GrammarRule or similar (RuleOptions?)
  306. exp.children = [expr] + exp.children
  307. x = [T('expansion', [Token('RULE', x)]) for x in ignore_names]
  308. _ignore_tree = T('expr', [T('expansions', x), Token('OP', '?')])
  309. rule_defs.append(('__ignore', _ignore_tree, None))
  310. # End of "ignore" section
  311. for name, tree in self.token_defs:
  312. rule_defs.append((name, tree, RuleOptions(keep_all_tokens=True)))
  313. token_defs = []
  314. tokens_to_convert = {name: '__token_'+name for name, tree, _ in rule_defs if is_terminal(name)}
  315. new_rule_defs = []
  316. for name, tree, options in rule_defs:
  317. if name in tokens_to_convert:
  318. if name.startswith('_'):
  319. options = RuleOptions.new_from(options, filter_out=True)
  320. else:
  321. options = RuleOptions.new_from(options, create_token=name)
  322. name = tokens_to_convert.get(name, name)
  323. for exp in chain( tree.find_data('expansion'), tree.find_data('expr') ):
  324. for i, sym in enumerate(exp.children):
  325. if sym in tokens_to_convert:
  326. exp.children[i] = Token(sym.type, tokens_to_convert[sym])
  327. new_rule_defs.append((name, tree, options))
  328. rule_defs = new_rule_defs
  329. else:
  330. token_defs = list(self.token_defs)
  331. rule_defs = self.rule_defs
  332. # =================
  333. # Compile Tokens
  334. # =================
  335. token_tree_to_pattern = TokenTreeToPattern()
  336. # Convert tokens to strings/regexps
  337. tokens = []
  338. for name, token_tree in token_defs:
  339. token_tree = PrepareLiterals().transform(token_tree)
  340. pattern = token_tree_to_pattern.transform(token_tree)
  341. tokens.append(TokenDef(name, pattern) )
  342. # Resolve regexp assignments of the form /..${X}../
  343. # XXX This is deprecated, since you can express most regexps with EBNF
  344. # XXX Also, since this happens after import, it can be a source of bugs
  345. token_dict = {td.name: td.pattern.to_regexp() for td in tokens}
  346. while True:
  347. changed = False
  348. for t in tokens:
  349. if isinstance(t.pattern, PatternRE):
  350. sp = re.split(r'(\$\{%s})' % TOKENS['TOKEN'], t.pattern.value)
  351. if sp:
  352. value = ''.join(token_dict[x[2:-1]] if x.startswith('${') and x.endswith('}') else x
  353. for x in sp)
  354. if value != t.pattern.value:
  355. t.pattern.value = value
  356. changed = True
  357. if not changed:
  358. break
  359. # =================
  360. # Compile Rules
  361. # =================
  362. extract_anon = ExtractAnonTokens(tokens)
  363. ebnf_to_bnf = EBNF_to_BNF()
  364. simplify_rule = SimplifyRule_Visitor()
  365. rule_tree_to_text = RuleTreeToText()
  366. rules = {}
  367. for name, rule_tree, options in rule_defs:
  368. assert name not in rules, name
  369. rule_tree = PrepareLiterals().transform(rule_tree)
  370. if not lexer:
  371. rule_tree = SplitLiterals().transform(rule_tree)
  372. tree = extract_anon.transform(rule_tree) # Adds to tokens
  373. ebnf_to_bnf.rule_options = RuleOptions(keep_all_tokens=True) if options and options.keep_all_tokens else None
  374. rules[name] = ebnf_to_bnf.transform(tree), options
  375. dict_update_safe(rules, ebnf_to_bnf.new_rules)
  376. for tree, _o in rules.values():
  377. simplify_rule.visit(tree)
  378. rules = {origin: (rule_tree_to_text.transform(tree), options) for origin, (tree, options) in rules.items()}
  379. return tokens, rules, self.extra
  380. class RuleOptions:
  381. def __init__(self, keep_all_tokens=False, expand1=False, create_token=None, filter_out=False):
  382. self.keep_all_tokens = keep_all_tokens
  383. self.expand1 = expand1
  384. self.create_token = create_token # used for scanless postprocessing
  385. self.filter_out = filter_out # remove this rule from the tree
  386. # used for "token"-rules in scanless
  387. @classmethod
  388. def new_from(cls, options, **kw):
  389. return cls(
  390. keep_all_tokens=options and options.keep_all_tokens,
  391. expand1=options and options.expand1,
  392. **kw)
  393. def _extract_options_for_rule(name, expansions):
  394. keep_all_tokens = name.startswith('!')
  395. name = name.lstrip('!')
  396. expand1 = name.startswith('?')
  397. name = name.lstrip('?')
  398. return name, expansions, RuleOptions(keep_all_tokens, expand1)
  399. _imported_grammars = {}
  400. def import_grammar(grammar_path):
  401. if grammar_path not in _imported_grammars:
  402. for import_path in IMPORT_PATHS:
  403. with open(os.path.join(import_path, grammar_path)) as f:
  404. text = f.read()
  405. grammar = load_grammar(text, grammar_path)
  406. _imported_grammars[grammar_path] = grammar
  407. return _imported_grammars[grammar_path]
  408. def resolve_token_references(token_defs):
  409. token_dict = dict(token_defs)
  410. assert len(token_dict) == len(token_defs), "Same name defined twice?"
  411. while True:
  412. changed = False
  413. for name, token_tree in token_defs:
  414. for exp in chain(token_tree.find_data('expansion'), token_tree.find_data('expr')):
  415. for i, item in enumerate(exp.children):
  416. if isinstance(item, Token):
  417. if item.type == 'RULE':
  418. raise GrammarError("Rules aren't allowed inside tokens (%s in %s)" % (item, name))
  419. if item.type == 'TOKEN':
  420. exp.children[i] = token_dict[item]
  421. changed = True
  422. if not changed:
  423. break
  424. class GrammarLoader:
  425. def __init__(self):
  426. tokens = [TokenDef(name, PatternRE(value)) for name, value in TOKENS.items()]
  427. rules = [_extract_options_for_rule(name, x) for name, x in RULES.items()]
  428. d = {r: ([(x.split(), None) for x in xs], o) for r, xs, o in rules}
  429. rules, callback = ParseTreeBuilder(T).create_tree_builder(d, None)
  430. lexer_conf = LexerConf(tokens, ['WS', 'COMMENT'], None)
  431. parser_conf = ParserConf(rules, callback, 'start')
  432. self.parser = LALR(lexer_conf, parser_conf)
  433. self.simplify_tree = SimplifyTree()
  434. def load_grammar(self, grammar_text, name='<?>'):
  435. try:
  436. tree = self.simplify_tree.transform( self.parser.parse(grammar_text+'\n') )
  437. except UnexpectedInput as e:
  438. raise GrammarError("Unexpected input %r at line %d column %d in %s" % (e.context, e.line, e.column, name))
  439. except UnexpectedToken as e:
  440. if '_COLON' in e.expected:
  441. raise GrammarError("Missing colon at line %s column %s" % (e.line, e.column))
  442. elif 'literal' in e.expected:
  443. raise GrammarError("Expecting a value at line %s column %s" % (e.line, e.column))
  444. elif e.expected == ['_OR']:
  445. raise GrammarError("Newline without starting a new option (Expecting '|') at line %s column %s" % (e.line, e.column))
  446. raise
  447. # Extract grammar items
  448. token_defs = [c.children for c in tree.children if c.data=='token']
  449. rule_defs = [c.children for c in tree.children if c.data=='rule']
  450. statements = [c.children for c in tree.children if c.data=='statement']
  451. assert len(token_defs) + len(rule_defs) + len(statements) == len(tree.children)
  452. token_defs = [(name.value, t) for name, t in token_defs]
  453. # Execute statements
  454. ignore = []
  455. for (stmt,) in statements:
  456. if stmt.data == 'ignore':
  457. expansions ,= stmt.children
  458. ignore.append(expansions)
  459. elif stmt.data == 'import':
  460. dotted_path = stmt.children[0].children
  461. name = stmt.children[1] if len(stmt.children)>1 else dotted_path[-1]
  462. grammar_path = os.path.join(*dotted_path[:-1]) + '.g'
  463. g = import_grammar(grammar_path)
  464. token_tree = dict(g.token_defs)[dotted_path[-1]]
  465. token_defs.append([name.value, token_tree])
  466. else:
  467. assert False, stmt
  468. # Verify correctness 1
  469. for name, _ in token_defs:
  470. if name.startswith('__'):
  471. raise GrammarError('Names starting with double-underscore are reserved (Error at %s)' % name)
  472. # Handle ignore tokens
  473. ignore_names = []
  474. for i, t in enumerate(ignore):
  475. if t.data == 'expansions' and len(t.children) == 1:
  476. x ,= t.children
  477. if x.data == 'expansion' and len(x.children) == 1:
  478. item ,= x.children
  479. if isinstance(item, Token) and item.type == 'TOKEN':
  480. # XXX is this really a wise solution? -- Erez
  481. ignore_names.append(item.value)
  482. continue
  483. name = '__IGNORE_%d'%i
  484. token_defs.append((name, t))
  485. ignore_names.append(name)
  486. # Resolve token references
  487. resolve_token_references(token_defs)
  488. # Verify correctness 2
  489. token_names = set()
  490. for name, _ in token_defs:
  491. if name in token_names:
  492. raise GrammarError("Token '%s' defined more than once" % name)
  493. token_names.add(name)
  494. rules = [_extract_options_for_rule(name, x) for name, x in rule_defs]
  495. rule_names = set()
  496. for name, _x, _o in rules:
  497. if name.startswith('__'):
  498. raise GrammarError('Names starting with double-underscore are reserved (Error at %s)' % name)
  499. if name in rule_names:
  500. raise GrammarError("Rule '%s' defined more than once" % name)
  501. rule_names.add(name)
  502. for name, expansions, _o in rules:
  503. used_symbols = {t for x in expansions.find_data('expansion')
  504. for t in x.scan_values(lambda t: t.type in ('RULE', 'TOKEN'))}
  505. for sym in used_symbols:
  506. if is_terminal(sym):
  507. if sym not in token_names:
  508. raise GrammarError("Token '%s' used but not defined (in rule %s)" % (sym, name))
  509. else:
  510. if sym not in rule_names:
  511. raise GrammarError("Rule '%s' used but not defined (in rule %s)" % (sym, name))
  512. # TODO don't include unused tokens, they can only cause trouble!
  513. return Grammar(rules, token_defs, {'ignore': ignore_names})
  514. load_grammar = GrammarLoader().load_grammar