This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

399 lines
11 KiB

  1. import re
  2. import codecs
  3. from .lexer import Lexer, Token
  4. from .parse_tree_builder import ParseTreeBuilder
  5. from .parser_frontends import LALR
  6. from .common import is_terminal, GrammarError
  7. from .tree import Tree as T, Transformer, InlineTransformer, Visitor
  8. unicode_escape = codecs.getdecoder('unicode_escape')
  9. _TOKEN_NAMES = {
  10. ':' : '_COLON',
  11. ',' : 'COMMA',
  12. ';' : 'SEMICOLON',
  13. '+' : 'PLUS',
  14. '-' : 'MINUS',
  15. '*' : 'STAR',
  16. '/' : 'SLASH',
  17. '|' : 'VBAR',
  18. '!' : 'BANG',
  19. '?' : 'QMARK',
  20. '#' : 'HASH',
  21. '$' : 'DOLLAR',
  22. '&' : 'AMPERSAND',
  23. '<' : 'LESSTHAN',
  24. '>' : 'MORETHAN',
  25. '=' : 'EQUAL',
  26. '.' : '_DOT',
  27. '%' : 'PERCENT',
  28. '`' : 'BACKQUOTE',
  29. '^' : 'CIRCUMFLEX',
  30. '"' : 'DBLQUOTE',
  31. '\'' : 'QUOTE',
  32. '~' : 'TILDE',
  33. '@' : 'AT',
  34. '(' : '_LPAR',
  35. ')' : '_RPAR',
  36. '{' : 'LBRACE',
  37. '}' : 'RBRACE',
  38. '[' : 'LSQB',
  39. ']' : 'RSQB',
  40. }
  41. # Grammar Parser
  42. TOKENS = {
  43. '_LPAR': '\(',
  44. '_RPAR': '\)',
  45. '_LBRA': '\[',
  46. '_RBRA': '\]',
  47. 'OP': '[+*?]',
  48. '_COLON': ':',
  49. '_OR': '\|',
  50. '_DOT': '\.',
  51. 'RULE': '[_?*]?[a-z][_a-z0-9]*',
  52. 'TOKEN': '_?[A-Z][_A-Z0-9]*',
  53. 'STRING': r'".*?[^\\]"',
  54. 'REGEXP': r"/(.|\n)*?[^\\]/",
  55. '_NL': r'(\r?\n)+\s*',
  56. 'WS': r'[ \t]+',
  57. 'COMMENT': r'//[^\n]*\n',
  58. '_TO': '->'
  59. }
  60. RULES = {
  61. 'start': ['list'],
  62. 'list': ['item', 'list item'],
  63. 'item': ['rule', 'token', '_NL'],
  64. 'rule': ['RULE _COLON expansions _NL'],
  65. 'expansions': ['expansion',
  66. 'expansions _OR expansion',
  67. 'expansions _NL _OR expansion'],
  68. 'expansion': ['_expansion',
  69. '_expansion _TO RULE'],
  70. '_expansion': ['', '_expansion expr'],
  71. '?expr': ['atom',
  72. 'atom OP'],
  73. '?atom': ['_LPAR expansions _RPAR',
  74. 'maybe',
  75. 'RULE',
  76. 'TOKEN',
  77. 'anontoken'],
  78. 'anontoken': ['tokenvalue'],
  79. 'maybe': ['_LBRA expansions _RBRA'],
  80. 'token': ['TOKEN _COLON tokenvalue _NL',
  81. 'TOKEN tokenmods _COLON tokenvalue _NL'],
  82. '?tokenvalue': ['REGEXP', 'STRING'],
  83. 'tokenmods': ['_DOT RULE', 'tokenmods _DOT RULE'],
  84. }
  85. class EBNF_to_BNF(InlineTransformer):
  86. def __init__(self):
  87. self.new_rules = {}
  88. self.rules_by_expr = {}
  89. self.prefix = 'anon'
  90. self.i = 0
  91. def _add_recurse_rule(self, type_, expr):
  92. if expr in self.rules_by_expr:
  93. return self.rules_by_expr[expr]
  94. new_name = '__%s_%s_%d' % (self.prefix, type_, self.i)
  95. self.i += 1
  96. t = Token('RULE', new_name, -1)
  97. self.new_rules[new_name] = T('expansions', [T('expansion', [expr]), T('expansion', [t, expr])])
  98. self.rules_by_expr[expr] = t
  99. return t
  100. def expr(self, rule, op):
  101. if op.value == '?':
  102. return T('expansions', [rule, T('expansion', [])])
  103. elif op.value == '+':
  104. # a : b c+ d
  105. # -->
  106. # a : b _c d
  107. # _c : _c c | c;
  108. return self._add_recurse_rule('plus', rule)
  109. elif op.value == '*':
  110. # a : b c* d
  111. # -->
  112. # a : b _c? d
  113. # _c : _c c | c;
  114. new_name = self._add_recurse_rule('star', rule)
  115. return T('expansions', [new_name, T('expansion', [])])
  116. assert False, op
  117. class SimplifyRule_Visitor(Visitor):
  118. @staticmethod
  119. def _flatten(tree):
  120. while True:
  121. to_expand = [i for i, child in enumerate(tree.children)
  122. if isinstance(child, T) and child.data == tree.data]
  123. if not to_expand:
  124. break
  125. tree.expand_kids_by_index(*to_expand)
  126. def expansion(self, tree):
  127. # rules_list unpacking
  128. # a : b (c|d) e
  129. # -->
  130. # a : b c e | b d e
  131. #
  132. # In AST terms:
  133. # expansion(b, expansions(c, d), e)
  134. # -->
  135. # expansions( expansion(b, c, e), expansion(b, d, e) )
  136. while True:
  137. self._flatten(tree)
  138. for i, child in enumerate(tree.children):
  139. if isinstance(child, T) and child.data == 'expansions':
  140. tree.data = 'expansions'
  141. tree.children = [self.visit(T('expansion', [option if i==j else other
  142. for j, other in enumerate(tree.children)]))
  143. for option in child.children]
  144. break
  145. else:
  146. break
  147. def alias(self, tree):
  148. rule, alias_name = tree.children
  149. if rule.data == 'expansions':
  150. aliases = []
  151. for child in tree.children[0].children:
  152. aliases.append(T('alias', [child, alias_name]))
  153. tree.data = 'expansions'
  154. tree.children = aliases
  155. expansions = _flatten
  156. def dict_update_safe(d1, d2):
  157. for k, v in d2.iteritems():
  158. assert k not in d1
  159. d1[k] = v
  160. class RuleTreeToText(Transformer):
  161. def expansions(self, x):
  162. return x
  163. def expansion(self, symbols):
  164. return [sym.value for sym in symbols], None
  165. def alias(self, ((expansion, _alias), alias)):
  166. assert _alias is None, (alias, expansion, '-', _alias)
  167. return expansion, alias.value
  168. class SimplifyTree(InlineTransformer):
  169. def maybe(self, expr):
  170. return T('expr', [expr, Token('OP', '?', -1)])
  171. def tokenmods(self, *args):
  172. if len(args) == 1:
  173. return list(args)
  174. tokenmods, value = args
  175. return tokenmods + [value]
  176. def get_tokens(tree, token_set):
  177. tokens = []
  178. for t in tree.find_data('token'):
  179. x = t.children
  180. name = x[0].value
  181. assert not name.startswith('__'), 'Names starting with double-underscore are reserved (Error at %s)' % name
  182. if name in token_set:
  183. raise ValueError("Token '%s' defined more than once" % name)
  184. token_set.add(name)
  185. if len(x) == 2:
  186. yield name, x[1], []
  187. else:
  188. assert len(x) == 3
  189. yield name, x[2], x[1]
  190. class ExtractAnonTokens(InlineTransformer):
  191. def __init__(self, tokens, token_set):
  192. self.tokens = tokens
  193. self.token_set = token_set
  194. self.token_reverse = {value[1:-1]: name for name, value, _flags in tokens}
  195. self.i = 0
  196. def anontoken(self, token):
  197. if token.type == 'STRING':
  198. value = token.value[1:-1]
  199. try:
  200. # If already defined, use the user-defined token name
  201. token_name = self.token_reverse[value]
  202. except KeyError:
  203. # Try to assign an indicative anon-token name, otherwise use a numbered name
  204. try:
  205. token_name = _TOKEN_NAMES[value]
  206. except KeyError:
  207. if value.isalnum() and value[0].isalpha():
  208. token_name = value.upper()
  209. else:
  210. token_name = 'ANONSTR_%d' % self.i
  211. self.i += 1
  212. token_name = '__' + token_name
  213. elif token.type == 'REGEXP':
  214. token_name = 'ANONRE_%d' % self.i
  215. self.i += 1
  216. else:
  217. assert False, x
  218. if token_name not in self.token_set:
  219. self.token_set.add(token_name)
  220. self.tokens.append((token_name, token, []))
  221. return Token('TOKEN', token_name, -1)
  222. class GrammarLoader:
  223. def __init__(self):
  224. self.lexer = Lexer(TOKENS.items(), {}, ignore=['WS', 'COMMENT'])
  225. d = {r: [(x.split(), None) for x in xs] for r, xs in RULES.items()}
  226. rules, callback = ParseTreeBuilder(T).create_tree_builder(d, None)
  227. self.parser = LALR().build_parser(rules, callback, 'start')
  228. self.simplify_tree = SimplifyTree()
  229. self.simplify_rule = SimplifyRule_Visitor()
  230. self.rule_tree_to_text = RuleTreeToText()
  231. def load_grammar(self, grammar_text):
  232. token_stream = list(self.lexer.lex(grammar_text+"\n"))
  233. tree = self.simplify_tree.transform( self.parser.parse(token_stream) )
  234. # =================
  235. # Process Tokens
  236. # =================
  237. token_set = set()
  238. tokens = list(get_tokens(tree, token_set))
  239. extract_anon = ExtractAnonTokens(tokens, token_set)
  240. tree = extract_anon.transform(tree) # Adds to tokens
  241. token_ref = {}
  242. re_tokens = []
  243. str_tokens = []
  244. for name, token, flags in tokens:
  245. value = token.value[1:-1]
  246. if '\u' in value:
  247. # XXX for now, you can't mix unicode escaping and unicode characters at the same token
  248. value = unicode_escape(value)[0]
  249. if token.type == 'STRING':
  250. value = re.escape(value)
  251. str_tokens.append((name, value, flags))
  252. else:
  253. assert token.type == 'REGEXP'
  254. sp = re.split(r'(\$\{%s})' % TOKENS['TOKEN'], value)
  255. if sp:
  256. value = ''.join(token_ref[x[2:-1]] if x.startswith('${') and x.endswith('}') else x
  257. for x in sp)
  258. re_tokens.append((name, value, flags))
  259. token_ref[name] = value
  260. str_tokens.sort(key=lambda x:len(x[1]), reverse=True)
  261. re_tokens.sort(key=lambda x:len(x[1]), reverse=True)
  262. tokens = str_tokens + re_tokens # Order is important!
  263. # =================
  264. # Process Rules
  265. # =================
  266. ebnf_to_bnf = EBNF_to_BNF()
  267. rules = {}
  268. for rule in tree.find_data('rule'):
  269. name, ebnf_tree = rule.children
  270. name = name.value
  271. if name in rules:
  272. raise ValueError("Rule '%s' defined more than once" % name)
  273. rules[name] = ebnf_to_bnf.transform(ebnf_tree)
  274. dict_update_safe(rules, ebnf_to_bnf.new_rules)
  275. for r in rules.values():
  276. self.simplify_rule.visit(r)
  277. rules = {origin: self.rule_tree_to_text.transform(tree) for origin, tree in rules.items()}
  278. # ====================
  279. # Verify correctness
  280. # ====================
  281. used_symbols = {symbol for expansions in rules.values()
  282. for expansion, _alias in expansions
  283. for symbol in expansion}
  284. rule_set = {r.lstrip('?') for r in rules}
  285. for sym in used_symbols:
  286. if is_terminal(sym):
  287. if sym not in token_set:
  288. raise GrammarError("Token '%s' used but not defined" % sym)
  289. else:
  290. if sym not in rule_set:
  291. raise GrammarError("Rule '%s' used but not defined" % sym)
  292. return tokens, rules
  293. load_grammar = GrammarLoader().load_grammar
  294. def test():
  295. g = """
  296. start: add
  297. // Rules
  298. add: mul
  299. | add _add_sym mul
  300. mul: [mul _add_mul] _atom
  301. _atom: "-" _atom -> neg
  302. | NUMBER
  303. | "(" add ")"
  304. // Tokens
  305. NUMBER: /[\d.]+/
  306. _add_sym: "+" | "-"
  307. _add_mul: "*" | "/"
  308. WS.ignore.newline: /\s+/
  309. """
  310. g2 = """
  311. start: a
  312. a: "a" (b*|(c d)+) "b"?
  313. b: "b"
  314. c: "c"
  315. d: "+" | "-"
  316. """
  317. # print load_grammar(g)
  318. print GrammarLoader().load_grammar2(g)
  319. if __name__ == '__main__':
  320. test()