This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

396 lines
10 KiB

  1. import re
  2. import codecs
  3. from lexer import Lexer, Token
  4. from grammar_analysis import GrammarAnalyzer
  5. from parser import Parser
  6. from tree import Tree as T, Transformer, InlineTransformer, Visitor
  7. unicode_escape = codecs.getdecoder('unicode_escape')
  8. _TOKEN_NAMES = {
  9. ':' : 'COLON',
  10. ',' : 'COMMA',
  11. ';' : 'SEMICOLON',
  12. '+' : 'PLUS',
  13. '-' : 'MINUS',
  14. '*' : 'STAR',
  15. '/' : 'SLASH',
  16. '|' : 'VBAR',
  17. '!' : 'BANG',
  18. '?' : 'QMARK',
  19. '#' : 'HASH',
  20. '$' : 'DOLLAR',
  21. '&' : 'AMPERSAND',
  22. '<' : 'LESSTHAN',
  23. '>' : 'MORETHAN',
  24. '=' : 'EQUAL',
  25. '.' : 'DOT',
  26. '%' : 'PERCENT',
  27. '`' : 'BACKQUOTE',
  28. '^' : 'CIRCUMFLEX',
  29. '"' : 'DBLQUOTE',
  30. '\'' : 'QUOTE',
  31. '~' : 'TILDE',
  32. '@' : 'AT',
  33. '(' : 'LPAR',
  34. ')' : 'RPAR',
  35. '{' : 'LBRACE',
  36. '}' : 'RBRACE',
  37. '[' : 'LSQB',
  38. ']' : 'RSQB',
  39. }
  40. # Grammar Parser
  41. TOKENS = {
  42. 'LPAR': '\(',
  43. 'RPAR': '\)',
  44. 'LBRA': '\[',
  45. 'RBRA': '\]',
  46. 'OP': '[+*?]',
  47. 'COLON': ':',
  48. 'OR': '\|',
  49. 'DOT': '\.',
  50. 'RULE': '[_?*]?[a-z][_a-z0-9]*',
  51. 'TOKEN': '_?[A-Z][_A-Z0-9]*',
  52. 'STRING': r'".*?[^\\]"',
  53. 'REGEXP': r"/(.|\n)*?[^\\]/",
  54. 'NL': r'(\r?\n)+\s*',
  55. 'WS': r'[ \t]+',
  56. 'COMMENT': r'//[^\n]*\n',
  57. 'TO': '->'
  58. }
  59. RULES = [
  60. ('start', ['list']),
  61. ('list', ['item']),
  62. ('list', ['list', 'item']),
  63. ('item', ['rule']),
  64. ('item', ['token']),
  65. ('item', ['NL']),
  66. ('rule', ['RULE', 'COLON', 'expansions', 'NL']),
  67. ('expansions', ['expansion']),
  68. ('expansions', ['expansions', 'OR', 'expansion']),
  69. ('expansions', ['expansions', 'NL', 'OR', 'expansion']),
  70. ('expansion', ['_expansion']),
  71. ('expansion', ['_expansion', 'TO', 'RULE']),
  72. ('_expansion', []),
  73. ('_expansion', ['_expansion', 'expr']),
  74. ('expr', ['atom']),
  75. ('expr', ['atom', 'OP']),
  76. ('atom', ['LPAR', 'expansions', 'RPAR']),
  77. ('atom', ['maybe']),
  78. ('atom', ['RULE']),
  79. ('atom', ['TOKEN']),
  80. ('atom', ['anontoken']),
  81. ('anontoken', ['tokenvalue']),
  82. ('maybe', ['LBRA', 'expansions', 'RBRA']),
  83. ('token', ['TOKEN', 'COLON', 'tokenvalue', 'NL']),
  84. ('token', ['TOKEN', 'tokenmods', 'COLON', 'tokenvalue', 'NL']),
  85. ('tokenvalue', ['REGEXP']),
  86. ('tokenvalue', ['STRING']),
  87. ('tokenmods', ['DOT', 'RULE']),
  88. ('tokenmods', ['tokenmods', 'DOT', 'RULE']),
  89. ]
  90. class SaveDefinitions(object):
  91. def __init__(self):
  92. self.rules = {}
  93. self.token_set = set()
  94. self.tokens = []
  95. self.i = 0
  96. def atom__3(self, _1, value, _2):
  97. return value
  98. def atom__1(self, value):
  99. return value
  100. def expr__1(self, expr):
  101. return expr
  102. def expr(self, *x):
  103. return T('expr', x)
  104. def expansion__1(self, expansion):
  105. return expansion
  106. def expansion__3(self, expansion, _, alias):
  107. return T('alias', [expansion, alias])
  108. def _expansion(self, *x):
  109. return T('expansion', x)
  110. def expansions(self, *x):
  111. items = [i for i in x if isinstance(i, T)]
  112. return T('expansions', items)
  113. def maybe(self, _1, expr, _2):
  114. return T('expr', [expr, Token('OP', '?', -1)])
  115. def rule(self, name, _1, expansion, _2):
  116. name = name.value
  117. if name in self.rules:
  118. raise ValueError("Rule '%s' defined more than once" % name)
  119. self.rules[name] = expansion
  120. def token(self, *x):
  121. name = x[0].value
  122. if name in self.token_set:
  123. raise ValueError("Token '%s' defined more than once" % name)
  124. self.token_set.add(name)
  125. if len(x) == 4:
  126. self.tokens.append((name, x[2], []))
  127. else:
  128. self.tokens.append((name, x[3], x[1].children))
  129. def tokenvalue(self, tokenvalue):
  130. return tokenvalue
  131. def anontoken(self, token):
  132. if token.type == 'STRING':
  133. value = token.value[1:-1]
  134. try:
  135. token_name = _TOKEN_NAMES[value]
  136. except KeyError:
  137. if value.isalnum() and value[0].isalpha():
  138. token_name = value.upper()
  139. else:
  140. token_name = 'ANONSTR_%d' % self.i
  141. self.i += 1
  142. token_name = '__' + token_name
  143. elif token.type == 'REGEXP':
  144. token_name = 'ANONRE_%d' % self.i
  145. self.i += 1
  146. else:
  147. assert False, x
  148. if token_name not in self.token_set:
  149. self.token_set.add(token_name)
  150. self.tokens.append((token_name, token, []))
  151. return Token('TOKEN', token_name, -1)
  152. def tokenmods__2(self, _, rule):
  153. return T('tokenmods', [rule.value])
  154. def tokenmods__3(self, tokenmods, _, rule):
  155. return T('tokenmods', tokenmods.children + [rule.value])
  156. def start(self, *x): pass
  157. def list(self, *x): pass
  158. def item(self, *x): pass
  159. class EBNF_to_BNF(InlineTransformer):
  160. def __init__(self):
  161. self.new_rules = {}
  162. self.rules_by_expr = {}
  163. self.prefix = 'anon'
  164. self.i = 0
  165. def _add_recurse_rule(self, type_, expr):
  166. if expr in self.rules_by_expr:
  167. return self.rules_by_expr[expr]
  168. new_name = '__%s_%s_%d' % (self.prefix, type_, self.i)
  169. self.i += 1
  170. t = Token('RULE', new_name, -1)
  171. self.new_rules[new_name] = T('expansions', [T('expansion', [expr]), T('expansion', [t, expr])])
  172. self.rules_by_expr[expr] = t
  173. return t
  174. def expr(self, rule, op):
  175. if op.value == '?':
  176. return T('expansions', [rule, T('expansion', [])])
  177. elif op.value == '+':
  178. # a : b c+ d
  179. # -->
  180. # a : b _c d
  181. # _c : _c c | c;
  182. return self._add_recurse_rule('plus', rule)
  183. elif op.value == '*':
  184. # a : b c* d
  185. # -->
  186. # a : b _c? d
  187. # _c : _c c | c;
  188. new_name = self._add_recurse_rule('star', rule)
  189. return T('expansions', [new_name, T('expansion', [])])
  190. assert False, op
  191. class SimplifyRule_Visitor(Visitor):
  192. @staticmethod
  193. def _flatten(tree):
  194. while True:
  195. to_expand = [i for i, child in enumerate(tree.children)
  196. if isinstance(child, T) and child.data == tree.data]
  197. if not to_expand:
  198. break
  199. tree.expand_kids_by_index(*to_expand)
  200. def expansion(self, tree):
  201. # rules_list unpacking
  202. # a : b (c|d) e
  203. # -->
  204. # a : b c e | b d e
  205. #
  206. # In AST terms:
  207. # expansion(b, expansions(c, d), e)
  208. # -->
  209. # expansions( expansion(b, c, e), expansion(b, d, e) )
  210. while True:
  211. self._flatten(tree)
  212. for i, child in enumerate(tree.children):
  213. if isinstance(child, T) and child.data == 'expansions':
  214. tree.data = 'expansions'
  215. tree.children = [self.visit(T('expansion', [option if i==j else other
  216. for j, other in enumerate(tree.children)]))
  217. for option in child.children]
  218. break
  219. else:
  220. break
  221. def alias(self, tree):
  222. rule, alias_name = tree.children
  223. if rule.data == 'expansions':
  224. aliases = []
  225. for child in tree.children[0].children:
  226. aliases.append(T('alias', [child, alias_name]))
  227. tree.data = 'expansions'
  228. tree.children = aliases
  229. expansions = _flatten
  230. def dict_update_safe(d1, d2):
  231. for k, v in d2.iteritems():
  232. assert k not in d1
  233. d1[k] = v
  234. def generate_aliases():
  235. sd = SaveDefinitions()
  236. for name, expansion in RULES:
  237. try:
  238. f = getattr(sd, "%s__%s" % (name, len(expansion)))
  239. except AttributeError:
  240. f = getattr(sd, name)
  241. yield name, expansion, f.__name__
  242. def inline_args(f):
  243. def _f(self, args):
  244. return f(*args)
  245. return _f
  246. class GrammarLoader:
  247. def __init__(self):
  248. self.rules = list(generate_aliases())
  249. self.ga = GrammarAnalyzer(self.rules)
  250. self.ga.analyze()
  251. self.lexer = Lexer(TOKENS.items(), {}, ignore=['WS', 'COMMENT'])
  252. self.simplify_rule = SimplifyRule_Visitor()
  253. def _generate_parser_callbacks(self, callbacks):
  254. d = {alias: inline_args(getattr(callbacks, alias))
  255. for _n, _x, alias in self.rules}
  256. return type('Callback', (), d)()
  257. def load_grammar(self, grammar_text):
  258. sd = SaveDefinitions()
  259. c = self._generate_parser_callbacks(sd)
  260. p = Parser(self.ga, c)
  261. p.parse( list(self.lexer.lex(grammar_text+"\n")) )
  262. # Tokens
  263. token_ref = {}
  264. re_tokens = []
  265. str_tokens = []
  266. for name, token, flags in sd.tokens:
  267. value = token.value[1:-1]
  268. if '\u' in value:
  269. # XXX for now, you can't mix unicode escaping and unicode characters at the same token
  270. value = unicode_escape(value)[0]
  271. if token.type == 'STRING':
  272. value = re.escape(value)
  273. str_tokens.append((name, value, flags))
  274. else:
  275. assert token.type == 'REGEXP'
  276. sp = re.split(r'(\$\{%s})' % TOKENS['TOKEN'], value)
  277. if sp:
  278. value = ''.join(token_ref[x[2:-1]] if x.startswith('${') and x.endswith('}') else x
  279. for x in sp)
  280. re_tokens.append((name, value, flags))
  281. token_ref[name] = value
  282. str_tokens.sort(key=lambda x:len(x[1]), reverse=True)
  283. re_tokens.sort(key=lambda x:len(x[1]), reverse=True)
  284. tokens = str_tokens + re_tokens # Order is important!
  285. # Rules
  286. ebnf_to_bnf = EBNF_to_BNF()
  287. rules = {name: ebnf_to_bnf.transform(r) for name, r in sd.rules.items()}
  288. dict_update_safe(rules, ebnf_to_bnf.new_rules)
  289. for r in rules.values():
  290. self.simplify_rule.visit(r)
  291. return tokens, rules
  292. load_grammar = GrammarLoader().load_grammar
  293. def test():
  294. g = """
  295. start: add
  296. # Rules
  297. add: mul
  298. | add _add_sym mul
  299. mul: _atom
  300. | mul _add_mul _atom
  301. neg: "-" _atom
  302. _atom: neg
  303. | number
  304. | "(" add ")"
  305. # Tokens
  306. number: /[\d.]+/
  307. _add_sym: "+" | "-"
  308. _add_mul: "*" | "/"
  309. WS.ignore: /\s+/
  310. """
  311. g2 = """
  312. start: a
  313. a: "a" (b*|(c d)+) "b"?
  314. b: "b"
  315. c: "c"
  316. d: "+" | "-"
  317. """
  318. load_grammar(g)