This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
選択できるのは25トピックまでです。 トピックは、先頭が英数字で、英数字とダッシュ('-')を使用した35文字以内のものにしてください。

358 行
9.0 KiB

  1. import re
  2. from lexer import Lexer, Token
  3. from grammar_analysis import GrammarAnalyzer
  4. from parser import Parser
  5. from tree import Tree as T, Transformer, InlineTransformer, Visitor
  6. _TOKEN_NAMES = {
  7. ':' : 'COLON',
  8. ',' : 'COMMA',
  9. ';' : 'SEMICOLON',
  10. '+' : 'PLUS',
  11. '-' : 'MINUS',
  12. '*' : 'STAR',
  13. '/' : 'SLASH',
  14. '|' : 'VBAR',
  15. '!' : 'BANG',
  16. '?' : 'QMARK',
  17. '#' : 'HASH',
  18. '$' : 'DOLLAR',
  19. '&' : 'AMPERSAND',
  20. '<' : 'LESSTHAN',
  21. '>' : 'MORETHAN',
  22. '=' : 'EQUAL',
  23. '.' : 'DOT',
  24. '%' : 'PERCENT',
  25. '`' : 'BACKQUOTE',
  26. '^' : 'CIRCUMFLEX',
  27. '"' : 'DBLQUOTE',
  28. '\'' : 'QUOTE',
  29. '~' : 'TILDE',
  30. '@' : 'AT',
  31. '(' : 'LPAR',
  32. ')' : 'RPAR',
  33. '{' : 'LBRACE',
  34. '}' : 'RBRACE',
  35. '[' : 'LSQB',
  36. ']' : 'RSQB',
  37. }
  38. # Grammar Parser
  39. TOKENS = {
  40. 'LPAR': '\(',
  41. 'RPAR': '\)',
  42. 'LBRA': '\[',
  43. 'RBRA': '\]',
  44. 'OP': '[+*?]',
  45. 'COLON': ':',
  46. 'OR': '\|',
  47. 'DOT': '\.',
  48. 'RULE': '[_?*]?[a-z][_a-z0-9]*',
  49. 'TOKEN': '_?[A-Z][_A-Z0-9]*',
  50. 'STRING': r'".*?[^\\]"',
  51. 'REGEXP': r"/(.|\n)*?[^\\]/",
  52. 'NL': r'(\r?\n)+\s*',
  53. 'WS': r'[ \t]+',
  54. 'COMMENT': r'#[^\n]*\n',
  55. 'TO': '->'
  56. }
  57. RULES = [
  58. ('start', ['list']),
  59. ('list', ['item']),
  60. ('list', ['list', 'item']),
  61. ('item', ['rule']),
  62. ('item', ['token']),
  63. ('item', ['NL']),
  64. ('rule', ['RULE', 'COLON', 'expansions', 'NL']),
  65. ('expansions', ['expansion']),
  66. ('expansions', ['expansions', 'OR', 'expansion']),
  67. ('expansions', ['expansions', 'NL', 'OR', 'expansion']),
  68. ('expansion', ['_expansion']),
  69. ('expansion', ['_expansion', 'TO', 'RULE']),
  70. ('_expansion', ['expr']),
  71. ('_expansion', ['_expansion', 'expr']),
  72. ('expr', ['atom']),
  73. ('expr', ['atom', 'OP']),
  74. ('atom', ['LPAR', 'expansions', 'RPAR']),
  75. ('atom', ['maybe']),
  76. ('atom', ['RULE']),
  77. ('atom', ['TOKEN']),
  78. ('atom', ['anontoken']),
  79. ('anontoken', ['tokenvalue']),
  80. ('maybe', ['LBRA', 'expansions', 'RBRA']),
  81. ('token', ['TOKEN', 'COLON', 'tokenvalue', 'NL']),
  82. ('token', ['TOKEN', 'tokenmods', 'COLON', 'tokenvalue', 'NL']),
  83. ('tokenvalue', ['REGEXP']),
  84. ('tokenvalue', ['STRING']),
  85. ('tokenmods', ['DOT', 'RULE']),
  86. ('tokenmods', ['tokenmods', 'DOT', 'RULE']),
  87. ]
  88. class SaveDefinitions(object):
  89. def __init__(self):
  90. self.rules = {}
  91. self.tokens = {}
  92. self.i = 0
  93. def atom__3(self, _1, value, _2):
  94. return value
  95. def atom__1(self, value):
  96. return value
  97. def expr__1(self, expr):
  98. return expr
  99. def expr(self, *x):
  100. return T('expr', x)
  101. def expansion__1(self, expansion):
  102. return expansion
  103. def expansion__3(self, expansion, _, alias):
  104. return T('alias', [expansion, alias])
  105. def _expansion(self, *x):
  106. return T('expansion', x)
  107. def expansions(self, *x):
  108. items = [i for i in x if isinstance(i, T)]
  109. return T('expansions', items)
  110. def maybe(self, _1, expr, _2):
  111. return T('expr', [expr, Token('OP', '?', -1)])
  112. def rule(self, name, _1, expansion, _2):
  113. name = name.value
  114. if name in self.rules:
  115. raise ValueError("Rule '%s' defined more than once" % name)
  116. self.rules[name] = expansion
  117. def token(self, *x):
  118. name = x[0].value
  119. if name in self.tokens:
  120. raise ValueError("Token '%s' defined more than once" % name)
  121. if len(x) == 4:
  122. self.tokens[name] = x[2][1], []
  123. else:
  124. self.tokens[name] = x[3][1], x[1].children
  125. def tokenvalue(self, tokenvalue):
  126. value = tokenvalue.value[1:-1]
  127. if tokenvalue.type == 'STRING':
  128. value = re.escape(value)
  129. return tokenvalue, value
  130. def anontoken(self, (token, value)):
  131. if token.type == 'STRING':
  132. try:
  133. token_name = _TOKEN_NAMES[token.value[1:-1]]
  134. except KeyError:
  135. if value.isalnum() and value[0].isalpha():
  136. token_name = value.upper()
  137. else:
  138. token_name = 'ANONSTR_%d' % self.i
  139. self.i += 1
  140. token_name = '__' + token_name
  141. elif token.type == 'REGEXP':
  142. token_name = 'ANONRE_%d' % self.i
  143. self.i += 1
  144. else:
  145. assert False, x
  146. if token_name not in self.tokens:
  147. self.tokens[token_name] = value, []
  148. return Token('TOKEN', token_name, -1)
  149. def tokenmods__2(self, _, rule):
  150. return T('tokenmods', [rule.value])
  151. def tokenmods__3(self, tokenmods, _, rule):
  152. return T('tokenmods', tokenmods.children + [rule.value])
  153. def start(self, *x): pass
  154. def list(self, *x): pass
  155. def item(self, *x): pass
  156. class EBNF_to_BNF(InlineTransformer):
  157. def __init__(self):
  158. self.new_rules = {}
  159. self.prefix = 'anon'
  160. self.i = 0
  161. def _add_recurse_rule(self, type_, expr):
  162. new_name = '__%s_%s_%d' % (self.prefix, type_, self.i)
  163. self.i += 1
  164. t = Token('RULE', new_name, -1)
  165. self.new_rules[new_name] = T('expansions', [T('expansion', [expr]), T('expansion', [t, expr])])
  166. return t
  167. def expr(self, rule, op):
  168. if op.value == '?':
  169. return T('expansions', [rule, T('expansion', [])])
  170. elif op.value == '+':
  171. # a : b c+ d
  172. # -->
  173. # a : b _c d
  174. # _c : _c c | c;
  175. return self._add_recurse_rule('plus', rule)
  176. elif op.value == '*':
  177. # a : b c* d
  178. # -->
  179. # a : b _c? d
  180. # _c : _c c | c;
  181. new_name = self._add_recurse_rule('star', rule)
  182. return T('expansions', [new_name, T('expansion', [])])
  183. assert False, op
  184. class SimplifyRule_Visitor(Visitor):
  185. @staticmethod
  186. def _flatten(tree):
  187. while True:
  188. to_expand = [i for i, child in enumerate(tree.children)
  189. if isinstance(child, T) and child.data == tree.data]
  190. if not to_expand:
  191. break
  192. tree.expand_kids_by_index(*to_expand)
  193. def expansion(self, tree):
  194. # rules_list unpacking
  195. # a : b (c|d) e
  196. # -->
  197. # a : b c e | b d e
  198. #
  199. # In AST terms:
  200. # expansion(b, expansions(c, d), e)
  201. # -->
  202. # expansions( expansion(b, c, e), expansion(b, d, e) )
  203. while True:
  204. self._flatten(tree)
  205. for i, child in enumerate(tree.children):
  206. if isinstance(child, T) and child.data == 'expansions':
  207. tree.data = 'expansions'
  208. tree.children = [self.visit(T('expansion', [option if i==j else other
  209. for j, other in enumerate(tree.children)]))
  210. for option in child.children]
  211. break
  212. else:
  213. break
  214. def alias(self, tree):
  215. rule, alias_name = tree.children
  216. if rule.data == 'expansions':
  217. aliases = []
  218. for child in tree.children[0].children:
  219. aliases.append(T('alias', [child, alias_name]))
  220. tree.data = 'expansions'
  221. tree.children = aliases
  222. expansions = _flatten
  223. def dict_update_safe(d1, d2):
  224. for k, v in d2.iteritems():
  225. assert k not in d1
  226. d1[k] = v
  227. def generate_aliases():
  228. sd = SaveDefinitions()
  229. for name, expansion in RULES:
  230. try:
  231. f = getattr(sd, "%s__%s" % (name, len(expansion)))
  232. except AttributeError:
  233. f = getattr(sd, name)
  234. yield name, expansion, f.__name__
  235. def inline_args(f):
  236. def _f(self, args):
  237. return f(*args)
  238. return _f
  239. class GrammarLoader:
  240. def __init__(self):
  241. self.rules = list(generate_aliases())
  242. self.ga = GrammarAnalyzer(self.rules)
  243. self.ga.analyze()
  244. self.lexer = Lexer(TOKENS.items(), {}, ignore=['WS', 'COMMENT'])
  245. self.simplify_rule = SimplifyRule_Visitor()
  246. def _generate_parser_callbacks(self, callbacks):
  247. d = {alias: inline_args(getattr(callbacks, alias))
  248. for _n, _x, alias in self.rules}
  249. return type('Callback', (), d)()
  250. def load_grammar(self, grammar_text):
  251. sd = SaveDefinitions()
  252. c = self._generate_parser_callbacks(sd)
  253. p = Parser(self.ga, c)
  254. p.parse( list(self.lexer.lex(grammar_text+"\n")) )
  255. ebnf_to_bnf = EBNF_to_BNF()
  256. rules = {name: ebnf_to_bnf.transform(r) for name, r in sd.rules.items()}
  257. dict_update_safe(rules, ebnf_to_bnf.new_rules)
  258. for r in rules.values():
  259. self.simplify_rule.visit(r)
  260. return sd.tokens, rules
  261. load_grammar = GrammarLoader().load_grammar
  262. def test():
  263. g = """
  264. start: add
  265. # Rules
  266. add: mul
  267. | add _add_sym mul
  268. mul: _atom
  269. | mul _add_mul _atom
  270. neg: "-" _atom
  271. _atom: neg
  272. | number
  273. | "(" add ")"
  274. # Tokens
  275. number: /[\d.]+/
  276. _add_sym: "+" | "-"
  277. _add_mul: "*" | "/"
  278. WS.ignore: /\s+/
  279. """
  280. g2 = """
  281. start: a
  282. a: "a" (b*|(c d)+) "b"?
  283. b: "b"
  284. c: "c"
  285. d: "+" | "-"
  286. """
  287. load_grammar(g)