This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
Você não pode selecionar mais de 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.
 
 

358 linhas
9.0 KiB

  1. import re
  2. from lexer import Lexer, Token
  3. from grammar_analysis import GrammarAnalyzer
  4. from parser import Parser
  5. from tree import Tree as T, Transformer, InlineTransformer, Visitor
  6. _TOKEN_NAMES = {
  7. ':' : 'COLON',
  8. ',' : 'COMMA',
  9. ';' : 'SEMICOLON',
  10. '+' : 'PLUS',
  11. '-' : 'MINUS',
  12. '*' : 'STAR',
  13. '/' : 'SLASH',
  14. '|' : 'VBAR',
  15. '!' : 'BANG',
  16. '?' : 'QMARK',
  17. '#' : 'HASH',
  18. '$' : 'DOLLAR',
  19. '&' : 'AMPERSAND',
  20. '<' : 'LESSTHAN',
  21. '>' : 'MORETHAN',
  22. '=' : 'EQUAL',
  23. '.' : 'DOT',
  24. '%' : 'PERCENT',
  25. '`' : 'BACKQUOTE',
  26. '^' : 'CIRCUMFLEX',
  27. '"' : 'DBLQUOTE',
  28. '\'' : 'QUOTE',
  29. '~' : 'TILDE',
  30. '@' : 'AT',
  31. '(' : 'LPAR',
  32. ')' : 'RPAR',
  33. '{' : 'LBRACE',
  34. '}' : 'RBRACE',
  35. '[' : 'LSQB',
  36. ']' : 'RSQB',
  37. }
  38. # Grammar Parser
  39. TOKENS = {
  40. 'LPAR': '\(',
  41. 'RPAR': '\)',
  42. 'LBRA': '\[',
  43. 'RBRA': '\]',
  44. 'OP': '[+*?]',
  45. 'COLON': ':',
  46. 'OR': '\|',
  47. 'DOT': '\.',
  48. 'RULE': '[_?*]?[a-z][_a-z0-9]*',
  49. 'TOKEN': '_?[A-Z][_A-Z0-9]*',
  50. 'STRING': r'".*?[^\\]"',
  51. 'REGEXP': r"/(.|\n)*?[^\\]/",
  52. 'NL': r'(\r?\n)+\s*',
  53. 'WS': r'[ \t]+',
  54. 'COMMENT': r'#[^\n]*\n',
  55. 'TO': '->'
  56. }
  57. RULES = [
  58. ('start', ['list']),
  59. ('list', ['item']),
  60. ('list', ['list', 'item']),
  61. ('item', ['rule']),
  62. ('item', ['token']),
  63. ('item', ['NL']),
  64. ('rule', ['RULE', 'COLON', 'expansions', 'NL']),
  65. ('expansions', ['expansion']),
  66. ('expansions', ['expansions', 'OR', 'expansion']),
  67. ('expansions', ['expansions', 'NL', 'OR', 'expansion']),
  68. ('expansion', ['_expansion']),
  69. ('expansion', ['_expansion', 'TO', 'RULE']),
  70. ('_expansion', ['expr']),
  71. ('_expansion', ['_expansion', 'expr']),
  72. ('expr', ['atom']),
  73. ('expr', ['atom', 'OP']),
  74. ('atom', ['LPAR', 'expansions', 'RPAR']),
  75. ('atom', ['maybe']),
  76. ('atom', ['RULE']),
  77. ('atom', ['TOKEN']),
  78. ('atom', ['anontoken']),
  79. ('anontoken', ['tokenvalue']),
  80. ('maybe', ['LBRA', 'expansions', 'RBRA']),
  81. ('token', ['TOKEN', 'COLON', 'tokenvalue', 'NL']),
  82. ('token', ['TOKEN', 'tokenmods', 'COLON', 'tokenvalue', 'NL']),
  83. ('tokenvalue', ['REGEXP']),
  84. ('tokenvalue', ['STRING']),
  85. ('tokenmods', ['DOT', 'RULE']),
  86. ('tokenmods', ['tokenmods', 'DOT', 'RULE']),
  87. ]
  88. class SaveDefinitions(object):
  89. def __init__(self):
  90. self.rules = {}
  91. self.tokens = {}
  92. self.i = 0
  93. def atom__3(self, _1, value, _2):
  94. return value
  95. def atom__1(self, value):
  96. return value
  97. def expr__1(self, expr):
  98. return expr
  99. def expr(self, *x):
  100. return T('expr', x)
  101. def expansion__1(self, expansion):
  102. return expansion
  103. def expansion__3(self, expansion, _, alias):
  104. return T('alias', [expansion, alias])
  105. def _expansion(self, *x):
  106. return T('expansion', x)
  107. def expansions(self, *x):
  108. items = [i for i in x if isinstance(i, T)]
  109. return T('expansions', items)
  110. def maybe(self, _1, expr, _2):
  111. return T('expr', [expr, Token('OP', '?', -1)])
  112. def rule(self, name, _1, expansion, _2):
  113. name = name.value
  114. if name in self.rules:
  115. raise ValueError("Rule '%s' defined more than once" % name)
  116. self.rules[name] = expansion
  117. def token(self, *x):
  118. name = x[0].value
  119. if name in self.tokens:
  120. raise ValueError("Token '%s' defined more than once" % name)
  121. if len(x) == 4:
  122. self.tokens[name] = x[2][1], []
  123. else:
  124. self.tokens[name] = x[3][1], x[1].children
  125. def tokenvalue(self, tokenvalue):
  126. value = tokenvalue.value[1:-1]
  127. if tokenvalue.type == 'STRING':
  128. value = re.escape(value)
  129. return tokenvalue, value
  130. def anontoken(self, (token, value)):
  131. if token.type == 'STRING':
  132. try:
  133. token_name = _TOKEN_NAMES[token.value[1:-1]]
  134. except KeyError:
  135. if value.isalnum() and value[0].isalpha():
  136. token_name = value.upper()
  137. else:
  138. token_name = 'ANONSTR_%d' % self.i
  139. self.i += 1
  140. token_name = '__' + token_name
  141. elif token.type == 'REGEXP':
  142. token_name = 'ANONRE_%d' % self.i
  143. self.i += 1
  144. else:
  145. assert False, x
  146. if token_name not in self.tokens:
  147. self.tokens[token_name] = value, []
  148. return Token('TOKEN', token_name, -1)
  149. def tokenmods__2(self, _, rule):
  150. return T('tokenmods', [rule.value])
  151. def tokenmods__3(self, tokenmods, _, rule):
  152. return T('tokenmods', tokenmods.children + [rule.value])
  153. def start(self, *x): pass
  154. def list(self, *x): pass
  155. def item(self, *x): pass
  156. class EBNF_to_BNF(InlineTransformer):
  157. def __init__(self):
  158. self.new_rules = {}
  159. self.prefix = 'anon'
  160. self.i = 0
  161. def _add_recurse_rule(self, type_, expr):
  162. new_name = '__%s_%s_%d' % (self.prefix, type_, self.i)
  163. self.i += 1
  164. t = Token('RULE', new_name, -1)
  165. self.new_rules[new_name] = T('expansions', [T('expansion', [expr]), T('expansion', [t, expr])])
  166. return t
  167. def expr(self, rule, op):
  168. if op.value == '?':
  169. return T('expansions', [rule, T('expansion', [])])
  170. elif op.value == '+':
  171. # a : b c+ d
  172. # -->
  173. # a : b _c d
  174. # _c : _c c | c;
  175. return self._add_recurse_rule('plus', rule)
  176. elif op.value == '*':
  177. # a : b c* d
  178. # -->
  179. # a : b _c? d
  180. # _c : _c c | c;
  181. new_name = self._add_recurse_rule('star', rule)
  182. return T('expansions', [new_name, T('expansion', [])])
  183. assert False, op
  184. class SimplifyRule_Visitor(Visitor):
  185. @staticmethod
  186. def _flatten(tree):
  187. while True:
  188. to_expand = [i for i, child in enumerate(tree.children)
  189. if isinstance(child, T) and child.data == tree.data]
  190. if not to_expand:
  191. break
  192. tree.expand_kids_by_index(*to_expand)
  193. def expansion(self, tree):
  194. # rules_list unpacking
  195. # a : b (c|d) e
  196. # -->
  197. # a : b c e | b d e
  198. #
  199. # In AST terms:
  200. # expansion(b, expansions(c, d), e)
  201. # -->
  202. # expansions( expansion(b, c, e), expansion(b, d, e) )
  203. while True:
  204. self._flatten(tree)
  205. for i, child in enumerate(tree.children):
  206. if isinstance(child, T) and child.data == 'expansions':
  207. tree.data = 'expansions'
  208. tree.children = [self.visit(T('expansion', [option if i==j else other
  209. for j, other in enumerate(tree.children)]))
  210. for option in child.children]
  211. break
  212. else:
  213. break
  214. def alias(self, tree):
  215. rule, alias_name = tree.children
  216. if rule.data == 'expansions':
  217. aliases = []
  218. for child in tree.children[0].children:
  219. aliases.append(T('alias', [child, alias_name]))
  220. tree.data = 'expansions'
  221. tree.children = aliases
  222. expansions = _flatten
  223. def dict_update_safe(d1, d2):
  224. for k, v in d2.iteritems():
  225. assert k not in d1
  226. d1[k] = v
  227. def generate_aliases():
  228. sd = SaveDefinitions()
  229. for name, expansion in RULES:
  230. try:
  231. f = getattr(sd, "%s__%s" % (name, len(expansion)))
  232. except AttributeError:
  233. f = getattr(sd, name)
  234. yield name, expansion, f.__name__
  235. def inline_args(f):
  236. def _f(self, args):
  237. return f(*args)
  238. return _f
  239. class GrammarLoader:
  240. def __init__(self):
  241. self.rules = list(generate_aliases())
  242. self.ga = GrammarAnalyzer(self.rules)
  243. self.ga.analyze()
  244. self.lexer = Lexer(TOKENS.items(), {}, ignore=['WS', 'COMMENT'])
  245. self.simplify_rule = SimplifyRule_Visitor()
  246. def _generate_parser_callbacks(self, callbacks):
  247. d = {alias: inline_args(getattr(callbacks, alias))
  248. for _n, _x, alias in self.rules}
  249. return type('Callback', (), d)()
  250. def load_grammar(self, grammar_text):
  251. sd = SaveDefinitions()
  252. c = self._generate_parser_callbacks(sd)
  253. p = Parser(self.ga, c)
  254. p.parse( list(self.lexer.lex(grammar_text+"\n")) )
  255. ebnf_to_bnf = EBNF_to_BNF()
  256. rules = {name: ebnf_to_bnf.transform(r) for name, r in sd.rules.items()}
  257. dict_update_safe(rules, ebnf_to_bnf.new_rules)
  258. for r in rules.values():
  259. self.simplify_rule.visit(r)
  260. return sd.tokens, rules
  261. load_grammar = GrammarLoader().load_grammar
  262. def test():
  263. g = """
  264. start: add
  265. # Rules
  266. add: mul
  267. | add _add_sym mul
  268. mul: _atom
  269. | mul _add_mul _atom
  270. neg: "-" _atom
  271. _atom: neg
  272. | number
  273. | "(" add ")"
  274. # Tokens
  275. number: /[\d.]+/
  276. _add_sym: "+" | "-"
  277. _add_mul: "*" | "/"
  278. WS.ignore: /\s+/
  279. """
  280. g2 = """
  281. start: a
  282. a: "a" (b*|(c d)+) "b"?
  283. b: "b"
  284. c: "c"
  285. d: "+" | "-"
  286. """
  287. load_grammar(g)