This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
Nelze vybrat více než 25 témat Téma musí začínat písmenem nebo číslem, může obsahovat pomlčky („-“) a může být dlouhé až 35 znaků.
 
 

378 řádky
12 KiB

  1. import re
  2. import codecs
  3. from .lexer import Lexer, Token, UnexpectedInput, TokenDef__Str, TokenDef__Regexp
  4. from .parse_tree_builder import ParseTreeBuilder
  5. from .parser_frontends import LALR
  6. from .parsers.lalr_parser import UnexpectedToken
  7. from .common import is_terminal, GrammarError, LexerConf, ParserConf
  8. from .tree import Tree as T, Transformer, InlineTransformer, Visitor
  9. unicode_escape = codecs.getdecoder('unicode_escape')
  10. _TOKEN_NAMES = {
  11. '.' : 'DOT',
  12. ',' : 'COMMA',
  13. ':' : 'COLON',
  14. ';' : 'SEMICOLON',
  15. '+' : 'PLUS',
  16. '-' : 'MINUS',
  17. '*' : 'STAR',
  18. '/' : 'SLASH',
  19. '\\' : 'BACKSLASH',
  20. '|' : 'VBAR',
  21. '?' : 'QMARK',
  22. '!' : 'BANG',
  23. '@' : 'AT',
  24. '#' : 'HASH',
  25. '$' : 'DOLLAR',
  26. '%' : 'PERCENT',
  27. '^' : 'CIRCUMFLEX',
  28. '&' : 'AMPERSAND',
  29. '_' : 'UNDERSCORE',
  30. '<' : 'LESSTHAN',
  31. '>' : 'MORETHAN',
  32. '=' : 'EQUAL',
  33. '"' : 'DBLQUOTE',
  34. '\'' : 'QUOTE',
  35. '`' : 'BACKQUOTE',
  36. '~' : 'TILDE',
  37. '(' : 'LPAR',
  38. ')' : 'RPAR',
  39. '{' : 'LBRACE',
  40. '}' : 'RBRACE',
  41. '[' : 'LSQB',
  42. ']' : 'RSQB',
  43. '\n' : 'NEWLINE',
  44. '\r\n' : 'CRLF',
  45. '\t' : 'TAB',
  46. ' ' : 'SPACE',
  47. }
  48. # Grammar Parser
  49. TOKENS = {
  50. '_LPAR': r'\(',
  51. '_RPAR': r'\)',
  52. '_LBRA': r'\[',
  53. '_RBRA': r'\]',
  54. 'OP': '[+*?](?![a-z])',
  55. '_COLON': ':',
  56. '_OR': r'\|',
  57. '_DOT': r'\.',
  58. 'RULE': '[_?*]?[a-z][_a-z0-9]*',
  59. 'TOKEN': '_?[A-Z][_A-Z0-9]*',
  60. 'STRING': r'".*?[^\\]"',
  61. 'REGEXP': r"/(?!/).*?[^\\]/",
  62. '_NL': r'(\r?\n)+\s*',
  63. 'WS': r'[ \t]+',
  64. 'COMMENT': r'//[^\n]*\n',
  65. '_TO': '->'
  66. }
  67. RULES = {
  68. 'start': ['list'],
  69. 'list': ['item', 'list item'],
  70. 'item': ['rule', 'token', '_NL'],
  71. 'rule': ['RULE _COLON expansions _NL'],
  72. 'expansions': ['alias',
  73. 'expansions _OR alias',
  74. 'expansions _NL _OR alias'],
  75. '?alias': ['expansion _TO RULE', 'expansion'],
  76. 'expansion': ['_expansion'],
  77. '_expansion': ['', '_expansion expr'],
  78. '?expr': ['atom',
  79. 'atom OP'],
  80. '?atom': ['_LPAR expansions _RPAR',
  81. 'maybe',
  82. 'RULE',
  83. 'TOKEN',
  84. 'anontoken'],
  85. 'anontoken': ['tokenvalue'],
  86. 'maybe': ['_LBRA expansions _RBRA'],
  87. 'token': ['TOKEN _COLON tokenvalue _NL',
  88. 'TOKEN tokenmods _COLON tokenvalue _NL'],
  89. '?tokenvalue': ['REGEXP', 'STRING'],
  90. 'tokenmods': ['_DOT RULE', 'tokenmods _DOT RULE'],
  91. }
  92. class EBNF_to_BNF(InlineTransformer):
  93. def __init__(self):
  94. self.new_rules = {}
  95. self.rules_by_expr = {}
  96. self.prefix = 'anon'
  97. self.i = 0
  98. def _add_recurse_rule(self, type_, expr):
  99. if expr in self.rules_by_expr:
  100. return self.rules_by_expr[expr]
  101. new_name = '__%s_%s_%d' % (self.prefix, type_, self.i)
  102. self.i += 1
  103. t = Token('RULE', new_name, -1)
  104. self.new_rules[new_name] = T('expansions', [T('expansion', [expr]), T('expansion', [t, expr])])
  105. self.rules_by_expr[expr] = t
  106. return t
  107. def expr(self, rule, op):
  108. if op.value == '?':
  109. return T('expansions', [rule, T('expansion', [])])
  110. elif op.value == '+':
  111. # a : b c+ d
  112. # -->
  113. # a : b _c d
  114. # _c : _c c | c;
  115. return self._add_recurse_rule('plus', rule)
  116. elif op.value == '*':
  117. # a : b c* d
  118. # -->
  119. # a : b _c? d
  120. # _c : _c c | c;
  121. new_name = self._add_recurse_rule('star', rule)
  122. return T('expansions', [new_name, T('expansion', [])])
  123. assert False, op
  124. class SimplifyRule_Visitor(Visitor):
  125. @staticmethod
  126. def _flatten(tree):
  127. while True:
  128. to_expand = [i for i, child in enumerate(tree.children)
  129. if isinstance(child, T) and child.data == tree.data]
  130. if not to_expand:
  131. break
  132. tree.expand_kids_by_index(*to_expand)
  133. def expansion(self, tree):
  134. # rules_list unpacking
  135. # a : b (c|d) e
  136. # -->
  137. # a : b c e | b d e
  138. #
  139. # In AST terms:
  140. # expansion(b, expansions(c, d), e)
  141. # -->
  142. # expansions( expansion(b, c, e), expansion(b, d, e) )
  143. while True:
  144. self._flatten(tree)
  145. for i, child in enumerate(tree.children):
  146. if isinstance(child, T) and child.data == 'expansions':
  147. tree.data = 'expansions'
  148. tree.children = [self.visit(T('expansion', [option if i==j else other
  149. for j, other in enumerate(tree.children)]))
  150. for option in child.children]
  151. break
  152. else:
  153. break
  154. def alias(self, tree):
  155. rule, alias_name = tree.children
  156. if rule.data == 'expansions':
  157. aliases = []
  158. for child in tree.children[0].children:
  159. aliases.append(T('alias', [child, alias_name]))
  160. tree.data = 'expansions'
  161. tree.children = aliases
  162. expansions = _flatten
  163. def dict_update_safe(d1, d2):
  164. for k, v in d2.items():
  165. assert k not in d1
  166. d1[k] = v
  167. class RuleTreeToText(Transformer):
  168. def expansions(self, x):
  169. return x
  170. def expansion(self, symbols):
  171. return [sym.value for sym in symbols], None
  172. def alias(self, x):
  173. (expansion, _alias), alias = x
  174. assert _alias is None, (alias, expansion, '-', _alias)
  175. return expansion, alias.value
  176. class SimplifyTree(InlineTransformer):
  177. def maybe(self, expr):
  178. return T('expr', [expr, Token('OP', '?', -1)])
  179. def tokenmods(self, *args):
  180. if len(args) == 1:
  181. return list(args)
  182. tokenmods, value = args
  183. return tokenmods + [value]
  184. def get_tokens(tree, token_set):
  185. for t in tree.find_data('token'):
  186. x = t.children
  187. name = x[0].value
  188. assert not name.startswith('__'), 'Names starting with double-underscore are reserved (Error at %s)' % name
  189. if name in token_set:
  190. raise ValueError("Token '%s' defined more than once" % name)
  191. token_set.add(name)
  192. if len(x) == 2:
  193. yield name, x[1], []
  194. else:
  195. assert len(x) == 3
  196. yield name, x[2], x[1]
  197. class ExtractAnonTokens(InlineTransformer):
  198. def __init__(self, tokens, token_set):
  199. self.tokens = tokens
  200. self.token_set = token_set
  201. self.token_reverse = {value[1:-1]: name for name, value, _flags in tokens}
  202. self.i = 0
  203. def anontoken(self, token):
  204. if token.type == 'STRING':
  205. value = token.value[1:-1]
  206. try:
  207. # If already defined, use the user-defined token name
  208. token_name = self.token_reverse[value]
  209. except KeyError:
  210. # Try to assign an indicative anon-token name, otherwise use a numbered name
  211. try:
  212. token_name = _TOKEN_NAMES[value]
  213. except KeyError:
  214. if value.isalnum() and value[0].isalpha() and ('__'+value.upper()) not in self.token_set:
  215. token_name = value.upper() # This can create name duplications for unidentical tokens
  216. else:
  217. token_name = 'ANONSTR_%d' % self.i
  218. self.i += 1
  219. token_name = '__' + token_name
  220. elif token.type == 'REGEXP':
  221. token_name = 'ANONRE_%d' % self.i
  222. value = token.value
  223. self.i += 1
  224. else:
  225. assert False, token
  226. if value in self.token_reverse: # Kind of a wierd placement
  227. token_name = self.token_reverse[value]
  228. if token_name not in self.token_set:
  229. self.token_set.add(token_name)
  230. self.tokens.append((token_name, token, []))
  231. assert value not in self.token_reverse, value
  232. self.token_reverse[value] = token_name
  233. return Token('TOKEN', token_name, -1)
  234. class GrammarLoader:
  235. def __init__(self):
  236. tokens = [TokenDef__Regexp(name, value) for name, value in TOKENS.items()]
  237. d = {r: [(x.split(), None) for x in xs] for r, xs in RULES.items()}
  238. rules, callback = ParseTreeBuilder(T).create_tree_builder(d, None)
  239. lexer_conf = LexerConf(tokens, ['WS', 'COMMENT'], None)
  240. parser_conf = ParserConf(rules, callback, 'start')
  241. self.parser = LALR(lexer_conf, parser_conf)
  242. self.simplify_tree = SimplifyTree()
  243. self.simplify_rule = SimplifyRule_Visitor()
  244. self.rule_tree_to_text = RuleTreeToText()
  245. def load_grammar(self, grammar_text):
  246. try:
  247. tree = self.simplify_tree.transform( self.parser.parse(grammar_text+'\n') )
  248. except UnexpectedInput as e:
  249. raise GrammarError("Unexpected input %r at line %d column %d" % (e.context, e.line, e.column))
  250. except UnexpectedToken as e:
  251. if '_COLON' in e.expected:
  252. raise GrammarError("Missing colon at line %s column %s" % (e.line, e.column))
  253. elif 'tokenvalue' in e.expected:
  254. raise GrammarError("Expecting a value at line %s column %s" % (e.line, e.column))
  255. raise
  256. # =================
  257. # Process Tokens
  258. # =================
  259. token_set = set()
  260. tokens = list(get_tokens(tree, token_set))
  261. extract_anon = ExtractAnonTokens(tokens, token_set)
  262. tree = extract_anon.transform(tree) # Adds to tokens
  263. token_ref = {}
  264. tokendefs = []
  265. for name, token, flags in tokens:
  266. value = token.value[1:-1]
  267. if r'\u' in value:
  268. # XXX for now, you can't mix unicode escaping and unicode characters at the same token
  269. value = unicode_escape(value)[0]
  270. if token.type == 'REGEXP':
  271. sp = re.split(r'(\$\{%s})' % TOKENS['TOKEN'], value)
  272. if sp:
  273. value = ''.join(token_ref[x[2:-1]] if x.startswith('${') and x.endswith('}') else x
  274. for x in sp)
  275. token_ref[name] = value
  276. tokendef = TokenDef__Regexp(name, value)
  277. else:
  278. assert token.type == 'STRING'
  279. tokendef = TokenDef__Str(name, value)
  280. tokendefs.append((tokendef, flags))
  281. # =================
  282. # Process Rules
  283. # =================
  284. ebnf_to_bnf = EBNF_to_BNF()
  285. rules = {}
  286. for rule in tree.find_data('rule'):
  287. name, ebnf_tree = rule.children
  288. name = name.value
  289. if name in rules:
  290. raise ValueError("Rule '%s' defined more than once" % name)
  291. rules[name] = ebnf_to_bnf.transform(ebnf_tree)
  292. dict_update_safe(rules, ebnf_to_bnf.new_rules)
  293. for r in rules.values():
  294. self.simplify_rule.visit(r)
  295. rules = {origin: self.rule_tree_to_text.transform(tree) for origin, tree in rules.items()}
  296. # ====================
  297. # Verify correctness
  298. # ====================
  299. used_symbols = {symbol for expansions in rules.values()
  300. for expansion, _alias in expansions
  301. for symbol in expansion}
  302. rule_set = {r.lstrip('?') for r in rules}
  303. for sym in used_symbols:
  304. if is_terminal(sym):
  305. if sym not in token_set:
  306. raise GrammarError("Token '%s' used but not defined" % sym)
  307. else:
  308. if sym not in rule_set:
  309. raise GrammarError("Rule '%s' used but not defined" % sym)
  310. return tokendefs, rules
  311. load_grammar = GrammarLoader().load_grammar