This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
Du kannst nicht mehr als 25 Themen auswählen Themen müssen entweder mit einem Buchstaben oder einer Ziffer beginnen. Sie können Bindestriche („-“) enthalten und bis zu 35 Zeichen lang sein.

380 Zeilen
12 KiB

  1. import re
  2. import codecs
  3. from .lexer import Lexer, Token, UnexpectedInput, TokenDef__Str, TokenDef__Regexp
  4. from .parse_tree_builder import ParseTreeBuilder
  5. from .parser_frontends import LALR
  6. from .parsers.lalr_parser import UnexpectedToken
  7. from .common import is_terminal, GrammarError, LexerConf, ParserConf
  8. from .tree import Tree as T, Transformer, InlineTransformer, Visitor
  9. unicode_escape = codecs.getdecoder('unicode_escape')
  10. _TOKEN_NAMES = {
  11. '.' : 'DOT',
  12. ',' : 'COMMA',
  13. ':' : 'COLON',
  14. ';' : 'SEMICOLON',
  15. '+' : 'PLUS',
  16. '-' : 'MINUS',
  17. '*' : 'STAR',
  18. '/' : 'SLASH',
  19. '\\' : 'BACKSLASH',
  20. '|' : 'VBAR',
  21. '?' : 'QMARK',
  22. '!' : 'BANG',
  23. '@' : 'AT',
  24. '#' : 'HASH',
  25. '$' : 'DOLLAR',
  26. '%' : 'PERCENT',
  27. '^' : 'CIRCUMFLEX',
  28. '&' : 'AMPERSAND',
  29. '_' : 'UNDERSCORE',
  30. '<' : 'LESSTHAN',
  31. '>' : 'MORETHAN',
  32. '=' : 'EQUAL',
  33. '"' : 'DBLQUOTE',
  34. '\'' : 'QUOTE',
  35. '`' : 'BACKQUOTE',
  36. '~' : 'TILDE',
  37. '(' : 'LPAR',
  38. ')' : 'RPAR',
  39. '{' : 'LBRACE',
  40. '}' : 'RBRACE',
  41. '[' : 'LSQB',
  42. ']' : 'RSQB',
  43. '\n' : 'NEWLINE',
  44. '\r\n' : 'CRLF',
  45. '\t' : 'TAB',
  46. ' ' : 'SPACE',
  47. }
  48. # Grammar Parser
  49. TOKENS = {
  50. '_LPAR': r'\(',
  51. '_RPAR': r'\)',
  52. '_LBRA': r'\[',
  53. '_RBRA': r'\]',
  54. 'OP': '[+*?](?![a-z])',
  55. '_COLON': ':',
  56. '_OR': r'\|',
  57. '_DOT': r'\.',
  58. 'RULE': '!?[_?]?[a-z][_a-z0-9]*',
  59. 'TOKEN': '_?[A-Z][_A-Z0-9]*',
  60. 'STRING': r'".*?[^\\]"',
  61. 'REGEXP': r"/(?!/).*?[^\\]/",
  62. '_NL': r'(\r?\n)+\s*',
  63. 'WS': r'[ \t]+',
  64. 'COMMENT': r'//[^\n]*\n',
  65. '_TO': '->'
  66. }
  67. RULES = {
  68. 'start': ['list'],
  69. 'list': ['item', 'list item'],
  70. 'item': ['rule', 'token', '_NL'],
  71. 'rule': ['RULE _COLON expansions _NL'],
  72. 'expansions': ['alias',
  73. 'expansions _OR alias',
  74. 'expansions _NL _OR alias'],
  75. '?alias': ['expansion _TO RULE', 'expansion'],
  76. 'expansion': ['_expansion'],
  77. '_expansion': ['', '_expansion expr'],
  78. '?expr': ['atom',
  79. 'atom OP'],
  80. '?atom': ['_LPAR expansions _RPAR',
  81. 'maybe',
  82. 'RULE',
  83. 'TOKEN',
  84. 'anontoken'],
  85. 'anontoken': ['tokenvalue'],
  86. 'maybe': ['_LBRA expansions _RBRA'],
  87. 'token': ['TOKEN _COLON tokenvalue _NL',
  88. 'TOKEN tokenmods _COLON tokenvalue _NL'],
  89. '?tokenvalue': ['REGEXP', 'STRING'],
  90. 'tokenmods': ['_DOT RULE', 'tokenmods _DOT RULE'],
  91. }
  92. class EBNF_to_BNF(InlineTransformer):
  93. def __init__(self):
  94. self.new_rules = {}
  95. self.rules_by_expr = {}
  96. self.prefix = 'anon'
  97. self.i = 0
  98. def _add_recurse_rule(self, type_, expr):
  99. if expr in self.rules_by_expr:
  100. return self.rules_by_expr[expr]
  101. new_name = '__%s_%s_%d' % (self.prefix, type_, self.i)
  102. self.i += 1
  103. t = Token('RULE', new_name, -1)
  104. self.new_rules[new_name] = T('expansions', [T('expansion', [expr]), T('expansion', [t, expr])])
  105. self.rules_by_expr[expr] = t
  106. return t
  107. def expr(self, rule, op):
  108. if op.value == '?':
  109. return T('expansions', [rule, T('expansion', [])])
  110. elif op.value == '+':
  111. # a : b c+ d
  112. # -->
  113. # a : b _c d
  114. # _c : _c c | c;
  115. return self._add_recurse_rule('plus', rule)
  116. elif op.value == '*':
  117. # a : b c* d
  118. # -->
  119. # a : b _c? d
  120. # _c : _c c | c;
  121. new_name = self._add_recurse_rule('star', rule)
  122. return T('expansions', [new_name, T('expansion', [])])
  123. assert False, op
  124. class SimplifyRule_Visitor(Visitor):
  125. @staticmethod
  126. def _flatten(tree):
  127. while True:
  128. to_expand = [i for i, child in enumerate(tree.children)
  129. if isinstance(child, T) and child.data == tree.data]
  130. if not to_expand:
  131. break
  132. tree.expand_kids_by_index(*to_expand)
  133. def expansion(self, tree):
  134. # rules_list unpacking
  135. # a : b (c|d) e
  136. # -->
  137. # a : b c e | b d e
  138. #
  139. # In AST terms:
  140. # expansion(b, expansions(c, d), e)
  141. # -->
  142. # expansions( expansion(b, c, e), expansion(b, d, e) )
  143. while True:
  144. self._flatten(tree)
  145. for i, child in enumerate(tree.children):
  146. if isinstance(child, T) and child.data == 'expansions':
  147. tree.data = 'expansions'
  148. tree.children = [self.visit(T('expansion', [option if i==j else other
  149. for j, other in enumerate(tree.children)]))
  150. for option in child.children]
  151. break
  152. else:
  153. break
  154. def alias(self, tree):
  155. rule, alias_name = tree.children
  156. if rule.data == 'expansions':
  157. aliases = []
  158. for child in tree.children[0].children:
  159. aliases.append(T('alias', [child, alias_name]))
  160. tree.data = 'expansions'
  161. tree.children = aliases
  162. expansions = _flatten
  163. def dict_update_safe(d1, d2):
  164. for k, v in d2.items():
  165. assert k not in d1
  166. d1[k] = v
  167. class RuleTreeToText(Transformer):
  168. def expansions(self, x):
  169. return x
  170. def expansion(self, symbols):
  171. return [sym.value for sym in symbols], None
  172. def alias(self, x):
  173. (expansion, _alias), alias = x
  174. assert _alias is None, (alias, expansion, '-', _alias)
  175. return expansion, alias.value
  176. class SimplifyTree(InlineTransformer):
  177. def maybe(self, expr):
  178. return T('expr', [expr, Token('OP', '?', -1)])
  179. def tokenmods(self, *args):
  180. if len(args) == 1:
  181. return list(args)
  182. tokenmods, value = args
  183. return tokenmods + [value]
  184. def get_tokens(tree, token_set):
  185. for t in tree.find_data('token'):
  186. x = t.children
  187. name = x[0].value
  188. assert not name.startswith('__'), 'Names starting with double-underscore are reserved (Error at %s)' % name
  189. if name in token_set:
  190. raise ValueError("Token '%s' defined more than once" % name)
  191. token_set.add(name)
  192. if len(x) == 2:
  193. yield name, x[1], []
  194. else:
  195. assert len(x) == 3
  196. yield name, x[2], x[1]
  197. class ExtractAnonTokens(InlineTransformer):
  198. def __init__(self, tokens, token_set):
  199. self.tokens = tokens
  200. self.token_set = token_set
  201. self.token_reverse = {value[1:-1]: name for name, value, _flags in tokens}
  202. self.i = 0
  203. def anontoken(self, token):
  204. if token.type == 'STRING':
  205. value = token.value[1:-1]
  206. try:
  207. # If already defined, use the user-defined token name
  208. token_name = self.token_reverse[value]
  209. except KeyError:
  210. # Try to assign an indicative anon-token name, otherwise use a numbered name
  211. try:
  212. token_name = _TOKEN_NAMES[value]
  213. except KeyError:
  214. if value.isalnum() and value[0].isalpha() and ('__'+value.upper()) not in self.token_set:
  215. token_name = value.upper() # This can create name duplications for unidentical tokens
  216. else:
  217. token_name = 'ANONSTR_%d' % self.i
  218. self.i += 1
  219. token_name = '__' + token_name
  220. elif token.type == 'REGEXP':
  221. token_name = 'ANONRE_%d' % self.i
  222. value = token.value
  223. self.i += 1
  224. else:
  225. assert False, token
  226. if value in self.token_reverse: # Kind of a wierd placement
  227. token_name = self.token_reverse[value]
  228. if token_name not in self.token_set:
  229. self.token_set.add(token_name)
  230. self.tokens.append((token_name, token, []))
  231. assert value not in self.token_reverse, value
  232. self.token_reverse[value] = token_name
  233. return Token('TOKEN', token_name, -1)
  234. class GrammarLoader:
  235. def __init__(self):
  236. tokens = [TokenDef__Regexp(name, value) for name, value in TOKENS.items()]
  237. d = {r: [(x.split(), None) for x in xs] for r, xs in RULES.items()}
  238. rules, callback = ParseTreeBuilder(T).create_tree_builder(d, None)
  239. lexer_conf = LexerConf(tokens, ['WS', 'COMMENT'], None)
  240. parser_conf = ParserConf(rules, callback, 'start')
  241. self.parser = LALR(lexer_conf, parser_conf)
  242. self.simplify_tree = SimplifyTree()
  243. self.simplify_rule = SimplifyRule_Visitor()
  244. self.rule_tree_to_text = RuleTreeToText()
  245. def load_grammar(self, grammar_text):
  246. try:
  247. tree = self.simplify_tree.transform( self.parser.parse(grammar_text+'\n') )
  248. except UnexpectedInput as e:
  249. raise GrammarError("Unexpected input %r at line %d column %d" % (e.context, e.line, e.column))
  250. except UnexpectedToken as e:
  251. if '_COLON' in e.expected:
  252. raise GrammarError("Missing colon at line %s column %s" % (e.line, e.column))
  253. elif 'tokenvalue' in e.expected:
  254. raise GrammarError("Expecting a value at line %s column %s" % (e.line, e.column))
  255. elif e.expected == ['_OR']:
  256. raise GrammarError("Newline without starting a new option (Expecting '|') at line %s column %s" % (e.line, e.column))
  257. raise
  258. # =================
  259. # Process Tokens
  260. # =================
  261. token_set = set()
  262. tokens = list(get_tokens(tree, token_set))
  263. extract_anon = ExtractAnonTokens(tokens, token_set)
  264. tree = extract_anon.transform(tree) # Adds to tokens
  265. token_ref = {}
  266. tokendefs = []
  267. for name, token, flags in tokens:
  268. value = token.value[1:-1]
  269. if r'\u' in value:
  270. # XXX for now, you can't mix unicode escaping and unicode characters at the same token
  271. value = unicode_escape(value)[0]
  272. if token.type == 'REGEXP':
  273. sp = re.split(r'(\$\{%s})' % TOKENS['TOKEN'], value)
  274. if sp:
  275. value = ''.join(token_ref[x[2:-1]] if x.startswith('${') and x.endswith('}') else x
  276. for x in sp)
  277. token_ref[name] = value
  278. tokendef = TokenDef__Regexp(name, value)
  279. else:
  280. assert token.type == 'STRING'
  281. tokendef = TokenDef__Str(name, value)
  282. tokendefs.append((tokendef, flags))
  283. # =================
  284. # Process Rules
  285. # =================
  286. ebnf_to_bnf = EBNF_to_BNF()
  287. rules = {}
  288. for rule in tree.find_data('rule'):
  289. name, ebnf_tree = rule.children
  290. name = name.value
  291. if name in rules:
  292. raise ValueError("Rule '%s' defined more than once" % name)
  293. rules[name] = ebnf_to_bnf.transform(ebnf_tree)
  294. dict_update_safe(rules, ebnf_to_bnf.new_rules)
  295. for r in rules.values():
  296. self.simplify_rule.visit(r)
  297. rules = {origin: self.rule_tree_to_text.transform(tree) for origin, tree in rules.items()}
  298. # ====================
  299. # Verify correctness
  300. # ====================
  301. used_symbols = {symbol for expansions in rules.values()
  302. for expansion, _alias in expansions
  303. for symbol in expansion}
  304. rule_set = {r.lstrip('!').lstrip('?') for r in rules}
  305. for sym in used_symbols:
  306. if is_terminal(sym):
  307. if sym not in token_set:
  308. raise GrammarError("Token '%s' used but not defined" % sym)
  309. else:
  310. if sym not in rule_set:
  311. raise GrammarError("Rule '%s' used but not defined" % sym)
  312. return tokendefs, rules
  313. load_grammar = GrammarLoader().load_grammar