This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

376 lines
12 KiB

  1. import re
  2. import codecs
  3. from .lexer import Lexer, Token, UnexpectedInput, TokenDef__Str, TokenDef__Regexp
  4. from .parse_tree_builder import ParseTreeBuilder
  5. from .parser_frontends import LALR
  6. from .parsers.lalr_parser import UnexpectedToken
  7. from .common import is_terminal, GrammarError, LexerConf, ParserConf
  8. from .tree import Tree as T, Transformer, InlineTransformer, Visitor
  9. unicode_escape = codecs.getdecoder('unicode_escape')
  10. _TOKEN_NAMES = {
  11. '.' : 'DOT',
  12. ',' : 'COMMA',
  13. ':' : 'COLON',
  14. ';' : 'SEMICOLON',
  15. '+' : 'PLUS',
  16. '-' : 'MINUS',
  17. '*' : 'STAR',
  18. '/' : 'SLASH',
  19. '\\' : 'BACKSLASH',
  20. '|' : 'VBAR',
  21. '?' : 'QMARK',
  22. '!' : 'BANG',
  23. '@' : 'AT',
  24. '#' : 'HASH',
  25. '$' : 'DOLLAR',
  26. '%' : 'PERCENT',
  27. '^' : 'CIRCUMFLEX',
  28. '&' : 'AMPERSAND',
  29. '_' : 'UNDERSCORE',
  30. '<' : 'LESSTHAN',
  31. '>' : 'MORETHAN',
  32. '=' : 'EQUAL',
  33. '"' : 'DBLQUOTE',
  34. '\'' : 'QUOTE',
  35. '`' : 'BACKQUOTE',
  36. '~' : 'TILDE',
  37. '(' : 'LPAR',
  38. ')' : 'RPAR',
  39. '{' : 'LBRACE',
  40. '}' : 'RBRACE',
  41. '[' : 'LSQB',
  42. ']' : 'RSQB',
  43. '\n' : 'NEWLINE',
  44. '\r\n' : 'CRLF',
  45. '\t' : 'TAB',
  46. ' ' : 'SPACE',
  47. }
  48. # Grammar Parser
  49. TOKENS = {
  50. '_LPAR': r'\(',
  51. '_RPAR': r'\)',
  52. '_LBRA': r'\[',
  53. '_RBRA': r'\]',
  54. 'OP': '[+*?](?![a-z])',
  55. '_COLON': ':',
  56. '_OR': r'\|',
  57. '_DOT': r'\.',
  58. 'RULE': '[_?*]?[a-z][_a-z0-9]*',
  59. 'TOKEN': '_?[A-Z][_A-Z0-9]*',
  60. 'STRING': r'".*?[^\\]"',
  61. 'REGEXP': r"/(?!/).*?[^\\]/",
  62. '_NL': r'(\r?\n)+\s*',
  63. 'WS': r'[ \t]+',
  64. 'COMMENT': r'//[^\n]*\n',
  65. '_TO': '->'
  66. }
  67. RULES = {
  68. 'start': ['list'],
  69. 'list': ['item', 'list item'],
  70. 'item': ['rule', 'token', '_NL'],
  71. 'rule': ['RULE _COLON expansions _NL'],
  72. 'expansions': ['alias',
  73. 'expansions _OR alias',
  74. 'expansions _NL _OR alias'],
  75. '?alias': ['expansion _TO RULE', 'expansion'],
  76. 'expansion': ['_expansion'],
  77. '_expansion': ['', '_expansion expr'],
  78. '?expr': ['atom',
  79. 'atom OP'],
  80. '?atom': ['_LPAR expansions _RPAR',
  81. 'maybe',
  82. 'RULE',
  83. 'TOKEN',
  84. 'anontoken'],
  85. 'anontoken': ['tokenvalue'],
  86. 'maybe': ['_LBRA expansions _RBRA'],
  87. 'token': ['TOKEN _COLON tokenvalue _NL',
  88. 'TOKEN tokenmods _COLON tokenvalue _NL'],
  89. '?tokenvalue': ['REGEXP', 'STRING'],
  90. 'tokenmods': ['_DOT RULE', 'tokenmods _DOT RULE'],
  91. }
  92. class EBNF_to_BNF(InlineTransformer):
  93. def __init__(self):
  94. self.new_rules = {}
  95. self.rules_by_expr = {}
  96. self.prefix = 'anon'
  97. self.i = 0
  98. def _add_recurse_rule(self, type_, expr):
  99. if expr in self.rules_by_expr:
  100. return self.rules_by_expr[expr]
  101. new_name = '__%s_%s_%d' % (self.prefix, type_, self.i)
  102. self.i += 1
  103. t = Token('RULE', new_name, -1)
  104. self.new_rules[new_name] = T('expansions', [T('expansion', [expr]), T('expansion', [t, expr])])
  105. self.rules_by_expr[expr] = t
  106. return t
  107. def expr(self, rule, op):
  108. if op.value == '?':
  109. return T('expansions', [rule, T('expansion', [])])
  110. elif op.value == '+':
  111. # a : b c+ d
  112. # -->
  113. # a : b _c d
  114. # _c : _c c | c;
  115. return self._add_recurse_rule('plus', rule)
  116. elif op.value == '*':
  117. # a : b c* d
  118. # -->
  119. # a : b _c? d
  120. # _c : _c c | c;
  121. new_name = self._add_recurse_rule('star', rule)
  122. return T('expansions', [new_name, T('expansion', [])])
  123. assert False, op
  124. class SimplifyRule_Visitor(Visitor):
  125. @staticmethod
  126. def _flatten(tree):
  127. while True:
  128. to_expand = [i for i, child in enumerate(tree.children)
  129. if isinstance(child, T) and child.data == tree.data]
  130. if not to_expand:
  131. break
  132. tree.expand_kids_by_index(*to_expand)
  133. def expansion(self, tree):
  134. # rules_list unpacking
  135. # a : b (c|d) e
  136. # -->
  137. # a : b c e | b d e
  138. #
  139. # In AST terms:
  140. # expansion(b, expansions(c, d), e)
  141. # -->
  142. # expansions( expansion(b, c, e), expansion(b, d, e) )
  143. while True:
  144. self._flatten(tree)
  145. for i, child in enumerate(tree.children):
  146. if isinstance(child, T) and child.data == 'expansions':
  147. tree.data = 'expansions'
  148. tree.children = [self.visit(T('expansion', [option if i==j else other
  149. for j, other in enumerate(tree.children)]))
  150. for option in child.children]
  151. break
  152. else:
  153. break
  154. def alias(self, tree):
  155. rule, alias_name = tree.children
  156. if rule.data == 'expansions':
  157. aliases = []
  158. for child in tree.children[0].children:
  159. aliases.append(T('alias', [child, alias_name]))
  160. tree.data = 'expansions'
  161. tree.children = aliases
  162. expansions = _flatten
  163. def dict_update_safe(d1, d2):
  164. for k, v in d2.items():
  165. assert k not in d1
  166. d1[k] = v
  167. class RuleTreeToText(Transformer):
  168. def expansions(self, x):
  169. return x
  170. def expansion(self, symbols):
  171. return [sym.value for sym in symbols], None
  172. def alias(self, x):
  173. (expansion, _alias), alias = x
  174. assert _alias is None, (alias, expansion, '-', _alias)
  175. return expansion, alias.value
  176. class SimplifyTree(InlineTransformer):
  177. def maybe(self, expr):
  178. return T('expr', [expr, Token('OP', '?', -1)])
  179. def tokenmods(self, *args):
  180. if len(args) == 1:
  181. return list(args)
  182. tokenmods, value = args
  183. return tokenmods + [value]
  184. def get_tokens(tree, token_set):
  185. tokens = []
  186. for t in tree.find_data('token'):
  187. x = t.children
  188. name = x[0].value
  189. assert not name.startswith('__'), 'Names starting with double-underscore are reserved (Error at %s)' % name
  190. if name in token_set:
  191. raise ValueError("Token '%s' defined more than once" % name)
  192. token_set.add(name)
  193. if len(x) == 2:
  194. yield name, x[1], []
  195. else:
  196. assert len(x) == 3
  197. yield name, x[2], x[1]
  198. class ExtractAnonTokens(InlineTransformer):
  199. def __init__(self, tokens, token_set):
  200. self.tokens = tokens
  201. self.token_set = token_set
  202. self.token_reverse = {value[1:-1]: name for name, value, _flags in tokens}
  203. self.i = 0
  204. def anontoken(self, token):
  205. if token.type == 'STRING':
  206. value = token.value[1:-1]
  207. try:
  208. # If already defined, use the user-defined token name
  209. token_name = self.token_reverse[value]
  210. except KeyError:
  211. # Try to assign an indicative anon-token name, otherwise use a numbered name
  212. try:
  213. token_name = _TOKEN_NAMES[value]
  214. except KeyError:
  215. if value.isalnum() and value[0].isalpha() and ('__'+value.upper()) not in self.token_set:
  216. token_name = value.upper() # This can create name duplications for unidentical tokens
  217. else:
  218. token_name = 'ANONSTR_%d' % self.i
  219. self.i += 1
  220. token_name = '__' + token_name
  221. elif token.type == 'REGEXP':
  222. token_name = 'ANONRE_%d' % self.i
  223. value = token.value
  224. self.i += 1
  225. else:
  226. assert False, token
  227. if token_name not in self.token_set:
  228. self.token_set.add(token_name)
  229. self.tokens.append((token_name, token, []))
  230. assert value not in self.token_reverse
  231. self.token_reverse[value] = token_name
  232. return Token('TOKEN', token_name, -1)
  233. class GrammarLoader:
  234. def __init__(self):
  235. tokens = [TokenDef__Regexp(name, value) for name, value in TOKENS.items()]
  236. d = {r: [(x.split(), None) for x in xs] for r, xs in RULES.items()}
  237. rules, callback = ParseTreeBuilder(T).create_tree_builder(d, None)
  238. lexer_conf = LexerConf(tokens, ['WS', 'COMMENT'], None)
  239. parser_conf = ParserConf(rules, callback, 'start')
  240. self.parser = LALR(lexer_conf, parser_conf)
  241. self.simplify_tree = SimplifyTree()
  242. self.simplify_rule = SimplifyRule_Visitor()
  243. self.rule_tree_to_text = RuleTreeToText()
  244. def load_grammar(self, grammar_text):
  245. try:
  246. tree = self.simplify_tree.transform( self.parser.parse(grammar_text+'\n') )
  247. except UnexpectedInput as e:
  248. raise GrammarError("Unexpected input %r at line %d column %d" % (e.context, e.line, e.column))
  249. except UnexpectedToken as e:
  250. if '_COLON' in e.expected:
  251. raise GrammarError("Missing colon at line %s column %s" % (e.line, e.column))
  252. elif 'tokenvalue' in e.expected:
  253. raise GrammarError("Expecting a value at line %s column %s" % (e.line, e.column))
  254. raise
  255. # =================
  256. # Process Tokens
  257. # =================
  258. token_set = set()
  259. tokens = list(get_tokens(tree, token_set))
  260. extract_anon = ExtractAnonTokens(tokens, token_set)
  261. tree = extract_anon.transform(tree) # Adds to tokens
  262. token_ref = {}
  263. tokendefs = []
  264. for name, token, flags in tokens:
  265. value = token.value[1:-1]
  266. if r'\u' in value:
  267. # XXX for now, you can't mix unicode escaping and unicode characters at the same token
  268. value = unicode_escape(value)[0]
  269. if token.type == 'REGEXP':
  270. sp = re.split(r'(\$\{%s})' % TOKENS['TOKEN'], value)
  271. if sp:
  272. value = ''.join(token_ref[x[2:-1]] if x.startswith('${') and x.endswith('}') else x
  273. for x in sp)
  274. token_ref[name] = value
  275. tokendef = TokenDef__Regexp(name, value)
  276. else:
  277. assert token.type == 'STRING'
  278. tokendef = TokenDef__Str(name, value)
  279. tokendefs.append((tokendef, flags))
  280. # =================
  281. # Process Rules
  282. # =================
  283. ebnf_to_bnf = EBNF_to_BNF()
  284. rules = {}
  285. for rule in tree.find_data('rule'):
  286. name, ebnf_tree = rule.children
  287. name = name.value
  288. if name in rules:
  289. raise ValueError("Rule '%s' defined more than once" % name)
  290. rules[name] = ebnf_to_bnf.transform(ebnf_tree)
  291. dict_update_safe(rules, ebnf_to_bnf.new_rules)
  292. for r in rules.values():
  293. self.simplify_rule.visit(r)
  294. rules = {origin: self.rule_tree_to_text.transform(tree) for origin, tree in rules.items()}
  295. # ====================
  296. # Verify correctness
  297. # ====================
  298. used_symbols = {symbol for expansions in rules.values()
  299. for expansion, _alias in expansions
  300. for symbol in expansion}
  301. rule_set = {r.lstrip('?') for r in rules}
  302. for sym in used_symbols:
  303. if is_terminal(sym):
  304. if sym not in token_set:
  305. raise GrammarError("Token '%s' used but not defined" % sym)
  306. else:
  307. if sym not in rule_set:
  308. raise GrammarError("Rule '%s' used but not defined" % sym)
  309. return tokendefs, rules
  310. load_grammar = GrammarLoader().load_grammar