This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

421 line
12 KiB

  1. import re
  2. import codecs
  3. from .lexer import Lexer, Token
  4. from .parse_tree_builder import ParseTreeBuilder
  5. from .parser_frontends import LALR
  6. from .common import is_terminal, GrammarError
  7. from .tree import Tree as T, Transformer, InlineTransformer, Visitor
  8. unicode_escape = codecs.getdecoder('unicode_escape')
  9. _TOKEN_NAMES = {
  10. '.' : 'DOT',
  11. ',' : 'COMMA',
  12. ':' : 'COLON',
  13. ';' : 'SEMICOLON',
  14. '+' : 'PLUS',
  15. '-' : 'MINUS',
  16. '*' : 'STAR',
  17. '/' : 'SLASH',
  18. '\\' : 'BACKSLASH',
  19. '|' : 'VBAR',
  20. '?' : 'QMARK',
  21. '!' : 'BANG',
  22. '@' : 'AT',
  23. '#' : 'HASH',
  24. '$' : 'DOLLAR',
  25. '%' : 'PERCENT',
  26. '^' : 'CIRCUMFLEX',
  27. '&' : 'AMPERSAND',
  28. '_' : 'UNDERSCORE',
  29. '<' : 'LESSTHAN',
  30. '>' : 'MORETHAN',
  31. '=' : 'EQUAL',
  32. '"' : 'DBLQUOTE',
  33. '\'' : 'QUOTE',
  34. '`' : 'BACKQUOTE',
  35. '~' : 'TILDE',
  36. '(' : 'LPAR',
  37. ')' : 'RPAR',
  38. '{' : 'LBRACE',
  39. '}' : 'RBRACE',
  40. '[' : 'LSQB',
  41. ']' : 'RSQB',
  42. '\n' : 'NEWLINE',
  43. '\r\n' : 'CRLF',
  44. '\t' : 'TAB',
  45. ' ' : 'SPACE',
  46. }
  47. # Grammar Parser
  48. TOKENS = {
  49. '_LPAR': '\(',
  50. '_RPAR': '\)',
  51. '_LBRA': '\[',
  52. '_RBRA': '\]',
  53. 'OP': '[+*?](?![a-z])',
  54. '_COLON': ':',
  55. '_OR': '\|',
  56. '_DOT': '\.',
  57. 'RULE': '[_?*]?[a-z][_a-z0-9]*',
  58. 'TOKEN': '_?[A-Z][_A-Z0-9]*',
  59. 'STRING': r'".*?[^\\]"',
  60. 'REGEXP': r"/(?!/).*?[^\\]/",
  61. '_NL': r'(\r?\n)+\s*',
  62. 'WS': r'[ \t]+',
  63. 'COMMENT': r'//[^\n]*\n',
  64. '_TO': '->'
  65. }
  66. RULES = {
  67. 'start': ['list'],
  68. 'list': ['item', 'list item'],
  69. 'item': ['rule', 'token', '_NL'],
  70. 'rule': ['RULE _COLON expansions _NL'],
  71. 'expansions': ['alias',
  72. 'expansions _OR alias',
  73. 'expansions _NL _OR alias'],
  74. '?alias': ['expansion _TO RULE', 'expansion'],
  75. 'expansion': ['_expansion'],
  76. '_expansion': ['', '_expansion expr'],
  77. '?expr': ['atom',
  78. 'atom OP'],
  79. '?atom': ['_LPAR expansions _RPAR',
  80. 'maybe',
  81. 'RULE',
  82. 'TOKEN',
  83. 'anontoken'],
  84. 'anontoken': ['tokenvalue'],
  85. 'maybe': ['_LBRA expansions _RBRA'],
  86. 'token': ['TOKEN _COLON tokenvalue _NL',
  87. 'TOKEN tokenmods _COLON tokenvalue _NL'],
  88. '?tokenvalue': ['REGEXP', 'STRING'],
  89. 'tokenmods': ['_DOT RULE', 'tokenmods _DOT RULE'],
  90. }
  91. class EBNF_to_BNF(InlineTransformer):
  92. def __init__(self):
  93. self.new_rules = {}
  94. self.rules_by_expr = {}
  95. self.prefix = 'anon'
  96. self.i = 0
  97. def _add_recurse_rule(self, type_, expr):
  98. if expr in self.rules_by_expr:
  99. return self.rules_by_expr[expr]
  100. new_name = '__%s_%s_%d' % (self.prefix, type_, self.i)
  101. self.i += 1
  102. t = Token('RULE', new_name, -1)
  103. self.new_rules[new_name] = T('expansions', [T('expansion', [expr]), T('expansion', [t, expr])])
  104. self.rules_by_expr[expr] = t
  105. return t
  106. def expr(self, rule, op):
  107. if op.value == '?':
  108. return T('expansions', [rule, T('expansion', [])])
  109. elif op.value == '+':
  110. # a : b c+ d
  111. # -->
  112. # a : b _c d
  113. # _c : _c c | c;
  114. return self._add_recurse_rule('plus', rule)
  115. elif op.value == '*':
  116. # a : b c* d
  117. # -->
  118. # a : b _c? d
  119. # _c : _c c | c;
  120. new_name = self._add_recurse_rule('star', rule)
  121. return T('expansions', [new_name, T('expansion', [])])
  122. assert False, op
  123. class SimplifyRule_Visitor(Visitor):
  124. @staticmethod
  125. def _flatten(tree):
  126. while True:
  127. to_expand = [i for i, child in enumerate(tree.children)
  128. if isinstance(child, T) and child.data == tree.data]
  129. if not to_expand:
  130. break
  131. tree.expand_kids_by_index(*to_expand)
  132. def expansion(self, tree):
  133. # rules_list unpacking
  134. # a : b (c|d) e
  135. # -->
  136. # a : b c e | b d e
  137. #
  138. # In AST terms:
  139. # expansion(b, expansions(c, d), e)
  140. # -->
  141. # expansions( expansion(b, c, e), expansion(b, d, e) )
  142. while True:
  143. self._flatten(tree)
  144. for i, child in enumerate(tree.children):
  145. if isinstance(child, T) and child.data == 'expansions':
  146. tree.data = 'expansions'
  147. tree.children = [self.visit(T('expansion', [option if i==j else other
  148. for j, other in enumerate(tree.children)]))
  149. for option in child.children]
  150. break
  151. else:
  152. break
  153. def alias(self, tree):
  154. rule, alias_name = tree.children
  155. if rule.data == 'expansions':
  156. aliases = []
  157. for child in tree.children[0].children:
  158. aliases.append(T('alias', [child, alias_name]))
  159. tree.data = 'expansions'
  160. tree.children = aliases
  161. expansions = _flatten
  162. def dict_update_safe(d1, d2):
  163. for k, v in d2.items():
  164. assert k not in d1
  165. d1[k] = v
  166. class RuleTreeToText(Transformer):
  167. def expansions(self, x):
  168. return x
  169. def expansion(self, symbols):
  170. return [sym.value for sym in symbols], None
  171. def alias(self, x):
  172. (expansion, _alias), alias = x
  173. assert _alias is None, (alias, expansion, '-', _alias)
  174. return expansion, alias.value
  175. class SimplifyTree(InlineTransformer):
  176. def maybe(self, expr):
  177. return T('expr', [expr, Token('OP', '?', -1)])
  178. def tokenmods(self, *args):
  179. if len(args) == 1:
  180. return list(args)
  181. tokenmods, value = args
  182. return tokenmods + [value]
  183. def get_tokens(tree, token_set):
  184. tokens = []
  185. for t in tree.find_data('token'):
  186. x = t.children
  187. name = x[0].value
  188. assert not name.startswith('__'), 'Names starting with double-underscore are reserved (Error at %s)' % name
  189. if name in token_set:
  190. raise ValueError("Token '%s' defined more than once" % name)
  191. token_set.add(name)
  192. if len(x) == 2:
  193. yield name, x[1], []
  194. else:
  195. assert len(x) == 3
  196. yield name, x[2], x[1]
  197. class ExtractAnonTokens(InlineTransformer):
  198. def __init__(self, tokens, token_set):
  199. self.tokens = tokens
  200. self.token_set = token_set
  201. self.token_reverse = {value[1:-1]: name for name, value, _flags in tokens}
  202. self.i = 0
  203. def anontoken(self, token):
  204. if token.type == 'STRING':
  205. value = token.value[1:-1]
  206. try:
  207. # If already defined, use the user-defined token name
  208. token_name = self.token_reverse[value]
  209. except KeyError:
  210. # Try to assign an indicative anon-token name, otherwise use a numbered name
  211. try:
  212. token_name = _TOKEN_NAMES[value]
  213. except KeyError:
  214. if value.isalnum() and value[0].isalpha():
  215. token_name = value.upper()
  216. else:
  217. token_name = 'ANONSTR_%d' % self.i
  218. self.i += 1
  219. token_name = '__' + token_name
  220. elif token.type == 'REGEXP':
  221. token_name = 'ANONRE_%d' % self.i
  222. self.i += 1
  223. else:
  224. assert False, x
  225. if token_name not in self.token_set:
  226. self.token_set.add(token_name)
  227. self.tokens.append((token_name, token, []))
  228. return Token('TOKEN', token_name, -1)
  229. class GrammarLoader:
  230. def __init__(self):
  231. self.lexer = Lexer(TOKENS.items(), {}, ignore=['WS', 'COMMENT'])
  232. d = {r: [(x.split(), None) for x in xs] for r, xs in RULES.items()}
  233. rules, callback = ParseTreeBuilder(T).create_tree_builder(d, None)
  234. self.parser = LALR().build_parser(rules, callback, 'start')
  235. self.simplify_tree = SimplifyTree()
  236. self.simplify_rule = SimplifyRule_Visitor()
  237. self.rule_tree_to_text = RuleTreeToText()
  238. def load_grammar(self, grammar_text):
  239. token_stream = list(self.lexer.lex(grammar_text+"\n"))
  240. tree = self.simplify_tree.transform( self.parser.parse(token_stream) )
  241. # =================
  242. # Process Tokens
  243. # =================
  244. token_set = set()
  245. tokens = list(get_tokens(tree, token_set))
  246. extract_anon = ExtractAnonTokens(tokens, token_set)
  247. tree = extract_anon.transform(tree) # Adds to tokens
  248. tokens2 = []
  249. for name, token, flags in tokens:
  250. value = token.value[1:-1]
  251. if r'\u' in value:
  252. # XXX for now, you can't mix unicode escaping and unicode characters at the same token
  253. value = unicode_escape(value)[0]
  254. tokens2.append((name, token.type, value, flags))
  255. token_ref = {}
  256. re_tokens = []
  257. str_tokens = []
  258. for name, type_, value, flags in tokens2:
  259. if type_ == 'STRING':
  260. str_tokens.append((name, value, flags))
  261. else:
  262. assert type_ == 'REGEXP'
  263. sp = re.split(r'(\$\{%s})' % TOKENS['TOKEN'], value)
  264. if sp:
  265. value = ''.join(token_ref[x[2:-1]] if x.startswith('${') and x.endswith('}') else x
  266. for x in sp)
  267. re_tokens.append((name, value, flags))
  268. token_ref[name] = value
  269. embedded_strs = set()
  270. for re_name, re_value, re_flags in re_tokens:
  271. unless = {}
  272. for str_name, str_value, _sf in str_tokens:
  273. m = re.match(re_value, str_value)
  274. if m and m.group(0) == str_value:
  275. assert not _sf, "You just broke Lark! Please email me with your grammar"
  276. embedded_strs.add(str_name)
  277. unless[str_value] = str_name
  278. if unless:
  279. re_flags.append(('unless', unless))
  280. str_tokens = [(n, re.escape(v), f) for n, v, f in str_tokens if n not in embedded_strs]
  281. str_tokens.sort(key=lambda x:len(x[1]), reverse=True)
  282. re_tokens.sort(key=lambda x:len(x[1]), reverse=True)
  283. tokens = str_tokens + re_tokens # Order is important!
  284. # =================
  285. # Process Rules
  286. # =================
  287. ebnf_to_bnf = EBNF_to_BNF()
  288. rules = {}
  289. for rule in tree.find_data('rule'):
  290. name, ebnf_tree = rule.children
  291. name = name.value
  292. if name in rules:
  293. raise ValueError("Rule '%s' defined more than once" % name)
  294. rules[name] = ebnf_to_bnf.transform(ebnf_tree)
  295. dict_update_safe(rules, ebnf_to_bnf.new_rules)
  296. for r in rules.values():
  297. self.simplify_rule.visit(r)
  298. rules = {origin: self.rule_tree_to_text.transform(tree) for origin, tree in rules.items()}
  299. # ====================
  300. # Verify correctness
  301. # ====================
  302. used_symbols = {symbol for expansions in rules.values()
  303. for expansion, _alias in expansions
  304. for symbol in expansion}
  305. rule_set = {r.lstrip('?') for r in rules}
  306. for sym in used_symbols:
  307. if is_terminal(sym):
  308. if sym not in token_set:
  309. raise GrammarError("Token '%s' used but not defined" % sym)
  310. else:
  311. if sym not in rule_set:
  312. raise GrammarError("Rule '%s' used but not defined" % sym)
  313. return tokens, rules
  314. load_grammar = GrammarLoader().load_grammar
  315. def test():
  316. g = """
  317. start: add
  318. // Rules
  319. add: mul
  320. | add _add_sym mul
  321. mul: [mul _add_mul] _atom
  322. _atom: "-" _atom -> neg
  323. | NUMBER
  324. | "(" add ")"
  325. // Tokens
  326. NUMBER: /[\d.]+/
  327. _add_sym: "+" | "-"
  328. _add_mul: "*" | "/"
  329. WS.ignore.newline: /\s+/
  330. """
  331. g2 = """
  332. start: a
  333. a: "a" (b*|(c d)+) "b"?
  334. b: "b"
  335. c: "c"
  336. d: "+" | "-"
  337. """
  338. # print load_grammar(g)
  339. print(GrammarLoader().load_grammar2(g))
  340. if __name__ == '__main__':
  341. test()