This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.

421 lignes
12 KiB

  1. import re
  2. import codecs
  3. from .lexer import Lexer, Token
  4. from .parse_tree_builder import ParseTreeBuilder
  5. from .parser_frontends import LALR
  6. from .common import is_terminal, GrammarError
  7. from .tree import Tree as T, Transformer, InlineTransformer, Visitor
  8. unicode_escape = codecs.getdecoder('unicode_escape')
  9. _TOKEN_NAMES = {
  10. '.' : 'DOT',
  11. ',' : 'COMMA',
  12. ':' : 'COLON',
  13. ';' : 'SEMICOLON',
  14. '+' : 'PLUS',
  15. '-' : 'MINUS',
  16. '*' : 'STAR',
  17. '/' : 'SLASH',
  18. '\\' : 'BACKSLASH',
  19. '|' : 'VBAR',
  20. '?' : 'QMARK',
  21. '!' : 'BANG',
  22. '@' : 'AT',
  23. '#' : 'HASH',
  24. '$' : 'DOLLAR',
  25. '%' : 'PERCENT',
  26. '^' : 'CIRCUMFLEX',
  27. '&' : 'AMPERSAND',
  28. '_' : 'UNDERSCORE',
  29. '<' : 'LESSTHAN',
  30. '>' : 'MORETHAN',
  31. '=' : 'EQUAL',
  32. '"' : 'DBLQUOTE',
  33. '\'' : 'QUOTE',
  34. '`' : 'BACKQUOTE',
  35. '~' : 'TILDE',
  36. '(' : 'LPAR',
  37. ')' : 'RPAR',
  38. '{' : 'LBRACE',
  39. '}' : 'RBRACE',
  40. '[' : 'LSQB',
  41. ']' : 'RSQB',
  42. '\n' : 'NEWLINE',
  43. '\r\n' : 'CRLF',
  44. '\t' : 'TAB',
  45. ' ' : 'SPACE',
  46. }
  47. # Grammar Parser
  48. TOKENS = {
  49. '_LPAR': '\(',
  50. '_RPAR': '\)',
  51. '_LBRA': '\[',
  52. '_RBRA': '\]',
  53. 'OP': '[+*?](?![a-z])',
  54. '_COLON': ':',
  55. '_OR': '\|',
  56. '_DOT': '\.',
  57. 'RULE': '[_?*]?[a-z][_a-z0-9]*',
  58. 'TOKEN': '_?[A-Z][_A-Z0-9]*',
  59. 'STRING': r'".*?[^\\]"',
  60. 'REGEXP': r"/(?!/).*?[^\\]/",
  61. '_NL': r'(\r?\n)+\s*',
  62. 'WS': r'[ \t]+',
  63. 'COMMENT': r'//[^\n]*\n',
  64. '_TO': '->'
  65. }
  66. RULES = {
  67. 'start': ['list'],
  68. 'list': ['item', 'list item'],
  69. 'item': ['rule', 'token', '_NL'],
  70. 'rule': ['RULE _COLON expansions _NL'],
  71. 'expansions': ['alias',
  72. 'expansions _OR alias',
  73. 'expansions _NL _OR alias'],
  74. '?alias': ['expansion _TO RULE', 'expansion'],
  75. 'expansion': ['_expansion'],
  76. '_expansion': ['', '_expansion expr'],
  77. '?expr': ['atom',
  78. 'atom OP'],
  79. '?atom': ['_LPAR expansions _RPAR',
  80. 'maybe',
  81. 'RULE',
  82. 'TOKEN',
  83. 'anontoken'],
  84. 'anontoken': ['tokenvalue'],
  85. 'maybe': ['_LBRA expansions _RBRA'],
  86. 'token': ['TOKEN _COLON tokenvalue _NL',
  87. 'TOKEN tokenmods _COLON tokenvalue _NL'],
  88. '?tokenvalue': ['REGEXP', 'STRING'],
  89. 'tokenmods': ['_DOT RULE', 'tokenmods _DOT RULE'],
  90. }
  91. class EBNF_to_BNF(InlineTransformer):
  92. def __init__(self):
  93. self.new_rules = {}
  94. self.rules_by_expr = {}
  95. self.prefix = 'anon'
  96. self.i = 0
  97. def _add_recurse_rule(self, type_, expr):
  98. if expr in self.rules_by_expr:
  99. return self.rules_by_expr[expr]
  100. new_name = '__%s_%s_%d' % (self.prefix, type_, self.i)
  101. self.i += 1
  102. t = Token('RULE', new_name, -1)
  103. self.new_rules[new_name] = T('expansions', [T('expansion', [expr]), T('expansion', [t, expr])])
  104. self.rules_by_expr[expr] = t
  105. return t
  106. def expr(self, rule, op):
  107. if op.value == '?':
  108. return T('expansions', [rule, T('expansion', [])])
  109. elif op.value == '+':
  110. # a : b c+ d
  111. # -->
  112. # a : b _c d
  113. # _c : _c c | c;
  114. return self._add_recurse_rule('plus', rule)
  115. elif op.value == '*':
  116. # a : b c* d
  117. # -->
  118. # a : b _c? d
  119. # _c : _c c | c;
  120. new_name = self._add_recurse_rule('star', rule)
  121. return T('expansions', [new_name, T('expansion', [])])
  122. assert False, op
  123. class SimplifyRule_Visitor(Visitor):
  124. @staticmethod
  125. def _flatten(tree):
  126. while True:
  127. to_expand = [i for i, child in enumerate(tree.children)
  128. if isinstance(child, T) and child.data == tree.data]
  129. if not to_expand:
  130. break
  131. tree.expand_kids_by_index(*to_expand)
  132. def expansion(self, tree):
  133. # rules_list unpacking
  134. # a : b (c|d) e
  135. # -->
  136. # a : b c e | b d e
  137. #
  138. # In AST terms:
  139. # expansion(b, expansions(c, d), e)
  140. # -->
  141. # expansions( expansion(b, c, e), expansion(b, d, e) )
  142. while True:
  143. self._flatten(tree)
  144. for i, child in enumerate(tree.children):
  145. if isinstance(child, T) and child.data == 'expansions':
  146. tree.data = 'expansions'
  147. tree.children = [self.visit(T('expansion', [option if i==j else other
  148. for j, other in enumerate(tree.children)]))
  149. for option in child.children]
  150. break
  151. else:
  152. break
  153. def alias(self, tree):
  154. rule, alias_name = tree.children
  155. if rule.data == 'expansions':
  156. aliases = []
  157. for child in tree.children[0].children:
  158. aliases.append(T('alias', [child, alias_name]))
  159. tree.data = 'expansions'
  160. tree.children = aliases
  161. expansions = _flatten
  162. def dict_update_safe(d1, d2):
  163. for k, v in d2.items():
  164. assert k not in d1
  165. d1[k] = v
  166. class RuleTreeToText(Transformer):
  167. def expansions(self, x):
  168. return x
  169. def expansion(self, symbols):
  170. return [sym.value for sym in symbols], None
  171. def alias(self, x):
  172. (expansion, _alias), alias = x
  173. assert _alias is None, (alias, expansion, '-', _alias)
  174. return expansion, alias.value
  175. class SimplifyTree(InlineTransformer):
  176. def maybe(self, expr):
  177. return T('expr', [expr, Token('OP', '?', -1)])
  178. def tokenmods(self, *args):
  179. if len(args) == 1:
  180. return list(args)
  181. tokenmods, value = args
  182. return tokenmods + [value]
  183. def get_tokens(tree, token_set):
  184. tokens = []
  185. for t in tree.find_data('token'):
  186. x = t.children
  187. name = x[0].value
  188. assert not name.startswith('__'), 'Names starting with double-underscore are reserved (Error at %s)' % name
  189. if name in token_set:
  190. raise ValueError("Token '%s' defined more than once" % name)
  191. token_set.add(name)
  192. if len(x) == 2:
  193. yield name, x[1], []
  194. else:
  195. assert len(x) == 3
  196. yield name, x[2], x[1]
  197. class ExtractAnonTokens(InlineTransformer):
  198. def __init__(self, tokens, token_set):
  199. self.tokens = tokens
  200. self.token_set = token_set
  201. self.token_reverse = {value[1:-1]: name for name, value, _flags in tokens}
  202. self.i = 0
  203. def anontoken(self, token):
  204. if token.type == 'STRING':
  205. value = token.value[1:-1]
  206. try:
  207. # If already defined, use the user-defined token name
  208. token_name = self.token_reverse[value]
  209. except KeyError:
  210. # Try to assign an indicative anon-token name, otherwise use a numbered name
  211. try:
  212. token_name = _TOKEN_NAMES[value]
  213. except KeyError:
  214. if value.isalnum() and value[0].isalpha():
  215. token_name = value.upper()
  216. else:
  217. token_name = 'ANONSTR_%d' % self.i
  218. self.i += 1
  219. token_name = '__' + token_name
  220. elif token.type == 'REGEXP':
  221. token_name = 'ANONRE_%d' % self.i
  222. self.i += 1
  223. else:
  224. assert False, x
  225. if token_name not in self.token_set:
  226. self.token_set.add(token_name)
  227. self.tokens.append((token_name, token, []))
  228. return Token('TOKEN', token_name, -1)
  229. class GrammarLoader:
  230. def __init__(self):
  231. self.lexer = Lexer(TOKENS.items(), {}, ignore=['WS', 'COMMENT'])
  232. d = {r: [(x.split(), None) for x in xs] for r, xs in RULES.items()}
  233. rules, callback = ParseTreeBuilder(T).create_tree_builder(d, None)
  234. self.parser = LALR().build_parser(rules, callback, 'start')
  235. self.simplify_tree = SimplifyTree()
  236. self.simplify_rule = SimplifyRule_Visitor()
  237. self.rule_tree_to_text = RuleTreeToText()
  238. def load_grammar(self, grammar_text):
  239. token_stream = list(self.lexer.lex(grammar_text+"\n"))
  240. tree = self.simplify_tree.transform( self.parser.parse(token_stream) )
  241. # =================
  242. # Process Tokens
  243. # =================
  244. token_set = set()
  245. tokens = list(get_tokens(tree, token_set))
  246. extract_anon = ExtractAnonTokens(tokens, token_set)
  247. tree = extract_anon.transform(tree) # Adds to tokens
  248. tokens2 = []
  249. for name, token, flags in tokens:
  250. value = token.value[1:-1]
  251. if r'\u' in value:
  252. # XXX for now, you can't mix unicode escaping and unicode characters at the same token
  253. value = unicode_escape(value)[0]
  254. tokens2.append((name, token.type, value, flags))
  255. token_ref = {}
  256. re_tokens = []
  257. str_tokens = []
  258. for name, type_, value, flags in tokens2:
  259. if type_ == 'STRING':
  260. str_tokens.append((name, value, flags))
  261. else:
  262. assert type_ == 'REGEXP'
  263. sp = re.split(r'(\$\{%s})' % TOKENS['TOKEN'], value)
  264. if sp:
  265. value = ''.join(token_ref[x[2:-1]] if x.startswith('${') and x.endswith('}') else x
  266. for x in sp)
  267. re_tokens.append((name, value, flags))
  268. token_ref[name] = value
  269. embedded_strs = set()
  270. for re_name, re_value, re_flags in re_tokens:
  271. unless = {}
  272. for str_name, str_value, _sf in str_tokens:
  273. m = re.match(re_value, str_value)
  274. if m and m.group(0) == str_value:
  275. assert not _sf, "You just broke Lark! Please email me with your grammar"
  276. embedded_strs.add(str_name)
  277. unless[str_value] = str_name
  278. if unless:
  279. re_flags.append(('unless', unless))
  280. str_tokens = [(n, re.escape(v), f) for n, v, f in str_tokens if n not in embedded_strs]
  281. str_tokens.sort(key=lambda x:len(x[1]), reverse=True)
  282. re_tokens.sort(key=lambda x:len(x[1]), reverse=True)
  283. tokens = str_tokens + re_tokens # Order is important!
  284. # =================
  285. # Process Rules
  286. # =================
  287. ebnf_to_bnf = EBNF_to_BNF()
  288. rules = {}
  289. for rule in tree.find_data('rule'):
  290. name, ebnf_tree = rule.children
  291. name = name.value
  292. if name in rules:
  293. raise ValueError("Rule '%s' defined more than once" % name)
  294. rules[name] = ebnf_to_bnf.transform(ebnf_tree)
  295. dict_update_safe(rules, ebnf_to_bnf.new_rules)
  296. for r in rules.values():
  297. self.simplify_rule.visit(r)
  298. rules = {origin: self.rule_tree_to_text.transform(tree) for origin, tree in rules.items()}
  299. # ====================
  300. # Verify correctness
  301. # ====================
  302. used_symbols = {symbol for expansions in rules.values()
  303. for expansion, _alias in expansions
  304. for symbol in expansion}
  305. rule_set = {r.lstrip('?') for r in rules}
  306. for sym in used_symbols:
  307. if is_terminal(sym):
  308. if sym not in token_set:
  309. raise GrammarError("Token '%s' used but not defined" % sym)
  310. else:
  311. if sym not in rule_set:
  312. raise GrammarError("Rule '%s' used but not defined" % sym)
  313. return tokens, rules
  314. load_grammar = GrammarLoader().load_grammar
  315. def test():
  316. g = """
  317. start: add
  318. // Rules
  319. add: mul
  320. | add _add_sym mul
  321. mul: [mul _add_mul] _atom
  322. _atom: "-" _atom -> neg
  323. | NUMBER
  324. | "(" add ")"
  325. // Tokens
  326. NUMBER: /[\d.]+/
  327. _add_sym: "+" | "-"
  328. _add_mul: "*" | "/"
  329. WS.ignore.newline: /\s+/
  330. """
  331. g2 = """
  332. start: a
  333. a: "a" (b*|(c d)+) "b"?
  334. b: "b"
  335. c: "c"
  336. d: "+" | "-"
  337. """
  338. # print load_grammar(g)
  339. print(GrammarLoader().load_grammar2(g))
  340. if __name__ == '__main__':
  341. test()