This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.

435 lignes
13 KiB

  1. import re
  2. import codecs
  3. from .lexer import Lexer, Token, UnexpectedInput
  4. from .parse_tree_builder import ParseTreeBuilder
  5. from .parser_frontends import LALR
  6. from .parsers.lalr_parser import UnexpectedToken
  7. from .common import is_terminal, GrammarError
  8. from .tree import Tree as T, Transformer, InlineTransformer, Visitor
  9. unicode_escape = codecs.getdecoder('unicode_escape')
  10. _TOKEN_NAMES = {
  11. '.' : 'DOT',
  12. ',' : 'COMMA',
  13. ':' : 'COLON',
  14. ';' : 'SEMICOLON',
  15. '+' : 'PLUS',
  16. '-' : 'MINUS',
  17. '*' : 'STAR',
  18. '/' : 'SLASH',
  19. '\\' : 'BACKSLASH',
  20. '|' : 'VBAR',
  21. '?' : 'QMARK',
  22. '!' : 'BANG',
  23. '@' : 'AT',
  24. '#' : 'HASH',
  25. '$' : 'DOLLAR',
  26. '%' : 'PERCENT',
  27. '^' : 'CIRCUMFLEX',
  28. '&' : 'AMPERSAND',
  29. '_' : 'UNDERSCORE',
  30. '<' : 'LESSTHAN',
  31. '>' : 'MORETHAN',
  32. '=' : 'EQUAL',
  33. '"' : 'DBLQUOTE',
  34. '\'' : 'QUOTE',
  35. '`' : 'BACKQUOTE',
  36. '~' : 'TILDE',
  37. '(' : 'LPAR',
  38. ')' : 'RPAR',
  39. '{' : 'LBRACE',
  40. '}' : 'RBRACE',
  41. '[' : 'LSQB',
  42. ']' : 'RSQB',
  43. '\n' : 'NEWLINE',
  44. '\r\n' : 'CRLF',
  45. '\t' : 'TAB',
  46. ' ' : 'SPACE',
  47. }
  48. # Grammar Parser
  49. TOKENS = {
  50. '_LPAR': '\(',
  51. '_RPAR': '\)',
  52. '_LBRA': '\[',
  53. '_RBRA': '\]',
  54. 'OP': '[+*?](?![a-z])',
  55. '_COLON': ':',
  56. '_OR': '\|',
  57. '_DOT': '\.',
  58. 'RULE': '[_?*]?[a-z][_a-z0-9]*',
  59. 'TOKEN': '_?[A-Z][_A-Z0-9]*',
  60. 'STRING': r'".*?[^\\]"',
  61. 'REGEXP': r"/(?!/).*?[^\\]/",
  62. '_NL': r'(\r?\n)+\s*',
  63. 'WS': r'[ \t]+',
  64. 'COMMENT': r'//[^\n]*\n',
  65. '_TO': '->'
  66. }
  67. RULES = {
  68. 'start': ['list'],
  69. 'list': ['item', 'list item'],
  70. 'item': ['rule', 'token', '_NL'],
  71. 'rule': ['RULE _COLON expansions _NL'],
  72. 'expansions': ['alias',
  73. 'expansions _OR alias',
  74. 'expansions _NL _OR alias'],
  75. '?alias': ['expansion _TO RULE', 'expansion'],
  76. 'expansion': ['_expansion'],
  77. '_expansion': ['', '_expansion expr'],
  78. '?expr': ['atom',
  79. 'atom OP'],
  80. '?atom': ['_LPAR expansions _RPAR',
  81. 'maybe',
  82. 'RULE',
  83. 'TOKEN',
  84. 'anontoken'],
  85. 'anontoken': ['tokenvalue'],
  86. 'maybe': ['_LBRA expansions _RBRA'],
  87. 'token': ['TOKEN _COLON tokenvalue _NL',
  88. 'TOKEN tokenmods _COLON tokenvalue _NL'],
  89. '?tokenvalue': ['REGEXP', 'STRING'],
  90. 'tokenmods': ['_DOT RULE', 'tokenmods _DOT RULE'],
  91. }
  92. class EBNF_to_BNF(InlineTransformer):
  93. def __init__(self):
  94. self.new_rules = {}
  95. self.rules_by_expr = {}
  96. self.prefix = 'anon'
  97. self.i = 0
  98. def _add_recurse_rule(self, type_, expr):
  99. if expr in self.rules_by_expr:
  100. return self.rules_by_expr[expr]
  101. new_name = '__%s_%s_%d' % (self.prefix, type_, self.i)
  102. self.i += 1
  103. t = Token('RULE', new_name, -1)
  104. self.new_rules[new_name] = T('expansions', [T('expansion', [expr]), T('expansion', [t, expr])])
  105. self.rules_by_expr[expr] = t
  106. return t
  107. def expr(self, rule, op):
  108. if op.value == '?':
  109. return T('expansions', [rule, T('expansion', [])])
  110. elif op.value == '+':
  111. # a : b c+ d
  112. # -->
  113. # a : b _c d
  114. # _c : _c c | c;
  115. return self._add_recurse_rule('plus', rule)
  116. elif op.value == '*':
  117. # a : b c* d
  118. # -->
  119. # a : b _c? d
  120. # _c : _c c | c;
  121. new_name = self._add_recurse_rule('star', rule)
  122. return T('expansions', [new_name, T('expansion', [])])
  123. assert False, op
  124. class SimplifyRule_Visitor(Visitor):
  125. @staticmethod
  126. def _flatten(tree):
  127. while True:
  128. to_expand = [i for i, child in enumerate(tree.children)
  129. if isinstance(child, T) and child.data == tree.data]
  130. if not to_expand:
  131. break
  132. tree.expand_kids_by_index(*to_expand)
  133. def expansion(self, tree):
  134. # rules_list unpacking
  135. # a : b (c|d) e
  136. # -->
  137. # a : b c e | b d e
  138. #
  139. # In AST terms:
  140. # expansion(b, expansions(c, d), e)
  141. # -->
  142. # expansions( expansion(b, c, e), expansion(b, d, e) )
  143. while True:
  144. self._flatten(tree)
  145. for i, child in enumerate(tree.children):
  146. if isinstance(child, T) and child.data == 'expansions':
  147. tree.data = 'expansions'
  148. tree.children = [self.visit(T('expansion', [option if i==j else other
  149. for j, other in enumerate(tree.children)]))
  150. for option in child.children]
  151. break
  152. else:
  153. break
  154. def alias(self, tree):
  155. rule, alias_name = tree.children
  156. if rule.data == 'expansions':
  157. aliases = []
  158. for child in tree.children[0].children:
  159. aliases.append(T('alias', [child, alias_name]))
  160. tree.data = 'expansions'
  161. tree.children = aliases
  162. expansions = _flatten
  163. def dict_update_safe(d1, d2):
  164. for k, v in d2.items():
  165. assert k not in d1
  166. d1[k] = v
  167. class RuleTreeToText(Transformer):
  168. def expansions(self, x):
  169. return x
  170. def expansion(self, symbols):
  171. return [sym.value for sym in symbols], None
  172. def alias(self, x):
  173. (expansion, _alias), alias = x
  174. assert _alias is None, (alias, expansion, '-', _alias)
  175. return expansion, alias.value
  176. class SimplifyTree(InlineTransformer):
  177. def maybe(self, expr):
  178. return T('expr', [expr, Token('OP', '?', -1)])
  179. def tokenmods(self, *args):
  180. if len(args) == 1:
  181. return list(args)
  182. tokenmods, value = args
  183. return tokenmods + [value]
  184. def get_tokens(tree, token_set):
  185. tokens = []
  186. for t in tree.find_data('token'):
  187. x = t.children
  188. name = x[0].value
  189. assert not name.startswith('__'), 'Names starting with double-underscore are reserved (Error at %s)' % name
  190. if name in token_set:
  191. raise ValueError("Token '%s' defined more than once" % name)
  192. token_set.add(name)
  193. if len(x) == 2:
  194. yield name, x[1], []
  195. else:
  196. assert len(x) == 3
  197. yield name, x[2], x[1]
  198. class ExtractAnonTokens(InlineTransformer):
  199. def __init__(self, tokens, token_set):
  200. self.tokens = tokens
  201. self.token_set = token_set
  202. self.token_reverse = {value[1:-1]: name for name, value, _flags in tokens}
  203. self.i = 0
  204. def anontoken(self, token):
  205. if token.type == 'STRING':
  206. value = token.value[1:-1]
  207. try:
  208. # If already defined, use the user-defined token name
  209. token_name = self.token_reverse[value]
  210. except KeyError:
  211. # Try to assign an indicative anon-token name, otherwise use a numbered name
  212. try:
  213. token_name = _TOKEN_NAMES[value]
  214. except KeyError:
  215. if value.isalnum() and value[0].isalpha():
  216. token_name = value.upper()
  217. else:
  218. token_name = 'ANONSTR_%d' % self.i
  219. self.i += 1
  220. token_name = '__' + token_name
  221. elif token.type == 'REGEXP':
  222. token_name = 'ANONRE_%d' % self.i
  223. value = token.value
  224. self.i += 1
  225. else:
  226. assert False, x
  227. if token_name not in self.token_set:
  228. self.token_set.add(token_name)
  229. self.tokens.append((token_name, token, []))
  230. assert value not in self.token_reverse
  231. self.token_reverse[value] = token_name
  232. return Token('TOKEN', token_name, -1)
  233. class GrammarLoader:
  234. def __init__(self):
  235. self.lexer = Lexer(TOKENS.items(), {}, ignore=['WS', 'COMMENT'])
  236. d = {r: [(x.split(), None) for x in xs] for r, xs in RULES.items()}
  237. rules, callback = ParseTreeBuilder(T).create_tree_builder(d, None)
  238. self.parser = LALR().build_parser(rules, callback, 'start')
  239. self.simplify_tree = SimplifyTree()
  240. self.simplify_rule = SimplifyRule_Visitor()
  241. self.rule_tree_to_text = RuleTreeToText()
  242. def load_grammar(self, grammar_text):
  243. try:
  244. token_stream = list(self.lexer.lex(grammar_text+"\n"))
  245. except UnexpectedInput as e:
  246. raise GrammarError("Unexpected input %r at line %d column %d" % (e.context, e.line, e.column))
  247. try:
  248. tree = self.simplify_tree.transform( self.parser.parse(token_stream) )
  249. except UnexpectedToken as e:
  250. if '_COLON' in e.expected:
  251. raise GrammarError("Missing colon at line %s column %s" % (e.line, e.column))
  252. elif 'tokenvalue' in e.expected:
  253. raise GrammarError("Expecting a value at line %s column %s" % (e.line, e.column))
  254. raise
  255. # =================
  256. # Process Tokens
  257. # =================
  258. token_set = set()
  259. tokens = list(get_tokens(tree, token_set))
  260. extract_anon = ExtractAnonTokens(tokens, token_set)
  261. tree = extract_anon.transform(tree) # Adds to tokens
  262. tokens2 = []
  263. for name, token, flags in tokens:
  264. value = token.value[1:-1]
  265. if r'\u' in value:
  266. # XXX for now, you can't mix unicode escaping and unicode characters at the same token
  267. value = unicode_escape(value)[0]
  268. tokens2.append((name, token.type, value, flags))
  269. token_ref = {}
  270. re_tokens = []
  271. str_tokens = []
  272. for name, type_, value, flags in tokens2:
  273. if type_ == 'STRING':
  274. str_tokens.append((name, value, flags))
  275. else:
  276. assert type_ == 'REGEXP'
  277. sp = re.split(r'(\$\{%s})' % TOKENS['TOKEN'], value)
  278. if sp:
  279. value = ''.join(token_ref[x[2:-1]] if x.startswith('${') and x.endswith('}') else x
  280. for x in sp)
  281. re_tokens.append((name, value, flags))
  282. token_ref[name] = value
  283. embedded_strs = set()
  284. for re_name, re_value, re_flags in re_tokens:
  285. unless = {}
  286. for str_name, str_value, _sf in str_tokens:
  287. m = re.match(re_value, str_value)
  288. if m and m.group(0) == str_value:
  289. assert not _sf, "You just broke Lark! Please email me with your grammar"
  290. embedded_strs.add(str_name)
  291. unless[str_value] = str_name
  292. if unless:
  293. re_flags.append(('unless', unless))
  294. str_tokens = [(n, re.escape(v), f) for n, v, f in str_tokens if n not in embedded_strs]
  295. str_tokens.sort(key=lambda x:len(x[1]), reverse=True)
  296. re_tokens.sort(key=lambda x:len(x[1]), reverse=True)
  297. tokens = str_tokens + re_tokens # Order is important!
  298. # =================
  299. # Process Rules
  300. # =================
  301. ebnf_to_bnf = EBNF_to_BNF()
  302. rules = {}
  303. for rule in tree.find_data('rule'):
  304. name, ebnf_tree = rule.children
  305. name = name.value
  306. if name in rules:
  307. raise ValueError("Rule '%s' defined more than once" % name)
  308. rules[name] = ebnf_to_bnf.transform(ebnf_tree)
  309. dict_update_safe(rules, ebnf_to_bnf.new_rules)
  310. for r in rules.values():
  311. self.simplify_rule.visit(r)
  312. rules = {origin: self.rule_tree_to_text.transform(tree) for origin, tree in rules.items()}
  313. # ====================
  314. # Verify correctness
  315. # ====================
  316. used_symbols = {symbol for expansions in rules.values()
  317. for expansion, _alias in expansions
  318. for symbol in expansion}
  319. rule_set = {r.lstrip('?') for r in rules}
  320. for sym in used_symbols:
  321. if is_terminal(sym):
  322. if sym not in token_set:
  323. raise GrammarError("Token '%s' used but not defined" % sym)
  324. else:
  325. if sym not in rule_set:
  326. raise GrammarError("Rule '%s' used but not defined" % sym)
  327. return tokens, rules
  328. load_grammar = GrammarLoader().load_grammar
  329. def test():
  330. g = """
  331. start: add
  332. // Rules
  333. add: mul
  334. | add _add_sym mul
  335. mul: [mul _add_mul] _atom
  336. _atom: "-" _atom -> neg
  337. | NUMBER
  338. | "(" add ")"
  339. // Tokens
  340. NUMBER: /[\d.]+/
  341. _add_sym: "+" | "-"
  342. _add_mul: "*" | "/"
  343. WS.ignore.newline: /\s+/
  344. """
  345. g2 = """
  346. start: a
  347. a: "a" (b*|(c d)+) "b"?
  348. b: "b"
  349. c: "c"
  350. d: "+" | "-"
  351. """
  352. # print load_grammar(g)
  353. print(GrammarLoader().load_grammar2(g))
  354. if __name__ == '__main__':
  355. test()