This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
25'ten fazla konu seçemezsiniz Konular bir harf veya rakamla başlamalı, kısa çizgiler ('-') içerebilir ve en fazla 35 karakter uzunluğunda olabilir.
 
 

435 satır
13 KiB

  1. import re
  2. import codecs
  3. from .lexer import Lexer, Token, UnexpectedInput
  4. from .parse_tree_builder import ParseTreeBuilder
  5. from .parser_frontends import LALR
  6. from .parsers.lalr_parser import UnexpectedToken
  7. from .common import is_terminal, GrammarError
  8. from .tree import Tree as T, Transformer, InlineTransformer, Visitor
  9. unicode_escape = codecs.getdecoder('unicode_escape')
  10. _TOKEN_NAMES = {
  11. '.' : 'DOT',
  12. ',' : 'COMMA',
  13. ':' : 'COLON',
  14. ';' : 'SEMICOLON',
  15. '+' : 'PLUS',
  16. '-' : 'MINUS',
  17. '*' : 'STAR',
  18. '/' : 'SLASH',
  19. '\\' : 'BACKSLASH',
  20. '|' : 'VBAR',
  21. '?' : 'QMARK',
  22. '!' : 'BANG',
  23. '@' : 'AT',
  24. '#' : 'HASH',
  25. '$' : 'DOLLAR',
  26. '%' : 'PERCENT',
  27. '^' : 'CIRCUMFLEX',
  28. '&' : 'AMPERSAND',
  29. '_' : 'UNDERSCORE',
  30. '<' : 'LESSTHAN',
  31. '>' : 'MORETHAN',
  32. '=' : 'EQUAL',
  33. '"' : 'DBLQUOTE',
  34. '\'' : 'QUOTE',
  35. '`' : 'BACKQUOTE',
  36. '~' : 'TILDE',
  37. '(' : 'LPAR',
  38. ')' : 'RPAR',
  39. '{' : 'LBRACE',
  40. '}' : 'RBRACE',
  41. '[' : 'LSQB',
  42. ']' : 'RSQB',
  43. '\n' : 'NEWLINE',
  44. '\r\n' : 'CRLF',
  45. '\t' : 'TAB',
  46. ' ' : 'SPACE',
  47. }
  48. # Grammar Parser
  49. TOKENS = {
  50. '_LPAR': '\(',
  51. '_RPAR': '\)',
  52. '_LBRA': '\[',
  53. '_RBRA': '\]',
  54. 'OP': '[+*?](?![a-z])',
  55. '_COLON': ':',
  56. '_OR': '\|',
  57. '_DOT': '\.',
  58. 'RULE': '[_?*]?[a-z][_a-z0-9]*',
  59. 'TOKEN': '_?[A-Z][_A-Z0-9]*',
  60. 'STRING': r'".*?[^\\]"',
  61. 'REGEXP': r"/(?!/).*?[^\\]/",
  62. '_NL': r'(\r?\n)+\s*',
  63. 'WS': r'[ \t]+',
  64. 'COMMENT': r'//[^\n]*\n',
  65. '_TO': '->'
  66. }
  67. RULES = {
  68. 'start': ['list'],
  69. 'list': ['item', 'list item'],
  70. 'item': ['rule', 'token', '_NL'],
  71. 'rule': ['RULE _COLON expansions _NL'],
  72. 'expansions': ['alias',
  73. 'expansions _OR alias',
  74. 'expansions _NL _OR alias'],
  75. '?alias': ['expansion _TO RULE', 'expansion'],
  76. 'expansion': ['_expansion'],
  77. '_expansion': ['', '_expansion expr'],
  78. '?expr': ['atom',
  79. 'atom OP'],
  80. '?atom': ['_LPAR expansions _RPAR',
  81. 'maybe',
  82. 'RULE',
  83. 'TOKEN',
  84. 'anontoken'],
  85. 'anontoken': ['tokenvalue'],
  86. 'maybe': ['_LBRA expansions _RBRA'],
  87. 'token': ['TOKEN _COLON tokenvalue _NL',
  88. 'TOKEN tokenmods _COLON tokenvalue _NL'],
  89. '?tokenvalue': ['REGEXP', 'STRING'],
  90. 'tokenmods': ['_DOT RULE', 'tokenmods _DOT RULE'],
  91. }
  92. class EBNF_to_BNF(InlineTransformer):
  93. def __init__(self):
  94. self.new_rules = {}
  95. self.rules_by_expr = {}
  96. self.prefix = 'anon'
  97. self.i = 0
  98. def _add_recurse_rule(self, type_, expr):
  99. if expr in self.rules_by_expr:
  100. return self.rules_by_expr[expr]
  101. new_name = '__%s_%s_%d' % (self.prefix, type_, self.i)
  102. self.i += 1
  103. t = Token('RULE', new_name, -1)
  104. self.new_rules[new_name] = T('expansions', [T('expansion', [expr]), T('expansion', [t, expr])])
  105. self.rules_by_expr[expr] = t
  106. return t
  107. def expr(self, rule, op):
  108. if op.value == '?':
  109. return T('expansions', [rule, T('expansion', [])])
  110. elif op.value == '+':
  111. # a : b c+ d
  112. # -->
  113. # a : b _c d
  114. # _c : _c c | c;
  115. return self._add_recurse_rule('plus', rule)
  116. elif op.value == '*':
  117. # a : b c* d
  118. # -->
  119. # a : b _c? d
  120. # _c : _c c | c;
  121. new_name = self._add_recurse_rule('star', rule)
  122. return T('expansions', [new_name, T('expansion', [])])
  123. assert False, op
  124. class SimplifyRule_Visitor(Visitor):
  125. @staticmethod
  126. def _flatten(tree):
  127. while True:
  128. to_expand = [i for i, child in enumerate(tree.children)
  129. if isinstance(child, T) and child.data == tree.data]
  130. if not to_expand:
  131. break
  132. tree.expand_kids_by_index(*to_expand)
  133. def expansion(self, tree):
  134. # rules_list unpacking
  135. # a : b (c|d) e
  136. # -->
  137. # a : b c e | b d e
  138. #
  139. # In AST terms:
  140. # expansion(b, expansions(c, d), e)
  141. # -->
  142. # expansions( expansion(b, c, e), expansion(b, d, e) )
  143. while True:
  144. self._flatten(tree)
  145. for i, child in enumerate(tree.children):
  146. if isinstance(child, T) and child.data == 'expansions':
  147. tree.data = 'expansions'
  148. tree.children = [self.visit(T('expansion', [option if i==j else other
  149. for j, other in enumerate(tree.children)]))
  150. for option in child.children]
  151. break
  152. else:
  153. break
  154. def alias(self, tree):
  155. rule, alias_name = tree.children
  156. if rule.data == 'expansions':
  157. aliases = []
  158. for child in tree.children[0].children:
  159. aliases.append(T('alias', [child, alias_name]))
  160. tree.data = 'expansions'
  161. tree.children = aliases
  162. expansions = _flatten
  163. def dict_update_safe(d1, d2):
  164. for k, v in d2.items():
  165. assert k not in d1
  166. d1[k] = v
  167. class RuleTreeToText(Transformer):
  168. def expansions(self, x):
  169. return x
  170. def expansion(self, symbols):
  171. return [sym.value for sym in symbols], None
  172. def alias(self, x):
  173. (expansion, _alias), alias = x
  174. assert _alias is None, (alias, expansion, '-', _alias)
  175. return expansion, alias.value
  176. class SimplifyTree(InlineTransformer):
  177. def maybe(self, expr):
  178. return T('expr', [expr, Token('OP', '?', -1)])
  179. def tokenmods(self, *args):
  180. if len(args) == 1:
  181. return list(args)
  182. tokenmods, value = args
  183. return tokenmods + [value]
  184. def get_tokens(tree, token_set):
  185. tokens = []
  186. for t in tree.find_data('token'):
  187. x = t.children
  188. name = x[0].value
  189. assert not name.startswith('__'), 'Names starting with double-underscore are reserved (Error at %s)' % name
  190. if name in token_set:
  191. raise ValueError("Token '%s' defined more than once" % name)
  192. token_set.add(name)
  193. if len(x) == 2:
  194. yield name, x[1], []
  195. else:
  196. assert len(x) == 3
  197. yield name, x[2], x[1]
  198. class ExtractAnonTokens(InlineTransformer):
  199. def __init__(self, tokens, token_set):
  200. self.tokens = tokens
  201. self.token_set = token_set
  202. self.token_reverse = {value[1:-1]: name for name, value, _flags in tokens}
  203. self.i = 0
  204. def anontoken(self, token):
  205. if token.type == 'STRING':
  206. value = token.value[1:-1]
  207. try:
  208. # If already defined, use the user-defined token name
  209. token_name = self.token_reverse[value]
  210. except KeyError:
  211. # Try to assign an indicative anon-token name, otherwise use a numbered name
  212. try:
  213. token_name = _TOKEN_NAMES[value]
  214. except KeyError:
  215. if value.isalnum() and value[0].isalpha():
  216. token_name = value.upper()
  217. else:
  218. token_name = 'ANONSTR_%d' % self.i
  219. self.i += 1
  220. token_name = '__' + token_name
  221. elif token.type == 'REGEXP':
  222. token_name = 'ANONRE_%d' % self.i
  223. value = token.value
  224. self.i += 1
  225. else:
  226. assert False, x
  227. if token_name not in self.token_set:
  228. self.token_set.add(token_name)
  229. self.tokens.append((token_name, token, []))
  230. assert value not in self.token_reverse
  231. self.token_reverse[value] = token_name
  232. return Token('TOKEN', token_name, -1)
  233. class GrammarLoader:
  234. def __init__(self):
  235. self.lexer = Lexer(TOKENS.items(), {}, ignore=['WS', 'COMMENT'])
  236. d = {r: [(x.split(), None) for x in xs] for r, xs in RULES.items()}
  237. rules, callback = ParseTreeBuilder(T).create_tree_builder(d, None)
  238. self.parser = LALR().build_parser(rules, callback, 'start')
  239. self.simplify_tree = SimplifyTree()
  240. self.simplify_rule = SimplifyRule_Visitor()
  241. self.rule_tree_to_text = RuleTreeToText()
  242. def load_grammar(self, grammar_text):
  243. try:
  244. token_stream = list(self.lexer.lex(grammar_text+"\n"))
  245. except UnexpectedInput as e:
  246. raise GrammarError("Unexpected input %r at line %d column %d" % (e.context, e.line, e.column))
  247. try:
  248. tree = self.simplify_tree.transform( self.parser.parse(token_stream) )
  249. except UnexpectedToken as e:
  250. if '_COLON' in e.expected:
  251. raise GrammarError("Missing colon at line %s column %s" % (e.line, e.column))
  252. elif 'tokenvalue' in e.expected:
  253. raise GrammarError("Expecting a value at line %s column %s" % (e.line, e.column))
  254. raise
  255. # =================
  256. # Process Tokens
  257. # =================
  258. token_set = set()
  259. tokens = list(get_tokens(tree, token_set))
  260. extract_anon = ExtractAnonTokens(tokens, token_set)
  261. tree = extract_anon.transform(tree) # Adds to tokens
  262. tokens2 = []
  263. for name, token, flags in tokens:
  264. value = token.value[1:-1]
  265. if r'\u' in value:
  266. # XXX for now, you can't mix unicode escaping and unicode characters at the same token
  267. value = unicode_escape(value)[0]
  268. tokens2.append((name, token.type, value, flags))
  269. token_ref = {}
  270. re_tokens = []
  271. str_tokens = []
  272. for name, type_, value, flags in tokens2:
  273. if type_ == 'STRING':
  274. str_tokens.append((name, value, flags))
  275. else:
  276. assert type_ == 'REGEXP'
  277. sp = re.split(r'(\$\{%s})' % TOKENS['TOKEN'], value)
  278. if sp:
  279. value = ''.join(token_ref[x[2:-1]] if x.startswith('${') and x.endswith('}') else x
  280. for x in sp)
  281. re_tokens.append((name, value, flags))
  282. token_ref[name] = value
  283. embedded_strs = set()
  284. for re_name, re_value, re_flags in re_tokens:
  285. unless = {}
  286. for str_name, str_value, _sf in str_tokens:
  287. m = re.match(re_value, str_value)
  288. if m and m.group(0) == str_value:
  289. assert not _sf, "You just broke Lark! Please email me with your grammar"
  290. embedded_strs.add(str_name)
  291. unless[str_value] = str_name
  292. if unless:
  293. re_flags.append(('unless', unless))
  294. str_tokens = [(n, re.escape(v), f) for n, v, f in str_tokens if n not in embedded_strs]
  295. str_tokens.sort(key=lambda x:len(x[1]), reverse=True)
  296. re_tokens.sort(key=lambda x:len(x[1]), reverse=True)
  297. tokens = str_tokens + re_tokens # Order is important!
  298. # =================
  299. # Process Rules
  300. # =================
  301. ebnf_to_bnf = EBNF_to_BNF()
  302. rules = {}
  303. for rule in tree.find_data('rule'):
  304. name, ebnf_tree = rule.children
  305. name = name.value
  306. if name in rules:
  307. raise ValueError("Rule '%s' defined more than once" % name)
  308. rules[name] = ebnf_to_bnf.transform(ebnf_tree)
  309. dict_update_safe(rules, ebnf_to_bnf.new_rules)
  310. for r in rules.values():
  311. self.simplify_rule.visit(r)
  312. rules = {origin: self.rule_tree_to_text.transform(tree) for origin, tree in rules.items()}
  313. # ====================
  314. # Verify correctness
  315. # ====================
  316. used_symbols = {symbol for expansions in rules.values()
  317. for expansion, _alias in expansions
  318. for symbol in expansion}
  319. rule_set = {r.lstrip('?') for r in rules}
  320. for sym in used_symbols:
  321. if is_terminal(sym):
  322. if sym not in token_set:
  323. raise GrammarError("Token '%s' used but not defined" % sym)
  324. else:
  325. if sym not in rule_set:
  326. raise GrammarError("Rule '%s' used but not defined" % sym)
  327. return tokens, rules
  328. load_grammar = GrammarLoader().load_grammar
  329. def test():
  330. g = """
  331. start: add
  332. // Rules
  333. add: mul
  334. | add _add_sym mul
  335. mul: [mul _add_mul] _atom
  336. _atom: "-" _atom -> neg
  337. | NUMBER
  338. | "(" add ")"
  339. // Tokens
  340. NUMBER: /[\d.]+/
  341. _add_sym: "+" | "-"
  342. _add_mul: "*" | "/"
  343. WS.ignore.newline: /\s+/
  344. """
  345. g2 = """
  346. start: a
  347. a: "a" (b*|(c d)+) "b"?
  348. b: "b"
  349. c: "c"
  350. d: "+" | "-"
  351. """
  352. # print load_grammar(g)
  353. print(GrammarLoader().load_grammar2(g))
  354. if __name__ == '__main__':
  355. test()