This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
Ви не можете вибрати більше 25 тем Теми мають розпочинатися з літери або цифри, можуть містити дефіси (-) і не повинні перевищувати 35 символів.

705 рядки
24 KiB

  1. "Parses and creates Grammar objects"
  2. import os.path
  3. from itertools import chain
  4. import re
  5. from ast import literal_eval
  6. from copy import deepcopy
  7. from .lexer import Token, UnexpectedInput
  8. from .parse_tree_builder import ParseTreeBuilder
  9. from .parser_frontends import LALR
  10. from .parsers.lalr_parser import UnexpectedToken
  11. from .common import is_terminal, GrammarError, LexerConf, ParserConf, PatternStr, PatternRE, TokenDef
  12. from .grammar import RuleOptions, Rule
  13. from .tree import Tree, Transformer, InlineTransformer, Visitor, SlottedTree as ST
  14. __path__ = os.path.dirname(__file__)
  15. IMPORT_PATHS = [os.path.join(__path__, 'grammars')]
  16. _RE_FLAGS = 'imslux'
  17. _TOKEN_NAMES = {
  18. '.' : 'DOT',
  19. ',' : 'COMMA',
  20. ':' : 'COLON',
  21. ';' : 'SEMICOLON',
  22. '+' : 'PLUS',
  23. '-' : 'MINUS',
  24. '*' : 'STAR',
  25. '/' : 'SLASH',
  26. '\\' : 'BACKSLASH',
  27. '|' : 'VBAR',
  28. '?' : 'QMARK',
  29. '!' : 'BANG',
  30. '@' : 'AT',
  31. '#' : 'HASH',
  32. '$' : 'DOLLAR',
  33. '%' : 'PERCENT',
  34. '^' : 'CIRCUMFLEX',
  35. '&' : 'AMPERSAND',
  36. '_' : 'UNDERSCORE',
  37. '<' : 'LESSTHAN',
  38. '>' : 'MORETHAN',
  39. '=' : 'EQUAL',
  40. '"' : 'DBLQUOTE',
  41. '\'' : 'QUOTE',
  42. '`' : 'BACKQUOTE',
  43. '~' : 'TILDE',
  44. '(' : 'LPAR',
  45. ')' : 'RPAR',
  46. '{' : 'LBRACE',
  47. '}' : 'RBRACE',
  48. '[' : 'LSQB',
  49. ']' : 'RSQB',
  50. '\n' : 'NEWLINE',
  51. '\r\n' : 'CRLF',
  52. '\t' : 'TAB',
  53. ' ' : 'SPACE',
  54. }
  55. # Grammar Parser
  56. TOKENS = {
  57. '_LPAR': r'\(',
  58. '_RPAR': r'\)',
  59. '_LBRA': r'\[',
  60. '_RBRA': r'\]',
  61. 'OP': '[+*][?]?|[?](?![a-z])',
  62. '_COLON': ':',
  63. '_OR': r'\|',
  64. '_DOT': r'\.',
  65. 'TILDE': '~',
  66. 'RULE': '!?[_?]?[a-z][_a-z0-9]*',
  67. 'TOKEN': '_?[A-Z][_A-Z0-9]*',
  68. 'STRING': r'"(\\"|\\\\|[^"\n])*?"i?',
  69. 'REGEXP': r'/(?!/)(\\/|\\\\|[^/\n])*?/[%s]*' % _RE_FLAGS,
  70. '_NL': r'(\r?\n)+\s*',
  71. 'WS': r'[ \t]+',
  72. 'COMMENT': r'//[^\n]*',
  73. '_TO': '->',
  74. '_IGNORE': r'%ignore',
  75. '_IMPORT': r'%import',
  76. 'NUMBER': r'\d+',
  77. }
  78. RULES = {
  79. 'start': ['_list'],
  80. '_list': ['_item', '_list _item'],
  81. '_item': ['rule', 'token', 'statement', '_NL'],
  82. 'rule': ['RULE _COLON expansions _NL',
  83. 'RULE _DOT NUMBER _COLON expansions _NL'],
  84. 'expansions': ['alias',
  85. 'expansions _OR alias',
  86. 'expansions _NL _OR alias'],
  87. '?alias': ['expansion _TO RULE', 'expansion'],
  88. 'expansion': ['_expansion'],
  89. '_expansion': ['', '_expansion expr'],
  90. '?expr': ['atom',
  91. 'atom OP',
  92. 'atom TILDE NUMBER',
  93. 'atom TILDE NUMBER _DOT _DOT NUMBER',
  94. ],
  95. '?atom': ['_LPAR expansions _RPAR',
  96. 'maybe',
  97. 'name',
  98. 'literal',
  99. 'range'],
  100. '?name': ['RULE', 'TOKEN'],
  101. 'maybe': ['_LBRA expansions _RBRA'],
  102. 'range': ['STRING _DOT _DOT STRING'],
  103. 'token': ['TOKEN _COLON expansions _NL',
  104. 'TOKEN _DOT NUMBER _COLON expansions _NL'],
  105. 'statement': ['ignore', 'import'],
  106. 'ignore': ['_IGNORE expansions _NL'],
  107. 'import': ['_IMPORT import_args _NL',
  108. '_IMPORT import_args _TO TOKEN'],
  109. 'import_args': ['_import_args'],
  110. '_import_args': ['name', '_import_args _DOT name'],
  111. 'literal': ['REGEXP', 'STRING'],
  112. }
  113. class EBNF_to_BNF(InlineTransformer):
  114. def __init__(self):
  115. self.new_rules = []
  116. self.rules_by_expr = {}
  117. self.prefix = 'anon'
  118. self.i = 0
  119. self.rule_options = None
  120. def _add_recurse_rule(self, type_, expr):
  121. if expr in self.rules_by_expr:
  122. return self.rules_by_expr[expr]
  123. new_name = '__%s_%s_%d' % (self.prefix, type_, self.i)
  124. self.i += 1
  125. t = Token('RULE', new_name, -1)
  126. tree = ST('expansions', [ST('expansion', [expr]), ST('expansion', [t, expr])])
  127. self.new_rules.append((new_name, tree, self.rule_options))
  128. self.rules_by_expr[expr] = t
  129. return t
  130. def expr(self, rule, op, *args):
  131. if op.value == '?':
  132. return ST('expansions', [rule, ST('expansion', [])])
  133. elif op.value == '+':
  134. # a : b c+ d
  135. # -->
  136. # a : b _c d
  137. # _c : _c c | c;
  138. return self._add_recurse_rule('plus', rule)
  139. elif op.value == '*':
  140. # a : b c* d
  141. # -->
  142. # a : b _c? d
  143. # _c : _c c | c;
  144. new_name = self._add_recurse_rule('star', rule)
  145. return ST('expansions', [new_name, ST('expansion', [])])
  146. elif op.value == '~':
  147. if len(args) == 1:
  148. mn = mx = int(args[0])
  149. else:
  150. mn, mx = map(int, args)
  151. if mx < mn:
  152. raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (rule, mn, mx))
  153. return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx+1)])
  154. assert False, op
  155. class SimplifyRule_Visitor(Visitor):
  156. @staticmethod
  157. def _flatten(tree):
  158. while True:
  159. to_expand = [i for i, child in enumerate(tree.children)
  160. if isinstance(child, Tree) and child.data == tree.data]
  161. if not to_expand:
  162. break
  163. tree.expand_kids_by_index(*to_expand)
  164. def expansion(self, tree):
  165. # rules_list unpacking
  166. # a : b (c|d) e
  167. # -->
  168. # a : b c e | b d e
  169. #
  170. # In AST terms:
  171. # expansion(b, expansions(c, d), e)
  172. # -->
  173. # expansions( expansion(b, c, e), expansion(b, d, e) )
  174. while True:
  175. self._flatten(tree)
  176. for i, child in enumerate(tree.children):
  177. if isinstance(child, Tree) and child.data == 'expansions':
  178. tree.data = 'expansions'
  179. tree.children = [self.visit(ST('expansion', [option if i==j else other
  180. for j, other in enumerate(tree.children)]))
  181. for option in set(child.children)]
  182. break
  183. else:
  184. break
  185. def alias(self, tree):
  186. rule, alias_name = tree.children
  187. if rule.data == 'expansions':
  188. aliases = []
  189. for child in tree.children[0].children:
  190. aliases.append(ST('alias', [child, alias_name]))
  191. tree.data = 'expansions'
  192. tree.children = aliases
  193. def expansions(self, tree):
  194. self._flatten(tree)
  195. tree.children = list(set(tree.children))
  196. class RuleTreeToText(Transformer):
  197. def expansions(self, x):
  198. return x
  199. def expansion(self, symbols):
  200. return [sym.value for sym in symbols], None
  201. def alias(self, x):
  202. (expansion, _alias), alias = x
  203. assert _alias is None, (alias, expansion, '-', _alias)
  204. return expansion, alias.value
  205. class CanonizeTree(InlineTransformer):
  206. def maybe(self, expr):
  207. return ST('expr', [expr, Token('OP', '?', -1)])
  208. def tokenmods(self, *args):
  209. if len(args) == 1:
  210. return list(args)
  211. tokenmods, value = args
  212. return tokenmods + [value]
  213. class ExtractAnonTokens(InlineTransformer):
  214. "Create a unique list of anonymous tokens. Attempt to give meaningful names to them when we add them"
  215. def __init__(self, tokens):
  216. self.tokens = tokens
  217. self.token_set = {td.name for td in self.tokens}
  218. self.token_reverse = {td.pattern: td for td in tokens}
  219. self.i = 0
  220. def pattern(self, p):
  221. value = p.value
  222. if p in self.token_reverse and p.flags != self.token_reverse[p].pattern.flags:
  223. raise GrammarError(u'Conflicting flags for the same terminal: %s' % p)
  224. if isinstance(p, PatternStr):
  225. try:
  226. # If already defined, use the user-defined token name
  227. token_name = self.token_reverse[p].name
  228. except KeyError:
  229. # Try to assign an indicative anon-token name, otherwise use a numbered name
  230. try:
  231. token_name = _TOKEN_NAMES[value]
  232. except KeyError:
  233. if value.isalnum() and value[0].isalpha() and ('__'+value.upper()) not in self.token_set:
  234. token_name = '%s%d' % (value.upper(), self.i)
  235. try:
  236. # Make sure we don't have unicode in our token names
  237. token_name.encode('ascii')
  238. except UnicodeEncodeError:
  239. token_name = 'ANONSTR_%d' % self.i
  240. else:
  241. token_name = 'ANONSTR_%d' % self.i
  242. self.i += 1
  243. token_name = '__' + token_name
  244. elif isinstance(p, PatternRE):
  245. if p in self.token_reverse: # Kind of a wierd placement.name
  246. token_name = self.token_reverse[p].name
  247. else:
  248. token_name = 'ANONRE_%d' % self.i
  249. self.i += 1
  250. else:
  251. assert False, p
  252. if token_name not in self.token_set:
  253. assert p not in self.token_reverse
  254. self.token_set.add(token_name)
  255. tokendef = TokenDef(token_name, p)
  256. self.token_reverse[p] = tokendef
  257. self.tokens.append(tokendef)
  258. return Token('TOKEN', token_name, -1)
  259. def _rfind(s, choices):
  260. return max(s.rfind(c) for c in choices)
  261. def _fix_escaping(s):
  262. w = ''
  263. i = iter(s)
  264. for n in i:
  265. w += n
  266. if n == '\\':
  267. n2 = next(i)
  268. if n2 == '\\':
  269. w += '\\\\'
  270. elif n2 not in 'unftr':
  271. w += '\\'
  272. w += n2
  273. w = w.replace('\\"', '"').replace("'", "\\'")
  274. to_eval = "u'''%s'''" % w
  275. try:
  276. s = literal_eval(to_eval)
  277. except SyntaxError as e:
  278. raise ValueError(s, e)
  279. return s
  280. def _literal_to_pattern(literal):
  281. v = literal.value
  282. flag_start = _rfind(v, '/"')+1
  283. assert flag_start > 0
  284. flags = v[flag_start:]
  285. assert all(f in _RE_FLAGS for f in flags), flags
  286. v = v[:flag_start]
  287. assert v[0] == v[-1] and v[0] in '"/'
  288. x = v[1:-1]
  289. s = _fix_escaping(x)
  290. if v[0] == '"':
  291. s = s.replace('\\\\', '\\')
  292. return { 'STRING': PatternStr,
  293. 'REGEXP': PatternRE }[literal.type](s, flags)
  294. class PrepareLiterals(InlineTransformer):
  295. def literal(self, literal):
  296. return ST('pattern', [_literal_to_pattern(literal)])
  297. def range(self, start, end):
  298. assert start.type == end.type == 'STRING'
  299. start = start.value[1:-1]
  300. end = end.value[1:-1]
  301. assert len(start) == len(end) == 1, (start, end, len(start), len(end))
  302. regexp = '[%s-%s]' % (start, end)
  303. return ST('pattern', [PatternRE(regexp)])
  304. class SplitLiterals(InlineTransformer):
  305. def pattern(self, p):
  306. if isinstance(p, PatternStr) and len(p.value)>1:
  307. return ST('expansion', [ST('pattern', [PatternStr(ch, flags=p.flags)]) for ch in p.value])
  308. return ST('pattern', [p])
  309. class TokenTreeToPattern(Transformer):
  310. def pattern(self, ps):
  311. p ,= ps
  312. return p
  313. def expansion(self, items):
  314. if len(items) == 1:
  315. return items[0]
  316. if len({i.flags for i in items}) > 1:
  317. raise GrammarError("Lark doesn't support joining tokens with conflicting flags!")
  318. return PatternRE(''.join(i.to_regexp() for i in items), items[0].flags)
  319. def expansions(self, exps):
  320. if len(exps) == 1:
  321. return exps[0]
  322. if len({i.flags for i in exps}) > 1:
  323. raise GrammarError("Lark doesn't support joining tokens with conflicting flags!")
  324. return PatternRE('(?:%s)' % ('|'.join(i.to_regexp() for i in exps)), exps[0].flags)
  325. def expr(self, args):
  326. inner, op = args[:2]
  327. if op == '~':
  328. if len(args) == 3:
  329. op = "{%d}" % int(args[2])
  330. else:
  331. mn, mx = map(int, args[2:])
  332. if mx < mn:
  333. raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (inner, mn, mx))
  334. op = "{%d,%d}" % (mn, mx)
  335. else:
  336. assert len(args) == 2
  337. return PatternRE('(?:%s)%s' % (inner.to_regexp(), op), inner.flags)
  338. def alias(self, t):
  339. raise GrammarError("Aliasing not allowed in terminals (You used -> in the wrong place)")
  340. def _interleave(l, item):
  341. for e in l:
  342. yield e
  343. if isinstance(e, Tree):
  344. if e.data in ('literal', 'range'):
  345. yield item
  346. elif is_terminal(e):
  347. yield item
  348. def _choice_of_rules(rules):
  349. return ST('expansions', [ST('expansion', [Token('RULE', name)]) for name in rules])
  350. class Grammar:
  351. def __init__(self, rule_defs, token_defs, ignore):
  352. self.token_defs = token_defs
  353. self.rule_defs = rule_defs
  354. self.ignore = ignore
  355. def _prepare_scanless_grammar(self, start):
  356. # XXX Pretty hacky! There should be a better way to write this method..
  357. rule_defs = deepcopy(self.rule_defs)
  358. term_defs = self.token_defs
  359. # Implement the "%ignore" feature without a lexer..
  360. terms_to_ignore = {name:'__'+name for name in self.ignore}
  361. if terms_to_ignore:
  362. assert set(terms_to_ignore) <= {name for name, _t in term_defs}
  363. term_defs = [(terms_to_ignore.get(name,name),t) for name,t in term_defs]
  364. expr = Token('RULE', '__ignore')
  365. for r, tree, _o in rule_defs:
  366. for exp in tree.find_data('expansion'):
  367. exp.children = list(_interleave(exp.children, expr))
  368. if r == start:
  369. exp.children = [expr] + exp.children
  370. for exp in tree.find_data('expr'):
  371. exp.children[0] = ST('expansion', list(_interleave(exp.children[:1], expr)))
  372. _ignore_tree = ST('expr', [_choice_of_rules(terms_to_ignore.values()), Token('OP', '?')])
  373. rule_defs.append(('__ignore', _ignore_tree, None))
  374. # Convert all tokens to rules
  375. new_terminal_names = {name: '__token_'+name for name, _t in term_defs}
  376. for name, tree, options in rule_defs:
  377. for exp in chain( tree.find_data('expansion'), tree.find_data('expr') ):
  378. for i, sym in enumerate(exp.children):
  379. if sym in new_terminal_names:
  380. exp.children[i] = Token(sym.type, new_terminal_names[sym])
  381. for name, (tree, priority) in term_defs: # TODO transfer priority to rule?
  382. if any(tree.find_data('alias')):
  383. raise GrammarError("Aliasing not allowed in terminals (You used -> in the wrong place)")
  384. if name.startswith('_'):
  385. options = RuleOptions(filter_out=True, priority=-priority)
  386. else:
  387. options = RuleOptions(keep_all_tokens=True, create_token=name, priority=-priority)
  388. name = new_terminal_names[name]
  389. inner_name = name + '_inner'
  390. rule_defs.append((name, _choice_of_rules([inner_name]), None))
  391. rule_defs.append((inner_name, tree, options))
  392. return [], rule_defs
  393. def compile(self, lexer=False, start=None):
  394. if not lexer:
  395. token_defs, rule_defs = self._prepare_scanless_grammar(start)
  396. else:
  397. token_defs = list(self.token_defs)
  398. rule_defs = self.rule_defs
  399. # =================
  400. # Compile Tokens
  401. # =================
  402. # Convert token-trees to strings/regexps
  403. transformer = PrepareLiterals() * TokenTreeToPattern()
  404. tokens = [TokenDef(name, transformer.transform(token_tree), priority)
  405. for name, (token_tree, priority) in token_defs]
  406. # =================
  407. # Compile Rules
  408. # =================
  409. # 1. Pre-process terminals
  410. transformer = PrepareLiterals()
  411. if not lexer:
  412. transformer *= SplitLiterals()
  413. transformer *= ExtractAnonTokens(tokens) # Adds to tokens
  414. # 2. Convert EBNF to BNF (and apply step 1)
  415. ebnf_to_bnf = EBNF_to_BNF()
  416. rules = []
  417. for name, rule_tree, options in rule_defs:
  418. ebnf_to_bnf.rule_options = RuleOptions(keep_all_tokens=True) if options and options.keep_all_tokens else None
  419. tree = transformer.transform(rule_tree)
  420. rules.append((name, ebnf_to_bnf.transform(tree), options))
  421. rules += ebnf_to_bnf.new_rules
  422. assert len(rules) == len({name for name, _t, _o in rules}), "Whoops, name collision"
  423. # 3. Compile tree to Rule objects
  424. rule_tree_to_text = RuleTreeToText()
  425. simplify_rule = SimplifyRule_Visitor()
  426. compiled_rules = []
  427. for name, tree, options in rules:
  428. simplify_rule.visit(tree)
  429. expansions = rule_tree_to_text.transform(tree)
  430. for expansion, alias in expansions:
  431. if alias and name.startswith('_'):
  432. raise GrammarError("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (name, alias))
  433. rule = Rule(name, expansion, alias, options)
  434. compiled_rules.append(rule)
  435. return tokens, compiled_rules, self.ignore
  436. _imported_grammars = {}
  437. def import_grammar(grammar_path):
  438. if grammar_path not in _imported_grammars:
  439. for import_path in IMPORT_PATHS:
  440. with open(os.path.join(import_path, grammar_path)) as f:
  441. text = f.read()
  442. grammar = load_grammar(text, grammar_path)
  443. _imported_grammars[grammar_path] = grammar
  444. return _imported_grammars[grammar_path]
  445. def resolve_token_references(token_defs):
  446. # TODO Cycles detection
  447. # TODO Solve with transitive closure (maybe)
  448. token_dict = {k:t for k, (t,_p) in token_defs}
  449. assert len(token_dict) == len(token_defs), "Same name defined twice?"
  450. while True:
  451. changed = False
  452. for name, (token_tree, _p) in token_defs:
  453. for exp in chain(token_tree.find_data('expansion'), token_tree.find_data('expr')):
  454. for i, item in enumerate(exp.children):
  455. if isinstance(item, Token):
  456. if item.type == 'RULE':
  457. raise GrammarError("Rules aren't allowed inside tokens (%s in %s)" % (item, name))
  458. if item.type == 'TOKEN':
  459. exp.children[i] = token_dict[item]
  460. changed = True
  461. if not changed:
  462. break
  463. def options_from_rule(name, *x):
  464. if len(x) > 1:
  465. priority, expansions = x
  466. priority = int(priority)
  467. else:
  468. expansions ,= x
  469. priority = None
  470. keep_all_tokens = name.startswith('!')
  471. name = name.lstrip('!')
  472. expand1 = name.startswith('?')
  473. name = name.lstrip('?')
  474. return name, expansions, RuleOptions(keep_all_tokens, expand1, priority=priority)
  475. class GrammarLoader:
  476. def __init__(self):
  477. tokens = [TokenDef(name, PatternRE(value)) for name, value in TOKENS.items()]
  478. rules = [options_from_rule(name, x) for name, x in RULES.items()]
  479. rules = [Rule(r, x.split(), None, o) for r, xs, o in rules for x in xs]
  480. callback = ParseTreeBuilder(rules, ST).create_callback()
  481. lexer_conf = LexerConf(tokens, ['WS', 'COMMENT'])
  482. parser_conf = ParserConf(rules, callback, 'start')
  483. self.parser = LALR(lexer_conf, parser_conf)
  484. self.canonize_tree = CanonizeTree()
  485. def load_grammar(self, grammar_text, name='<?>'):
  486. "Parse grammar_text, verify, and create Grammar object. Display nice messages on error."
  487. try:
  488. tree = self.canonize_tree.transform( self.parser.parse(grammar_text+'\n') )
  489. except UnexpectedInput as e:
  490. raise GrammarError("Unexpected input %r at line %d column %d in %s" % (e.context, e.line, e.column, name))
  491. except UnexpectedToken as e:
  492. if e.expected == ['_COLON']:
  493. raise GrammarError("Missing colon at line %s column %s" % (e.line, e.column))
  494. elif e.expected == ['RULE']:
  495. raise GrammarError("Missing alias at line %s column %s" % (e.line, e.column))
  496. elif 'STRING' in e.expected:
  497. raise GrammarError("Expecting a value at line %s column %s" % (e.line, e.column))
  498. elif e.expected == ['_OR']:
  499. raise GrammarError("Newline without starting a new option (Expecting '|') at line %s column %s" % (e.line, e.column))
  500. raise
  501. # Extract grammar items
  502. token_defs = [c.children for c in tree.children if c.data=='token']
  503. rule_defs = [c.children for c in tree.children if c.data=='rule']
  504. statements = [c.children for c in tree.children if c.data=='statement']
  505. assert len(token_defs) + len(rule_defs) + len(statements) == len(tree.children)
  506. token_defs = [td if len(td)==3 else (td[0], 1, td[1]) for td in token_defs]
  507. token_defs = [(name.value, (t, int(p))) for name, p, t in token_defs]
  508. # Execute statements
  509. ignore = []
  510. for (stmt,) in statements:
  511. if stmt.data == 'ignore':
  512. t ,= stmt.children
  513. ignore.append(t)
  514. elif stmt.data == 'import':
  515. dotted_path = stmt.children[0].children
  516. name = stmt.children[1] if len(stmt.children)>1 else dotted_path[-1]
  517. grammar_path = os.path.join(*dotted_path[:-1]) + '.g'
  518. g = import_grammar(grammar_path)
  519. token_options = dict(g.token_defs)[dotted_path[-1]]
  520. assert isinstance(token_options, tuple) and len(token_options)==2
  521. token_defs.append([name.value, token_options])
  522. else:
  523. assert False, stmt
  524. # Verify correctness 1
  525. for name, _ in token_defs:
  526. if name.startswith('__'):
  527. raise GrammarError('Names starting with double-underscore are reserved (Error at %s)' % name)
  528. # Handle ignore tokens
  529. # XXX A slightly hacky solution. Recognition of %ignore TOKEN as separate comes from the lexer's
  530. # inability to handle duplicate tokens (two names, one value)
  531. ignore_names = []
  532. for t in ignore:
  533. if t.data=='expansions' and len(t.children) == 1:
  534. t2 ,= t.children
  535. if t2.data=='expansion' and len(t2.children) == 1:
  536. item ,= t2.children
  537. if isinstance(item, Token) and item.type == 'TOKEN':
  538. ignore_names.append(item.value)
  539. continue
  540. name = '__IGNORE_%d'% len(ignore_names)
  541. ignore_names.append(name)
  542. token_defs.append((name, (t, 0)))
  543. # Verify correctness 2
  544. token_names = set()
  545. for name, _ in token_defs:
  546. if name in token_names:
  547. raise GrammarError("Token '%s' defined more than once" % name)
  548. token_names.add(name)
  549. if set(ignore_names) > token_names:
  550. raise GrammarError("Tokens %s were marked to ignore but were not defined!" % (set(ignore_names) - token_names))
  551. # Resolve token references
  552. resolve_token_references(token_defs)
  553. rules = [options_from_rule(*x) for x in rule_defs]
  554. rule_names = set()
  555. for name, _x, _o in rules:
  556. if name.startswith('__'):
  557. raise GrammarError('Names starting with double-underscore are reserved (Error at %s)' % name)
  558. if name in rule_names:
  559. raise GrammarError("Rule '%s' defined more than once" % name)
  560. rule_names.add(name)
  561. for name, expansions, _o in rules:
  562. used_symbols = {t for x in expansions.find_data('expansion')
  563. for t in x.scan_values(lambda t: t.type in ('RULE', 'TOKEN'))}
  564. for sym in used_symbols:
  565. if is_terminal(sym):
  566. if sym not in token_names:
  567. raise GrammarError("Token '%s' used but not defined (in rule %s)" % (sym, name))
  568. else:
  569. if sym not in rule_names:
  570. raise GrammarError("Rule '%s' used but not defined (in rule %s)" % (sym, name))
  571. # TODO don't include unused tokens, they can only cause trouble!
  572. return Grammar(rules, token_defs, ignore_names)
  573. load_grammar = GrammarLoader().load_grammar