This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
選択できるのは25トピックまでです。 トピックは、先頭が英数字で、英数字とダッシュ('-')を使用した35文字以内のものにしてください。

1002 行
37 KiB

  1. "Parses and creates Grammar objects"
  2. import os.path
  3. import sys
  4. from copy import copy, deepcopy
  5. from io import open
  6. from .utils import bfs, eval_escaping, Py36, logger, classify_bool
  7. from .lexer import Token, TerminalDef, PatternStr, PatternRE
  8. from .parse_tree_builder import ParseTreeBuilder
  9. from .parser_frontends import LALR_TraditionalLexer
  10. from .common import LexerConf, ParserConf
  11. from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol
  12. from .utils import classify, suppress, dedup_list, Str
  13. from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken
  14. from .tree import Tree, SlottedTree as ST
  15. from .visitors import Transformer, Visitor, v_args, Transformer_InPlace, Transformer_NonRecursive
  16. inline_args = v_args(inline=True)
  17. __path__ = os.path.dirname(__file__)
  18. IMPORT_PATHS = ['grammars']
  19. EXT = '.lark'
  20. _RE_FLAGS = 'imslux'
  21. _EMPTY = Symbol('__empty__')
  22. _TERMINAL_NAMES = {
  23. '.' : 'DOT',
  24. ',' : 'COMMA',
  25. ':' : 'COLON',
  26. ';' : 'SEMICOLON',
  27. '+' : 'PLUS',
  28. '-' : 'MINUS',
  29. '*' : 'STAR',
  30. '/' : 'SLASH',
  31. '\\' : 'BACKSLASH',
  32. '|' : 'VBAR',
  33. '?' : 'QMARK',
  34. '!' : 'BANG',
  35. '@' : 'AT',
  36. '#' : 'HASH',
  37. '$' : 'DOLLAR',
  38. '%' : 'PERCENT',
  39. '^' : 'CIRCUMFLEX',
  40. '&' : 'AMPERSAND',
  41. '_' : 'UNDERSCORE',
  42. '<' : 'LESSTHAN',
  43. '>' : 'MORETHAN',
  44. '=' : 'EQUAL',
  45. '"' : 'DBLQUOTE',
  46. '\'' : 'QUOTE',
  47. '`' : 'BACKQUOTE',
  48. '~' : 'TILDE',
  49. '(' : 'LPAR',
  50. ')' : 'RPAR',
  51. '{' : 'LBRACE',
  52. '}' : 'RBRACE',
  53. '[' : 'LSQB',
  54. ']' : 'RSQB',
  55. '\n' : 'NEWLINE',
  56. '\r\n' : 'CRLF',
  57. '\t' : 'TAB',
  58. ' ' : 'SPACE',
  59. }
  60. # Grammar Parser
  61. TERMINALS = {
  62. '_LPAR': r'\(',
  63. '_RPAR': r'\)',
  64. '_LBRA': r'\[',
  65. '_RBRA': r'\]',
  66. '_LBRACE': r'\{',
  67. '_RBRACE': r'\}',
  68. 'OP': '[+*]|[?](?![a-z])',
  69. '_COLON': ':',
  70. '_COMMA': ',',
  71. '_OR': r'\|',
  72. '_DOT': r'\.(?!\.)',
  73. '_DOTDOT': r'\.\.',
  74. 'TILDE': '~',
  75. 'RULE': '!?[_?]?[a-z][_a-z0-9]*',
  76. 'TERMINAL': '_?[A-Z][_A-Z0-9]*',
  77. 'STRING': r'"(\\"|\\\\|[^"\n])*?"i?',
  78. 'REGEXP': r'/(?!/)(\\/|\\\\|[^/])*?/[%s]*' % _RE_FLAGS,
  79. '_NL': r'(\r?\n)+\s*',
  80. 'WS': r'[ \t]+',
  81. 'COMMENT': r'\s*//[^\n]*',
  82. '_TO': '->',
  83. '_IGNORE': r'%ignore',
  84. '_DECLARE': r'%declare',
  85. '_IMPORT': r'%import',
  86. 'NUMBER': r'[+-]?\d+',
  87. }
  88. RULES = {
  89. 'start': ['_list'],
  90. '_list': ['_item', '_list _item'],
  91. '_item': ['rule', 'term', 'statement', '_NL'],
  92. 'rule': ['RULE template_params _COLON expansions _NL',
  93. 'RULE template_params _DOT NUMBER _COLON expansions _NL'],
  94. 'template_params': ['_LBRACE _template_params _RBRACE',
  95. ''],
  96. '_template_params': ['RULE',
  97. '_template_params _COMMA RULE'],
  98. 'expansions': ['alias',
  99. 'expansions _OR alias',
  100. 'expansions _NL _OR alias'],
  101. '?alias': ['expansion _TO RULE', 'expansion'],
  102. 'expansion': ['_expansion'],
  103. '_expansion': ['', '_expansion expr'],
  104. '?expr': ['atom',
  105. 'atom OP',
  106. 'atom TILDE NUMBER',
  107. 'atom TILDE NUMBER _DOTDOT NUMBER',
  108. ],
  109. '?atom': ['_LPAR expansions _RPAR',
  110. 'maybe',
  111. 'value'],
  112. 'value': ['terminal',
  113. 'nonterminal',
  114. 'literal',
  115. 'range',
  116. 'template_usage'],
  117. 'terminal': ['TERMINAL'],
  118. 'nonterminal': ['RULE'],
  119. '?name': ['RULE', 'TERMINAL'],
  120. 'maybe': ['_LBRA expansions _RBRA'],
  121. 'range': ['STRING _DOTDOT STRING'],
  122. 'template_usage': ['RULE _LBRACE _template_args _RBRACE'],
  123. '_template_args': ['value',
  124. '_template_args _COMMA value'],
  125. 'term': ['TERMINAL _COLON expansions _NL',
  126. 'TERMINAL _DOT NUMBER _COLON expansions _NL'],
  127. 'statement': ['ignore', 'import', 'declare'],
  128. 'ignore': ['_IGNORE expansions _NL'],
  129. 'declare': ['_DECLARE _declare_args _NL'],
  130. 'import': ['_IMPORT _import_path _NL',
  131. '_IMPORT _import_path _LPAR name_list _RPAR _NL',
  132. '_IMPORT _import_path _TO name _NL'],
  133. '_import_path': ['import_lib', 'import_rel'],
  134. 'import_lib': ['_import_args'],
  135. 'import_rel': ['_DOT _import_args'],
  136. '_import_args': ['name', '_import_args _DOT name'],
  137. 'name_list': ['_name_list'],
  138. '_name_list': ['name', '_name_list _COMMA name'],
  139. '_declare_args': ['name', '_declare_args name'],
  140. 'literal': ['REGEXP', 'STRING'],
  141. }
  142. @inline_args
  143. class EBNF_to_BNF(Transformer_InPlace):
  144. def __init__(self):
  145. self.new_rules = []
  146. self.rules_by_expr = {}
  147. self.prefix = 'anon'
  148. self.i = 0
  149. self.rule_options = None
  150. def _add_recurse_rule(self, type_, expr):
  151. if expr in self.rules_by_expr:
  152. return self.rules_by_expr[expr]
  153. new_name = '__%s_%s_%d' % (self.prefix, type_, self.i)
  154. self.i += 1
  155. t = NonTerminal(new_name)
  156. tree = ST('expansions', [ST('expansion', [expr]), ST('expansion', [t, expr])])
  157. self.new_rules.append((new_name, tree, self.rule_options))
  158. self.rules_by_expr[expr] = t
  159. return t
  160. def expr(self, rule, op, *args):
  161. if op.value == '?':
  162. empty = ST('expansion', [])
  163. return ST('expansions', [rule, empty])
  164. elif op.value == '+':
  165. # a : b c+ d
  166. # -->
  167. # a : b _c d
  168. # _c : _c c | c;
  169. return self._add_recurse_rule('plus', rule)
  170. elif op.value == '*':
  171. # a : b c* d
  172. # -->
  173. # a : b _c? d
  174. # _c : _c c | c;
  175. new_name = self._add_recurse_rule('star', rule)
  176. return ST('expansions', [new_name, ST('expansion', [])])
  177. elif op.value == '~':
  178. if len(args) == 1:
  179. mn = mx = int(args[0])
  180. else:
  181. mn, mx = map(int, args)
  182. if mx < mn or mn < 0:
  183. raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (rule, mn, mx))
  184. return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx+1)])
  185. assert False, op
  186. def maybe(self, rule):
  187. keep_all_tokens = self.rule_options and self.rule_options.keep_all_tokens
  188. def will_not_get_removed(sym):
  189. if isinstance(sym, NonTerminal):
  190. return not sym.name.startswith('_')
  191. if isinstance(sym, Terminal):
  192. return keep_all_tokens or not sym.filter_out
  193. assert False
  194. if any(rule.scan_values(will_not_get_removed)):
  195. empty = _EMPTY
  196. else:
  197. empty = ST('expansion', [])
  198. return ST('expansions', [rule, empty])
  199. class SimplifyRule_Visitor(Visitor):
  200. @staticmethod
  201. def _flatten(tree):
  202. while True:
  203. to_expand = [i for i, child in enumerate(tree.children)
  204. if isinstance(child, Tree) and child.data == tree.data]
  205. if not to_expand:
  206. break
  207. tree.expand_kids_by_index(*to_expand)
  208. def expansion(self, tree):
  209. # rules_list unpacking
  210. # a : b (c|d) e
  211. # -->
  212. # a : b c e | b d e
  213. #
  214. # In AST terms:
  215. # expansion(b, expansions(c, d), e)
  216. # -->
  217. # expansions( expansion(b, c, e), expansion(b, d, e) )
  218. self._flatten(tree)
  219. for i, child in enumerate(tree.children):
  220. if isinstance(child, Tree) and child.data == 'expansions':
  221. tree.data = 'expansions'
  222. tree.children = [self.visit(ST('expansion', [option if i==j else other
  223. for j, other in enumerate(tree.children)]))
  224. for option in dedup_list(child.children)]
  225. self._flatten(tree)
  226. break
  227. def alias(self, tree):
  228. rule, alias_name = tree.children
  229. if rule.data == 'expansions':
  230. aliases = []
  231. for child in tree.children[0].children:
  232. aliases.append(ST('alias', [child, alias_name]))
  233. tree.data = 'expansions'
  234. tree.children = aliases
  235. def expansions(self, tree):
  236. self._flatten(tree)
  237. # Ensure all children are unique
  238. if len(set(tree.children)) != len(tree.children):
  239. tree.children = dedup_list(tree.children) # dedup is expensive, so try to minimize its use
  240. class RuleTreeToText(Transformer):
  241. def expansions(self, x):
  242. return x
  243. def expansion(self, symbols):
  244. return symbols, None
  245. def alias(self, x):
  246. (expansion, _alias), alias = x
  247. assert _alias is None, (alias, expansion, '-', _alias) # Double alias not allowed
  248. return expansion, alias.value
  249. @inline_args
  250. class CanonizeTree(Transformer_InPlace):
  251. def tokenmods(self, *args):
  252. if len(args) == 1:
  253. return list(args)
  254. tokenmods, value = args
  255. return tokenmods + [value]
  256. class PrepareAnonTerminals(Transformer_InPlace):
  257. "Create a unique list of anonymous terminals. Attempt to give meaningful names to them when we add them"
  258. def __init__(self, terminals):
  259. self.terminals = terminals
  260. self.term_set = {td.name for td in self.terminals}
  261. self.term_reverse = {td.pattern: td for td in terminals}
  262. self.i = 0
  263. self.rule_options = None
  264. @inline_args
  265. def pattern(self, p):
  266. value = p.value
  267. if p in self.term_reverse and p.flags != self.term_reverse[p].pattern.flags:
  268. raise GrammarError(u'Conflicting flags for the same terminal: %s' % p)
  269. term_name = None
  270. if isinstance(p, PatternStr):
  271. try:
  272. # If already defined, use the user-defined terminal name
  273. term_name = self.term_reverse[p].name
  274. except KeyError:
  275. # Try to assign an indicative anon-terminal name
  276. try:
  277. term_name = _TERMINAL_NAMES[value]
  278. except KeyError:
  279. if value.isalnum() and value[0].isalpha() and value.upper() not in self.term_set:
  280. with suppress(UnicodeEncodeError):
  281. value.upper().encode('ascii') # Make sure we don't have unicode in our terminal names
  282. term_name = value.upper()
  283. if term_name in self.term_set:
  284. term_name = None
  285. elif isinstance(p, PatternRE):
  286. if p in self.term_reverse: # Kind of a weird placement.name
  287. term_name = self.term_reverse[p].name
  288. else:
  289. assert False, p
  290. if term_name is None:
  291. term_name = '__ANON_%d' % self.i
  292. self.i += 1
  293. if term_name not in self.term_set:
  294. assert p not in self.term_reverse
  295. self.term_set.add(term_name)
  296. termdef = TerminalDef(term_name, p)
  297. self.term_reverse[p] = termdef
  298. self.terminals.append(termdef)
  299. filter_out = False if self.rule_options and self.rule_options.keep_all_tokens else isinstance(p, PatternStr)
  300. return Terminal(term_name, filter_out=filter_out)
  301. class _ReplaceSymbols(Transformer_InPlace):
  302. " Helper for ApplyTemplates "
  303. def __init__(self):
  304. self.names = {}
  305. def value(self, c):
  306. if len(c) == 1 and isinstance(c[0], Token) and c[0].value in self.names:
  307. return self.names[c[0].value]
  308. return self.__default__('value', c, None)
  309. def template_usage(self, c):
  310. if c[0] in self.names:
  311. return self.__default__('template_usage', [self.names[c[0]].name] + c[1:], None)
  312. return self.__default__('template_usage', c, None)
  313. class ApplyTemplates(Transformer_InPlace):
  314. " Apply the templates, creating new rules that represent the used templates "
  315. def __init__(self, rule_defs):
  316. self.rule_defs = rule_defs
  317. self.replacer = _ReplaceSymbols()
  318. self.created_templates = set()
  319. def template_usage(self, c):
  320. name = c[0]
  321. args = c[1:]
  322. result_name = "%s{%s}" % (name, ",".join(a.name for a in args))
  323. if result_name not in self.created_templates:
  324. self.created_templates.add(result_name)
  325. (_n, params, tree, options) ,= (t for t in self.rule_defs if t[0] == name)
  326. assert len(params) == len(args), args
  327. result_tree = deepcopy(tree)
  328. self.replacer.names = dict(zip(params, args))
  329. self.replacer.transform(result_tree)
  330. self.rule_defs.append((result_name, [], result_tree, deepcopy(options)))
  331. return NonTerminal(result_name)
  332. def _rfind(s, choices):
  333. return max(s.rfind(c) for c in choices)
  334. def _literal_to_pattern(literal):
  335. v = literal.value
  336. flag_start = _rfind(v, '/"')+1
  337. assert flag_start > 0
  338. flags = v[flag_start:]
  339. assert all(f in _RE_FLAGS for f in flags), flags
  340. if literal.type == 'STRING' and '\n' in v:
  341. raise GrammarError('You cannot put newlines in string literals')
  342. if literal.type == 'REGEXP' and '\n' in v and 'x' not in flags:
  343. raise GrammarError('You can only use newlines in regular expressions '
  344. 'with the `x` (verbose) flag')
  345. v = v[:flag_start]
  346. assert v[0] == v[-1] and v[0] in '"/'
  347. x = v[1:-1]
  348. s = eval_escaping(x)
  349. if literal.type == 'STRING':
  350. s = s.replace('\\\\', '\\')
  351. return PatternStr(s, flags)
  352. elif literal.type == 'REGEXP':
  353. return PatternRE(s, flags)
  354. else:
  355. assert False, 'Invariant failed: literal.type not in ["STRING", "REGEXP"]'
  356. @inline_args
  357. class PrepareLiterals(Transformer_InPlace):
  358. def literal(self, literal):
  359. return ST('pattern', [_literal_to_pattern(literal)])
  360. def range(self, start, end):
  361. assert start.type == end.type == 'STRING'
  362. start = start.value[1:-1]
  363. end = end.value[1:-1]
  364. assert len(eval_escaping(start)) == len(eval_escaping(end)) == 1, (start, end, len(eval_escaping(start)), len(eval_escaping(end)))
  365. regexp = '[%s-%s]' % (start, end)
  366. return ST('pattern', [PatternRE(regexp)])
  367. def _make_joined_pattern(regexp, flags_set):
  368. # In Python 3.6, a new syntax for flags was introduced, that allows us to restrict the scope
  369. # of flags to a specific regexp group. We are already using it in `lexer.Pattern._get_flags`
  370. # However, for prior Python versions, we still need to use global flags, so we have to make sure
  371. # that there are no flag collisions when we merge several terminals.
  372. flags = ()
  373. if not Py36:
  374. if len(flags_set) > 1:
  375. raise GrammarError("Lark doesn't support joining terminals with conflicting flags in python <3.6!")
  376. elif len(flags_set) == 1:
  377. flags ,= flags_set
  378. return PatternRE(regexp, flags)
  379. class TerminalTreeToPattern(Transformer):
  380. def pattern(self, ps):
  381. p ,= ps
  382. return p
  383. def expansion(self, items):
  384. assert items
  385. if len(items) == 1:
  386. return items[0]
  387. pattern = ''.join(i.to_regexp() for i in items)
  388. return _make_joined_pattern(pattern, {i.flags for i in items})
  389. def expansions(self, exps):
  390. if len(exps) == 1:
  391. return exps[0]
  392. pattern = '(?:%s)' % ('|'.join(i.to_regexp() for i in exps))
  393. return _make_joined_pattern(pattern, {i.flags for i in exps})
  394. def expr(self, args):
  395. inner, op = args[:2]
  396. if op == '~':
  397. if len(args) == 3:
  398. op = "{%d}" % int(args[2])
  399. else:
  400. mn, mx = map(int, args[2:])
  401. if mx < mn:
  402. raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (inner, mn, mx))
  403. op = "{%d,%d}" % (mn, mx)
  404. else:
  405. assert len(args) == 2
  406. return PatternRE('(?:%s)%s' % (inner.to_regexp(), op), inner.flags)
  407. def maybe(self, expr):
  408. return self.expr(expr + ['?'])
  409. def alias(self, t):
  410. raise GrammarError("Aliasing not allowed in terminals (You used -> in the wrong place)")
  411. def value(self, v):
  412. return v[0]
  413. class PrepareSymbols(Transformer_InPlace):
  414. def value(self, v):
  415. v ,= v
  416. if isinstance(v, Tree):
  417. return v
  418. elif v.type == 'RULE':
  419. return NonTerminal(Str(v.value))
  420. elif v.type == 'TERMINAL':
  421. return Terminal(Str(v.value), filter_out=v.startswith('_'))
  422. assert False
  423. def _choice_of_rules(rules):
  424. return ST('expansions', [ST('expansion', [Token('RULE', name)]) for name in rules])
  425. def nr_deepcopy_tree(t):
  426. "Deepcopy tree `t` without recursion"
  427. return Transformer_NonRecursive(False).transform(t)
  428. class Grammar:
  429. def __init__(self, rule_defs, term_defs, ignore):
  430. self.term_defs = term_defs
  431. self.rule_defs = rule_defs
  432. self.ignore = ignore
  433. def compile(self, start):
  434. # We change the trees in-place (to support huge grammars)
  435. # So deepcopy allows calling compile more than once.
  436. term_defs = deepcopy(list(self.term_defs))
  437. rule_defs = [(n,p,nr_deepcopy_tree(t),o) for n,p,t,o in self.rule_defs]
  438. # ===================
  439. # Compile Terminals
  440. # ===================
  441. # Convert terminal-trees to strings/regexps
  442. for name, (term_tree, priority) in term_defs:
  443. if term_tree is None: # Terminal added through %declare
  444. continue
  445. expansions = list(term_tree.find_data('expansion'))
  446. if len(expansions) == 1 and not expansions[0].children:
  447. raise GrammarError("Terminals cannot be empty (%s)" % name)
  448. transformer = PrepareLiterals() * TerminalTreeToPattern()
  449. terminals = [TerminalDef(name, transformer.transform( term_tree ), priority)
  450. for name, (term_tree, priority) in term_defs if term_tree]
  451. # =================
  452. # Compile Rules
  453. # =================
  454. # 1. Pre-process terminals
  455. anon_tokens_transf = PrepareAnonTerminals(terminals)
  456. transformer = PrepareLiterals() * PrepareSymbols() * anon_tokens_transf # Adds to terminals
  457. # 2. Inline Templates
  458. transformer *= ApplyTemplates(rule_defs)
  459. # 3. Convert EBNF to BNF (and apply step 1 & 2)
  460. ebnf_to_bnf = EBNF_to_BNF()
  461. rules = []
  462. i = 0
  463. while i < len(rule_defs): # We have to do it like this because rule_defs might grow due to templates
  464. name, params, rule_tree, options = rule_defs[i]
  465. i += 1
  466. if len(params) != 0: # Dont transform templates
  467. continue
  468. rule_options = RuleOptions(keep_all_tokens=True) if options and options.keep_all_tokens else None
  469. ebnf_to_bnf.rule_options = rule_options
  470. ebnf_to_bnf.prefix = name
  471. anon_tokens_transf.rule_options = rule_options
  472. tree = transformer.transform(rule_tree)
  473. res = ebnf_to_bnf.transform(tree)
  474. rules.append((name, res, options))
  475. rules += ebnf_to_bnf.new_rules
  476. assert len(rules) == len({name for name, _t, _o in rules}), "Whoops, name collision"
  477. # 4. Compile tree to Rule objects
  478. rule_tree_to_text = RuleTreeToText()
  479. simplify_rule = SimplifyRule_Visitor()
  480. compiled_rules = []
  481. for rule_content in rules:
  482. name, tree, options = rule_content
  483. simplify_rule.visit(tree)
  484. expansions = rule_tree_to_text.transform(tree)
  485. for i, (expansion, alias) in enumerate(expansions):
  486. if alias and name.startswith('_'):
  487. raise GrammarError("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (name, alias))
  488. empty_indices = [x==_EMPTY for x in expansion]
  489. if any(empty_indices):
  490. exp_options = copy(options) or RuleOptions()
  491. exp_options.empty_indices = empty_indices
  492. expansion = [x for x in expansion if x!=_EMPTY]
  493. else:
  494. exp_options = options
  495. assert all(isinstance(x, Symbol) for x in expansion), expansion
  496. rule = Rule(NonTerminal(name), expansion, i, alias, exp_options)
  497. compiled_rules.append(rule)
  498. # Remove duplicates of empty rules, throw error for non-empty duplicates
  499. if len(set(compiled_rules)) != len(compiled_rules):
  500. duplicates = classify(compiled_rules, lambda x: x)
  501. for dups in duplicates.values():
  502. if len(dups) > 1:
  503. if dups[0].expansion:
  504. raise GrammarError("Rules defined twice: %s\n\n(Might happen due to colliding expansion of optionals: [] or ?)"
  505. % ''.join('\n * %s' % i for i in dups))
  506. # Empty rule; assert all other attributes are equal
  507. assert len({(r.alias, r.order, r.options) for r in dups}) == len(dups)
  508. # Remove duplicates
  509. compiled_rules = list(set(compiled_rules))
  510. # Filter out unused rules
  511. while True:
  512. c = len(compiled_rules)
  513. used_rules = {s for r in compiled_rules
  514. for s in r.expansion
  515. if isinstance(s, NonTerminal)
  516. and s != r.origin}
  517. used_rules |= {NonTerminal(s) for s in start}
  518. compiled_rules, unused = classify_bool(compiled_rules, lambda r: r.origin in used_rules)
  519. for r in unused:
  520. logger.debug("Unused rule: %s", r)
  521. if len(compiled_rules) == c:
  522. break
  523. # Filter out unused terminals
  524. used_terms = {t.name for r in compiled_rules
  525. for t in r.expansion
  526. if isinstance(t, Terminal)}
  527. terminals, unused = classify_bool(terminals, lambda t: t.name in used_terms or t.name in self.ignore)
  528. if unused:
  529. logger.debug("Unused terminals: %s", [t.name for t in unused])
  530. return terminals, compiled_rules, self.ignore
  531. def stdlib_loader(base_paths, grammar_path):
  532. import pkgutil
  533. for path in IMPORT_PATHS:
  534. text = pkgutil.get_data('lark', path + '/' + grammar_path)
  535. if text is None:
  536. continue
  537. return '<stdlib:' + grammar_path + '>', text.decode()
  538. raise FileNotFoundError()
  539. _imported_grammars = {}
  540. def import_grammar(grammar_path, re_, base_paths=(), import_sources=()):
  541. if grammar_path not in _imported_grammars:
  542. import_paths = import_sources + base_paths + [stdlib_loader]
  543. for source in import_paths:
  544. if isinstance(source, str):
  545. with suppress(IOError):
  546. joined_path = os.path.join(source, grammar_path)
  547. with open(joined_path, encoding='utf8') as f:
  548. text = f.read()
  549. grammar = load_grammar(text, joined_path, re_, import_sources)
  550. _imported_grammars[grammar_path] = grammar
  551. break
  552. else:
  553. with suppress(IOError):
  554. joined_path, text = source(base_paths, grammar_path)
  555. grammar = load_grammar(text, joined_path, re_, import_sources)
  556. _imported_grammars[grammar_path] = grammar
  557. break
  558. else:
  559. open(grammar_path, encoding='utf8')
  560. assert False
  561. return _imported_grammars[grammar_path]
  562. def import_from_grammar_into_namespace(grammar, namespace, aliases):
  563. """Returns all rules and terminals of grammar, prepended
  564. with a 'namespace' prefix, except for those which are aliased.
  565. """
  566. imported_terms = dict(grammar.term_defs)
  567. imported_rules = {n:(n,p,deepcopy(t),o) for n,p,t,o in grammar.rule_defs}
  568. term_defs = []
  569. rule_defs = []
  570. def rule_dependencies(symbol):
  571. if symbol.type != 'RULE':
  572. return []
  573. try:
  574. _, params, tree,_ = imported_rules[symbol]
  575. except KeyError:
  576. raise GrammarError("Missing symbol '%s' in grammar %s" % (symbol, namespace))
  577. return _find_used_symbols(tree) - set(params)
  578. def get_namespace_name(name, params):
  579. if params is not None:
  580. try:
  581. return params[name]
  582. except KeyError:
  583. pass
  584. try:
  585. return aliases[name].value
  586. except KeyError:
  587. if name[0] == '_':
  588. return '_%s__%s' % (namespace, name[1:])
  589. return '%s__%s' % (namespace, name)
  590. to_import = list(bfs(aliases, rule_dependencies))
  591. for symbol in to_import:
  592. if symbol.type == 'TERMINAL':
  593. term_defs.append([get_namespace_name(symbol, None), imported_terms[symbol]])
  594. else:
  595. assert symbol.type == 'RULE'
  596. _, params, tree, options = imported_rules[symbol]
  597. params_map = {p: ('%s__%s' if p[0]!='_' else '_%s__%s' ) % (namespace, p) for p in params}
  598. for t in tree.iter_subtrees():
  599. for i, c in enumerate(t.children):
  600. if isinstance(c, Token) and c.type in ('RULE', 'TERMINAL'):
  601. t.children[i] = Token(c.type, get_namespace_name(c, params_map))
  602. params = [params_map[p] for p in params] # We can not rely on ordered dictionaries
  603. rule_defs.append((get_namespace_name(symbol, params_map), params, tree, options))
  604. return term_defs, rule_defs
  605. def resolve_term_references(term_defs):
  606. # TODO Solve with transitive closure (maybe)
  607. term_dict = {k:t for k, (t,_p) in term_defs}
  608. assert len(term_dict) == len(term_defs), "Same name defined twice?"
  609. while True:
  610. changed = False
  611. for name, (token_tree, _p) in term_defs:
  612. if token_tree is None: # Terminal added through %declare
  613. continue
  614. for exp in token_tree.find_data('value'):
  615. item ,= exp.children
  616. if isinstance(item, Token):
  617. if item.type == 'RULE':
  618. raise GrammarError("Rules aren't allowed inside terminals (%s in %s)" % (item, name))
  619. if item.type == 'TERMINAL':
  620. term_value = term_dict[item]
  621. assert term_value is not None
  622. exp.children[0] = term_value
  623. changed = True
  624. if not changed:
  625. break
  626. for name, term in term_dict.items():
  627. if term: # Not just declared
  628. for child in term.children:
  629. ids = [id(x) for x in child.iter_subtrees()]
  630. if id(term) in ids:
  631. raise GrammarError("Recursion in terminal '%s' (recursion is only allowed in rules, not terminals)" % name)
  632. def options_from_rule(name, params, *x):
  633. if len(x) > 1:
  634. priority, expansions = x
  635. priority = int(priority)
  636. else:
  637. expansions ,= x
  638. priority = None
  639. params = [t.value for t in params.children] if params is not None else [] # For the grammar parser
  640. keep_all_tokens = name.startswith('!')
  641. name = name.lstrip('!')
  642. expand1 = name.startswith('?')
  643. name = name.lstrip('?')
  644. return name, params, expansions, RuleOptions(keep_all_tokens, expand1, priority=priority,
  645. template_source=(name if params else None))
  646. def symbols_from_strcase(expansion):
  647. return [Terminal(x, filter_out=x.startswith('_')) if x.isupper() else NonTerminal(x) for x in expansion]
  648. @inline_args
  649. class PrepareGrammar(Transformer_InPlace):
  650. def terminal(self, name):
  651. return name
  652. def nonterminal(self, name):
  653. return name
  654. def _find_used_symbols(tree):
  655. assert tree.data == 'expansions'
  656. return {t for x in tree.find_data('expansion')
  657. for t in x.scan_values(lambda t: t.type in ('RULE', 'TERMINAL'))}
  658. class GrammarLoader:
  659. ERRORS = [
  660. ('Unclosed parenthesis', ['a: (\n']),
  661. ('Umatched closing parenthesis', ['a: )\n', 'a: [)\n', 'a: (]\n']),
  662. ('Expecting rule or terminal definition (missing colon)', ['a\n', 'A\n', 'a->\n', 'A->\n', 'a A\n']),
  663. ('Illegal name for rules or terminals', ['Aa:\n']),
  664. ('Alias expects lowercase name', ['a: -> "a"\n']),
  665. ('Unexpected colon', ['a::\n', 'a: b:\n', 'a: B:\n', 'a: "a":\n']),
  666. ('Misplaced operator', ['a: b??', 'a: b(?)', 'a:+\n', 'a:?\n', 'a:*\n', 'a:|*\n']),
  667. ('Expecting option ("|") or a new rule or terminal definition', ['a:a\n()\n']),
  668. ('Terminal names cannot contain dots', ['A.B\n']),
  669. ('%import expects a name', ['%import "a"\n']),
  670. ('%ignore expects a value', ['%ignore %import\n']),
  671. ]
  672. def __init__(self, re_module):
  673. terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()]
  674. rules = [options_from_rule(name, None, x) for name, x in RULES.items()]
  675. rules = [Rule(NonTerminal(r), symbols_from_strcase(x.split()), i, None, o) for r, _p, xs, o in rules for i, x in enumerate(xs)]
  676. callback = ParseTreeBuilder(rules, ST).create_callback()
  677. lexer_conf = LexerConf(terminals, re_module, ['WS', 'COMMENT'])
  678. parser_conf = ParserConf(rules, callback, ['start'])
  679. self.parser = LALR_TraditionalLexer(lexer_conf, parser_conf)
  680. self.canonize_tree = CanonizeTree()
  681. self.re_module = re_module
  682. def load_grammar(self, grammar_text, grammar_name='<?>', import_sources=[]):
  683. "Parse grammar_text, verify, and create Grammar object. Display nice messages on error."
  684. try:
  685. tree = self.canonize_tree.transform( self.parser.parse(grammar_text+'\n') )
  686. except UnexpectedCharacters as e:
  687. context = e.get_context(grammar_text)
  688. raise GrammarError("Unexpected input at line %d column %d in %s: \n\n%s" %
  689. (e.line, e.column, grammar_name, context))
  690. except UnexpectedToken as e:
  691. context = e.get_context(grammar_text)
  692. error = e.match_examples(self.parser.parse, self.ERRORS, use_accepts=True)
  693. if error:
  694. raise GrammarError("%s, at line %s column %s\n\n%s" % (error, e.line, e.column, context))
  695. elif 'STRING' in e.expected:
  696. raise GrammarError("Expecting a value at line %s column %s\n\n%s" % (e.line, e.column, context))
  697. raise
  698. tree = PrepareGrammar().transform(tree)
  699. # Extract grammar items
  700. defs = classify(tree.children, lambda c: c.data, lambda c: c.children)
  701. term_defs = defs.pop('term', [])
  702. rule_defs = defs.pop('rule', [])
  703. statements = defs.pop('statement', [])
  704. assert not defs
  705. term_defs = [td if len(td)==3 else (td[0], 1, td[1]) for td in term_defs]
  706. term_defs = [(name.value, (t, int(p))) for name, p, t in term_defs]
  707. rule_defs = [options_from_rule(*x) for x in rule_defs]
  708. # Execute statements
  709. ignore, imports = [], {}
  710. for (stmt,) in statements:
  711. if stmt.data == 'ignore':
  712. t ,= stmt.children
  713. ignore.append(t)
  714. elif stmt.data == 'import':
  715. if len(stmt.children) > 1:
  716. path_node, arg1 = stmt.children
  717. else:
  718. path_node ,= stmt.children
  719. arg1 = None
  720. if isinstance(arg1, Tree): # Multi import
  721. dotted_path = tuple(path_node.children)
  722. names = arg1.children
  723. aliases = dict(zip(names, names)) # Can't have aliased multi import, so all aliases will be the same as names
  724. else: # Single import
  725. dotted_path = tuple(path_node.children[:-1])
  726. name = path_node.children[-1] # Get name from dotted path
  727. aliases = {name: arg1 or name} # Aliases if exist
  728. if path_node.data == 'import_lib': # Import from library
  729. base_paths = []
  730. else: # Relative import
  731. if grammar_name == '<string>': # Import relative to script file path if grammar is coded in script
  732. try:
  733. base_file = os.path.abspath(sys.modules['__main__'].__file__)
  734. except AttributeError:
  735. base_file = None
  736. else:
  737. base_file = grammar_name # Import relative to grammar file path if external grammar file
  738. if base_file:
  739. base_paths = [os.path.split(base_file)[0]]
  740. else:
  741. base_paths = [os.path.abspath(os.path.curdir)]
  742. try:
  743. import_base_paths, import_aliases = imports[dotted_path]
  744. assert base_paths == import_base_paths, 'Inconsistent base_paths for %s.' % '.'.join(dotted_path)
  745. import_aliases.update(aliases)
  746. except KeyError:
  747. imports[dotted_path] = base_paths, aliases
  748. elif stmt.data == 'declare':
  749. for t in stmt.children:
  750. term_defs.append([t.value, (None, None)])
  751. else:
  752. assert False, stmt
  753. # import grammars
  754. for dotted_path, (base_paths, aliases) in imports.items():
  755. grammar_path = os.path.join(*dotted_path) + EXT
  756. g = import_grammar(grammar_path, self.re_module, base_paths=base_paths, import_sources=import_sources)
  757. new_td, new_rd = import_from_grammar_into_namespace(g, '__'.join(dotted_path), aliases)
  758. term_defs += new_td
  759. rule_defs += new_rd
  760. # Verify correctness 1
  761. for name, _ in term_defs:
  762. if name.startswith('__'):
  763. raise GrammarError('Names starting with double-underscore are reserved (Error at %s)' % name)
  764. # Handle ignore tokens
  765. # XXX A slightly hacky solution. Recognition of %ignore TERMINAL as separate comes from the lexer's
  766. # inability to handle duplicate terminals (two names, one value)
  767. ignore_names = []
  768. for t in ignore:
  769. if t.data=='expansions' and len(t.children) == 1:
  770. t2 ,= t.children
  771. if t2.data=='expansion' and len(t2.children) == 1:
  772. item ,= t2.children
  773. if item.data == 'value':
  774. item ,= item.children
  775. if isinstance(item, Token) and item.type == 'TERMINAL':
  776. ignore_names.append(item.value)
  777. continue
  778. name = '__IGNORE_%d'% len(ignore_names)
  779. ignore_names.append(name)
  780. term_defs.append((name, (t, 1)))
  781. # Verify correctness 2
  782. terminal_names = set()
  783. for name, _ in term_defs:
  784. if name in terminal_names:
  785. raise GrammarError("Terminal '%s' defined more than once" % name)
  786. terminal_names.add(name)
  787. if set(ignore_names) > terminal_names:
  788. raise GrammarError("Terminals %s were marked to ignore but were not defined!" % (set(ignore_names) - terminal_names))
  789. resolve_term_references(term_defs)
  790. rules = rule_defs
  791. rule_names = {}
  792. for name, params, _x, _o in rules:
  793. if name.startswith('__'):
  794. raise GrammarError('Names starting with double-underscore are reserved (Error at %s)' % name)
  795. if name in rule_names:
  796. raise GrammarError("Rule '%s' defined more than once" % name)
  797. rule_names[name] = len(params)
  798. for name, params , expansions, _o in rules:
  799. for i, p in enumerate(params):
  800. if p in rule_names:
  801. raise GrammarError("Template Parameter conflicts with rule %s (in template %s)" % (p, name))
  802. if p in params[:i]:
  803. raise GrammarError("Duplicate Template Parameter %s (in template %s)" % (p, name))
  804. for temp in expansions.find_data('template_usage'):
  805. sym = temp.children[0]
  806. args = temp.children[1:]
  807. if sym not in params:
  808. if sym not in rule_names:
  809. raise GrammarError("Template '%s' used but not defined (in rule %s)" % (sym, name))
  810. if len(args) != rule_names[sym]:
  811. raise GrammarError("Wrong number of template arguments used for %s "
  812. "(expected %s, got %s) (in rule %s)"%(sym, rule_names[sym], len(args), name))
  813. for sym in _find_used_symbols(expansions):
  814. if sym.type == 'TERMINAL':
  815. if sym not in terminal_names:
  816. raise GrammarError("Token '%s' used but not defined (in rule %s)" % (sym, name))
  817. else:
  818. if sym not in rule_names and sym not in params:
  819. raise GrammarError("Rule '%s' used but not defined (in rule %s)" % (sym, name))
  820. return Grammar(rules, term_defs, ignore_names)
  821. def load_grammar(grammar, source, re_, import_sources):
  822. return GrammarLoader(re_).load_grammar(grammar, source, import_sources)