This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

880 lines
31 KiB

  1. "Parses and creates Grammar objects"
  2. import os.path
  3. import sys
  4. from ast import literal_eval
  5. from copy import copy, deepcopy
  6. from io import open
  7. from .utils import bfs
  8. from .lexer import Token, TerminalDef, PatternStr, PatternRE
  9. from .parse_tree_builder import ParseTreeBuilder
  10. from .parser_frontends import LALR_TraditionalLexer
  11. from .common import LexerConf, ParserConf
  12. from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol
  13. from .utils import classify, suppress, dedup_list, Str
  14. from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken
  15. from .tree import Tree, SlottedTree as ST
  16. from .visitors import Transformer, Visitor, v_args, Transformer_InPlace
  17. inline_args = v_args(inline=True)
  18. __path__ = os.path.dirname(__file__)
  19. IMPORT_PATHS = [os.path.join(__path__, 'grammars')]
  20. EXT = '.lark'
  21. _RE_FLAGS = 'imslux'
  22. _EMPTY = Symbol('__empty__')
  23. _TERMINAL_NAMES = {
  24. '.' : 'DOT',
  25. ',' : 'COMMA',
  26. ':' : 'COLON',
  27. ';' : 'SEMICOLON',
  28. '+' : 'PLUS',
  29. '-' : 'MINUS',
  30. '*' : 'STAR',
  31. '/' : 'SLASH',
  32. '\\' : 'BACKSLASH',
  33. '|' : 'VBAR',
  34. '?' : 'QMARK',
  35. '!' : 'BANG',
  36. '@' : 'AT',
  37. '#' : 'HASH',
  38. '$' : 'DOLLAR',
  39. '%' : 'PERCENT',
  40. '^' : 'CIRCUMFLEX',
  41. '&' : 'AMPERSAND',
  42. '_' : 'UNDERSCORE',
  43. '<' : 'LESSTHAN',
  44. '>' : 'MORETHAN',
  45. '=' : 'EQUAL',
  46. '"' : 'DBLQUOTE',
  47. '\'' : 'QUOTE',
  48. '`' : 'BACKQUOTE',
  49. '~' : 'TILDE',
  50. '(' : 'LPAR',
  51. ')' : 'RPAR',
  52. '{' : 'LBRACE',
  53. '}' : 'RBRACE',
  54. '[' : 'LSQB',
  55. ']' : 'RSQB',
  56. '\n' : 'NEWLINE',
  57. '\r\n' : 'CRLF',
  58. '\t' : 'TAB',
  59. ' ' : 'SPACE',
  60. }
  61. # Grammar Parser
  62. TERMINALS = {
  63. '_LPAR': r'\(',
  64. '_RPAR': r'\)',
  65. '_LBRA': r'\[',
  66. '_RBRA': r'\]',
  67. 'OP': '[+*][?]?|[?](?![a-z])',
  68. '_COLON': ':',
  69. '_COMMA': ',',
  70. '_OR': r'\|',
  71. '_DOT': r'\.',
  72. 'TILDE': '~',
  73. 'RULE': '!?[_?]?[a-z][_a-z0-9]*',
  74. 'TERMINAL': '_?[A-Z][_A-Z0-9]*',
  75. 'STRING': r'"(\\"|\\\\|[^"\n])*?"i?',
  76. 'REGEXP': r'/(?!/)(\\/|\\\\|[^/\n])*?/[%s]*' % _RE_FLAGS,
  77. '_NL': r'(\r?\n)+\s*',
  78. 'WS': r'[ \t]+',
  79. 'COMMENT': r'//[^\n]*',
  80. '_TO': '->',
  81. '_IGNORE': r'%ignore',
  82. '_DECLARE': r'%declare',
  83. '_IMPORT': r'%import',
  84. 'NUMBER': r'[+-]?\d+',
  85. }
  86. RULES = {
  87. 'start': ['_list'],
  88. '_list': ['_item', '_list _item'],
  89. '_item': ['rule', 'term', 'statement', '_NL'],
  90. 'rule': ['RULE _COLON expansions _NL',
  91. 'RULE _DOT NUMBER _COLON expansions _NL'],
  92. 'expansions': ['alias',
  93. 'expansions _OR alias',
  94. 'expansions _NL _OR alias'],
  95. '?alias': ['expansion _TO RULE', 'expansion'],
  96. 'expansion': ['_expansion'],
  97. '_expansion': ['', '_expansion expr'],
  98. '?expr': ['atom',
  99. 'atom OP',
  100. 'atom TILDE NUMBER',
  101. 'atom TILDE NUMBER _DOT _DOT NUMBER',
  102. ],
  103. '?atom': ['_LPAR expansions _RPAR',
  104. 'maybe',
  105. 'value'],
  106. 'value': ['terminal',
  107. 'nonterminal',
  108. 'literal',
  109. 'range'],
  110. 'terminal': ['TERMINAL'],
  111. 'nonterminal': ['RULE'],
  112. '?name': ['RULE', 'TERMINAL'],
  113. 'maybe': ['_LBRA expansions _RBRA'],
  114. 'range': ['STRING _DOT _DOT STRING'],
  115. 'term': ['TERMINAL _COLON expansions _NL',
  116. 'TERMINAL _DOT NUMBER _COLON expansions _NL'],
  117. 'statement': ['ignore', 'import', 'declare'],
  118. 'ignore': ['_IGNORE expansions _NL'],
  119. 'declare': ['_DECLARE _declare_args _NL'],
  120. 'import': ['_IMPORT _import_path _NL',
  121. '_IMPORT _import_path _LPAR name_list _RPAR _NL',
  122. '_IMPORT _import_path _TO name _NL'],
  123. '_import_path': ['import_lib', 'import_rel'],
  124. 'import_lib': ['_import_args'],
  125. 'import_rel': ['_DOT _import_args'],
  126. '_import_args': ['name', '_import_args _DOT name'],
  127. 'name_list': ['_name_list'],
  128. '_name_list': ['name', '_name_list _COMMA name'],
  129. '_declare_args': ['name', '_declare_args name'],
  130. 'literal': ['REGEXP', 'STRING'],
  131. }
  132. @inline_args
  133. class EBNF_to_BNF(Transformer_InPlace):
  134. def __init__(self):
  135. self.new_rules = []
  136. self.rules_by_expr = {}
  137. self.prefix = 'anon'
  138. self.i = 0
  139. self.rule_options = None
  140. def _add_recurse_rule(self, type_, expr):
  141. if expr in self.rules_by_expr:
  142. return self.rules_by_expr[expr]
  143. new_name = '__%s_%s_%d' % (self.prefix, type_, self.i)
  144. self.i += 1
  145. t = NonTerminal(new_name)
  146. tree = ST('expansions', [ST('expansion', [expr]), ST('expansion', [t, expr])])
  147. self.new_rules.append((new_name, tree, self.rule_options))
  148. self.rules_by_expr[expr] = t
  149. return t
  150. def expr(self, rule, op, *args):
  151. if op.value == '?':
  152. empty = ST('expansion', [])
  153. return ST('expansions', [rule, empty])
  154. elif op.value == '+':
  155. # a : b c+ d
  156. # -->
  157. # a : b _c d
  158. # _c : _c c | c;
  159. return self._add_recurse_rule('plus', rule)
  160. elif op.value == '*':
  161. # a : b c* d
  162. # -->
  163. # a : b _c? d
  164. # _c : _c c | c;
  165. new_name = self._add_recurse_rule('star', rule)
  166. return ST('expansions', [new_name, ST('expansion', [])])
  167. elif op.value == '~':
  168. if len(args) == 1:
  169. mn = mx = int(args[0])
  170. else:
  171. mn, mx = map(int, args)
  172. if mx < mn or mn < 0:
  173. raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (rule, mn, mx))
  174. return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx+1)])
  175. assert False, op
  176. def maybe(self, rule):
  177. keep_all_tokens = self.rule_options and self.rule_options.keep_all_tokens
  178. def will_not_get_removed(sym):
  179. if isinstance(sym, NonTerminal):
  180. return not sym.name.startswith('_')
  181. if isinstance(sym, Terminal):
  182. return keep_all_tokens or not sym.filter_out
  183. assert False
  184. if any(rule.scan_values(will_not_get_removed)):
  185. empty = _EMPTY
  186. else:
  187. empty = ST('expansion', [])
  188. return ST('expansions', [rule, empty])
  189. class SimplifyRule_Visitor(Visitor):
  190. @staticmethod
  191. def _flatten(tree):
  192. while True:
  193. to_expand = [i for i, child in enumerate(tree.children)
  194. if isinstance(child, Tree) and child.data == tree.data]
  195. if not to_expand:
  196. break
  197. tree.expand_kids_by_index(*to_expand)
  198. def expansion(self, tree):
  199. # rules_list unpacking
  200. # a : b (c|d) e
  201. # -->
  202. # a : b c e | b d e
  203. #
  204. # In AST terms:
  205. # expansion(b, expansions(c, d), e)
  206. # -->
  207. # expansions( expansion(b, c, e), expansion(b, d, e) )
  208. self._flatten(tree)
  209. for i, child in enumerate(tree.children):
  210. if isinstance(child, Tree) and child.data == 'expansions':
  211. tree.data = 'expansions'
  212. tree.children = [self.visit(ST('expansion', [option if i==j else other
  213. for j, other in enumerate(tree.children)]))
  214. for option in dedup_list(child.children)]
  215. self._flatten(tree)
  216. break
  217. def alias(self, tree):
  218. rule, alias_name = tree.children
  219. if rule.data == 'expansions':
  220. aliases = []
  221. for child in tree.children[0].children:
  222. aliases.append(ST('alias', [child, alias_name]))
  223. tree.data = 'expansions'
  224. tree.children = aliases
  225. def expansions(self, tree):
  226. self._flatten(tree)
  227. tree.children = dedup_list(tree.children)
  228. class RuleTreeToText(Transformer):
  229. def expansions(self, x):
  230. return x
  231. def expansion(self, symbols):
  232. return symbols, None
  233. def alias(self, x):
  234. (expansion, _alias), alias = x
  235. assert _alias is None, (alias, expansion, '-', _alias) # Double alias not allowed
  236. return expansion, alias.value
  237. @inline_args
  238. class CanonizeTree(Transformer_InPlace):
  239. def tokenmods(self, *args):
  240. if len(args) == 1:
  241. return list(args)
  242. tokenmods, value = args
  243. return tokenmods + [value]
  244. class PrepareAnonTerminals(Transformer_InPlace):
  245. "Create a unique list of anonymous terminals. Attempt to give meaningful names to them when we add them"
  246. def __init__(self, terminals):
  247. self.terminals = terminals
  248. self.term_set = {td.name for td in self.terminals}
  249. self.term_reverse = {td.pattern: td for td in terminals}
  250. self.i = 0
  251. @inline_args
  252. def pattern(self, p):
  253. value = p.value
  254. if p in self.term_reverse and p.flags != self.term_reverse[p].pattern.flags:
  255. raise GrammarError(u'Conflicting flags for the same terminal: %s' % p)
  256. term_name = None
  257. if isinstance(p, PatternStr):
  258. try:
  259. # If already defined, use the user-defined terminal name
  260. term_name = self.term_reverse[p].name
  261. except KeyError:
  262. # Try to assign an indicative anon-terminal name
  263. try:
  264. term_name = _TERMINAL_NAMES[value]
  265. except KeyError:
  266. if value.isalnum() and value[0].isalpha() and value.upper() not in self.term_set:
  267. with suppress(UnicodeEncodeError):
  268. value.upper().encode('ascii') # Make sure we don't have unicode in our terminal names
  269. term_name = value.upper()
  270. if term_name in self.term_set:
  271. term_name = None
  272. elif isinstance(p, PatternRE):
  273. if p in self.term_reverse: # Kind of a wierd placement.name
  274. term_name = self.term_reverse[p].name
  275. else:
  276. assert False, p
  277. if term_name is None:
  278. term_name = '__ANON_%d' % self.i
  279. self.i += 1
  280. if term_name not in self.term_set:
  281. assert p not in self.term_reverse
  282. self.term_set.add(term_name)
  283. termdef = TerminalDef(term_name, p)
  284. self.term_reverse[p] = termdef
  285. self.terminals.append(termdef)
  286. return Terminal(term_name, filter_out=isinstance(p, PatternStr))
  287. def _rfind(s, choices):
  288. return max(s.rfind(c) for c in choices)
  289. def _fix_escaping(s):
  290. w = ''
  291. i = iter(s)
  292. for n in i:
  293. w += n
  294. if n == '\\':
  295. try:
  296. n2 = next(i)
  297. except StopIteration:
  298. raise ValueError("Literal ended unexpectedly (bad escaping): `%r`" % s)
  299. if n2 == '\\':
  300. w += '\\\\'
  301. elif n2 not in 'uxnftr':
  302. w += '\\'
  303. w += n2
  304. w = w.replace('\\"', '"').replace("'", "\\'")
  305. to_eval = "u'''%s'''" % w
  306. try:
  307. s = literal_eval(to_eval)
  308. except SyntaxError as e:
  309. raise ValueError(s, e)
  310. return s
  311. def _literal_to_pattern(literal):
  312. v = literal.value
  313. flag_start = _rfind(v, '/"')+1
  314. assert flag_start > 0
  315. flags = v[flag_start:]
  316. assert all(f in _RE_FLAGS for f in flags), flags
  317. v = v[:flag_start]
  318. assert v[0] == v[-1] and v[0] in '"/'
  319. x = v[1:-1]
  320. s = _fix_escaping(x)
  321. if literal.type == 'STRING':
  322. s = s.replace('\\\\', '\\')
  323. return { 'STRING': PatternStr,
  324. 'REGEXP': PatternRE }[literal.type](s, flags)
  325. @inline_args
  326. class PrepareLiterals(Transformer_InPlace):
  327. def literal(self, literal):
  328. return ST('pattern', [_literal_to_pattern(literal)])
  329. def range(self, start, end):
  330. assert start.type == end.type == 'STRING'
  331. start = start.value[1:-1]
  332. end = end.value[1:-1]
  333. assert len(_fix_escaping(start)) == len(_fix_escaping(end)) == 1, (start, end, len(_fix_escaping(start)), len(_fix_escaping(end)))
  334. regexp = '[%s-%s]' % (start, end)
  335. return ST('pattern', [PatternRE(regexp)])
  336. class TerminalTreeToPattern(Transformer):
  337. def pattern(self, ps):
  338. p ,= ps
  339. return p
  340. def expansion(self, items):
  341. assert items
  342. if len(items) == 1:
  343. return items[0]
  344. if len({i.flags for i in items}) > 1:
  345. raise GrammarError("Lark doesn't support joining terminals with conflicting flags!")
  346. return PatternRE(''.join(i.to_regexp() for i in items), items[0].flags if items else ())
  347. def expansions(self, exps):
  348. if len(exps) == 1:
  349. return exps[0]
  350. if len({i.flags for i in exps}) > 1:
  351. raise GrammarError("Lark doesn't support joining terminals with conflicting flags!")
  352. return PatternRE('(?:%s)' % ('|'.join(i.to_regexp() for i in exps)), exps[0].flags)
  353. def expr(self, args):
  354. inner, op = args[:2]
  355. if op == '~':
  356. if len(args) == 3:
  357. op = "{%d}" % int(args[2])
  358. else:
  359. mn, mx = map(int, args[2:])
  360. if mx < mn:
  361. raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (inner, mn, mx))
  362. op = "{%d,%d}" % (mn, mx)
  363. else:
  364. assert len(args) == 2
  365. return PatternRE('(?:%s)%s' % (inner.to_regexp(), op), inner.flags)
  366. def maybe(self, expr):
  367. return self.expr(expr + ['?'])
  368. def alias(self, t):
  369. raise GrammarError("Aliasing not allowed in terminals (You used -> in the wrong place)")
  370. def value(self, v):
  371. return v[0]
  372. class PrepareSymbols(Transformer_InPlace):
  373. def value(self, v):
  374. v ,= v
  375. if isinstance(v, Tree):
  376. return v
  377. elif v.type == 'RULE':
  378. return NonTerminal(Str(v.value))
  379. elif v.type == 'TERMINAL':
  380. return Terminal(Str(v.value), filter_out=v.startswith('_'))
  381. assert False
  382. def _choice_of_rules(rules):
  383. return ST('expansions', [ST('expansion', [Token('RULE', name)]) for name in rules])
  384. class Grammar:
  385. def __init__(self, rule_defs, term_defs, ignore):
  386. self.term_defs = term_defs
  387. self.rule_defs = rule_defs
  388. self.ignore = ignore
  389. def compile(self, start):
  390. # We change the trees in-place (to support huge grammars)
  391. # So deepcopy allows calling compile more than once.
  392. term_defs = deepcopy(list(self.term_defs))
  393. rule_defs = deepcopy(self.rule_defs)
  394. # ===================
  395. # Compile Terminals
  396. # ===================
  397. # Convert terminal-trees to strings/regexps
  398. for name, (term_tree, priority) in term_defs:
  399. if term_tree is None: # Terminal added through %declare
  400. continue
  401. expansions = list(term_tree.find_data('expansion'))
  402. if len(expansions) == 1 and not expansions[0].children:
  403. raise GrammarError("Terminals cannot be empty (%s)" % name)
  404. transformer = PrepareLiterals() * TerminalTreeToPattern()
  405. terminals = [TerminalDef(name, transformer.transform( term_tree ), priority)
  406. for name, (term_tree, priority) in term_defs if term_tree]
  407. # =================
  408. # Compile Rules
  409. # =================
  410. # 1. Pre-process terminals
  411. transformer = PrepareLiterals() * PrepareSymbols() * PrepareAnonTerminals(terminals) # Adds to terminals
  412. # 2. Convert EBNF to BNF (and apply step 1)
  413. ebnf_to_bnf = EBNF_to_BNF()
  414. rules = []
  415. for name, rule_tree, options in rule_defs:
  416. ebnf_to_bnf.rule_options = RuleOptions(keep_all_tokens=True) if options and options.keep_all_tokens else None
  417. tree = transformer.transform(rule_tree)
  418. res = ebnf_to_bnf.transform(tree)
  419. rules.append((name, res, options))
  420. rules += ebnf_to_bnf.new_rules
  421. assert len(rules) == len({name for name, _t, _o in rules}), "Whoops, name collision"
  422. # 3. Compile tree to Rule objects
  423. rule_tree_to_text = RuleTreeToText()
  424. simplify_rule = SimplifyRule_Visitor()
  425. compiled_rules = []
  426. for rule_content in rules:
  427. name, tree, options = rule_content
  428. simplify_rule.visit(tree)
  429. expansions = rule_tree_to_text.transform(tree)
  430. for i, (expansion, alias) in enumerate(expansions):
  431. if alias and name.startswith('_'):
  432. raise GrammarError("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (name, alias))
  433. empty_indices = [x==_EMPTY for x in expansion]
  434. if any(empty_indices):
  435. exp_options = copy(options) if options else RuleOptions()
  436. exp_options.empty_indices = empty_indices
  437. expansion = [x for x in expansion if x!=_EMPTY]
  438. else:
  439. exp_options = options
  440. assert all(isinstance(x, Symbol) for x in expansion), expansion
  441. rule = Rule(NonTerminal(name), expansion, i, alias, exp_options)
  442. compiled_rules.append(rule)
  443. # Remove duplicates of empty rules, throw error for non-empty duplicates
  444. if len(set(compiled_rules)) != len(compiled_rules):
  445. duplicates = classify(compiled_rules, lambda x: x)
  446. for dups in duplicates.values():
  447. if len(dups) > 1:
  448. if dups[0].expansion:
  449. raise GrammarError("Rules defined twice: %s\n\n(Might happen due to colliding expansion of optionals: [] or ?)" % ''.join('\n * %s' % i for i in dups))
  450. # Empty rule; assert all other attributes are equal
  451. assert len({(r.alias, r.order, r.options) for r in dups}) == len(dups)
  452. # Remove duplicates
  453. compiled_rules = list(set(compiled_rules))
  454. # Filter out unused rules
  455. while True:
  456. c = len(compiled_rules)
  457. used_rules = {s for r in compiled_rules
  458. for s in r.expansion
  459. if isinstance(s, NonTerminal)
  460. and s != r.origin}
  461. used_rules |= {NonTerminal(s) for s in start}
  462. compiled_rules = [r for r in compiled_rules if r.origin in used_rules]
  463. if len(compiled_rules) == c:
  464. break
  465. # Filter out unused terminals
  466. used_terms = {t.name for r in compiled_rules
  467. for t in r.expansion
  468. if isinstance(t, Terminal)}
  469. terminals = [t for t in terminals if t.name in used_terms or t.name in self.ignore]
  470. return terminals, compiled_rules, self.ignore
  471. _imported_grammars = {}
  472. def import_grammar(grammar_path, base_paths=[]):
  473. if grammar_path not in _imported_grammars:
  474. import_paths = base_paths + IMPORT_PATHS
  475. for import_path in import_paths:
  476. with suppress(IOError):
  477. joined_path = os.path.join(import_path, grammar_path)
  478. with open(joined_path, encoding='utf8') as f:
  479. text = f.read()
  480. grammar = load_grammar(text, joined_path)
  481. _imported_grammars[grammar_path] = grammar
  482. break
  483. else:
  484. open(grammar_path, encoding='utf8')
  485. assert False
  486. return _imported_grammars[grammar_path]
  487. def import_from_grammar_into_namespace(grammar, namespace, aliases):
  488. """Returns all rules and terminals of grammar, prepended
  489. with a 'namespace' prefix, except for those which are aliased.
  490. """
  491. imported_terms = dict(grammar.term_defs)
  492. imported_rules = {n:(n,deepcopy(t),o) for n,t,o in grammar.rule_defs}
  493. term_defs = []
  494. rule_defs = []
  495. def rule_dependencies(symbol):
  496. if symbol.type != 'RULE':
  497. return []
  498. try:
  499. _, tree, _ = imported_rules[symbol]
  500. except KeyError:
  501. raise GrammarError("Missing symbol '%s' in grammar %s" % (symbol, namespace))
  502. return _find_used_symbols(tree)
  503. def get_namespace_name(name):
  504. try:
  505. return aliases[name].value
  506. except KeyError:
  507. if name[0] == '_':
  508. return '_%s__%s' % (namespace, name[1:])
  509. return '%s__%s' % (namespace, name)
  510. to_import = list(bfs(aliases, rule_dependencies))
  511. for symbol in to_import:
  512. if symbol.type == 'TERMINAL':
  513. term_defs.append([get_namespace_name(symbol), imported_terms[symbol]])
  514. else:
  515. assert symbol.type == 'RULE'
  516. rule = imported_rules[symbol]
  517. for t in rule[1].iter_subtrees():
  518. for i, c in enumerate(t.children):
  519. if isinstance(c, Token) and c.type in ('RULE', 'TERMINAL'):
  520. t.children[i] = Token(c.type, get_namespace_name(c))
  521. rule_defs.append((get_namespace_name(symbol), rule[1], rule[2]))
  522. return term_defs, rule_defs
  523. def resolve_term_references(term_defs):
  524. # TODO Solve with transitive closure (maybe)
  525. term_dict = {k:t for k, (t,_p) in term_defs}
  526. assert len(term_dict) == len(term_defs), "Same name defined twice?"
  527. while True:
  528. changed = False
  529. for name, (token_tree, _p) in term_defs:
  530. if token_tree is None: # Terminal added through %declare
  531. continue
  532. for exp in token_tree.find_data('value'):
  533. item ,= exp.children
  534. if isinstance(item, Token):
  535. if item.type == 'RULE':
  536. raise GrammarError("Rules aren't allowed inside terminals (%s in %s)" % (item, name))
  537. if item.type == 'TERMINAL':
  538. term_value = term_dict[item]
  539. assert term_value is not None
  540. exp.children[0] = term_value
  541. changed = True
  542. if not changed:
  543. break
  544. for name, term in term_dict.items():
  545. if term: # Not just declared
  546. for child in term.children:
  547. ids = [id(x) for x in child.iter_subtrees()]
  548. if id(term) in ids:
  549. raise GrammarError("Recursion in terminal '%s' (recursion is only allowed in rules, not terminals)" % name)
  550. def options_from_rule(name, *x):
  551. if len(x) > 1:
  552. priority, expansions = x
  553. priority = int(priority)
  554. else:
  555. expansions ,= x
  556. priority = None
  557. keep_all_tokens = name.startswith('!')
  558. name = name.lstrip('!')
  559. expand1 = name.startswith('?')
  560. name = name.lstrip('?')
  561. return name, expansions, RuleOptions(keep_all_tokens, expand1, priority=priority)
  562. def symbols_from_strcase(expansion):
  563. return [Terminal(x, filter_out=x.startswith('_')) if x.isupper() else NonTerminal(x) for x in expansion]
  564. @inline_args
  565. class PrepareGrammar(Transformer_InPlace):
  566. def terminal(self, name):
  567. return name
  568. def nonterminal(self, name):
  569. return name
  570. def _find_used_symbols(tree):
  571. assert tree.data == 'expansions'
  572. return {t for x in tree.find_data('expansion')
  573. for t in x.scan_values(lambda t: t.type in ('RULE', 'TERMINAL'))}
  574. class GrammarLoader:
  575. def __init__(self):
  576. terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()]
  577. rules = [options_from_rule(name, x) for name, x in RULES.items()]
  578. rules = [Rule(NonTerminal(r), symbols_from_strcase(x.split()), i, None, o) for r, xs, o in rules for i, x in enumerate(xs)]
  579. callback = ParseTreeBuilder(rules, ST).create_callback()
  580. lexer_conf = LexerConf(terminals, ['WS', 'COMMENT'])
  581. parser_conf = ParserConf(rules, callback, ['start'])
  582. self.parser = LALR_TraditionalLexer(lexer_conf, parser_conf)
  583. self.canonize_tree = CanonizeTree()
  584. def load_grammar(self, grammar_text, grammar_name='<?>'):
  585. "Parse grammar_text, verify, and create Grammar object. Display nice messages on error."
  586. try:
  587. tree = self.canonize_tree.transform( self.parser.parse(grammar_text+'\n') )
  588. except UnexpectedCharacters as e:
  589. context = e.get_context(grammar_text)
  590. raise GrammarError("Unexpected input at line %d column %d in %s: \n\n%s" %
  591. (e.line, e.column, grammar_name, context))
  592. except UnexpectedToken as e:
  593. context = e.get_context(grammar_text)
  594. error = e.match_examples(self.parser.parse, {
  595. 'Unclosed parenthesis': ['a: (\n'],
  596. 'Umatched closing parenthesis': ['a: )\n', 'a: [)\n', 'a: (]\n'],
  597. 'Expecting rule or terminal definition (missing colon)': ['a\n', 'a->\n', 'A->\n', 'a A\n'],
  598. 'Alias expects lowercase name': ['a: -> "a"\n'],
  599. 'Unexpected colon': ['a::\n', 'a: b:\n', 'a: B:\n', 'a: "a":\n'],
  600. 'Misplaced operator': ['a: b??', 'a: b(?)', 'a:+\n', 'a:?\n', 'a:*\n', 'a:|*\n'],
  601. 'Expecting option ("|") or a new rule or terminal definition': ['a:a\n()\n'],
  602. '%import expects a name': ['%import "a"\n'],
  603. '%ignore expects a value': ['%ignore %import\n'],
  604. })
  605. if error:
  606. raise GrammarError("%s at line %s column %s\n\n%s" % (error, e.line, e.column, context))
  607. elif 'STRING' in e.expected:
  608. raise GrammarError("Expecting a value at line %s column %s\n\n%s" % (e.line, e.column, context))
  609. raise
  610. tree = PrepareGrammar().transform(tree)
  611. # Extract grammar items
  612. defs = classify(tree.children, lambda c: c.data, lambda c: c.children)
  613. term_defs = defs.pop('term', [])
  614. rule_defs = defs.pop('rule', [])
  615. statements = defs.pop('statement', [])
  616. assert not defs
  617. term_defs = [td if len(td)==3 else (td[0], 1, td[1]) for td in term_defs]
  618. term_defs = [(name.value, (t, int(p))) for name, p, t in term_defs]
  619. rule_defs = [options_from_rule(*x) for x in rule_defs]
  620. # Execute statements
  621. ignore, imports = [], {}
  622. for (stmt,) in statements:
  623. if stmt.data == 'ignore':
  624. t ,= stmt.children
  625. ignore.append(t)
  626. elif stmt.data == 'import':
  627. if len(stmt.children) > 1:
  628. path_node, arg1 = stmt.children
  629. else:
  630. path_node, = stmt.children
  631. arg1 = None
  632. if isinstance(arg1, Tree): # Multi import
  633. dotted_path = tuple(path_node.children)
  634. names = arg1.children
  635. aliases = dict(zip(names, names)) # Can't have aliased multi import, so all aliases will be the same as names
  636. else: # Single import
  637. dotted_path = tuple(path_node.children[:-1])
  638. name = path_node.children[-1] # Get name from dotted path
  639. aliases = {name: arg1 or name} # Aliases if exist
  640. if path_node.data == 'import_lib': # Import from library
  641. base_paths = []
  642. else: # Relative import
  643. if grammar_name == '<string>': # Import relative to script file path if grammar is coded in script
  644. try:
  645. base_file = os.path.abspath(sys.modules['__main__'].__file__)
  646. except AttributeError:
  647. base_file = None
  648. else:
  649. base_file = grammar_name # Import relative to grammar file path if external grammar file
  650. if base_file:
  651. base_paths = [os.path.split(base_file)[0]]
  652. else:
  653. base_paths = [os.path.abspath(os.path.curdir)]
  654. try:
  655. import_base_paths, import_aliases = imports[dotted_path]
  656. assert base_paths == import_base_paths, 'Inconsistent base_paths for %s.' % '.'.join(dotted_path)
  657. import_aliases.update(aliases)
  658. except KeyError:
  659. imports[dotted_path] = base_paths, aliases
  660. elif stmt.data == 'declare':
  661. for t in stmt.children:
  662. term_defs.append([t.value, (None, None)])
  663. else:
  664. assert False, stmt
  665. # import grammars
  666. for dotted_path, (base_paths, aliases) in imports.items():
  667. grammar_path = os.path.join(*dotted_path) + EXT
  668. g = import_grammar(grammar_path, base_paths=base_paths)
  669. new_td, new_rd = import_from_grammar_into_namespace(g, '__'.join(dotted_path), aliases)
  670. term_defs += new_td
  671. rule_defs += new_rd
  672. # Verify correctness 1
  673. for name, _ in term_defs:
  674. if name.startswith('__'):
  675. raise GrammarError('Names starting with double-underscore are reserved (Error at %s)' % name)
  676. # Handle ignore tokens
  677. # XXX A slightly hacky solution. Recognition of %ignore TERMINAL as separate comes from the lexer's
  678. # inability to handle duplicate terminals (two names, one value)
  679. ignore_names = []
  680. for t in ignore:
  681. if t.data=='expansions' and len(t.children) == 1:
  682. t2 ,= t.children
  683. if t2.data=='expansion' and len(t2.children) == 1:
  684. item ,= t2.children
  685. if item.data == 'value':
  686. item ,= item.children
  687. if isinstance(item, Token) and item.type == 'TERMINAL':
  688. ignore_names.append(item.value)
  689. continue
  690. name = '__IGNORE_%d'% len(ignore_names)
  691. ignore_names.append(name)
  692. term_defs.append((name, (t, 1)))
  693. # Verify correctness 2
  694. terminal_names = set()
  695. for name, _ in term_defs:
  696. if name in terminal_names:
  697. raise GrammarError("Terminal '%s' defined more than once" % name)
  698. terminal_names.add(name)
  699. if set(ignore_names) > terminal_names:
  700. raise GrammarError("Terminals %s were marked to ignore but were not defined!" % (set(ignore_names) - terminal_names))
  701. resolve_term_references(term_defs)
  702. rules = rule_defs
  703. rule_names = set()
  704. for name, _x, _o in rules:
  705. if name.startswith('__'):
  706. raise GrammarError('Names starting with double-underscore are reserved (Error at %s)' % name)
  707. if name in rule_names:
  708. raise GrammarError("Rule '%s' defined more than once" % name)
  709. rule_names.add(name)
  710. for name, expansions, _o in rules:
  711. for sym in _find_used_symbols(expansions):
  712. if sym.type == 'TERMINAL':
  713. if sym not in terminal_names:
  714. raise GrammarError("Token '%s' used but not defined (in rule %s)" % (sym, name))
  715. else:
  716. if sym not in rule_names:
  717. raise GrammarError("Rule '%s' used but not defined (in rule %s)" % (sym, name))
  718. return Grammar(rules, term_defs, ignore_names)
  719. load_grammar = GrammarLoader().load_grammar