This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

191 lines
5.4 KiB

  1. "Converts between Lark and Nearley grammars. Work in progress!"
  2. import os.path
  3. import sys
  4. import codecs
  5. from lark import Lark, InlineTransformer
  6. nearley_grammar = r"""
  7. start: (ruledef|directive)+
  8. directive: "@" NAME (STRING|NAME)
  9. | "@" JS -> js_code
  10. ruledef: NAME "->" expansions
  11. | NAME REGEXP "->" expansions -> macro
  12. expansions: expansion ("|" expansion)*
  13. expansion: expr+ js
  14. ?expr: item [":" /[+*?]/]
  15. ?item: rule|string|regexp|null
  16. | "(" expansions ")"
  17. rule: NAME
  18. string: STRING
  19. regexp: REGEXP
  20. null: "null"
  21. JS: /{%.*?%}/s
  22. js: JS?
  23. NAME: /[a-zA-Z_$]\w*/
  24. COMMENT: /#[^\n]*/
  25. REGEXP: /\[.*?\]/
  26. %import common.ESCAPED_STRING -> STRING
  27. %import common.WS
  28. %ignore WS
  29. %ignore COMMENT
  30. """
  31. nearley_grammar_parser = Lark(nearley_grammar, parser='earley', lexer='standard')
  32. def _get_rulename(name):
  33. name = {'_': '_ws_maybe', '__':'_ws'}.get(name, name)
  34. return 'n_' + name.replace('$', '__DOLLAR__').lower()
  35. class NearleyToLark(InlineTransformer):
  36. def __init__(self):
  37. self._count = 0
  38. self.extra_rules = {}
  39. self.extra_rules_rev = {}
  40. self.alias_js_code = {}
  41. def _new_function(self, code):
  42. name = 'alias_%d' % self._count
  43. self._count += 1
  44. self.alias_js_code[name] = code
  45. return name
  46. def _extra_rule(self, rule):
  47. if rule in self.extra_rules_rev:
  48. return self.extra_rules_rev[rule]
  49. name = 'xrule_%d' % len(self.extra_rules)
  50. assert name not in self.extra_rules
  51. self.extra_rules[name] = rule
  52. self.extra_rules_rev[rule] = name
  53. return name
  54. def rule(self, name):
  55. return _get_rulename(name)
  56. def ruledef(self, name, exps):
  57. return '!%s: %s' % (_get_rulename(name), exps)
  58. def expr(self, item, op):
  59. rule = '(%s)%s' % (item, op)
  60. return self._extra_rule(rule)
  61. def regexp(self, r):
  62. return '/%s/' % r
  63. def null(self):
  64. return ''
  65. def string(self, s):
  66. return self._extra_rule(s)
  67. def expansion(self, *x):
  68. x, js = x[:-1], x[-1]
  69. if js.children:
  70. js_code ,= js.children
  71. js_code = js_code[2:-2]
  72. alias = '-> ' + self._new_function(js_code)
  73. else:
  74. alias = ''
  75. return ' '.join(x) + alias
  76. def expansions(self, *x):
  77. return '%s' % ('\n |'.join(x))
  78. def start(self, *rules):
  79. return '\n'.join(filter(None, rules))
  80. def _nearley_to_lark(g, builtin_path, n2l, js_code, folder_path, includes):
  81. rule_defs = []
  82. tree = nearley_grammar_parser.parse(g)
  83. for statement in tree.children:
  84. if statement.data == 'directive':
  85. directive, arg = statement.children
  86. if directive in ('builtin', 'include'):
  87. folder = builtin_path if directive == 'builtin' else folder_path
  88. path = os.path.join(folder, arg[1:-1])
  89. if path not in includes:
  90. includes.add(path)
  91. with codecs.open(path, encoding='utf8') as f:
  92. text = f.read()
  93. rule_defs += _nearley_to_lark(text, builtin_path, n2l, js_code, os.path.abspath(os.path.dirname(path)), includes)
  94. else:
  95. assert False, directive
  96. elif statement.data == 'js_code':
  97. code ,= statement.children
  98. code = code[2:-2]
  99. js_code.append(code)
  100. elif statement.data == 'macro':
  101. pass # TODO Add support for macros!
  102. elif statement.data == 'ruledef':
  103. rule_defs.append( n2l.transform(statement) )
  104. else:
  105. raise Exception("Unknown statement: %s" % statement)
  106. return rule_defs
  107. def create_code_for_nearley_grammar(g, start, builtin_path, folder_path):
  108. import js2py
  109. emit_code = []
  110. def emit(x=None):
  111. if x:
  112. emit_code.append(x)
  113. emit_code.append('\n')
  114. js_code = ['function id(x) {return x[0];}']
  115. n2l = NearleyToLark()
  116. rule_defs = _nearley_to_lark(g, builtin_path, n2l, js_code, folder_path, set())
  117. lark_g = '\n'.join(rule_defs)
  118. lark_g += '\n'+'\n'.join('!%s: %s' % item for item in n2l.extra_rules.items())
  119. emit('from lark import Lark, Transformer')
  120. emit()
  121. emit('grammar = ' + repr(lark_g))
  122. emit()
  123. for alias, code in n2l.alias_js_code.items():
  124. js_code.append('%s = (%s);' % (alias, code))
  125. emit(js2py.translate_js('\n'.join(js_code)))
  126. emit('class TransformNearley(Transformer):')
  127. for alias in n2l.alias_js_code:
  128. emit(" %s = var.get('%s').to_python()" % (alias, alias))
  129. emit(" __default__ = lambda self, n, c, m: c if c else None")
  130. emit()
  131. emit('parser = Lark(grammar, start="n_%s")' % start)
  132. emit('def parse(text):')
  133. emit(' return TransformNearley().transform(parser.parse(text))')
  134. return ''.join(emit_code)
  135. def main(fn, start, nearley_lib):
  136. with codecs.open(fn, encoding='utf8') as f:
  137. grammar = f.read()
  138. return create_code_for_nearley_grammar(grammar, start, os.path.join(nearley_lib, 'builtin'), os.path.abspath(os.path.dirname(fn)))
  139. if __name__ == '__main__':
  140. if len(sys.argv) < 4:
  141. print("Reads Nearley grammar (with js functions) outputs an equivalent lark parser.")
  142. print("Usage: %s <nearley_grammar_path> <start_rule> <nearley_lib_path>" % sys.argv[0])
  143. sys.exit(1)
  144. fn, start, nearley_lib = sys.argv[1:]
  145. print(main(fn, start, nearley_lib))