This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
Du kan inte välja fler än 25 ämnen Ämnen måste starta med en bokstav eller siffra, kan innehålla bindestreck ('-') och vara max 35 tecken långa.

197 rader
5.0 KiB

  1. # coding=utf-8
  2. import json
  3. import sys
  4. import unittest
  5. from itertools import product
  6. from unittest import TestCase
  7. from lark import Lark
  8. from lark.reconstruct import Reconstructor
  9. common = """
  10. %import common (WS_INLINE, NUMBER, WORD)
  11. %ignore WS_INLINE
  12. """
  13. def _remove_ws(s):
  14. return s.replace(' ', '').replace('\n', '')
  15. class TestReconstructor(TestCase):
  16. def assert_reconstruct(self, grammar, code, **options):
  17. parser = Lark(grammar, parser='lalr', maybe_placeholders=False, **options)
  18. tree = parser.parse(code)
  19. new = Reconstructor(parser).reconstruct(tree)
  20. self.assertEqual(_remove_ws(code), _remove_ws(new))
  21. def test_starred_rule(self):
  22. g = """
  23. start: item*
  24. item: NL
  25. | rule
  26. rule: WORD ":" NUMBER
  27. NL: /(\\r?\\n)+\\s*/
  28. """ + common
  29. code = """
  30. Elephants: 12
  31. """
  32. self.assert_reconstruct(g, code)
  33. def test_starred_group(self):
  34. g = """
  35. start: (rule | NL)*
  36. rule: WORD ":" NUMBER
  37. NL: /(\\r?\\n)+\\s*/
  38. """ + common
  39. code = """
  40. Elephants: 12
  41. """
  42. self.assert_reconstruct(g, code)
  43. def test_alias(self):
  44. g = """
  45. start: line*
  46. line: NL
  47. | rule
  48. | "hello" -> hi
  49. rule: WORD ":" NUMBER
  50. NL: /(\\r?\\n)+\\s*/
  51. """ + common
  52. code = """
  53. Elephants: 12
  54. hello
  55. """
  56. self.assert_reconstruct(g, code)
  57. def test_keep_tokens(self):
  58. g = """
  59. start: (NL | stmt)*
  60. stmt: var op var
  61. !op: ("+" | "-" | "*" | "/")
  62. var: WORD
  63. NL: /(\\r?\\n)+\s*/
  64. """ + common
  65. code = """
  66. a+b
  67. """
  68. self.assert_reconstruct(g, code)
  69. def test_expand_rule(self):
  70. g = """
  71. ?start: (NL | mult_stmt)*
  72. ?mult_stmt: sum_stmt ["*" sum_stmt]
  73. ?sum_stmt: var ["+" var]
  74. var: WORD
  75. NL: /(\\r?\\n)+\s*/
  76. """ + common
  77. code = ['a', 'a*b', 'a+b', 'a*b+c', 'a+b*c', 'a+b*c+d']
  78. for c in code:
  79. self.assert_reconstruct(g, c)
  80. def test_json_example(self):
  81. test_json = '''
  82. {
  83. "empty_object" : {},
  84. "empty_array" : [],
  85. "booleans" : { "YES" : true, "NO" : false },
  86. "numbers" : [ 0, 1, -2, 3.3, 4.4e5, 6.6e-7 ],
  87. "strings" : [ "This", [ "And" , "That", "And a \\"b" ] ],
  88. "nothing" : null
  89. }
  90. '''
  91. json_grammar = r"""
  92. ?start: value
  93. ?value: object
  94. | array
  95. | string
  96. | SIGNED_NUMBER -> number
  97. | "true" -> true
  98. | "false" -> false
  99. | "null" -> null
  100. array : "[" [value ("," value)*] "]"
  101. object : "{" [pair ("," pair)*] "}"
  102. pair : string ":" value
  103. string : ESCAPED_STRING
  104. %import common.ESCAPED_STRING
  105. %import common.SIGNED_NUMBER
  106. %import common.WS
  107. %ignore WS
  108. """
  109. json_parser = Lark(json_grammar, parser='lalr', maybe_placeholders=False)
  110. tree = json_parser.parse(test_json)
  111. new_json = Reconstructor(json_parser).reconstruct(tree)
  112. self.assertEqual(json.loads(new_json), json.loads(test_json))
  113. def test_keep_all_tokens(self):
  114. g = """
  115. start: "a"? _B? c? _d?
  116. _B: "b"
  117. c: "c"
  118. _d: "d"
  119. """
  120. examples = list(map(''.join, product(('', 'a'), ('', 'b'), ('', 'c'), ('', 'd'), )))
  121. for code in examples:
  122. self.assert_reconstruct(g, code, keep_all_tokens=True)
  123. @unittest.skipIf(sys.version_info < (3, 0), "Python 2 does not play well with Unicode.")
  124. def test_switch_grammar_unicode_terminal(self):
  125. """
  126. This test checks that a parse tree built with a grammar containing only ascii characters can be reconstructed
  127. with a grammar that has unicode rules (or vice versa). The original bug assigned ANON terminals to unicode
  128. keywords, which offsets the ANON terminal count in the unicode grammar and causes subsequent identical ANON
  129. tokens (e.g., `+=`) to mis-match between the two grammars.
  130. """
  131. g1 = """
  132. start: (NL | stmt)*
  133. stmt: "keyword" var op var
  134. !op: ("+=" | "-=" | "*=" | "/=")
  135. var: WORD
  136. NL: /(\\r?\\n)+\s*/
  137. """ + common
  138. g2 = """
  139. start: (NL | stmt)*
  140. stmt: "குறிப்பு" var op var
  141. !op: ("+=" | "-=" | "*=" | "/=")
  142. var: WORD
  143. NL: /(\\r?\\n)+\s*/
  144. """ + common
  145. code = """
  146. keyword x += y
  147. """
  148. l1 = Lark(g1, parser='lalr')
  149. l2 = Lark(g2, parser='lalr')
  150. r = Reconstructor(l2)
  151. tree = l1.parse(code)
  152. code2 = r.reconstruct(tree)
  153. assert l2.parse(code2) == tree
  154. if __name__ == '__main__':
  155. unittest.main()