This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1350 lines
45 KiB

  1. # -*- coding: utf-8 -*-
  2. from __future__ import absolute_import
  3. import unittest
  4. import logging
  5. import os
  6. import sys
  7. try:
  8. from cStringIO import StringIO as cStringIO
  9. except ImportError:
  10. # Available only in Python 2.x, 3.x only has io.StringIO from below
  11. cStringIO = None
  12. from io import (
  13. StringIO as uStringIO,
  14. open,
  15. )
  16. logging.basicConfig(level=logging.INFO)
  17. from lark.lark import Lark
  18. from lark.exceptions import GrammarError, ParseError, UnexpectedToken, UnexpectedInput
  19. from lark.tree import Tree
  20. from lark.visitors import Transformer
  21. from lark.parsers.earley import ApplyCallbacks
  22. __path__ = os.path.dirname(__file__)
  23. def _read(n, *args):
  24. with open(os.path.join(__path__, n), *args) as f:
  25. return f.read()
  26. class TestParsers(unittest.TestCase):
  27. def test_same_ast(self):
  28. "Tests that Earley and LALR parsers produce equal trees"
  29. g = Lark(r"""start: "(" name_list ("," "*" NAME)? ")"
  30. name_list: NAME | name_list "," NAME
  31. NAME: /\w+/ """, parser='lalr')
  32. l = g.parse('(a,b,c,*x)')
  33. g = Lark(r"""start: "(" name_list ("," "*" NAME)? ")"
  34. name_list: NAME | name_list "," NAME
  35. NAME: /\w/+ """)
  36. l2 = g.parse('(a,b,c,*x)')
  37. assert l == l2, '%s != %s' % (l.pretty(), l2.pretty())
  38. def test_infinite_recurse(self):
  39. g = """start: a
  40. a: a | "a"
  41. """
  42. self.assertRaises(GrammarError, Lark, g, parser='lalr')
  43. # l = Lark(g, parser='earley', lexer='dynamic')
  44. # self.assertRaises(ParseError, l.parse, 'a')
  45. def test_propagate_positions(self):
  46. g = Lark("""start: a
  47. a: "a"
  48. """, propagate_positions=True)
  49. r = g.parse('a')
  50. self.assertEqual( r.children[0].meta.line, 1 )
  51. def test_expand1(self):
  52. g = Lark("""start: a
  53. ?a: b
  54. b: "x"
  55. """)
  56. r = g.parse('x')
  57. self.assertEqual( r.children[0].data, "b" )
  58. g = Lark("""start: a
  59. ?a: b -> c
  60. b: "x"
  61. """)
  62. r = g.parse('x')
  63. self.assertEqual( r.children[0].data, "c" )
  64. g = Lark("""start: a
  65. ?a: B -> c
  66. B: "x"
  67. """)
  68. self.assertEqual( r.children[0].data, "c" )
  69. g = Lark("""start: a
  70. ?a: b b -> c
  71. b: "x"
  72. """)
  73. r = g.parse('xx')
  74. self.assertEqual( r.children[0].data, "c" )
  75. def test_embedded_transformer(self):
  76. class T(Transformer):
  77. def a(self, children):
  78. return "<a>"
  79. def b(self, children):
  80. return "<b>"
  81. def c(self, children):
  82. return "<c>"
  83. # Test regular
  84. g = Lark("""start: a
  85. a : "x"
  86. """, parser='lalr')
  87. r = T().transform(g.parse("x"))
  88. self.assertEqual( r.children, ["<a>"] )
  89. g = Lark("""start: a
  90. a : "x"
  91. """, parser='lalr', transformer=T())
  92. r = g.parse("x")
  93. self.assertEqual( r.children, ["<a>"] )
  94. # Test Expand1
  95. g = Lark("""start: a
  96. ?a : b
  97. b : "x"
  98. """, parser='lalr')
  99. r = T().transform(g.parse("x"))
  100. self.assertEqual( r.children, ["<b>"] )
  101. g = Lark("""start: a
  102. ?a : b
  103. b : "x"
  104. """, parser='lalr', transformer=T())
  105. r = g.parse("x")
  106. self.assertEqual( r.children, ["<b>"] )
  107. # Test Expand1 -> Alias
  108. g = Lark("""start: a
  109. ?a : b b -> c
  110. b : "x"
  111. """, parser='lalr')
  112. r = T().transform(g.parse("xx"))
  113. self.assertEqual( r.children, ["<c>"] )
  114. g = Lark("""start: a
  115. ?a : b b -> c
  116. b : "x"
  117. """, parser='lalr', transformer=T())
  118. r = g.parse("xx")
  119. self.assertEqual( r.children, ["<c>"] )
  120. def test_alias(self):
  121. Lark("""start: ["a"] "b" ["c"] "e" ["f"] ["g"] ["h"] "x" -> d """)
  122. def _make_full_earley_test(LEXER):
  123. def _Lark(grammar, **kwargs):
  124. return Lark(grammar, lexer=LEXER, parser='earley', propagate_positions=True, **kwargs)
  125. class _TestFullEarley(unittest.TestCase):
  126. def test_anon(self):
  127. # Fails an Earley implementation without special handling for empty rules,
  128. # or re-processing of already completed rules.
  129. g = Lark(r"""start: B
  130. B: ("ab"|/[^b]/)+
  131. """, lexer=LEXER)
  132. self.assertEqual( g.parse('abc').children[0], 'abc')
  133. def test_earley(self):
  134. g = Lark("""start: A "b" c
  135. A: "a"+
  136. c: "abc"
  137. """, parser="earley", lexer=LEXER)
  138. x = g.parse('aaaababc')
  139. def test_earley2(self):
  140. grammar = """
  141. start: statement+
  142. statement: "r"
  143. | "c" /[a-z]/+
  144. %ignore " "
  145. """
  146. program = """c b r"""
  147. l = Lark(grammar, parser='earley', lexer=LEXER)
  148. l.parse(program)
  149. @unittest.skipIf(LEXER=='dynamic', "Only relevant for the dynamic_complete parser")
  150. def test_earley3(self):
  151. "Tests prioritization and disambiguation for pseudo-terminals (there should be only one result)"
  152. grammar = """
  153. start: A A
  154. A: "a"+
  155. """
  156. l = Lark(grammar, parser='earley', lexer=LEXER)
  157. res = l.parse("aaa")
  158. self.assertEqual(res.children, ['aa', 'a'])
  159. def test_earley4(self):
  160. grammar = """
  161. start: A A?
  162. A: "a"+
  163. """
  164. l = Lark(grammar, parser='earley', lexer=LEXER)
  165. res = l.parse("aaa")
  166. self.assertEqual(res.children, ['aaa'])
  167. def test_earley_repeating_empty(self):
  168. # This was a sneaky bug!
  169. grammar = """
  170. !start: "a" empty empty "b"
  171. empty: empty2
  172. empty2:
  173. """
  174. parser = Lark(grammar, parser='earley', lexer=LEXER)
  175. res = parser.parse('ab')
  176. empty_tree = Tree('empty', [Tree('empty2', [])])
  177. self.assertSequenceEqual(res.children, ['a', empty_tree, empty_tree, 'b'])
  178. @unittest.skipIf(LEXER=='standard', "Requires dynamic lexer")
  179. def test_earley_explicit_ambiguity(self):
  180. # This was a sneaky bug!
  181. grammar = """
  182. start: a b | ab
  183. a: "a"
  184. b: "b"
  185. ab: "ab"
  186. """
  187. parser = Lark(grammar, parser='earley', lexer=LEXER, ambiguity='explicit')
  188. ambig_tree = parser.parse('ab')
  189. # print(ambig_tree.pretty())
  190. self.assertEqual( ambig_tree.data, '_ambig')
  191. self.assertEqual( len(ambig_tree.children), 2)
  192. @unittest.skipIf(LEXER=='standard', "Requires dynamic lexer")
  193. def test_ambiguity1(self):
  194. grammar = """
  195. start: cd+ "e"
  196. !cd: "c"
  197. | "d"
  198. | "cd"
  199. """
  200. l = Lark(grammar, parser='earley', ambiguity='explicit', lexer=LEXER)
  201. ambig_tree = l.parse('cde')
  202. # print(ambig_tree.pretty())
  203. # tree = ApplyCallbacks(l.parser.parser.postprocess).transform(ambig_tree)
  204. assert ambig_tree.data == '_ambig', ambig_tree
  205. assert len(ambig_tree.children) == 2
  206. @unittest.skipIf(LEXER=='standard', "Requires dynamic lexer")
  207. def test_ambiguity2(self):
  208. grammar = """
  209. ANY: /[a-zA-Z0-9 ]+/
  210. a.2: "A" b+
  211. b.2: "B"
  212. c: ANY
  213. start: (a|c)*
  214. """
  215. l = Lark(grammar, parser='earley', lexer=LEXER)
  216. res = l.parse('ABX')
  217. expected = Tree('start', [
  218. Tree('a', [
  219. Tree('b', [])
  220. ]),
  221. Tree('c', [
  222. 'X'
  223. ])
  224. ])
  225. self.assertEqual(res, expected)
  226. def test_fruitflies_ambig(self):
  227. grammar = """
  228. start: noun verb noun -> simple
  229. | noun verb "like" noun -> comparative
  230. noun: adj? NOUN
  231. verb: VERB
  232. adj: ADJ
  233. NOUN: "flies" | "bananas" | "fruit"
  234. VERB: "like" | "flies"
  235. ADJ: "fruit"
  236. %import common.WS
  237. %ignore WS
  238. """
  239. parser = Lark(grammar, ambiguity='explicit', lexer=LEXER)
  240. tree = parser.parse('fruit flies like bananas')
  241. # tree = ApplyCallbacks(parser.parser.parser.postprocess).transform(ambig_tree)
  242. expected = Tree('_ambig', [
  243. Tree('comparative', [
  244. Tree('noun', ['fruit']),
  245. Tree('verb', ['flies']),
  246. Tree('noun', ['bananas'])
  247. ]),
  248. Tree('simple', [
  249. Tree('noun', [Tree('adj', ['fruit']), 'flies']),
  250. Tree('verb', ['like']),
  251. Tree('noun', ['bananas'])
  252. ])
  253. ])
  254. # print res.pretty()
  255. # print expected.pretty()
  256. # self.assertEqual(tree, expected)
  257. self.assertEqual(tree.data, expected.data)
  258. self.assertEqual(set(tree.children), set(expected.children))
  259. @unittest.skipIf(LEXER!='dynamic_complete', "Only relevant for the dynamic_complete parser")
  260. def test_explicit_ambiguity2(self):
  261. grammar = r"""
  262. start: NAME+
  263. NAME: /\w+/
  264. %ignore " "
  265. """
  266. text = """cat"""
  267. parser = _Lark(grammar, start='start', ambiguity='explicit')
  268. tree = parser.parse(text)
  269. print(tree.pretty())
  270. self.assertEqual(tree.data, '_ambig')
  271. combinations = {tuple(str(s) for s in t.children) for t in tree.children}
  272. self.assertEqual(combinations, {
  273. ('cat',),
  274. ('ca', 't'),
  275. ('c', 'at'),
  276. ('c', 'a' ,'t')
  277. })
  278. def test_term_ambig_resolve(self):
  279. grammar = r"""
  280. !start: NAME+
  281. NAME: /\w+/
  282. %ignore " "
  283. """
  284. text = """foo bar"""
  285. parser = Lark(grammar)
  286. tree = parser.parse(text)
  287. self.assertEqual(tree.children, ['foo', 'bar'])
  288. # @unittest.skipIf(LEXER=='dynamic', "Not implemented in Dynamic Earley yet") # TODO
  289. # def test_not_all_derivations(self):
  290. # grammar = """
  291. # start: cd+ "e"
  292. # !cd: "c"
  293. # | "d"
  294. # | "cd"
  295. # """
  296. # l = Lark(grammar, parser='earley', ambiguity='explicit', lexer=LEXER, earley__all_derivations=False)
  297. # x = l.parse('cde')
  298. # assert x.data != '_ambig', x
  299. # assert len(x.children) == 1
  300. _NAME = "TestFullEarley" + LEXER.capitalize()
  301. _TestFullEarley.__name__ = _NAME
  302. globals()[_NAME] = _TestFullEarley
  303. def _make_parser_test(LEXER, PARSER):
  304. def _Lark(grammar, **kwargs):
  305. return Lark(grammar, lexer=LEXER, parser=PARSER, propagate_positions=True, **kwargs)
  306. class _TestParser(unittest.TestCase):
  307. def test_basic1(self):
  308. g = _Lark("""start: a+ b a* "b" a*
  309. b: "b"
  310. a: "a"
  311. """)
  312. r = g.parse('aaabaab')
  313. self.assertEqual( ''.join(x.data for x in r.children), 'aaabaa' )
  314. r = g.parse('aaabaaba')
  315. self.assertEqual( ''.join(x.data for x in r.children), 'aaabaaa' )
  316. self.assertRaises(ParseError, g.parse, 'aaabaa')
  317. def test_basic2(self):
  318. # Multiple parsers and colliding tokens
  319. g = _Lark("""start: B A
  320. B: "12"
  321. A: "1" """)
  322. g2 = _Lark("""start: B A
  323. B: "12"
  324. A: "2" """)
  325. x = g.parse('121')
  326. assert x.data == 'start' and x.children == ['12', '1'], x
  327. x = g2.parse('122')
  328. assert x.data == 'start' and x.children == ['12', '2'], x
  329. @unittest.skipIf(cStringIO is None, "cStringIO not available")
  330. def test_stringio_bytes(self):
  331. """Verify that a Lark can be created from file-like objects other than Python's standard 'file' object"""
  332. _Lark(cStringIO(b'start: a+ b a* "b" a*\n b: "b"\n a: "a" '))
  333. def test_stringio_unicode(self):
  334. """Verify that a Lark can be created from file-like objects other than Python's standard 'file' object"""
  335. _Lark(uStringIO(u'start: a+ b a* "b" a*\n b: "b"\n a: "a" '))
  336. def test_unicode(self):
  337. g = _Lark(u"""start: UNIA UNIB UNIA
  338. UNIA: /\xa3/
  339. UNIB: /\u0101/
  340. """)
  341. g.parse(u'\xa3\u0101\u00a3')
  342. def test_unicode2(self):
  343. g = _Lark(r"""start: UNIA UNIB UNIA UNIC
  344. UNIA: /\xa3/
  345. UNIB: "a\u0101b\ "
  346. UNIC: /a?\u0101c\n/
  347. """)
  348. g.parse(u'\xa3a\u0101b\\ \u00a3\u0101c\n')
  349. def test_unicode3(self):
  350. g = _Lark(r"""start: UNIA UNIB UNIA UNIC
  351. UNIA: /\xa3/
  352. UNIB: "\u0101"
  353. UNIC: /\u0203/ /\n/
  354. """)
  355. g.parse(u'\xa3\u0101\u00a3\u0203\n')
  356. @unittest.skipIf(PARSER == 'cyk', "Takes forever")
  357. def test_stack_for_ebnf(self):
  358. """Verify that stack depth isn't an issue for EBNF grammars"""
  359. g = _Lark(r"""start: a+
  360. a : "a" """)
  361. g.parse("a" * (sys.getrecursionlimit()*2 ))
  362. def test_expand1_lists_with_one_item(self):
  363. g = _Lark(r"""start: list
  364. ?list: item+
  365. item : A
  366. A: "a"
  367. """)
  368. r = g.parse("a")
  369. # because 'list' is an expand-if-contains-one rule and we only provided one element it should have expanded to 'item'
  370. self.assertSequenceEqual([subtree.data for subtree in r.children], ('item',))
  371. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  372. self.assertEqual(len(r.children), 1)
  373. def test_expand1_lists_with_one_item_2(self):
  374. g = _Lark(r"""start: list
  375. ?list: item+ "!"
  376. item : A
  377. A: "a"
  378. """)
  379. r = g.parse("a!")
  380. # because 'list' is an expand-if-contains-one rule and we only provided one element it should have expanded to 'item'
  381. self.assertSequenceEqual([subtree.data for subtree in r.children], ('item',))
  382. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  383. self.assertEqual(len(r.children), 1)
  384. def test_dont_expand1_lists_with_multiple_items(self):
  385. g = _Lark(r"""start: list
  386. ?list: item+
  387. item : A
  388. A: "a"
  389. """)
  390. r = g.parse("aa")
  391. # because 'list' is an expand-if-contains-one rule and we've provided more than one element it should *not* have expanded
  392. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  393. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  394. self.assertEqual(len(r.children), 1)
  395. # Sanity check: verify that 'list' contains the two 'item's we've given it
  396. [list] = r.children
  397. self.assertSequenceEqual([item.data for item in list.children], ('item', 'item'))
  398. def test_dont_expand1_lists_with_multiple_items_2(self):
  399. g = _Lark(r"""start: list
  400. ?list: item+ "!"
  401. item : A
  402. A: "a"
  403. """)
  404. r = g.parse("aa!")
  405. # because 'list' is an expand-if-contains-one rule and we've provided more than one element it should *not* have expanded
  406. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  407. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  408. self.assertEqual(len(r.children), 1)
  409. # Sanity check: verify that 'list' contains the two 'item's we've given it
  410. [list] = r.children
  411. self.assertSequenceEqual([item.data for item in list.children], ('item', 'item'))
  412. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  413. def test_empty_expand1_list(self):
  414. g = _Lark(r"""start: list
  415. ?list: item*
  416. item : A
  417. A: "a"
  418. """)
  419. r = g.parse("")
  420. # because 'list' is an expand-if-contains-one rule and we've provided less than one element (i.e. none) it should *not* have expanded
  421. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  422. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  423. self.assertEqual(len(r.children), 1)
  424. # Sanity check: verify that 'list' contains no 'item's as we've given it none
  425. [list] = r.children
  426. self.assertSequenceEqual([item.data for item in list.children], ())
  427. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  428. def test_empty_expand1_list_2(self):
  429. g = _Lark(r"""start: list
  430. ?list: item* "!"?
  431. item : A
  432. A: "a"
  433. """)
  434. r = g.parse("")
  435. # because 'list' is an expand-if-contains-one rule and we've provided less than one element (i.e. none) it should *not* have expanded
  436. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  437. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  438. self.assertEqual(len(r.children), 1)
  439. # Sanity check: verify that 'list' contains no 'item's as we've given it none
  440. [list] = r.children
  441. self.assertSequenceEqual([item.data for item in list.children], ())
  442. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  443. def test_empty_flatten_list(self):
  444. g = _Lark(r"""start: list
  445. list: | item "," list
  446. item : A
  447. A: "a"
  448. """)
  449. r = g.parse("")
  450. # Because 'list' is a flatten rule it's top-level element should *never* be expanded
  451. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  452. # Sanity check: verify that 'list' contains no 'item's as we've given it none
  453. [list] = r.children
  454. self.assertSequenceEqual([item.data for item in list.children], ())
  455. @unittest.skipIf(True, "Flattening list isn't implemented (and may never be)")
  456. def test_single_item_flatten_list(self):
  457. g = _Lark(r"""start: list
  458. list: | item "," list
  459. item : A
  460. A: "a"
  461. """)
  462. r = g.parse("a,")
  463. # Because 'list' is a flatten rule it's top-level element should *never* be expanded
  464. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  465. # Sanity check: verify that 'list' contains exactly the one 'item' we've given it
  466. [list] = r.children
  467. self.assertSequenceEqual([item.data for item in list.children], ('item',))
  468. @unittest.skipIf(True, "Flattening list isn't implemented (and may never be)")
  469. def test_multiple_item_flatten_list(self):
  470. g = _Lark(r"""start: list
  471. #list: | item "," list
  472. item : A
  473. A: "a"
  474. """)
  475. r = g.parse("a,a,")
  476. # Because 'list' is a flatten rule it's top-level element should *never* be expanded
  477. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  478. # Sanity check: verify that 'list' contains exactly the two 'item's we've given it
  479. [list] = r.children
  480. self.assertSequenceEqual([item.data for item in list.children], ('item', 'item'))
  481. @unittest.skipIf(True, "Flattening list isn't implemented (and may never be)")
  482. def test_recurse_flatten(self):
  483. """Verify that stack depth doesn't get exceeded on recursive rules marked for flattening."""
  484. g = _Lark(r"""start: a | start a
  485. a : A
  486. A : "a" """)
  487. # Force PLY to write to the debug log, but prevent writing it to the terminal (uses repr() on the half-built
  488. # STree data structures, which uses recursion).
  489. g.parse("a" * (sys.getrecursionlimit() // 4))
  490. def test_token_collision(self):
  491. g = _Lark(r"""start: "Hello" NAME
  492. NAME: /\w/+
  493. %ignore " "
  494. """)
  495. x = g.parse('Hello World')
  496. self.assertSequenceEqual(x.children, ['World'])
  497. x = g.parse('Hello HelloWorld')
  498. self.assertSequenceEqual(x.children, ['HelloWorld'])
  499. def test_token_collision_WS(self):
  500. g = _Lark(r"""start: "Hello" NAME
  501. NAME: /\w/+
  502. %import common.WS
  503. %ignore WS
  504. """)
  505. x = g.parse('Hello World')
  506. self.assertSequenceEqual(x.children, ['World'])
  507. x = g.parse('Hello HelloWorld')
  508. self.assertSequenceEqual(x.children, ['HelloWorld'])
  509. def test_token_collision2(self):
  510. g = _Lark("""
  511. !start: "starts"
  512. %import common.LCASE_LETTER
  513. """)
  514. x = g.parse("starts")
  515. self.assertSequenceEqual(x.children, ['starts'])
  516. # def test_string_priority(self):
  517. # g = _Lark("""start: (A | /a?bb/)+
  518. # A: "a" """)
  519. # x = g.parse('abb')
  520. # self.assertEqual(len(x.children), 2)
  521. # # This parse raises an exception because the lexer will always try to consume
  522. # # "a" first and will never match the regular expression
  523. # # This behavior is subject to change!!
  524. # # Thie won't happen with ambiguity handling.
  525. # g = _Lark("""start: (A | /a?ab/)+
  526. # A: "a" """)
  527. # self.assertRaises(LexError, g.parse, 'aab')
  528. def test_undefined_rule(self):
  529. self.assertRaises(GrammarError, _Lark, """start: a""")
  530. def test_undefined_token(self):
  531. self.assertRaises(GrammarError, _Lark, """start: A""")
  532. def test_rule_collision(self):
  533. g = _Lark("""start: "a"+ "b"
  534. | "a"+ """)
  535. x = g.parse('aaaa')
  536. x = g.parse('aaaab')
  537. def test_rule_collision2(self):
  538. g = _Lark("""start: "a"* "b"
  539. | "a"+ """)
  540. x = g.parse('aaaa')
  541. x = g.parse('aaaab')
  542. x = g.parse('b')
  543. def test_token_not_anon(self):
  544. """Tests that "a" is matched as an anonymous token, and not A.
  545. """
  546. g = _Lark("""start: "a"
  547. A: "a" """)
  548. x = g.parse('a')
  549. self.assertEqual(len(x.children), 0, '"a" should be considered anonymous')
  550. g = _Lark("""start: "a" A
  551. A: "a" """)
  552. x = g.parse('aa')
  553. self.assertEqual(len(x.children), 1, 'only "a" should be considered anonymous')
  554. self.assertEqual(x.children[0].type, "A")
  555. g = _Lark("""start: /a/
  556. A: /a/ """)
  557. x = g.parse('a')
  558. self.assertEqual(len(x.children), 1)
  559. self.assertEqual(x.children[0].type, "A", "A isn't associated with /a/")
  560. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  561. def test_maybe(self):
  562. g = _Lark("""start: ["a"] """)
  563. x = g.parse('a')
  564. x = g.parse('')
  565. def test_start(self):
  566. g = _Lark("""a: "a" a? """, start='a')
  567. x = g.parse('a')
  568. x = g.parse('aa')
  569. x = g.parse('aaa')
  570. def test_alias(self):
  571. g = _Lark("""start: "a" -> b """)
  572. x = g.parse('a')
  573. self.assertEqual(x.data, "b")
  574. def test_token_ebnf(self):
  575. g = _Lark("""start: A
  576. A: "a"* ("b"? "c".."e")+
  577. """)
  578. x = g.parse('abcde')
  579. x = g.parse('dd')
  580. def test_backslash(self):
  581. g = _Lark(r"""start: "\\" "a"
  582. """)
  583. x = g.parse(r'\a')
  584. g = _Lark(r"""start: /\\/ /a/
  585. """)
  586. x = g.parse(r'\a')
  587. def test_special_chars(self):
  588. g = _Lark(r"""start: "\n"
  589. """)
  590. x = g.parse('\n')
  591. g = _Lark(r"""start: /\n/
  592. """)
  593. x = g.parse('\n')
  594. def test_backslash2(self):
  595. g = _Lark(r"""start: "\"" "-"
  596. """)
  597. x = g.parse('"-')
  598. g = _Lark(r"""start: /\// /-/
  599. """)
  600. x = g.parse('/-')
  601. # def test_token_recurse(self):
  602. # g = _Lark("""start: A
  603. # A: B
  604. # B: A
  605. # """)
  606. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  607. def test_empty(self):
  608. # Fails an Earley implementation without special handling for empty rules,
  609. # or re-processing of already completed rules.
  610. g = _Lark(r"""start: _empty a "B"
  611. a: _empty "A"
  612. _empty:
  613. """)
  614. x = g.parse('AB')
  615. def test_regex_quote(self):
  616. g = r"""
  617. start: SINGLE_QUOTED_STRING | DOUBLE_QUOTED_STRING
  618. SINGLE_QUOTED_STRING : /'[^']*'/
  619. DOUBLE_QUOTED_STRING : /"[^"]*"/
  620. """
  621. g = _Lark(g)
  622. self.assertEqual( g.parse('"hello"').children, ['"hello"'])
  623. self.assertEqual( g.parse("'hello'").children, ["'hello'"])
  624. def test_lexer_token_limit(self):
  625. "Python has a stupid limit of 100 groups in a regular expression. Test that we handle this limitation"
  626. tokens = {'A%d'%i:'"%d"'%i for i in range(300)}
  627. g = _Lark("""start: %s
  628. %s""" % (' '.join(tokens), '\n'.join("%s: %s"%x for x in tokens.items())))
  629. def test_float_without_lexer(self):
  630. expected_error = UnexpectedInput if LEXER == 'dynamic' else UnexpectedToken
  631. if PARSER == 'cyk':
  632. expected_error = ParseError
  633. g = _Lark("""start: ["+"|"-"] float
  634. float: digit* "." digit+ exp?
  635. | digit+ exp
  636. exp: ("e"|"E") ["+"|"-"] digit+
  637. digit: "0"|"1"|"2"|"3"|"4"|"5"|"6"|"7"|"8"|"9"
  638. """)
  639. g.parse("1.2")
  640. g.parse("-.2e9")
  641. g.parse("+2e-9")
  642. self.assertRaises( expected_error, g.parse, "+2e-9e")
  643. def test_keep_all_tokens(self):
  644. l = _Lark("""start: "a"+ """, keep_all_tokens=True)
  645. tree = l.parse('aaa')
  646. self.assertEqual(tree.children, ['a', 'a', 'a'])
  647. def test_token_flags(self):
  648. l = _Lark("""!start: "a"i+
  649. """
  650. )
  651. tree = l.parse('aA')
  652. self.assertEqual(tree.children, ['a', 'A'])
  653. l = _Lark("""!start: /a/i+
  654. """
  655. )
  656. tree = l.parse('aA')
  657. self.assertEqual(tree.children, ['a', 'A'])
  658. # g = """!start: "a"i "a"
  659. # """
  660. # self.assertRaises(GrammarError, _Lark, g)
  661. # g = """!start: /a/i /a/
  662. # """
  663. # self.assertRaises(GrammarError, _Lark, g)
  664. g = """start: NAME "," "a"
  665. NAME: /[a-z_]/i /[a-z0-9_]/i*
  666. """
  667. l = _Lark(g)
  668. tree = l.parse('ab,a')
  669. self.assertEqual(tree.children, ['ab'])
  670. tree = l.parse('AB,a')
  671. self.assertEqual(tree.children, ['AB'])
  672. def test_token_flags3(self):
  673. l = _Lark("""!start: ABC+
  674. ABC: "abc"i
  675. """
  676. )
  677. tree = l.parse('aBcAbC')
  678. self.assertEqual(tree.children, ['aBc', 'AbC'])
  679. def test_token_flags2(self):
  680. g = """!start: ("a"i | /a/ /b/?)+
  681. """
  682. l = _Lark(g)
  683. tree = l.parse('aA')
  684. self.assertEqual(tree.children, ['a', 'A'])
  685. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  686. def test_twice_empty(self):
  687. g = """!start: [["A"]]
  688. """
  689. l = _Lark(g)
  690. tree = l.parse('A')
  691. self.assertEqual(tree.children, ['A'])
  692. tree = l.parse('')
  693. self.assertEqual(tree.children, [])
  694. def test_undefined_ignore(self):
  695. g = """!start: "A"
  696. %ignore B
  697. """
  698. self.assertRaises( GrammarError, _Lark, g)
  699. def test_alias_in_terminal(self):
  700. g = """start: TERM
  701. TERM: "a" -> alias
  702. """
  703. self.assertRaises( GrammarError, _Lark, g)
  704. def test_line_and_column(self):
  705. g = r"""!start: "A" bc "D"
  706. !bc: "B\nC"
  707. """
  708. l = _Lark(g)
  709. a, bc, d = l.parse("AB\nCD").children
  710. self.assertEqual(a.line, 1)
  711. self.assertEqual(a.column, 1)
  712. bc ,= bc.children
  713. self.assertEqual(bc.line, 1)
  714. self.assertEqual(bc.column, 2)
  715. self.assertEqual(d.line, 2)
  716. self.assertEqual(d.column, 2)
  717. if LEXER != 'dynamic':
  718. self.assertEqual(a.end_line, 1)
  719. self.assertEqual(a.end_column, 2)
  720. self.assertEqual(bc.end_line, 2)
  721. self.assertEqual(bc.end_column, 2)
  722. self.assertEqual(d.end_line, 2)
  723. self.assertEqual(d.end_column, 3)
  724. def test_reduce_cycle(self):
  725. """Tests an edge-condition in the LALR parser, in which a transition state looks exactly like the end state.
  726. It seems that the correct solution is to explicitely distinguish finalization in the reduce() function.
  727. """
  728. l = _Lark("""
  729. term: A
  730. | term term
  731. A: "a"
  732. """, start='term')
  733. tree = l.parse("aa")
  734. self.assertEqual(len(tree.children), 2)
  735. @unittest.skipIf(LEXER != 'standard', "Only standard lexers care about token priority")
  736. def test_lexer_prioritization(self):
  737. "Tests effect of priority on result"
  738. grammar = """
  739. start: A B | AB
  740. A.2: "a"
  741. B: "b"
  742. AB: "ab"
  743. """
  744. l = _Lark(grammar)
  745. res = l.parse("ab")
  746. self.assertEqual(res.children, ['a', 'b'])
  747. self.assertNotEqual(res.children, ['ab'])
  748. grammar = """
  749. start: A B | AB
  750. A: "a"
  751. B: "b"
  752. AB.3: "ab"
  753. """
  754. l = _Lark(grammar)
  755. res = l.parse("ab")
  756. self.assertNotEqual(res.children, ['a', 'b'])
  757. self.assertEqual(res.children, ['ab'])
  758. def test_import(self):
  759. grammar = """
  760. start: NUMBER WORD
  761. %import common.NUMBER
  762. %import common.WORD
  763. %import common.WS
  764. %ignore WS
  765. """
  766. l = _Lark(grammar)
  767. x = l.parse('12 elephants')
  768. self.assertEqual(x.children, ['12', 'elephants'])
  769. def test_relative_import(self):
  770. grammar = """
  771. start: NUMBER WORD
  772. %import .grammars.test.NUMBER
  773. %import common.WORD
  774. %import common.WS
  775. %ignore WS
  776. """
  777. l = _Lark(grammar)
  778. x = l.parse('12 lions')
  779. self.assertEqual(x.children, ['12', 'lions'])
  780. def test_multi_import(self):
  781. grammar = """
  782. start: NUMBER WORD
  783. %import common (NUMBER, WORD, WS)
  784. %ignore WS
  785. """
  786. l = _Lark(grammar)
  787. x = l.parse('12 toucans')
  788. self.assertEqual(x.children, ['12', 'toucans'])
  789. def test_relative_multi_import(self):
  790. grammar = """
  791. start: NUMBER WORD
  792. %import .grammars.test (NUMBER, WORD, WS)
  793. %ignore WS
  794. """
  795. l = _Lark(grammar)
  796. x = l.parse('12 capybaras')
  797. self.assertEqual(x.children, ['12', 'capybaras'])
  798. def test_import_errors(self):
  799. grammar = """
  800. start: NUMBER WORD
  801. %import .grammars.bad_test.NUMBER
  802. """
  803. self.assertRaises(IOError, _Lark, grammar)
  804. grammar = """
  805. start: NUMBER WORD
  806. %import bad_test.NUMBER
  807. """
  808. self.assertRaises(IOError, _Lark, grammar)
  809. @unittest.skipIf(PARSER != 'earley', "Currently only Earley supports priority in rules")
  810. def test_earley_prioritization(self):
  811. "Tests effect of priority on result"
  812. grammar = """
  813. start: a | b
  814. a.1: "a"
  815. b.2: "a"
  816. """
  817. # l = Lark(grammar, parser='earley', lexer='standard')
  818. l = _Lark(grammar)
  819. res = l.parse("a")
  820. self.assertEqual(res.children[0].data, 'b')
  821. grammar = """
  822. start: a | b
  823. a.2: "a"
  824. b.1: "a"
  825. """
  826. l = _Lark(grammar)
  827. # l = Lark(grammar, parser='earley', lexer='standard')
  828. res = l.parse("a")
  829. self.assertEqual(res.children[0].data, 'a')
  830. @unittest.skipIf(PARSER != 'earley', "Currently only Earley supports priority in rules")
  831. def test_earley_prioritization_sum(self):
  832. "Tests effect of priority on result"
  833. grammar = """
  834. start: ab_ b_ a_ | indirection
  835. indirection: a_ bb_ a_
  836. a_: "a"
  837. b_: "b"
  838. ab_: "ab"
  839. bb_.1: "bb"
  840. """
  841. l = Lark(grammar, ambiguity='resolve__antiscore_sum')
  842. res = l.parse('abba')
  843. self.assertEqual(''.join(child.data for child in res.children), 'ab_b_a_')
  844. grammar = """
  845. start: ab_ b_ a_ | indirection
  846. indirection: a_ bb_ a_
  847. a_: "a"
  848. b_: "b"
  849. ab_.1: "ab"
  850. bb_: "bb"
  851. """
  852. l = Lark(grammar, ambiguity='resolve__antiscore_sum')
  853. res = l.parse('abba')
  854. self.assertEqual(''.join(child.data for child in res.children), 'indirection')
  855. grammar = """
  856. start: ab_ b_ a_ | indirection
  857. indirection: a_ bb_ a_
  858. a_.2: "a"
  859. b_.1: "b"
  860. ab_.3: "ab"
  861. bb_.3: "bb"
  862. """
  863. l = Lark(grammar, ambiguity='resolve__antiscore_sum')
  864. res = l.parse('abba')
  865. self.assertEqual(''.join(child.data for child in res.children), 'ab_b_a_')
  866. grammar = """
  867. start: ab_ b_ a_ | indirection
  868. indirection: a_ bb_ a_
  869. a_.1: "a"
  870. b_.1: "b"
  871. ab_.4: "ab"
  872. bb_.3: "bb"
  873. """
  874. l = Lark(grammar, ambiguity='resolve__antiscore_sum')
  875. res = l.parse('abba')
  876. self.assertEqual(''.join(child.data for child in res.children), 'indirection')
  877. def test_utf8(self):
  878. g = u"""start: a
  879. a: "±a"
  880. """
  881. l = _Lark(g)
  882. self.assertEqual(l.parse(u'±a'), Tree('start', [Tree('a', [])]))
  883. g = u"""start: A
  884. A: "±a"
  885. """
  886. l = _Lark(g)
  887. self.assertEqual(l.parse(u'±a'), Tree('start', [u'\xb1a']))
  888. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  889. def test_ignore(self):
  890. grammar = r"""
  891. COMMENT: /(!|(\/\/))[^\n]*/
  892. %ignore COMMENT
  893. %import common.WS -> _WS
  894. %import common.INT
  895. start: "INT"i _WS+ INT _WS*
  896. """
  897. parser = _Lark(grammar)
  898. tree = parser.parse("int 1 ! This is a comment\n")
  899. self.assertEqual(tree.children, ['1'])
  900. tree = parser.parse("int 1 ! This is a comment") # A trailing ignore token can be tricky!
  901. self.assertEqual(tree.children, ['1'])
  902. parser = _Lark(r"""
  903. start : "a"*
  904. %ignore "b"
  905. """)
  906. tree = parser.parse("bb")
  907. self.assertEqual(tree.children, [])
  908. def test_regex_escaping(self):
  909. g = _Lark("start: /[ab]/")
  910. g.parse('a')
  911. g.parse('b')
  912. self.assertRaises( UnexpectedInput, g.parse, 'c')
  913. _Lark(r'start: /\w/').parse('a')
  914. g = _Lark(r'start: /\\w/')
  915. self.assertRaises( UnexpectedInput, g.parse, 'a')
  916. g.parse(r'\w')
  917. _Lark(r'start: /\[/').parse('[')
  918. _Lark(r'start: /\//').parse('/')
  919. _Lark(r'start: /\\/').parse('\\')
  920. _Lark(r'start: /\[ab]/').parse('[ab]')
  921. _Lark(r'start: /\\[ab]/').parse('\\a')
  922. _Lark(r'start: /\t/').parse('\t')
  923. _Lark(r'start: /\\t/').parse('\\t')
  924. _Lark(r'start: /\\\t/').parse('\\\t')
  925. _Lark(r'start: "\t"').parse('\t')
  926. _Lark(r'start: "\\t"').parse('\\t')
  927. _Lark(r'start: "\\\t"').parse('\\\t')
  928. def test_ranged_repeat_rules(self):
  929. g = u"""!start: "A"~3
  930. """
  931. l = _Lark(g)
  932. self.assertEqual(l.parse(u'AAA'), Tree('start', ["A", "A", "A"]))
  933. self.assertRaises(ParseError, l.parse, u'AA')
  934. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAA')
  935. g = u"""!start: "A"~0..2
  936. """
  937. if PARSER != 'cyk': # XXX CYK currently doesn't support empty grammars
  938. l = _Lark(g)
  939. self.assertEqual(l.parse(u''), Tree('start', []))
  940. self.assertEqual(l.parse(u'A'), Tree('start', ['A']))
  941. self.assertEqual(l.parse(u'AA'), Tree('start', ['A', 'A']))
  942. self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'AAA')
  943. g = u"""!start: "A"~3..2
  944. """
  945. self.assertRaises(GrammarError, _Lark, g)
  946. g = u"""!start: "A"~2..3 "B"~2
  947. """
  948. l = _Lark(g)
  949. self.assertEqual(l.parse(u'AABB'), Tree('start', ['A', 'A', 'B', 'B']))
  950. self.assertEqual(l.parse(u'AAABB'), Tree('start', ['A', 'A', 'A', 'B', 'B']))
  951. self.assertRaises(ParseError, l.parse, u'AAAB')
  952. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAABBB')
  953. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB')
  954. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB')
  955. def test_ranged_repeat_terms(self):
  956. g = u"""!start: AAA
  957. AAA: "A"~3
  958. """
  959. l = _Lark(g)
  960. self.assertEqual(l.parse(u'AAA'), Tree('start', ["AAA"]))
  961. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AA')
  962. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAA')
  963. g = u"""!start: AABB CC
  964. AABB: "A"~0..2 "B"~2
  965. CC: "C"~1..2
  966. """
  967. l = _Lark(g)
  968. self.assertEqual(l.parse(u'AABBCC'), Tree('start', ['AABB', 'CC']))
  969. self.assertEqual(l.parse(u'BBC'), Tree('start', ['BB', 'C']))
  970. self.assertEqual(l.parse(u'ABBCC'), Tree('start', ['ABB', 'CC']))
  971. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAB')
  972. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAABBB')
  973. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB')
  974. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB')
  975. @unittest.skipIf(PARSER=='earley', "Priority not handled correctly right now") # TODO XXX
  976. def test_priority_vs_embedded(self):
  977. g = """
  978. A.2: "a"
  979. WORD: ("a".."z")+
  980. start: (A | WORD)+
  981. """
  982. l = _Lark(g)
  983. t = l.parse('abc')
  984. self.assertEqual(t.children, ['a', 'bc'])
  985. self.assertEqual(t.children[0].type, 'A')
  986. def test_line_counting(self):
  987. p = _Lark("start: /[^x]+/")
  988. text = 'hello\nworld'
  989. t = p.parse(text)
  990. tok = t.children[0]
  991. self.assertEqual(tok, text)
  992. self.assertEqual(tok.line, 1)
  993. self.assertEqual(tok.column, 1)
  994. if _LEXER != 'dynamic':
  995. self.assertEqual(tok.end_line, 2)
  996. self.assertEqual(tok.end_column, 6)
  997. @unittest.skipIf(PARSER=='cyk', "Empty rules")
  998. def test_empty_end(self):
  999. p = _Lark("""
  1000. start: b c d
  1001. b: "B"
  1002. c: | "C"
  1003. d: | "D"
  1004. """)
  1005. res = p.parse('B')
  1006. self.assertEqual(len(res.children), 3)
  1007. @unittest.skipIf(PARSER=='cyk', "Empty rules")
  1008. def test_maybe_placeholders(self):
  1009. # Anonymous tokens shouldn't count
  1010. p = _Lark("""start: "a"? "b"? "c"? """, maybe_placeholders=True)
  1011. self.assertEqual(p.parse("").children, [])
  1012. # Anonymous tokens shouldn't count, other constructs should
  1013. p = _Lark("""start: A? "b"? _c?
  1014. A: "a"
  1015. _c: "c" """, maybe_placeholders=True)
  1016. self.assertEqual(p.parse("").children, [None])
  1017. p = _Lark("""!start: "a"? "b"? "c"? """, maybe_placeholders=True)
  1018. self.assertEqual(p.parse("").children, [None, None, None])
  1019. self.assertEqual(p.parse("a").children, ['a', None, None])
  1020. self.assertEqual(p.parse("b").children, [None, 'b', None])
  1021. self.assertEqual(p.parse("c").children, [None, None, 'c'])
  1022. self.assertEqual(p.parse("ab").children, ['a', 'b', None])
  1023. self.assertEqual(p.parse("ac").children, ['a', None, 'c'])
  1024. self.assertEqual(p.parse("bc").children, [None, 'b', 'c'])
  1025. self.assertEqual(p.parse("abc").children, ['a', 'b', 'c'])
  1026. p = _Lark("""!start: ("a"? "b" "c"?)+ """, maybe_placeholders=True)
  1027. self.assertEqual(p.parse("b").children, [None, 'b', None])
  1028. self.assertEqual(p.parse("bb").children, [None, 'b', None, None, 'b', None])
  1029. self.assertEqual(p.parse("abbc").children, ['a', 'b', None, None, 'b', 'c'])
  1030. self.assertEqual(p.parse("babbcabcb").children,
  1031. [None, 'b', None,
  1032. 'a', 'b', None,
  1033. None, 'b', 'c',
  1034. 'a', 'b', 'c',
  1035. None, 'b', None])
  1036. p = _Lark("""!start: "a"? "c"? "b"+ "a"? "d"? """, maybe_placeholders=True)
  1037. self.assertEqual(p.parse("bb").children, [None, None, 'b', 'b', None, None])
  1038. self.assertEqual(p.parse("bd").children, [None, None, 'b', None, 'd'])
  1039. self.assertEqual(p.parse("abba").children, ['a', None, 'b', 'b', 'a', None])
  1040. self.assertEqual(p.parse("cbbbb").children, [None, 'c', 'b', 'b', 'b', 'b', None, None])
  1041. _NAME = "Test" + PARSER.capitalize() + LEXER.capitalize()
  1042. _TestParser.__name__ = _NAME
  1043. globals()[_NAME] = _TestParser
  1044. # Note: You still have to import them in __main__ for the tests to run
  1045. _TO_TEST = [
  1046. ('standard', 'earley'),
  1047. ('standard', 'cyk'),
  1048. ('dynamic', 'earley'),
  1049. ('dynamic_complete', 'earley'),
  1050. ('standard', 'lalr'),
  1051. ('contextual', 'lalr'),
  1052. # (None, 'earley'),
  1053. ]
  1054. for _LEXER, _PARSER in _TO_TEST:
  1055. _make_parser_test(_LEXER, _PARSER)
  1056. for _LEXER in ('dynamic', 'dynamic_complete'):
  1057. _make_full_earley_test(_LEXER)
  1058. if __name__ == '__main__':
  1059. unittest.main()