This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1192 lines
39 KiB

  1. # -*- coding: utf-8 -*-
  2. from __future__ import absolute_import
  3. import unittest
  4. import logging
  5. import os
  6. import sys
  7. try:
  8. from cStringIO import StringIO as cStringIO
  9. except ImportError:
  10. # Available only in Python 2.x, 3.x only has io.StringIO from below
  11. cStringIO = None
  12. from io import (
  13. StringIO as uStringIO,
  14. open,
  15. )
  16. logging.basicConfig(level=logging.INFO)
  17. from lark.lark import Lark
  18. from lark.common import GrammarError, ParseError, UnexpectedToken
  19. from lark.lexer import LexError, UnexpectedInput
  20. from lark.tree import Tree, Transformer
  21. __path__ = os.path.dirname(__file__)
  22. def _read(n, *args):
  23. with open(os.path.join(__path__, n), *args) as f:
  24. return f.read()
  25. class TestParsers(unittest.TestCase):
  26. def test_same_ast(self):
  27. "Tests that Earley and LALR parsers produce equal trees"
  28. g = Lark(r"""start: "(" name_list ("," "*" NAME)? ")"
  29. name_list: NAME | name_list "," NAME
  30. NAME: /\w+/ """, parser='lalr')
  31. l = g.parse('(a,b,c,*x)')
  32. g = Lark(r"""start: "(" name_list ("," "*" NAME)? ")"
  33. name_list: NAME | name_list "," NAME
  34. NAME: /\w/+ """)
  35. l2 = g.parse('(a,b,c,*x)')
  36. assert l == l2, '%s != %s' % (l.pretty(), l2.pretty())
  37. def test_infinite_recurse(self):
  38. g = """start: a
  39. a: a | "a"
  40. """
  41. self.assertRaises(GrammarError, Lark, g, parser='lalr')
  42. l = Lark(g, parser='earley', lexer='dynamic')
  43. self.assertRaises(ParseError, l.parse, 'a')
  44. def test_propagate_positions(self):
  45. g = Lark("""start: a
  46. a: "a"
  47. """, propagate_positions=True)
  48. r = g.parse('a')
  49. self.assertEqual( r.children[0].line, 1 )
  50. def test_expand1(self):
  51. g = Lark("""start: a
  52. ?a: b
  53. b: "x"
  54. """)
  55. r = g.parse('x')
  56. self.assertEqual( r.children[0].data, "b" )
  57. g = Lark("""start: a
  58. ?a: b -> c
  59. b: "x"
  60. """)
  61. r = g.parse('x')
  62. self.assertEqual( r.children[0].data, "c" )
  63. g = Lark("""start: a
  64. ?a: B -> c
  65. B: "x"
  66. """)
  67. self.assertEqual( r.children[0].data, "c" )
  68. g = Lark("""start: a
  69. ?a: b b -> c
  70. b: "x"
  71. """)
  72. r = g.parse('xx')
  73. self.assertEqual( r.children[0].data, "c" )
  74. def test_embedded_transformer(self):
  75. class T(Transformer):
  76. def a(self, children):
  77. return "<a>"
  78. def b(self, children):
  79. return "<b>"
  80. def c(self, children):
  81. return "<c>"
  82. # Test regular
  83. g = Lark("""start: a
  84. a : "x"
  85. """, parser='lalr')
  86. r = T().transform(g.parse("x"))
  87. self.assertEqual( r.children, ["<a>"] )
  88. g = Lark("""start: a
  89. a : "x"
  90. """, parser='lalr', transformer=T())
  91. r = g.parse("x")
  92. self.assertEqual( r.children, ["<a>"] )
  93. # Test Expand1
  94. g = Lark("""start: a
  95. ?a : b
  96. b : "x"
  97. """, parser='lalr')
  98. r = T().transform(g.parse("x"))
  99. self.assertEqual( r.children, ["<b>"] )
  100. g = Lark("""start: a
  101. ?a : b
  102. b : "x"
  103. """, parser='lalr', transformer=T())
  104. r = g.parse("x")
  105. self.assertEqual( r.children, ["<b>"] )
  106. # Test Expand1 -> Alias
  107. g = Lark("""start: a
  108. ?a : b b -> c
  109. b : "x"
  110. """, parser='lalr')
  111. r = T().transform(g.parse("xx"))
  112. self.assertEqual( r.children, ["<c>"] )
  113. g = Lark("""start: a
  114. ?a : b b -> c
  115. b : "x"
  116. """, parser='lalr', transformer=T())
  117. r = g.parse("xx")
  118. self.assertEqual( r.children, ["<c>"] )
  119. def _make_full_earley_test(LEXER):
  120. class _TestFullEarley(unittest.TestCase):
  121. def test_anon(self):
  122. # Fails an Earley implementation without special handling for empty rules,
  123. # or re-processing of already completed rules.
  124. g = Lark(r"""start: B
  125. B: ("ab"|/[^b]/)+
  126. """, lexer=LEXER)
  127. self.assertEqual( g.parse('abc').children[0], 'abc')
  128. def test_earley(self):
  129. g = Lark("""start: A "b" c
  130. A: "a"+
  131. c: "abc"
  132. """, parser="earley", lexer=LEXER)
  133. x = g.parse('aaaababc')
  134. def test_earley2(self):
  135. grammar = """
  136. start: statement+
  137. statement: "r"
  138. | "c" /[a-z]/+
  139. %ignore " "
  140. """
  141. program = """c b r"""
  142. l = Lark(grammar, parser='earley', lexer=LEXER)
  143. l.parse(program)
  144. def test_earley3(self):
  145. "Tests prioritization and disambiguation for pseudo-terminals (there should be only one result)"
  146. grammar = """
  147. start: A A
  148. A: "a"+
  149. """
  150. l = Lark(grammar, parser='earley', lexer=LEXER)
  151. res = l.parse("aaa")
  152. self.assertEqual(res.children, ['aa', 'a'])
  153. def test_earley4(self):
  154. grammar = """
  155. start: A A?
  156. A: "a"+
  157. """
  158. l = Lark(grammar, parser='earley', lexer=LEXER)
  159. res = l.parse("aaa")
  160. self.assertEqual(res.children, ['aaa'])
  161. def test_earley_repeating_empty(self):
  162. # This was a sneaky bug!
  163. grammar = """
  164. !start: "a" empty empty "b"
  165. empty: empty2
  166. empty2:
  167. """
  168. parser = Lark(grammar, parser='earley', lexer=LEXER)
  169. res = parser.parse('ab')
  170. empty_tree = Tree('empty', [Tree('empty2', [])])
  171. self.assertSequenceEqual(res.children, ['a', empty_tree, empty_tree, 'b'])
  172. def test_earley_explicit_ambiguity(self):
  173. # This was a sneaky bug!
  174. grammar = """
  175. start: a b | ab
  176. a: "a"
  177. b: "b"
  178. ab: "ab"
  179. """
  180. parser = Lark(grammar, parser='earley', lexer=LEXER, ambiguity='explicit')
  181. res = parser.parse('ab')
  182. self.assertEqual( res.data, '_ambig')
  183. self.assertEqual( len(res.children), 2)
  184. def test_ambiguity1(self):
  185. grammar = """
  186. start: cd+ "e"
  187. !cd: "c"
  188. | "d"
  189. | "cd"
  190. """
  191. l = Lark(grammar, parser='earley', ambiguity='explicit', lexer=LEXER)
  192. x = l.parse('cde')
  193. assert x.data == '_ambig', x
  194. assert len(x.children) == 2
  195. def test_fruitflies_ambig(self):
  196. grammar = """
  197. start: noun verb noun -> simple
  198. | noun verb "like" noun -> comparative
  199. noun: adj? NOUN
  200. verb: VERB
  201. adj: ADJ
  202. NOUN: "flies" | "bananas" | "fruit"
  203. VERB: "like" | "flies"
  204. ADJ: "fruit"
  205. %import common.WS
  206. %ignore WS
  207. """
  208. parser = Lark(grammar, ambiguity='explicit', lexer=LEXER)
  209. res = parser.parse('fruit flies like bananas')
  210. expected = Tree('_ambig', [
  211. Tree('comparative', [
  212. Tree('noun', ['fruit']),
  213. Tree('verb', ['flies']),
  214. Tree('noun', ['bananas'])
  215. ]),
  216. Tree('simple', [
  217. Tree('noun', [Tree('adj', ['fruit']), 'flies']),
  218. Tree('verb', ['like']),
  219. Tree('noun', ['bananas'])
  220. ])
  221. ])
  222. # print res.pretty()
  223. # print expected.pretty()
  224. self.assertEqual(res, expected)
  225. def test_explicit_ambiguity2(self):
  226. grammar = r"""
  227. start: NAME+
  228. NAME: /\w+/
  229. %ignore " "
  230. """
  231. text = """cat"""
  232. parser = Lark(grammar, start='start', ambiguity='explicit')
  233. tree = parser.parse(text)
  234. self.assertEqual(tree.data, '_ambig')
  235. combinations = {tuple(str(s) for s in t.children) for t in tree.children}
  236. self.assertEqual(combinations, {
  237. ('cat',),
  238. ('ca', 't'),
  239. ('c', 'at'),
  240. ('c', 'a' ,'t')
  241. })
  242. def test_term_ambig_resolve(self):
  243. grammar = r"""
  244. !start: NAME+
  245. NAME: /\w+/
  246. %ignore " "
  247. """
  248. text = """foo bar"""
  249. parser = Lark(grammar)
  250. tree = parser.parse(text)
  251. self.assertEqual(tree.children, ['foo', 'bar'])
  252. # @unittest.skipIf(LEXER=='dynamic', "Not implemented in Dynamic Earley yet") # TODO
  253. # def test_not_all_derivations(self):
  254. # grammar = """
  255. # start: cd+ "e"
  256. # !cd: "c"
  257. # | "d"
  258. # | "cd"
  259. # """
  260. # l = Lark(grammar, parser='earley', ambiguity='explicit', lexer=LEXER, earley__all_derivations=False)
  261. # x = l.parse('cde')
  262. # assert x.data != '_ambig', x
  263. # assert len(x.children) == 1
  264. _NAME = "TestFullEarley" + LEXER.capitalize()
  265. _TestFullEarley.__name__ = _NAME
  266. globals()[_NAME] = _TestFullEarley
  267. def _make_parser_test(LEXER, PARSER):
  268. def _Lark(grammar, **kwargs):
  269. return Lark(grammar, lexer=LEXER, parser=PARSER, **kwargs)
  270. class _TestParser(unittest.TestCase):
  271. def test_basic1(self):
  272. g = _Lark("""start: a+ b a* "b" a*
  273. b: "b"
  274. a: "a"
  275. """)
  276. r = g.parse('aaabaab')
  277. self.assertEqual( ''.join(x.data for x in r.children), 'aaabaa' )
  278. r = g.parse('aaabaaba')
  279. self.assertEqual( ''.join(x.data for x in r.children), 'aaabaaa' )
  280. self.assertRaises(ParseError, g.parse, 'aaabaa')
  281. def test_basic2(self):
  282. # Multiple parsers and colliding tokens
  283. g = _Lark("""start: B A
  284. B: "12"
  285. A: "1" """)
  286. g2 = _Lark("""start: B A
  287. B: "12"
  288. A: "2" """)
  289. x = g.parse('121')
  290. assert x.data == 'start' and x.children == ['12', '1'], x
  291. x = g2.parse('122')
  292. assert x.data == 'start' and x.children == ['12', '2'], x
  293. @unittest.skipIf(cStringIO is None, "cStringIO not available")
  294. def test_stringio_bytes(self):
  295. """Verify that a Lark can be created from file-like objects other than Python's standard 'file' object"""
  296. _Lark(cStringIO(b'start: a+ b a* "b" a*\n b: "b"\n a: "a" '))
  297. def test_stringio_unicode(self):
  298. """Verify that a Lark can be created from file-like objects other than Python's standard 'file' object"""
  299. _Lark(uStringIO(u'start: a+ b a* "b" a*\n b: "b"\n a: "a" '))
  300. def test_unicode(self):
  301. g = _Lark(u"""start: UNIA UNIB UNIA
  302. UNIA: /\xa3/
  303. UNIB: /\u0101/
  304. """)
  305. g.parse(u'\xa3\u0101\u00a3')
  306. def test_unicode2(self):
  307. g = _Lark(r"""start: UNIA UNIB UNIA UNIC
  308. UNIA: /\xa3/
  309. UNIB: "a\u0101b\ "
  310. UNIC: /a?\u0101c\n/
  311. """)
  312. g.parse(u'\xa3a\u0101b\\ \u00a3\u0101c\n')
  313. def test_unicode3(self):
  314. g = _Lark(r"""start: UNIA UNIB UNIA UNIC
  315. UNIA: /\xa3/
  316. UNIB: "\u0101"
  317. UNIC: /\u0203/ /\n/
  318. """)
  319. g.parse(u'\xa3\u0101\u00a3\u0203\n')
  320. @unittest.skipIf(PARSER == 'cyk', "Takes forever")
  321. def test_stack_for_ebnf(self):
  322. """Verify that stack depth isn't an issue for EBNF grammars"""
  323. g = _Lark(r"""start: a+
  324. a : "a" """)
  325. g.parse("a" * (sys.getrecursionlimit()*2 ))
  326. def test_expand1_lists_with_one_item(self):
  327. g = _Lark(r"""start: list
  328. ?list: item+
  329. item : A
  330. A: "a"
  331. """)
  332. r = g.parse("a")
  333. # because 'list' is an expand-if-contains-one rule and we only provided one element it should have expanded to 'item'
  334. self.assertSequenceEqual([subtree.data for subtree in r.children], ('item',))
  335. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  336. self.assertEqual(len(r.children), 1)
  337. def test_expand1_lists_with_one_item_2(self):
  338. g = _Lark(r"""start: list
  339. ?list: item+ "!"
  340. item : A
  341. A: "a"
  342. """)
  343. r = g.parse("a!")
  344. # because 'list' is an expand-if-contains-one rule and we only provided one element it should have expanded to 'item'
  345. self.assertSequenceEqual([subtree.data for subtree in r.children], ('item',))
  346. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  347. self.assertEqual(len(r.children), 1)
  348. def test_dont_expand1_lists_with_multiple_items(self):
  349. g = _Lark(r"""start: list
  350. ?list: item+
  351. item : A
  352. A: "a"
  353. """)
  354. r = g.parse("aa")
  355. # because 'list' is an expand-if-contains-one rule and we've provided more than one element it should *not* have expanded
  356. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  357. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  358. self.assertEqual(len(r.children), 1)
  359. # Sanity check: verify that 'list' contains the two 'item's we've given it
  360. [list] = r.children
  361. self.assertSequenceEqual([item.data for item in list.children], ('item', 'item'))
  362. def test_dont_expand1_lists_with_multiple_items_2(self):
  363. g = _Lark(r"""start: list
  364. ?list: item+ "!"
  365. item : A
  366. A: "a"
  367. """)
  368. r = g.parse("aa!")
  369. # because 'list' is an expand-if-contains-one rule and we've provided more than one element it should *not* have expanded
  370. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  371. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  372. self.assertEqual(len(r.children), 1)
  373. # Sanity check: verify that 'list' contains the two 'item's we've given it
  374. [list] = r.children
  375. self.assertSequenceEqual([item.data for item in list.children], ('item', 'item'))
  376. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  377. def test_empty_expand1_list(self):
  378. g = _Lark(r"""start: list
  379. ?list: item*
  380. item : A
  381. A: "a"
  382. """)
  383. r = g.parse("")
  384. # because 'list' is an expand-if-contains-one rule and we've provided less than one element (i.e. none) it should *not* have expanded
  385. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  386. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  387. self.assertEqual(len(r.children), 1)
  388. # Sanity check: verify that 'list' contains no 'item's as we've given it none
  389. [list] = r.children
  390. self.assertSequenceEqual([item.data for item in list.children], ())
  391. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  392. def test_empty_expand1_list_2(self):
  393. g = _Lark(r"""start: list
  394. ?list: item* "!"?
  395. item : A
  396. A: "a"
  397. """)
  398. r = g.parse("")
  399. # because 'list' is an expand-if-contains-one rule and we've provided less than one element (i.e. none) it should *not* have expanded
  400. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  401. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  402. self.assertEqual(len(r.children), 1)
  403. # Sanity check: verify that 'list' contains no 'item's as we've given it none
  404. [list] = r.children
  405. self.assertSequenceEqual([item.data for item in list.children], ())
  406. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  407. def test_empty_flatten_list(self):
  408. g = _Lark(r"""start: list
  409. list: | item "," list
  410. item : A
  411. A: "a"
  412. """)
  413. r = g.parse("")
  414. # Because 'list' is a flatten rule it's top-level element should *never* be expanded
  415. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  416. # Sanity check: verify that 'list' contains no 'item's as we've given it none
  417. [list] = r.children
  418. self.assertSequenceEqual([item.data for item in list.children], ())
  419. @unittest.skipIf(True, "Flattening list isn't implemented (and may never be)")
  420. def test_single_item_flatten_list(self):
  421. g = _Lark(r"""start: list
  422. list: | item "," list
  423. item : A
  424. A: "a"
  425. """)
  426. r = g.parse("a,")
  427. # Because 'list' is a flatten rule it's top-level element should *never* be expanded
  428. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  429. # Sanity check: verify that 'list' contains exactly the one 'item' we've given it
  430. [list] = r.children
  431. self.assertSequenceEqual([item.data for item in list.children], ('item',))
  432. @unittest.skipIf(True, "Flattening list isn't implemented (and may never be)")
  433. def test_multiple_item_flatten_list(self):
  434. g = _Lark(r"""start: list
  435. #list: | item "," list
  436. item : A
  437. A: "a"
  438. """)
  439. r = g.parse("a,a,")
  440. # Because 'list' is a flatten rule it's top-level element should *never* be expanded
  441. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  442. # Sanity check: verify that 'list' contains exactly the two 'item's we've given it
  443. [list] = r.children
  444. self.assertSequenceEqual([item.data for item in list.children], ('item', 'item'))
  445. @unittest.skipIf(True, "Flattening list isn't implemented (and may never be)")
  446. def test_recurse_flatten(self):
  447. """Verify that stack depth doesn't get exceeded on recursive rules marked for flattening."""
  448. g = _Lark(r"""start: a | start a
  449. a : A
  450. A : "a" """)
  451. # Force PLY to write to the debug log, but prevent writing it to the terminal (uses repr() on the half-built
  452. # STree data structures, which uses recursion).
  453. g.parse("a" * (sys.getrecursionlimit() // 4))
  454. def test_token_collision(self):
  455. g = _Lark(r"""start: "Hello" NAME
  456. NAME: /\w/+
  457. %ignore " "
  458. """)
  459. x = g.parse('Hello World')
  460. self.assertSequenceEqual(x.children, ['World'])
  461. x = g.parse('Hello HelloWorld')
  462. self.assertSequenceEqual(x.children, ['HelloWorld'])
  463. def test_token_collision_WS(self):
  464. g = _Lark(r"""start: "Hello" NAME
  465. NAME: /\w/+
  466. %import common.WS
  467. %ignore WS
  468. """)
  469. x = g.parse('Hello World')
  470. self.assertSequenceEqual(x.children, ['World'])
  471. x = g.parse('Hello HelloWorld')
  472. self.assertSequenceEqual(x.children, ['HelloWorld'])
  473. def test_token_collision2(self):
  474. g = _Lark("""
  475. !start: "starts"
  476. %import common.LCASE_LETTER
  477. """)
  478. x = g.parse("starts")
  479. self.assertSequenceEqual(x.children, ['starts'])
  480. # def test_string_priority(self):
  481. # g = _Lark("""start: (A | /a?bb/)+
  482. # A: "a" """)
  483. # x = g.parse('abb')
  484. # self.assertEqual(len(x.children), 2)
  485. # # This parse raises an exception because the lexer will always try to consume
  486. # # "a" first and will never match the regular expression
  487. # # This behavior is subject to change!!
  488. # # Thie won't happen with ambiguity handling.
  489. # g = _Lark("""start: (A | /a?ab/)+
  490. # A: "a" """)
  491. # self.assertRaises(LexError, g.parse, 'aab')
  492. def test_undefined_rule(self):
  493. self.assertRaises(GrammarError, _Lark, """start: a""")
  494. def test_undefined_token(self):
  495. self.assertRaises(GrammarError, _Lark, """start: A""")
  496. def test_rule_collision(self):
  497. g = _Lark("""start: "a"+ "b"
  498. | "a"+ """)
  499. x = g.parse('aaaa')
  500. x = g.parse('aaaab')
  501. def test_rule_collision2(self):
  502. g = _Lark("""start: "a"* "b"
  503. | "a"+ """)
  504. x = g.parse('aaaa')
  505. x = g.parse('aaaab')
  506. x = g.parse('b')
  507. def test_token_not_anon(self):
  508. """Tests that "a" is matched as an anonymous token, and not A.
  509. """
  510. g = _Lark("""start: "a"
  511. A: "a" """)
  512. x = g.parse('a')
  513. self.assertEqual(len(x.children), 0, '"a" should be considered anonymous')
  514. g = _Lark("""start: "a" A
  515. A: "a" """)
  516. x = g.parse('aa')
  517. self.assertEqual(len(x.children), 1, 'only "a" should be considered anonymous')
  518. self.assertEqual(x.children[0].type, "A")
  519. g = _Lark("""start: /a/
  520. A: /a/ """)
  521. x = g.parse('a')
  522. self.assertEqual(len(x.children), 1)
  523. self.assertEqual(x.children[0].type, "A", "A isn't associated with /a/")
  524. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  525. def test_maybe(self):
  526. g = _Lark("""start: ["a"] """)
  527. x = g.parse('a')
  528. x = g.parse('')
  529. def test_start(self):
  530. g = _Lark("""a: "a" a? """, start='a')
  531. x = g.parse('a')
  532. x = g.parse('aa')
  533. x = g.parse('aaa')
  534. def test_alias(self):
  535. g = _Lark("""start: "a" -> b """)
  536. x = g.parse('a')
  537. self.assertEqual(x.data, "b")
  538. def test_token_ebnf(self):
  539. g = _Lark("""start: A
  540. A: "a"* ("b"? "c".."e")+
  541. """)
  542. x = g.parse('abcde')
  543. x = g.parse('dd')
  544. def test_backslash(self):
  545. g = _Lark(r"""start: "\\" "a"
  546. """)
  547. x = g.parse(r'\a')
  548. g = _Lark(r"""start: /\\/ /a/
  549. """)
  550. x = g.parse(r'\a')
  551. def test_special_chars(self):
  552. g = _Lark(r"""start: "\n"
  553. """)
  554. x = g.parse('\n')
  555. g = _Lark(r"""start: /\n/
  556. """)
  557. x = g.parse('\n')
  558. def test_backslash2(self):
  559. g = _Lark(r"""start: "\"" "-"
  560. """)
  561. x = g.parse('"-')
  562. g = _Lark(r"""start: /\// /-/
  563. """)
  564. x = g.parse('/-')
  565. # def test_token_recurse(self):
  566. # g = _Lark("""start: A
  567. # A: B
  568. # B: A
  569. # """)
  570. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  571. def test_empty(self):
  572. # Fails an Earley implementation without special handling for empty rules,
  573. # or re-processing of already completed rules.
  574. g = _Lark(r"""start: _empty a "B"
  575. a: _empty "A"
  576. _empty:
  577. """)
  578. x = g.parse('AB')
  579. def test_regex_quote(self):
  580. g = r"""
  581. start: SINGLE_QUOTED_STRING | DOUBLE_QUOTED_STRING
  582. SINGLE_QUOTED_STRING : /'[^']*'/
  583. DOUBLE_QUOTED_STRING : /"[^"]*"/
  584. """
  585. g = _Lark(g)
  586. self.assertEqual( g.parse('"hello"').children, ['"hello"'])
  587. self.assertEqual( g.parse("'hello'").children, ["'hello'"])
  588. def test_lexer_token_limit(self):
  589. "Python has a stupid limit of 100 groups in a regular expression. Test that we handle this limitation"
  590. tokens = {'A%d'%i:'"%d"'%i for i in range(300)}
  591. g = _Lark("""start: %s
  592. %s""" % (' '.join(tokens), '\n'.join("%s: %s"%x for x in tokens.items())))
  593. def test_float_without_lexer(self):
  594. expected_error = UnexpectedInput if LEXER == 'dynamic' else UnexpectedToken
  595. if PARSER == 'cyk':
  596. expected_error = ParseError
  597. g = _Lark("""start: ["+"|"-"] float
  598. float: digit* "." digit+ exp?
  599. | digit+ exp
  600. exp: ("e"|"E") ["+"|"-"] digit+
  601. digit: "0"|"1"|"2"|"3"|"4"|"5"|"6"|"7"|"8"|"9"
  602. """)
  603. g.parse("1.2")
  604. g.parse("-.2e9")
  605. g.parse("+2e-9")
  606. self.assertRaises( expected_error, g.parse, "+2e-9e")
  607. def test_keep_all_tokens(self):
  608. l = _Lark("""start: "a"+ """, keep_all_tokens=True)
  609. tree = l.parse('aaa')
  610. self.assertEqual(tree.children, ['a', 'a', 'a'])
  611. def test_token_flags(self):
  612. l = _Lark("""!start: "a"i+
  613. """
  614. )
  615. tree = l.parse('aA')
  616. self.assertEqual(tree.children, ['a', 'A'])
  617. l = _Lark("""!start: /a/i+
  618. """
  619. )
  620. tree = l.parse('aA')
  621. self.assertEqual(tree.children, ['a', 'A'])
  622. # g = """!start: "a"i "a"
  623. # """
  624. # self.assertRaises(GrammarError, _Lark, g)
  625. # g = """!start: /a/i /a/
  626. # """
  627. # self.assertRaises(GrammarError, _Lark, g)
  628. g = """start: NAME "," "a"
  629. NAME: /[a-z_]/i /[a-z0-9_]/i*
  630. """
  631. l = _Lark(g)
  632. tree = l.parse('ab,a')
  633. self.assertEqual(tree.children, ['ab'])
  634. tree = l.parse('AB,a')
  635. self.assertEqual(tree.children, ['AB'])
  636. def test_token_flags3(self):
  637. l = _Lark("""!start: ABC+
  638. ABC: "abc"i
  639. """
  640. )
  641. tree = l.parse('aBcAbC')
  642. self.assertEqual(tree.children, ['aBc', 'AbC'])
  643. def test_token_flags2(self):
  644. g = """!start: ("a"i | /a/ /b/?)+
  645. """
  646. l = _Lark(g)
  647. tree = l.parse('aA')
  648. self.assertEqual(tree.children, ['a', 'A'])
  649. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  650. def test_twice_empty(self):
  651. g = """!start: [["A"]]
  652. """
  653. l = _Lark(g)
  654. tree = l.parse('A')
  655. self.assertEqual(tree.children, ['A'])
  656. tree = l.parse('')
  657. self.assertEqual(tree.children, [])
  658. def test_undefined_ignore(self):
  659. g = """!start: "A"
  660. %ignore B
  661. """
  662. self.assertRaises( GrammarError, _Lark, g)
  663. def test_alias_in_terminal(self):
  664. g = """start: TERM
  665. TERM: "a" -> alias
  666. """
  667. self.assertRaises( GrammarError, _Lark, g)
  668. def test_line_and_column(self):
  669. g = r"""!start: "A" bc "D"
  670. !bc: "B\nC"
  671. """
  672. l = _Lark(g)
  673. a, bc, d = l.parse("AB\nCD").children
  674. self.assertEqual(a.line, 1)
  675. self.assertEqual(a.column, 1)
  676. bc ,= bc.children
  677. self.assertEqual(bc.line, 1)
  678. self.assertEqual(bc.column, 2)
  679. self.assertEqual(d.line, 2)
  680. self.assertEqual(d.column, 2)
  681. if LEXER != 'dynamic':
  682. self.assertEqual(a.end_line, 1)
  683. self.assertEqual(a.end_column, 2)
  684. self.assertEqual(bc.end_line, 2)
  685. self.assertEqual(bc.end_column, 2)
  686. self.assertEqual(d.end_line, 2)
  687. self.assertEqual(d.end_column, 3)
  688. def test_reduce_cycle(self):
  689. """Tests an edge-condition in the LALR parser, in which a transition state looks exactly like the end state.
  690. It seems that the correct solution is to explicitely distinguish finalization in the reduce() function.
  691. """
  692. l = _Lark("""
  693. term: A
  694. | term term
  695. A: "a"
  696. """, start='term')
  697. tree = l.parse("aa")
  698. self.assertEqual(len(tree.children), 2)
  699. @unittest.skipIf(LEXER != 'standard', "Only standard lexers care about token priority")
  700. def test_lexer_prioritization(self):
  701. "Tests effect of priority on result"
  702. grammar = """
  703. start: A B | AB
  704. A.2: "a"
  705. B: "b"
  706. AB: "ab"
  707. """
  708. l = _Lark(grammar)
  709. res = l.parse("ab")
  710. self.assertEqual(res.children, ['a', 'b'])
  711. self.assertNotEqual(res.children, ['ab'])
  712. grammar = """
  713. start: A B | AB
  714. A: "a"
  715. B: "b"
  716. AB.3: "ab"
  717. """
  718. l = _Lark(grammar)
  719. res = l.parse("ab")
  720. self.assertNotEqual(res.children, ['a', 'b'])
  721. self.assertEqual(res.children, ['ab'])
  722. def test_import(self):
  723. grammar = """
  724. start: NUMBER WORD
  725. %import common.NUMBER
  726. %import common.WORD
  727. %import common.WS
  728. %ignore WS
  729. """
  730. l = _Lark(grammar)
  731. x = l.parse('12 elephants')
  732. self.assertEqual(x.children, ['12', 'elephants'])
  733. @unittest.skipIf(PARSER != 'earley', "Currently only Earley supports priority in rules")
  734. def test_earley_prioritization(self):
  735. "Tests effect of priority on result"
  736. grammar = """
  737. start: a | b
  738. a.1: "a"
  739. b.2: "a"
  740. """
  741. # l = Lark(grammar, parser='earley', lexer='standard')
  742. l = _Lark(grammar)
  743. res = l.parse("a")
  744. self.assertEqual(res.children[0].data, 'b')
  745. grammar = """
  746. start: a | b
  747. a.2: "a"
  748. b.1: "a"
  749. """
  750. l = _Lark(grammar)
  751. # l = Lark(grammar, parser='earley', lexer='standard')
  752. res = l.parse("a")
  753. self.assertEqual(res.children[0].data, 'a')
  754. @unittest.skipIf(PARSER != 'earley', "Currently only Earley supports priority in rules")
  755. def test_earley_prioritization_sum(self):
  756. "Tests effect of priority on result"
  757. grammar = """
  758. start: ab_ b_ a_ | indirection
  759. indirection: a_ bb_ a_
  760. a_: "a"
  761. b_: "b"
  762. ab_: "ab"
  763. bb_.1: "bb"
  764. """
  765. l = _Lark(grammar, ambiguity='resolve__antiscore_sum')
  766. res = l.parse('abba')
  767. self.assertEqual(''.join(child.data for child in res.children), 'ab_b_a_')
  768. grammar = """
  769. start: ab_ b_ a_ | indirection
  770. indirection: a_ bb_ a_
  771. a_: "a"
  772. b_: "b"
  773. ab_.1: "ab"
  774. bb_: "bb"
  775. """
  776. l = Lark(grammar, parser='earley', ambiguity='resolve__antiscore_sum')
  777. res = l.parse('abba')
  778. self.assertEqual(''.join(child.data for child in res.children), 'indirection')
  779. grammar = """
  780. start: ab_ b_ a_ | indirection
  781. indirection: a_ bb_ a_
  782. a_.2: "a"
  783. b_.1: "b"
  784. ab_.3: "ab"
  785. bb_.3: "bb"
  786. """
  787. l = Lark(grammar, parser='earley', ambiguity='resolve__antiscore_sum')
  788. res = l.parse('abba')
  789. self.assertEqual(''.join(child.data for child in res.children), 'ab_b_a_')
  790. grammar = """
  791. start: ab_ b_ a_ | indirection
  792. indirection: a_ bb_ a_
  793. a_.1: "a"
  794. b_.1: "b"
  795. ab_.4: "ab"
  796. bb_.3: "bb"
  797. """
  798. l = Lark(grammar, parser='earley', ambiguity='resolve__antiscore_sum')
  799. res = l.parse('abba')
  800. self.assertEqual(''.join(child.data for child in res.children), 'indirection')
  801. def test_utf8(self):
  802. g = u"""start: a
  803. a: "±a"
  804. """
  805. l = _Lark(g)
  806. self.assertEqual(l.parse(u'±a'), Tree('start', [Tree('a', [])]))
  807. g = u"""start: A
  808. A: "±a"
  809. """
  810. l = _Lark(g)
  811. self.assertEqual(l.parse(u'±a'), Tree('start', [u'\xb1a']))
  812. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  813. def test_ignore(self):
  814. grammar = r"""
  815. COMMENT: /(!|(\/\/))[^\n]*/
  816. %ignore COMMENT
  817. %import common.WS -> _WS
  818. %import common.INT
  819. start: "INT"i _WS+ INT _WS*
  820. """
  821. parser = _Lark(grammar)
  822. tree = parser.parse("int 1 ! This is a comment\n")
  823. self.assertEqual(tree.children, ['1'])
  824. tree = parser.parse("int 1 ! This is a comment") # A trailing ignore token can be tricky!
  825. self.assertEqual(tree.children, ['1'])
  826. parser = _Lark(r"""
  827. start : "a"*
  828. %ignore "b"
  829. """)
  830. tree = parser.parse("bb")
  831. self.assertEqual(tree.children, [])
  832. def test_regex_escaping(self):
  833. g = _Lark("start: /[ab]/")
  834. g.parse('a')
  835. g.parse('b')
  836. self.assertRaises( UnexpectedInput, g.parse, 'c')
  837. _Lark(r'start: /\w/').parse('a')
  838. g = _Lark(r'start: /\\w/')
  839. self.assertRaises( UnexpectedInput, g.parse, 'a')
  840. g.parse(r'\w')
  841. _Lark(r'start: /\[/').parse('[')
  842. _Lark(r'start: /\//').parse('/')
  843. _Lark(r'start: /\\/').parse('\\')
  844. _Lark(r'start: /\[ab]/').parse('[ab]')
  845. _Lark(r'start: /\\[ab]/').parse('\\a')
  846. _Lark(r'start: /\t/').parse('\t')
  847. _Lark(r'start: /\\t/').parse('\\t')
  848. _Lark(r'start: /\\\t/').parse('\\\t')
  849. _Lark(r'start: "\t"').parse('\t')
  850. _Lark(r'start: "\\t"').parse('\\t')
  851. _Lark(r'start: "\\\t"').parse('\\\t')
  852. def test_ranged_repeat_rules(self):
  853. g = u"""!start: "A"~3
  854. """
  855. l = _Lark(g)
  856. self.assertEqual(l.parse(u'AAA'), Tree('start', ["A", "A", "A"]))
  857. self.assertRaises(ParseError, l.parse, u'AA')
  858. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAA')
  859. g = u"""!start: "A"~0..2
  860. """
  861. if PARSER != 'cyk': # XXX CYK currently doesn't support empty grammars
  862. l = _Lark(g)
  863. self.assertEqual(l.parse(u''), Tree('start', []))
  864. self.assertEqual(l.parse(u'A'), Tree('start', ['A']))
  865. self.assertEqual(l.parse(u'AA'), Tree('start', ['A', 'A']))
  866. self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'AAA')
  867. g = u"""!start: "A"~3..2
  868. """
  869. self.assertRaises(GrammarError, _Lark, g)
  870. g = u"""!start: "A"~2..3 "B"~2
  871. """
  872. l = _Lark(g)
  873. self.assertEqual(l.parse(u'AABB'), Tree('start', ['A', 'A', 'B', 'B']))
  874. self.assertEqual(l.parse(u'AAABB'), Tree('start', ['A', 'A', 'A', 'B', 'B']))
  875. self.assertRaises(ParseError, l.parse, u'AAAB')
  876. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAABBB')
  877. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB')
  878. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB')
  879. def test_ranged_repeat_terms(self):
  880. g = u"""!start: AAA
  881. AAA: "A"~3
  882. """
  883. l = _Lark(g)
  884. self.assertEqual(l.parse(u'AAA'), Tree('start', ["AAA"]))
  885. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AA')
  886. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAA')
  887. g = u"""!start: AABB CC
  888. AABB: "A"~0..2 "B"~2
  889. CC: "C"~1..2
  890. """
  891. l = _Lark(g)
  892. self.assertEqual(l.parse(u'AABBCC'), Tree('start', ['AABB', 'CC']))
  893. self.assertEqual(l.parse(u'BBC'), Tree('start', ['BB', 'C']))
  894. self.assertEqual(l.parse(u'ABBCC'), Tree('start', ['ABB', 'CC']))
  895. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAB')
  896. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAABBB')
  897. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB')
  898. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB')
  899. @unittest.skipIf(PARSER=='earley', "Priority not handled correctly right now") # TODO XXX
  900. def test_priority_vs_embedded(self):
  901. g = """
  902. A.2: "a"
  903. WORD: ("a".."z")+
  904. start: (A | WORD)+
  905. """
  906. l = _Lark(g)
  907. t = l.parse('abc')
  908. self.assertEqual(t.children, ['a', 'bc'])
  909. self.assertEqual(t.children[0].type, 'A')
  910. _NAME = "Test" + PARSER.capitalize() + LEXER.capitalize()
  911. _TestParser.__name__ = _NAME
  912. globals()[_NAME] = _TestParser
  913. # Note: You still have to import them in __main__ for the tests to run
  914. _TO_TEST = [
  915. ('standard', 'earley'),
  916. ('standard', 'cyk'),
  917. ('dynamic', 'earley'),
  918. ('standard', 'lalr'),
  919. ('contextual', 'lalr'),
  920. # (None, 'earley'),
  921. ]
  922. for _LEXER, _PARSER in _TO_TEST:
  923. _make_parser_test(_LEXER, _PARSER)
  924. for _LEXER in ('dynamic',):
  925. _make_full_earley_test(_LEXER)
  926. if __name__ == '__main__':
  927. unittest.main()