This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1193 lines
39 KiB

  1. # -*- coding: utf-8 -*-
  2. from __future__ import absolute_import
  3. import unittest
  4. import logging
  5. import os
  6. import sys
  7. try:
  8. from cStringIO import StringIO as cStringIO
  9. except ImportError:
  10. # Available only in Python 2.x, 3.x only has io.StringIO from below
  11. cStringIO = None
  12. from io import (
  13. StringIO as uStringIO,
  14. open,
  15. )
  16. logging.basicConfig(level=logging.INFO)
  17. from lark.lark import Lark
  18. from lark.common import GrammarError, ParseError, UnexpectedToken
  19. from lark.lexer import LexError, UnexpectedInput
  20. from lark.tree import Tree
  21. from lark.visitors import Transformer
  22. __path__ = os.path.dirname(__file__)
  23. def _read(n, *args):
  24. with open(os.path.join(__path__, n), *args) as f:
  25. return f.read()
  26. class TestParsers(unittest.TestCase):
  27. def test_same_ast(self):
  28. "Tests that Earley and LALR parsers produce equal trees"
  29. g = Lark(r"""start: "(" name_list ("," "*" NAME)? ")"
  30. name_list: NAME | name_list "," NAME
  31. NAME: /\w+/ """, parser='lalr')
  32. l = g.parse('(a,b,c,*x)')
  33. g = Lark(r"""start: "(" name_list ("," "*" NAME)? ")"
  34. name_list: NAME | name_list "," NAME
  35. NAME: /\w/+ """)
  36. l2 = g.parse('(a,b,c,*x)')
  37. assert l == l2, '%s != %s' % (l.pretty(), l2.pretty())
  38. def test_infinite_recurse(self):
  39. g = """start: a
  40. a: a | "a"
  41. """
  42. self.assertRaises(GrammarError, Lark, g, parser='lalr')
  43. l = Lark(g, parser='earley', lexer='dynamic')
  44. self.assertRaises(ParseError, l.parse, 'a')
  45. def test_propagate_positions(self):
  46. g = Lark("""start: a
  47. a: "a"
  48. """, propagate_positions=True)
  49. r = g.parse('a')
  50. self.assertEqual( r.children[0].meta.line, 1 )
  51. def test_expand1(self):
  52. g = Lark("""start: a
  53. ?a: b
  54. b: "x"
  55. """)
  56. r = g.parse('x')
  57. self.assertEqual( r.children[0].data, "b" )
  58. g = Lark("""start: a
  59. ?a: b -> c
  60. b: "x"
  61. """)
  62. r = g.parse('x')
  63. self.assertEqual( r.children[0].data, "c" )
  64. g = Lark("""start: a
  65. ?a: B -> c
  66. B: "x"
  67. """)
  68. self.assertEqual( r.children[0].data, "c" )
  69. g = Lark("""start: a
  70. ?a: b b -> c
  71. b: "x"
  72. """)
  73. r = g.parse('xx')
  74. self.assertEqual( r.children[0].data, "c" )
  75. def test_embedded_transformer(self):
  76. class T(Transformer):
  77. def a(self, children):
  78. return "<a>"
  79. def b(self, children):
  80. return "<b>"
  81. def c(self, children):
  82. return "<c>"
  83. # Test regular
  84. g = Lark("""start: a
  85. a : "x"
  86. """, parser='lalr')
  87. r = T().transform(g.parse("x"))
  88. self.assertEqual( r.children, ["<a>"] )
  89. g = Lark("""start: a
  90. a : "x"
  91. """, parser='lalr', transformer=T())
  92. r = g.parse("x")
  93. self.assertEqual( r.children, ["<a>"] )
  94. # Test Expand1
  95. g = Lark("""start: a
  96. ?a : b
  97. b : "x"
  98. """, parser='lalr')
  99. r = T().transform(g.parse("x"))
  100. self.assertEqual( r.children, ["<b>"] )
  101. g = Lark("""start: a
  102. ?a : b
  103. b : "x"
  104. """, parser='lalr', transformer=T())
  105. r = g.parse("x")
  106. self.assertEqual( r.children, ["<b>"] )
  107. # Test Expand1 -> Alias
  108. g = Lark("""start: a
  109. ?a : b b -> c
  110. b : "x"
  111. """, parser='lalr')
  112. r = T().transform(g.parse("xx"))
  113. self.assertEqual( r.children, ["<c>"] )
  114. g = Lark("""start: a
  115. ?a : b b -> c
  116. b : "x"
  117. """, parser='lalr', transformer=T())
  118. r = g.parse("xx")
  119. self.assertEqual( r.children, ["<c>"] )
  120. def _make_full_earley_test(LEXER):
  121. class _TestFullEarley(unittest.TestCase):
  122. def test_anon(self):
  123. # Fails an Earley implementation without special handling for empty rules,
  124. # or re-processing of already completed rules.
  125. g = Lark(r"""start: B
  126. B: ("ab"|/[^b]/)+
  127. """, lexer=LEXER)
  128. self.assertEqual( g.parse('abc').children[0], 'abc')
  129. def test_earley(self):
  130. g = Lark("""start: A "b" c
  131. A: "a"+
  132. c: "abc"
  133. """, parser="earley", lexer=LEXER)
  134. x = g.parse('aaaababc')
  135. def test_earley2(self):
  136. grammar = """
  137. start: statement+
  138. statement: "r"
  139. | "c" /[a-z]/+
  140. %ignore " "
  141. """
  142. program = """c b r"""
  143. l = Lark(grammar, parser='earley', lexer=LEXER)
  144. l.parse(program)
  145. def test_earley3(self):
  146. "Tests prioritization and disambiguation for pseudo-terminals (there should be only one result)"
  147. grammar = """
  148. start: A A
  149. A: "a"+
  150. """
  151. l = Lark(grammar, parser='earley', lexer=LEXER)
  152. res = l.parse("aaa")
  153. self.assertEqual(res.children, ['aa', 'a'])
  154. def test_earley4(self):
  155. grammar = """
  156. start: A A?
  157. A: "a"+
  158. """
  159. l = Lark(grammar, parser='earley', lexer=LEXER)
  160. res = l.parse("aaa")
  161. self.assertEqual(res.children, ['aaa'])
  162. def test_earley_repeating_empty(self):
  163. # This was a sneaky bug!
  164. grammar = """
  165. !start: "a" empty empty "b"
  166. empty: empty2
  167. empty2:
  168. """
  169. parser = Lark(grammar, parser='earley', lexer=LEXER)
  170. res = parser.parse('ab')
  171. empty_tree = Tree('empty', [Tree('empty2', [])])
  172. self.assertSequenceEqual(res.children, ['a', empty_tree, empty_tree, 'b'])
  173. def test_earley_explicit_ambiguity(self):
  174. # This was a sneaky bug!
  175. grammar = """
  176. start: a b | ab
  177. a: "a"
  178. b: "b"
  179. ab: "ab"
  180. """
  181. parser = Lark(grammar, parser='earley', lexer=LEXER, ambiguity='explicit')
  182. res = parser.parse('ab')
  183. self.assertEqual( res.data, '_ambig')
  184. self.assertEqual( len(res.children), 2)
  185. def test_ambiguity1(self):
  186. grammar = """
  187. start: cd+ "e"
  188. !cd: "c"
  189. | "d"
  190. | "cd"
  191. """
  192. l = Lark(grammar, parser='earley', ambiguity='explicit', lexer=LEXER)
  193. x = l.parse('cde')
  194. assert x.data == '_ambig', x
  195. assert len(x.children) == 2
  196. def test_fruitflies_ambig(self):
  197. grammar = """
  198. start: noun verb noun -> simple
  199. | noun verb "like" noun -> comparative
  200. noun: adj? NOUN
  201. verb: VERB
  202. adj: ADJ
  203. NOUN: "flies" | "bananas" | "fruit"
  204. VERB: "like" | "flies"
  205. ADJ: "fruit"
  206. %import common.WS
  207. %ignore WS
  208. """
  209. parser = Lark(grammar, ambiguity='explicit', lexer=LEXER)
  210. res = parser.parse('fruit flies like bananas')
  211. expected = Tree('_ambig', [
  212. Tree('comparative', [
  213. Tree('noun', ['fruit']),
  214. Tree('verb', ['flies']),
  215. Tree('noun', ['bananas'])
  216. ]),
  217. Tree('simple', [
  218. Tree('noun', [Tree('adj', ['fruit']), 'flies']),
  219. Tree('verb', ['like']),
  220. Tree('noun', ['bananas'])
  221. ])
  222. ])
  223. # print res.pretty()
  224. # print expected.pretty()
  225. self.assertEqual(res, expected)
  226. def test_explicit_ambiguity2(self):
  227. grammar = r"""
  228. start: NAME+
  229. NAME: /\w+/
  230. %ignore " "
  231. """
  232. text = """cat"""
  233. parser = Lark(grammar, start='start', ambiguity='explicit')
  234. tree = parser.parse(text)
  235. self.assertEqual(tree.data, '_ambig')
  236. combinations = {tuple(str(s) for s in t.children) for t in tree.children}
  237. self.assertEqual(combinations, {
  238. ('cat',),
  239. ('ca', 't'),
  240. ('c', 'at'),
  241. ('c', 'a' ,'t')
  242. })
  243. def test_term_ambig_resolve(self):
  244. grammar = r"""
  245. !start: NAME+
  246. NAME: /\w+/
  247. %ignore " "
  248. """
  249. text = """foo bar"""
  250. parser = Lark(grammar)
  251. tree = parser.parse(text)
  252. self.assertEqual(tree.children, ['foo', 'bar'])
  253. # @unittest.skipIf(LEXER=='dynamic', "Not implemented in Dynamic Earley yet") # TODO
  254. # def test_not_all_derivations(self):
  255. # grammar = """
  256. # start: cd+ "e"
  257. # !cd: "c"
  258. # | "d"
  259. # | "cd"
  260. # """
  261. # l = Lark(grammar, parser='earley', ambiguity='explicit', lexer=LEXER, earley__all_derivations=False)
  262. # x = l.parse('cde')
  263. # assert x.data != '_ambig', x
  264. # assert len(x.children) == 1
  265. _NAME = "TestFullEarley" + LEXER.capitalize()
  266. _TestFullEarley.__name__ = _NAME
  267. globals()[_NAME] = _TestFullEarley
  268. def _make_parser_test(LEXER, PARSER):
  269. def _Lark(grammar, **kwargs):
  270. return Lark(grammar, lexer=LEXER, parser=PARSER, **kwargs)
  271. class _TestParser(unittest.TestCase):
  272. def test_basic1(self):
  273. g = _Lark("""start: a+ b a* "b" a*
  274. b: "b"
  275. a: "a"
  276. """)
  277. r = g.parse('aaabaab')
  278. self.assertEqual( ''.join(x.data for x in r.children), 'aaabaa' )
  279. r = g.parse('aaabaaba')
  280. self.assertEqual( ''.join(x.data for x in r.children), 'aaabaaa' )
  281. self.assertRaises(ParseError, g.parse, 'aaabaa')
  282. def test_basic2(self):
  283. # Multiple parsers and colliding tokens
  284. g = _Lark("""start: B A
  285. B: "12"
  286. A: "1" """)
  287. g2 = _Lark("""start: B A
  288. B: "12"
  289. A: "2" """)
  290. x = g.parse('121')
  291. assert x.data == 'start' and x.children == ['12', '1'], x
  292. x = g2.parse('122')
  293. assert x.data == 'start' and x.children == ['12', '2'], x
  294. @unittest.skipIf(cStringIO is None, "cStringIO not available")
  295. def test_stringio_bytes(self):
  296. """Verify that a Lark can be created from file-like objects other than Python's standard 'file' object"""
  297. _Lark(cStringIO(b'start: a+ b a* "b" a*\n b: "b"\n a: "a" '))
  298. def test_stringio_unicode(self):
  299. """Verify that a Lark can be created from file-like objects other than Python's standard 'file' object"""
  300. _Lark(uStringIO(u'start: a+ b a* "b" a*\n b: "b"\n a: "a" '))
  301. def test_unicode(self):
  302. g = _Lark(u"""start: UNIA UNIB UNIA
  303. UNIA: /\xa3/
  304. UNIB: /\u0101/
  305. """)
  306. g.parse(u'\xa3\u0101\u00a3')
  307. def test_unicode2(self):
  308. g = _Lark(r"""start: UNIA UNIB UNIA UNIC
  309. UNIA: /\xa3/
  310. UNIB: "a\u0101b\ "
  311. UNIC: /a?\u0101c\n/
  312. """)
  313. g.parse(u'\xa3a\u0101b\\ \u00a3\u0101c\n')
  314. def test_unicode3(self):
  315. g = _Lark(r"""start: UNIA UNIB UNIA UNIC
  316. UNIA: /\xa3/
  317. UNIB: "\u0101"
  318. UNIC: /\u0203/ /\n/
  319. """)
  320. g.parse(u'\xa3\u0101\u00a3\u0203\n')
  321. @unittest.skipIf(PARSER == 'cyk', "Takes forever")
  322. def test_stack_for_ebnf(self):
  323. """Verify that stack depth isn't an issue for EBNF grammars"""
  324. g = _Lark(r"""start: a+
  325. a : "a" """)
  326. g.parse("a" * (sys.getrecursionlimit()*2 ))
  327. def test_expand1_lists_with_one_item(self):
  328. g = _Lark(r"""start: list
  329. ?list: item+
  330. item : A
  331. A: "a"
  332. """)
  333. r = g.parse("a")
  334. # because 'list' is an expand-if-contains-one rule and we only provided one element it should have expanded to 'item'
  335. self.assertSequenceEqual([subtree.data for subtree in r.children], ('item',))
  336. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  337. self.assertEqual(len(r.children), 1)
  338. def test_expand1_lists_with_one_item_2(self):
  339. g = _Lark(r"""start: list
  340. ?list: item+ "!"
  341. item : A
  342. A: "a"
  343. """)
  344. r = g.parse("a!")
  345. # because 'list' is an expand-if-contains-one rule and we only provided one element it should have expanded to 'item'
  346. self.assertSequenceEqual([subtree.data for subtree in r.children], ('item',))
  347. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  348. self.assertEqual(len(r.children), 1)
  349. def test_dont_expand1_lists_with_multiple_items(self):
  350. g = _Lark(r"""start: list
  351. ?list: item+
  352. item : A
  353. A: "a"
  354. """)
  355. r = g.parse("aa")
  356. # because 'list' is an expand-if-contains-one rule and we've provided more than one element it should *not* have expanded
  357. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  358. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  359. self.assertEqual(len(r.children), 1)
  360. # Sanity check: verify that 'list' contains the two 'item's we've given it
  361. [list] = r.children
  362. self.assertSequenceEqual([item.data for item in list.children], ('item', 'item'))
  363. def test_dont_expand1_lists_with_multiple_items_2(self):
  364. g = _Lark(r"""start: list
  365. ?list: item+ "!"
  366. item : A
  367. A: "a"
  368. """)
  369. r = g.parse("aa!")
  370. # because 'list' is an expand-if-contains-one rule and we've provided more than one element it should *not* have expanded
  371. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  372. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  373. self.assertEqual(len(r.children), 1)
  374. # Sanity check: verify that 'list' contains the two 'item's we've given it
  375. [list] = r.children
  376. self.assertSequenceEqual([item.data for item in list.children], ('item', 'item'))
  377. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  378. def test_empty_expand1_list(self):
  379. g = _Lark(r"""start: list
  380. ?list: item*
  381. item : A
  382. A: "a"
  383. """)
  384. r = g.parse("")
  385. # because 'list' is an expand-if-contains-one rule and we've provided less than one element (i.e. none) it should *not* have expanded
  386. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  387. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  388. self.assertEqual(len(r.children), 1)
  389. # Sanity check: verify that 'list' contains no 'item's as we've given it none
  390. [list] = r.children
  391. self.assertSequenceEqual([item.data for item in list.children], ())
  392. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  393. def test_empty_expand1_list_2(self):
  394. g = _Lark(r"""start: list
  395. ?list: item* "!"?
  396. item : A
  397. A: "a"
  398. """)
  399. r = g.parse("")
  400. # because 'list' is an expand-if-contains-one rule and we've provided less than one element (i.e. none) it should *not* have expanded
  401. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  402. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  403. self.assertEqual(len(r.children), 1)
  404. # Sanity check: verify that 'list' contains no 'item's as we've given it none
  405. [list] = r.children
  406. self.assertSequenceEqual([item.data for item in list.children], ())
  407. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  408. def test_empty_flatten_list(self):
  409. g = _Lark(r"""start: list
  410. list: | item "," list
  411. item : A
  412. A: "a"
  413. """)
  414. r = g.parse("")
  415. # Because 'list' is a flatten rule it's top-level element should *never* be expanded
  416. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  417. # Sanity check: verify that 'list' contains no 'item's as we've given it none
  418. [list] = r.children
  419. self.assertSequenceEqual([item.data for item in list.children], ())
  420. @unittest.skipIf(True, "Flattening list isn't implemented (and may never be)")
  421. def test_single_item_flatten_list(self):
  422. g = _Lark(r"""start: list
  423. list: | item "," list
  424. item : A
  425. A: "a"
  426. """)
  427. r = g.parse("a,")
  428. # Because 'list' is a flatten rule it's top-level element should *never* be expanded
  429. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  430. # Sanity check: verify that 'list' contains exactly the one 'item' we've given it
  431. [list] = r.children
  432. self.assertSequenceEqual([item.data for item in list.children], ('item',))
  433. @unittest.skipIf(True, "Flattening list isn't implemented (and may never be)")
  434. def test_multiple_item_flatten_list(self):
  435. g = _Lark(r"""start: list
  436. #list: | item "," list
  437. item : A
  438. A: "a"
  439. """)
  440. r = g.parse("a,a,")
  441. # Because 'list' is a flatten rule it's top-level element should *never* be expanded
  442. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  443. # Sanity check: verify that 'list' contains exactly the two 'item's we've given it
  444. [list] = r.children
  445. self.assertSequenceEqual([item.data for item in list.children], ('item', 'item'))
  446. @unittest.skipIf(True, "Flattening list isn't implemented (and may never be)")
  447. def test_recurse_flatten(self):
  448. """Verify that stack depth doesn't get exceeded on recursive rules marked for flattening."""
  449. g = _Lark(r"""start: a | start a
  450. a : A
  451. A : "a" """)
  452. # Force PLY to write to the debug log, but prevent writing it to the terminal (uses repr() on the half-built
  453. # STree data structures, which uses recursion).
  454. g.parse("a" * (sys.getrecursionlimit() // 4))
  455. def test_token_collision(self):
  456. g = _Lark(r"""start: "Hello" NAME
  457. NAME: /\w/+
  458. %ignore " "
  459. """)
  460. x = g.parse('Hello World')
  461. self.assertSequenceEqual(x.children, ['World'])
  462. x = g.parse('Hello HelloWorld')
  463. self.assertSequenceEqual(x.children, ['HelloWorld'])
  464. def test_token_collision_WS(self):
  465. g = _Lark(r"""start: "Hello" NAME
  466. NAME: /\w/+
  467. %import common.WS
  468. %ignore WS
  469. """)
  470. x = g.parse('Hello World')
  471. self.assertSequenceEqual(x.children, ['World'])
  472. x = g.parse('Hello HelloWorld')
  473. self.assertSequenceEqual(x.children, ['HelloWorld'])
  474. def test_token_collision2(self):
  475. g = _Lark("""
  476. !start: "starts"
  477. %import common.LCASE_LETTER
  478. """)
  479. x = g.parse("starts")
  480. self.assertSequenceEqual(x.children, ['starts'])
  481. # def test_string_priority(self):
  482. # g = _Lark("""start: (A | /a?bb/)+
  483. # A: "a" """)
  484. # x = g.parse('abb')
  485. # self.assertEqual(len(x.children), 2)
  486. # # This parse raises an exception because the lexer will always try to consume
  487. # # "a" first and will never match the regular expression
  488. # # This behavior is subject to change!!
  489. # # Thie won't happen with ambiguity handling.
  490. # g = _Lark("""start: (A | /a?ab/)+
  491. # A: "a" """)
  492. # self.assertRaises(LexError, g.parse, 'aab')
  493. def test_undefined_rule(self):
  494. self.assertRaises(GrammarError, _Lark, """start: a""")
  495. def test_undefined_token(self):
  496. self.assertRaises(GrammarError, _Lark, """start: A""")
  497. def test_rule_collision(self):
  498. g = _Lark("""start: "a"+ "b"
  499. | "a"+ """)
  500. x = g.parse('aaaa')
  501. x = g.parse('aaaab')
  502. def test_rule_collision2(self):
  503. g = _Lark("""start: "a"* "b"
  504. | "a"+ """)
  505. x = g.parse('aaaa')
  506. x = g.parse('aaaab')
  507. x = g.parse('b')
  508. def test_token_not_anon(self):
  509. """Tests that "a" is matched as an anonymous token, and not A.
  510. """
  511. g = _Lark("""start: "a"
  512. A: "a" """)
  513. x = g.parse('a')
  514. self.assertEqual(len(x.children), 0, '"a" should be considered anonymous')
  515. g = _Lark("""start: "a" A
  516. A: "a" """)
  517. x = g.parse('aa')
  518. self.assertEqual(len(x.children), 1, 'only "a" should be considered anonymous')
  519. self.assertEqual(x.children[0].type, "A")
  520. g = _Lark("""start: /a/
  521. A: /a/ """)
  522. x = g.parse('a')
  523. self.assertEqual(len(x.children), 1)
  524. self.assertEqual(x.children[0].type, "A", "A isn't associated with /a/")
  525. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  526. def test_maybe(self):
  527. g = _Lark("""start: ["a"] """)
  528. x = g.parse('a')
  529. x = g.parse('')
  530. def test_start(self):
  531. g = _Lark("""a: "a" a? """, start='a')
  532. x = g.parse('a')
  533. x = g.parse('aa')
  534. x = g.parse('aaa')
  535. def test_alias(self):
  536. g = _Lark("""start: "a" -> b """)
  537. x = g.parse('a')
  538. self.assertEqual(x.data, "b")
  539. def test_token_ebnf(self):
  540. g = _Lark("""start: A
  541. A: "a"* ("b"? "c".."e")+
  542. """)
  543. x = g.parse('abcde')
  544. x = g.parse('dd')
  545. def test_backslash(self):
  546. g = _Lark(r"""start: "\\" "a"
  547. """)
  548. x = g.parse(r'\a')
  549. g = _Lark(r"""start: /\\/ /a/
  550. """)
  551. x = g.parse(r'\a')
  552. def test_special_chars(self):
  553. g = _Lark(r"""start: "\n"
  554. """)
  555. x = g.parse('\n')
  556. g = _Lark(r"""start: /\n/
  557. """)
  558. x = g.parse('\n')
  559. def test_backslash2(self):
  560. g = _Lark(r"""start: "\"" "-"
  561. """)
  562. x = g.parse('"-')
  563. g = _Lark(r"""start: /\// /-/
  564. """)
  565. x = g.parse('/-')
  566. # def test_token_recurse(self):
  567. # g = _Lark("""start: A
  568. # A: B
  569. # B: A
  570. # """)
  571. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  572. def test_empty(self):
  573. # Fails an Earley implementation without special handling for empty rules,
  574. # or re-processing of already completed rules.
  575. g = _Lark(r"""start: _empty a "B"
  576. a: _empty "A"
  577. _empty:
  578. """)
  579. x = g.parse('AB')
  580. def test_regex_quote(self):
  581. g = r"""
  582. start: SINGLE_QUOTED_STRING | DOUBLE_QUOTED_STRING
  583. SINGLE_QUOTED_STRING : /'[^']*'/
  584. DOUBLE_QUOTED_STRING : /"[^"]*"/
  585. """
  586. g = _Lark(g)
  587. self.assertEqual( g.parse('"hello"').children, ['"hello"'])
  588. self.assertEqual( g.parse("'hello'").children, ["'hello'"])
  589. def test_lexer_token_limit(self):
  590. "Python has a stupid limit of 100 groups in a regular expression. Test that we handle this limitation"
  591. tokens = {'A%d'%i:'"%d"'%i for i in range(300)}
  592. g = _Lark("""start: %s
  593. %s""" % (' '.join(tokens), '\n'.join("%s: %s"%x for x in tokens.items())))
  594. def test_float_without_lexer(self):
  595. expected_error = UnexpectedInput if LEXER == 'dynamic' else UnexpectedToken
  596. if PARSER == 'cyk':
  597. expected_error = ParseError
  598. g = _Lark("""start: ["+"|"-"] float
  599. float: digit* "." digit+ exp?
  600. | digit+ exp
  601. exp: ("e"|"E") ["+"|"-"] digit+
  602. digit: "0"|"1"|"2"|"3"|"4"|"5"|"6"|"7"|"8"|"9"
  603. """)
  604. g.parse("1.2")
  605. g.parse("-.2e9")
  606. g.parse("+2e-9")
  607. self.assertRaises( expected_error, g.parse, "+2e-9e")
  608. def test_keep_all_tokens(self):
  609. l = _Lark("""start: "a"+ """, keep_all_tokens=True)
  610. tree = l.parse('aaa')
  611. self.assertEqual(tree.children, ['a', 'a', 'a'])
  612. def test_token_flags(self):
  613. l = _Lark("""!start: "a"i+
  614. """
  615. )
  616. tree = l.parse('aA')
  617. self.assertEqual(tree.children, ['a', 'A'])
  618. l = _Lark("""!start: /a/i+
  619. """
  620. )
  621. tree = l.parse('aA')
  622. self.assertEqual(tree.children, ['a', 'A'])
  623. # g = """!start: "a"i "a"
  624. # """
  625. # self.assertRaises(GrammarError, _Lark, g)
  626. # g = """!start: /a/i /a/
  627. # """
  628. # self.assertRaises(GrammarError, _Lark, g)
  629. g = """start: NAME "," "a"
  630. NAME: /[a-z_]/i /[a-z0-9_]/i*
  631. """
  632. l = _Lark(g)
  633. tree = l.parse('ab,a')
  634. self.assertEqual(tree.children, ['ab'])
  635. tree = l.parse('AB,a')
  636. self.assertEqual(tree.children, ['AB'])
  637. def test_token_flags3(self):
  638. l = _Lark("""!start: ABC+
  639. ABC: "abc"i
  640. """
  641. )
  642. tree = l.parse('aBcAbC')
  643. self.assertEqual(tree.children, ['aBc', 'AbC'])
  644. def test_token_flags2(self):
  645. g = """!start: ("a"i | /a/ /b/?)+
  646. """
  647. l = _Lark(g)
  648. tree = l.parse('aA')
  649. self.assertEqual(tree.children, ['a', 'A'])
  650. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  651. def test_twice_empty(self):
  652. g = """!start: [["A"]]
  653. """
  654. l = _Lark(g)
  655. tree = l.parse('A')
  656. self.assertEqual(tree.children, ['A'])
  657. tree = l.parse('')
  658. self.assertEqual(tree.children, [])
  659. def test_undefined_ignore(self):
  660. g = """!start: "A"
  661. %ignore B
  662. """
  663. self.assertRaises( GrammarError, _Lark, g)
  664. def test_alias_in_terminal(self):
  665. g = """start: TERM
  666. TERM: "a" -> alias
  667. """
  668. self.assertRaises( GrammarError, _Lark, g)
  669. def test_line_and_column(self):
  670. g = r"""!start: "A" bc "D"
  671. !bc: "B\nC"
  672. """
  673. l = _Lark(g)
  674. a, bc, d = l.parse("AB\nCD").children
  675. self.assertEqual(a.line, 1)
  676. self.assertEqual(a.column, 1)
  677. bc ,= bc.children
  678. self.assertEqual(bc.line, 1)
  679. self.assertEqual(bc.column, 2)
  680. self.assertEqual(d.line, 2)
  681. self.assertEqual(d.column, 2)
  682. if LEXER != 'dynamic':
  683. self.assertEqual(a.end_line, 1)
  684. self.assertEqual(a.end_column, 2)
  685. self.assertEqual(bc.end_line, 2)
  686. self.assertEqual(bc.end_column, 2)
  687. self.assertEqual(d.end_line, 2)
  688. self.assertEqual(d.end_column, 3)
  689. def test_reduce_cycle(self):
  690. """Tests an edge-condition in the LALR parser, in which a transition state looks exactly like the end state.
  691. It seems that the correct solution is to explicitely distinguish finalization in the reduce() function.
  692. """
  693. l = _Lark("""
  694. term: A
  695. | term term
  696. A: "a"
  697. """, start='term')
  698. tree = l.parse("aa")
  699. self.assertEqual(len(tree.children), 2)
  700. @unittest.skipIf(LEXER != 'standard', "Only standard lexers care about token priority")
  701. def test_lexer_prioritization(self):
  702. "Tests effect of priority on result"
  703. grammar = """
  704. start: A B | AB
  705. A.2: "a"
  706. B: "b"
  707. AB: "ab"
  708. """
  709. l = _Lark(grammar)
  710. res = l.parse("ab")
  711. self.assertEqual(res.children, ['a', 'b'])
  712. self.assertNotEqual(res.children, ['ab'])
  713. grammar = """
  714. start: A B | AB
  715. A: "a"
  716. B: "b"
  717. AB.3: "ab"
  718. """
  719. l = _Lark(grammar)
  720. res = l.parse("ab")
  721. self.assertNotEqual(res.children, ['a', 'b'])
  722. self.assertEqual(res.children, ['ab'])
  723. def test_import(self):
  724. grammar = """
  725. start: NUMBER WORD
  726. %import common.NUMBER
  727. %import common.WORD
  728. %import common.WS
  729. %ignore WS
  730. """
  731. l = _Lark(grammar)
  732. x = l.parse('12 elephants')
  733. self.assertEqual(x.children, ['12', 'elephants'])
  734. @unittest.skipIf(PARSER != 'earley', "Currently only Earley supports priority in rules")
  735. def test_earley_prioritization(self):
  736. "Tests effect of priority on result"
  737. grammar = """
  738. start: a | b
  739. a.1: "a"
  740. b.2: "a"
  741. """
  742. # l = Lark(grammar, parser='earley', lexer='standard')
  743. l = _Lark(grammar)
  744. res = l.parse("a")
  745. self.assertEqual(res.children[0].data, 'b')
  746. grammar = """
  747. start: a | b
  748. a.2: "a"
  749. b.1: "a"
  750. """
  751. l = _Lark(grammar)
  752. # l = Lark(grammar, parser='earley', lexer='standard')
  753. res = l.parse("a")
  754. self.assertEqual(res.children[0].data, 'a')
  755. @unittest.skipIf(PARSER != 'earley', "Currently only Earley supports priority in rules")
  756. def test_earley_prioritization_sum(self):
  757. "Tests effect of priority on result"
  758. grammar = """
  759. start: ab_ b_ a_ | indirection
  760. indirection: a_ bb_ a_
  761. a_: "a"
  762. b_: "b"
  763. ab_: "ab"
  764. bb_.1: "bb"
  765. """
  766. l = _Lark(grammar, ambiguity='resolve__antiscore_sum')
  767. res = l.parse('abba')
  768. self.assertEqual(''.join(child.data for child in res.children), 'ab_b_a_')
  769. grammar = """
  770. start: ab_ b_ a_ | indirection
  771. indirection: a_ bb_ a_
  772. a_: "a"
  773. b_: "b"
  774. ab_.1: "ab"
  775. bb_: "bb"
  776. """
  777. l = Lark(grammar, parser='earley', ambiguity='resolve__antiscore_sum')
  778. res = l.parse('abba')
  779. self.assertEqual(''.join(child.data for child in res.children), 'indirection')
  780. grammar = """
  781. start: ab_ b_ a_ | indirection
  782. indirection: a_ bb_ a_
  783. a_.2: "a"
  784. b_.1: "b"
  785. ab_.3: "ab"
  786. bb_.3: "bb"
  787. """
  788. l = Lark(grammar, parser='earley', ambiguity='resolve__antiscore_sum')
  789. res = l.parse('abba')
  790. self.assertEqual(''.join(child.data for child in res.children), 'ab_b_a_')
  791. grammar = """
  792. start: ab_ b_ a_ | indirection
  793. indirection: a_ bb_ a_
  794. a_.1: "a"
  795. b_.1: "b"
  796. ab_.4: "ab"
  797. bb_.3: "bb"
  798. """
  799. l = Lark(grammar, parser='earley', ambiguity='resolve__antiscore_sum')
  800. res = l.parse('abba')
  801. self.assertEqual(''.join(child.data for child in res.children), 'indirection')
  802. def test_utf8(self):
  803. g = u"""start: a
  804. a: "±a"
  805. """
  806. l = _Lark(g)
  807. self.assertEqual(l.parse(u'±a'), Tree('start', [Tree('a', [])]))
  808. g = u"""start: A
  809. A: "±a"
  810. """
  811. l = _Lark(g)
  812. self.assertEqual(l.parse(u'±a'), Tree('start', [u'\xb1a']))
  813. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  814. def test_ignore(self):
  815. grammar = r"""
  816. COMMENT: /(!|(\/\/))[^\n]*/
  817. %ignore COMMENT
  818. %import common.WS -> _WS
  819. %import common.INT
  820. start: "INT"i _WS+ INT _WS*
  821. """
  822. parser = _Lark(grammar)
  823. tree = parser.parse("int 1 ! This is a comment\n")
  824. self.assertEqual(tree.children, ['1'])
  825. tree = parser.parse("int 1 ! This is a comment") # A trailing ignore token can be tricky!
  826. self.assertEqual(tree.children, ['1'])
  827. parser = _Lark(r"""
  828. start : "a"*
  829. %ignore "b"
  830. """)
  831. tree = parser.parse("bb")
  832. self.assertEqual(tree.children, [])
  833. def test_regex_escaping(self):
  834. g = _Lark("start: /[ab]/")
  835. g.parse('a')
  836. g.parse('b')
  837. self.assertRaises( UnexpectedInput, g.parse, 'c')
  838. _Lark(r'start: /\w/').parse('a')
  839. g = _Lark(r'start: /\\w/')
  840. self.assertRaises( UnexpectedInput, g.parse, 'a')
  841. g.parse(r'\w')
  842. _Lark(r'start: /\[/').parse('[')
  843. _Lark(r'start: /\//').parse('/')
  844. _Lark(r'start: /\\/').parse('\\')
  845. _Lark(r'start: /\[ab]/').parse('[ab]')
  846. _Lark(r'start: /\\[ab]/').parse('\\a')
  847. _Lark(r'start: /\t/').parse('\t')
  848. _Lark(r'start: /\\t/').parse('\\t')
  849. _Lark(r'start: /\\\t/').parse('\\\t')
  850. _Lark(r'start: "\t"').parse('\t')
  851. _Lark(r'start: "\\t"').parse('\\t')
  852. _Lark(r'start: "\\\t"').parse('\\\t')
  853. def test_ranged_repeat_rules(self):
  854. g = u"""!start: "A"~3
  855. """
  856. l = _Lark(g)
  857. self.assertEqual(l.parse(u'AAA'), Tree('start', ["A", "A", "A"]))
  858. self.assertRaises(ParseError, l.parse, u'AA')
  859. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAA')
  860. g = u"""!start: "A"~0..2
  861. """
  862. if PARSER != 'cyk': # XXX CYK currently doesn't support empty grammars
  863. l = _Lark(g)
  864. self.assertEqual(l.parse(u''), Tree('start', []))
  865. self.assertEqual(l.parse(u'A'), Tree('start', ['A']))
  866. self.assertEqual(l.parse(u'AA'), Tree('start', ['A', 'A']))
  867. self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'AAA')
  868. g = u"""!start: "A"~3..2
  869. """
  870. self.assertRaises(GrammarError, _Lark, g)
  871. g = u"""!start: "A"~2..3 "B"~2
  872. """
  873. l = _Lark(g)
  874. self.assertEqual(l.parse(u'AABB'), Tree('start', ['A', 'A', 'B', 'B']))
  875. self.assertEqual(l.parse(u'AAABB'), Tree('start', ['A', 'A', 'A', 'B', 'B']))
  876. self.assertRaises(ParseError, l.parse, u'AAAB')
  877. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAABBB')
  878. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB')
  879. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB')
  880. def test_ranged_repeat_terms(self):
  881. g = u"""!start: AAA
  882. AAA: "A"~3
  883. """
  884. l = _Lark(g)
  885. self.assertEqual(l.parse(u'AAA'), Tree('start', ["AAA"]))
  886. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AA')
  887. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAA')
  888. g = u"""!start: AABB CC
  889. AABB: "A"~0..2 "B"~2
  890. CC: "C"~1..2
  891. """
  892. l = _Lark(g)
  893. self.assertEqual(l.parse(u'AABBCC'), Tree('start', ['AABB', 'CC']))
  894. self.assertEqual(l.parse(u'BBC'), Tree('start', ['BB', 'C']))
  895. self.assertEqual(l.parse(u'ABBCC'), Tree('start', ['ABB', 'CC']))
  896. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAB')
  897. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAABBB')
  898. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB')
  899. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB')
  900. @unittest.skipIf(PARSER=='earley', "Priority not handled correctly right now") # TODO XXX
  901. def test_priority_vs_embedded(self):
  902. g = """
  903. A.2: "a"
  904. WORD: ("a".."z")+
  905. start: (A | WORD)+
  906. """
  907. l = _Lark(g)
  908. t = l.parse('abc')
  909. self.assertEqual(t.children, ['a', 'bc'])
  910. self.assertEqual(t.children[0].type, 'A')
  911. _NAME = "Test" + PARSER.capitalize() + LEXER.capitalize()
  912. _TestParser.__name__ = _NAME
  913. globals()[_NAME] = _TestParser
  914. # Note: You still have to import them in __main__ for the tests to run
  915. _TO_TEST = [
  916. ('standard', 'earley'),
  917. ('standard', 'cyk'),
  918. ('dynamic', 'earley'),
  919. ('standard', 'lalr'),
  920. ('contextual', 'lalr'),
  921. # (None, 'earley'),
  922. ]
  923. for _LEXER, _PARSER in _TO_TEST:
  924. _make_parser_test(_LEXER, _PARSER)
  925. for _LEXER in ('dynamic',):
  926. _make_full_earley_test(_LEXER)
  927. if __name__ == '__main__':
  928. unittest.main()