This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1194 lines
39 KiB

  1. # -*- coding: utf-8 -*-
  2. from __future__ import absolute_import
  3. import unittest
  4. import logging
  5. import os
  6. import sys
  7. try:
  8. from cStringIO import StringIO as cStringIO
  9. except ImportError:
  10. # Available only in Python 2.x, 3.x only has io.StringIO from below
  11. cStringIO = None
  12. from io import (
  13. StringIO as uStringIO,
  14. open,
  15. )
  16. logging.basicConfig(level=logging.INFO)
  17. from lark.lark import Lark
  18. from lark.common import GrammarError, ParseError, UnexpectedToken
  19. from lark.lexer import LexError, UnexpectedInput
  20. from lark.tree import Tree
  21. from lark.visitors import Transformer, children_args
  22. __path__ = os.path.dirname(__file__)
  23. def _read(n, *args):
  24. with open(os.path.join(__path__, n), *args) as f:
  25. return f.read()
  26. class TestParsers(unittest.TestCase):
  27. def test_same_ast(self):
  28. "Tests that Earley and LALR parsers produce equal trees"
  29. g = Lark(r"""start: "(" name_list ("," "*" NAME)? ")"
  30. name_list: NAME | name_list "," NAME
  31. NAME: /\w+/ """, parser='lalr')
  32. l = g.parse('(a,b,c,*x)')
  33. g = Lark(r"""start: "(" name_list ("," "*" NAME)? ")"
  34. name_list: NAME | name_list "," NAME
  35. NAME: /\w/+ """)
  36. l2 = g.parse('(a,b,c,*x)')
  37. assert l == l2, '%s != %s' % (l.pretty(), l2.pretty())
  38. def test_infinite_recurse(self):
  39. g = """start: a
  40. a: a | "a"
  41. """
  42. self.assertRaises(GrammarError, Lark, g, parser='lalr')
  43. l = Lark(g, parser='earley', lexer='dynamic')
  44. self.assertRaises(ParseError, l.parse, 'a')
  45. def test_propagate_positions(self):
  46. g = Lark("""start: a
  47. a: "a"
  48. """, propagate_positions=True)
  49. r = g.parse('a')
  50. self.assertEqual( r.children[0].meta.line, 1 )
  51. def test_expand1(self):
  52. g = Lark("""start: a
  53. ?a: b
  54. b: "x"
  55. """)
  56. r = g.parse('x')
  57. self.assertEqual( r.children[0].data, "b" )
  58. g = Lark("""start: a
  59. ?a: b -> c
  60. b: "x"
  61. """)
  62. r = g.parse('x')
  63. self.assertEqual( r.children[0].data, "c" )
  64. g = Lark("""start: a
  65. ?a: B -> c
  66. B: "x"
  67. """)
  68. self.assertEqual( r.children[0].data, "c" )
  69. g = Lark("""start: a
  70. ?a: b b -> c
  71. b: "x"
  72. """)
  73. r = g.parse('xx')
  74. self.assertEqual( r.children[0].data, "c" )
  75. def test_embedded_transformer(self):
  76. @children_args
  77. class T(Transformer):
  78. def a(self, children):
  79. return "<a>"
  80. def b(self, children):
  81. return "<b>"
  82. def c(self, children):
  83. return "<c>"
  84. # Test regular
  85. g = Lark("""start: a
  86. a : "x"
  87. """, parser='lalr')
  88. r = T().transform(g.parse("x"))
  89. self.assertEqual( r.children, ["<a>"] )
  90. g = Lark("""start: a
  91. a : "x"
  92. """, parser='lalr', transformer=T())
  93. r = g.parse("x")
  94. self.assertEqual( r.children, ["<a>"] )
  95. # Test Expand1
  96. g = Lark("""start: a
  97. ?a : b
  98. b : "x"
  99. """, parser='lalr')
  100. r = T().transform(g.parse("x"))
  101. self.assertEqual( r.children, ["<b>"] )
  102. g = Lark("""start: a
  103. ?a : b
  104. b : "x"
  105. """, parser='lalr', transformer=T())
  106. r = g.parse("x")
  107. self.assertEqual( r.children, ["<b>"] )
  108. # Test Expand1 -> Alias
  109. g = Lark("""start: a
  110. ?a : b b -> c
  111. b : "x"
  112. """, parser='lalr')
  113. r = T().transform(g.parse("xx"))
  114. self.assertEqual( r.children, ["<c>"] )
  115. g = Lark("""start: a
  116. ?a : b b -> c
  117. b : "x"
  118. """, parser='lalr', transformer=T())
  119. r = g.parse("xx")
  120. self.assertEqual( r.children, ["<c>"] )
  121. def _make_full_earley_test(LEXER):
  122. class _TestFullEarley(unittest.TestCase):
  123. def test_anon(self):
  124. # Fails an Earley implementation without special handling for empty rules,
  125. # or re-processing of already completed rules.
  126. g = Lark(r"""start: B
  127. B: ("ab"|/[^b]/)+
  128. """, lexer=LEXER)
  129. self.assertEqual( g.parse('abc').children[0], 'abc')
  130. def test_earley(self):
  131. g = Lark("""start: A "b" c
  132. A: "a"+
  133. c: "abc"
  134. """, parser="earley", lexer=LEXER)
  135. x = g.parse('aaaababc')
  136. def test_earley2(self):
  137. grammar = """
  138. start: statement+
  139. statement: "r"
  140. | "c" /[a-z]/+
  141. %ignore " "
  142. """
  143. program = """c b r"""
  144. l = Lark(grammar, parser='earley', lexer=LEXER)
  145. l.parse(program)
  146. def test_earley3(self):
  147. "Tests prioritization and disambiguation for pseudo-terminals (there should be only one result)"
  148. grammar = """
  149. start: A A
  150. A: "a"+
  151. """
  152. l = Lark(grammar, parser='earley', lexer=LEXER)
  153. res = l.parse("aaa")
  154. self.assertEqual(res.children, ['aa', 'a'])
  155. def test_earley4(self):
  156. grammar = """
  157. start: A A?
  158. A: "a"+
  159. """
  160. l = Lark(grammar, parser='earley', lexer=LEXER)
  161. res = l.parse("aaa")
  162. self.assertEqual(res.children, ['aaa'])
  163. def test_earley_repeating_empty(self):
  164. # This was a sneaky bug!
  165. grammar = """
  166. !start: "a" empty empty "b"
  167. empty: empty2
  168. empty2:
  169. """
  170. parser = Lark(grammar, parser='earley', lexer=LEXER)
  171. res = parser.parse('ab')
  172. empty_tree = Tree('empty', [Tree('empty2', [])])
  173. self.assertSequenceEqual(res.children, ['a', empty_tree, empty_tree, 'b'])
  174. def test_earley_explicit_ambiguity(self):
  175. # This was a sneaky bug!
  176. grammar = """
  177. start: a b | ab
  178. a: "a"
  179. b: "b"
  180. ab: "ab"
  181. """
  182. parser = Lark(grammar, parser='earley', lexer=LEXER, ambiguity='explicit')
  183. res = parser.parse('ab')
  184. self.assertEqual( res.data, '_ambig')
  185. self.assertEqual( len(res.children), 2)
  186. def test_ambiguity1(self):
  187. grammar = """
  188. start: cd+ "e"
  189. !cd: "c"
  190. | "d"
  191. | "cd"
  192. """
  193. l = Lark(grammar, parser='earley', ambiguity='explicit', lexer=LEXER)
  194. x = l.parse('cde')
  195. assert x.data == '_ambig', x
  196. assert len(x.children) == 2
  197. def test_fruitflies_ambig(self):
  198. grammar = """
  199. start: noun verb noun -> simple
  200. | noun verb "like" noun -> comparative
  201. noun: adj? NOUN
  202. verb: VERB
  203. adj: ADJ
  204. NOUN: "flies" | "bananas" | "fruit"
  205. VERB: "like" | "flies"
  206. ADJ: "fruit"
  207. %import common.WS
  208. %ignore WS
  209. """
  210. parser = Lark(grammar, ambiguity='explicit', lexer=LEXER)
  211. res = parser.parse('fruit flies like bananas')
  212. expected = Tree('_ambig', [
  213. Tree('comparative', [
  214. Tree('noun', ['fruit']),
  215. Tree('verb', ['flies']),
  216. Tree('noun', ['bananas'])
  217. ]),
  218. Tree('simple', [
  219. Tree('noun', [Tree('adj', ['fruit']), 'flies']),
  220. Tree('verb', ['like']),
  221. Tree('noun', ['bananas'])
  222. ])
  223. ])
  224. # print res.pretty()
  225. # print expected.pretty()
  226. self.assertEqual(res, expected)
  227. def test_explicit_ambiguity2(self):
  228. grammar = r"""
  229. start: NAME+
  230. NAME: /\w+/
  231. %ignore " "
  232. """
  233. text = """cat"""
  234. parser = Lark(grammar, start='start', ambiguity='explicit')
  235. tree = parser.parse(text)
  236. self.assertEqual(tree.data, '_ambig')
  237. combinations = {tuple(str(s) for s in t.children) for t in tree.children}
  238. self.assertEqual(combinations, {
  239. ('cat',),
  240. ('ca', 't'),
  241. ('c', 'at'),
  242. ('c', 'a' ,'t')
  243. })
  244. def test_term_ambig_resolve(self):
  245. grammar = r"""
  246. !start: NAME+
  247. NAME: /\w+/
  248. %ignore " "
  249. """
  250. text = """foo bar"""
  251. parser = Lark(grammar)
  252. tree = parser.parse(text)
  253. self.assertEqual(tree.children, ['foo', 'bar'])
  254. # @unittest.skipIf(LEXER=='dynamic', "Not implemented in Dynamic Earley yet") # TODO
  255. # def test_not_all_derivations(self):
  256. # grammar = """
  257. # start: cd+ "e"
  258. # !cd: "c"
  259. # | "d"
  260. # | "cd"
  261. # """
  262. # l = Lark(grammar, parser='earley', ambiguity='explicit', lexer=LEXER, earley__all_derivations=False)
  263. # x = l.parse('cde')
  264. # assert x.data != '_ambig', x
  265. # assert len(x.children) == 1
  266. _NAME = "TestFullEarley" + LEXER.capitalize()
  267. _TestFullEarley.__name__ = _NAME
  268. globals()[_NAME] = _TestFullEarley
  269. def _make_parser_test(LEXER, PARSER):
  270. def _Lark(grammar, **kwargs):
  271. return Lark(grammar, lexer=LEXER, parser=PARSER, **kwargs)
  272. class _TestParser(unittest.TestCase):
  273. def test_basic1(self):
  274. g = _Lark("""start: a+ b a* "b" a*
  275. b: "b"
  276. a: "a"
  277. """)
  278. r = g.parse('aaabaab')
  279. self.assertEqual( ''.join(x.data for x in r.children), 'aaabaa' )
  280. r = g.parse('aaabaaba')
  281. self.assertEqual( ''.join(x.data for x in r.children), 'aaabaaa' )
  282. self.assertRaises(ParseError, g.parse, 'aaabaa')
  283. def test_basic2(self):
  284. # Multiple parsers and colliding tokens
  285. g = _Lark("""start: B A
  286. B: "12"
  287. A: "1" """)
  288. g2 = _Lark("""start: B A
  289. B: "12"
  290. A: "2" """)
  291. x = g.parse('121')
  292. assert x.data == 'start' and x.children == ['12', '1'], x
  293. x = g2.parse('122')
  294. assert x.data == 'start' and x.children == ['12', '2'], x
  295. @unittest.skipIf(cStringIO is None, "cStringIO not available")
  296. def test_stringio_bytes(self):
  297. """Verify that a Lark can be created from file-like objects other than Python's standard 'file' object"""
  298. _Lark(cStringIO(b'start: a+ b a* "b" a*\n b: "b"\n a: "a" '))
  299. def test_stringio_unicode(self):
  300. """Verify that a Lark can be created from file-like objects other than Python's standard 'file' object"""
  301. _Lark(uStringIO(u'start: a+ b a* "b" a*\n b: "b"\n a: "a" '))
  302. def test_unicode(self):
  303. g = _Lark(u"""start: UNIA UNIB UNIA
  304. UNIA: /\xa3/
  305. UNIB: /\u0101/
  306. """)
  307. g.parse(u'\xa3\u0101\u00a3')
  308. def test_unicode2(self):
  309. g = _Lark(r"""start: UNIA UNIB UNIA UNIC
  310. UNIA: /\xa3/
  311. UNIB: "a\u0101b\ "
  312. UNIC: /a?\u0101c\n/
  313. """)
  314. g.parse(u'\xa3a\u0101b\\ \u00a3\u0101c\n')
  315. def test_unicode3(self):
  316. g = _Lark(r"""start: UNIA UNIB UNIA UNIC
  317. UNIA: /\xa3/
  318. UNIB: "\u0101"
  319. UNIC: /\u0203/ /\n/
  320. """)
  321. g.parse(u'\xa3\u0101\u00a3\u0203\n')
  322. @unittest.skipIf(PARSER == 'cyk', "Takes forever")
  323. def test_stack_for_ebnf(self):
  324. """Verify that stack depth isn't an issue for EBNF grammars"""
  325. g = _Lark(r"""start: a+
  326. a : "a" """)
  327. g.parse("a" * (sys.getrecursionlimit()*2 ))
  328. def test_expand1_lists_with_one_item(self):
  329. g = _Lark(r"""start: list
  330. ?list: item+
  331. item : A
  332. A: "a"
  333. """)
  334. r = g.parse("a")
  335. # because 'list' is an expand-if-contains-one rule and we only provided one element it should have expanded to 'item'
  336. self.assertSequenceEqual([subtree.data for subtree in r.children], ('item',))
  337. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  338. self.assertEqual(len(r.children), 1)
  339. def test_expand1_lists_with_one_item_2(self):
  340. g = _Lark(r"""start: list
  341. ?list: item+ "!"
  342. item : A
  343. A: "a"
  344. """)
  345. r = g.parse("a!")
  346. # because 'list' is an expand-if-contains-one rule and we only provided one element it should have expanded to 'item'
  347. self.assertSequenceEqual([subtree.data for subtree in r.children], ('item',))
  348. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  349. self.assertEqual(len(r.children), 1)
  350. def test_dont_expand1_lists_with_multiple_items(self):
  351. g = _Lark(r"""start: list
  352. ?list: item+
  353. item : A
  354. A: "a"
  355. """)
  356. r = g.parse("aa")
  357. # because 'list' is an expand-if-contains-one rule and we've provided more than one element it should *not* have expanded
  358. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  359. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  360. self.assertEqual(len(r.children), 1)
  361. # Sanity check: verify that 'list' contains the two 'item's we've given it
  362. [list] = r.children
  363. self.assertSequenceEqual([item.data for item in list.children], ('item', 'item'))
  364. def test_dont_expand1_lists_with_multiple_items_2(self):
  365. g = _Lark(r"""start: list
  366. ?list: item+ "!"
  367. item : A
  368. A: "a"
  369. """)
  370. r = g.parse("aa!")
  371. # because 'list' is an expand-if-contains-one rule and we've provided more than one element it should *not* have expanded
  372. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  373. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  374. self.assertEqual(len(r.children), 1)
  375. # Sanity check: verify that 'list' contains the two 'item's we've given it
  376. [list] = r.children
  377. self.assertSequenceEqual([item.data for item in list.children], ('item', 'item'))
  378. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  379. def test_empty_expand1_list(self):
  380. g = _Lark(r"""start: list
  381. ?list: item*
  382. item : A
  383. A: "a"
  384. """)
  385. r = g.parse("")
  386. # because 'list' is an expand-if-contains-one rule and we've provided less than one element (i.e. none) it should *not* have expanded
  387. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  388. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  389. self.assertEqual(len(r.children), 1)
  390. # Sanity check: verify that 'list' contains no 'item's as we've given it none
  391. [list] = r.children
  392. self.assertSequenceEqual([item.data for item in list.children], ())
  393. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  394. def test_empty_expand1_list_2(self):
  395. g = _Lark(r"""start: list
  396. ?list: item* "!"?
  397. item : A
  398. A: "a"
  399. """)
  400. r = g.parse("")
  401. # because 'list' is an expand-if-contains-one rule and we've provided less than one element (i.e. none) it should *not* have expanded
  402. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  403. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  404. self.assertEqual(len(r.children), 1)
  405. # Sanity check: verify that 'list' contains no 'item's as we've given it none
  406. [list] = r.children
  407. self.assertSequenceEqual([item.data for item in list.children], ())
  408. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  409. def test_empty_flatten_list(self):
  410. g = _Lark(r"""start: list
  411. list: | item "," list
  412. item : A
  413. A: "a"
  414. """)
  415. r = g.parse("")
  416. # Because 'list' is a flatten rule it's top-level element should *never* be expanded
  417. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  418. # Sanity check: verify that 'list' contains no 'item's as we've given it none
  419. [list] = r.children
  420. self.assertSequenceEqual([item.data for item in list.children], ())
  421. @unittest.skipIf(True, "Flattening list isn't implemented (and may never be)")
  422. def test_single_item_flatten_list(self):
  423. g = _Lark(r"""start: list
  424. list: | item "," list
  425. item : A
  426. A: "a"
  427. """)
  428. r = g.parse("a,")
  429. # Because 'list' is a flatten rule it's top-level element should *never* be expanded
  430. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  431. # Sanity check: verify that 'list' contains exactly the one 'item' we've given it
  432. [list] = r.children
  433. self.assertSequenceEqual([item.data for item in list.children], ('item',))
  434. @unittest.skipIf(True, "Flattening list isn't implemented (and may never be)")
  435. def test_multiple_item_flatten_list(self):
  436. g = _Lark(r"""start: list
  437. #list: | item "," list
  438. item : A
  439. A: "a"
  440. """)
  441. r = g.parse("a,a,")
  442. # Because 'list' is a flatten rule it's top-level element should *never* be expanded
  443. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  444. # Sanity check: verify that 'list' contains exactly the two 'item's we've given it
  445. [list] = r.children
  446. self.assertSequenceEqual([item.data for item in list.children], ('item', 'item'))
  447. @unittest.skipIf(True, "Flattening list isn't implemented (and may never be)")
  448. def test_recurse_flatten(self):
  449. """Verify that stack depth doesn't get exceeded on recursive rules marked for flattening."""
  450. g = _Lark(r"""start: a | start a
  451. a : A
  452. A : "a" """)
  453. # Force PLY to write to the debug log, but prevent writing it to the terminal (uses repr() on the half-built
  454. # STree data structures, which uses recursion).
  455. g.parse("a" * (sys.getrecursionlimit() // 4))
  456. def test_token_collision(self):
  457. g = _Lark(r"""start: "Hello" NAME
  458. NAME: /\w/+
  459. %ignore " "
  460. """)
  461. x = g.parse('Hello World')
  462. self.assertSequenceEqual(x.children, ['World'])
  463. x = g.parse('Hello HelloWorld')
  464. self.assertSequenceEqual(x.children, ['HelloWorld'])
  465. def test_token_collision_WS(self):
  466. g = _Lark(r"""start: "Hello" NAME
  467. NAME: /\w/+
  468. %import common.WS
  469. %ignore WS
  470. """)
  471. x = g.parse('Hello World')
  472. self.assertSequenceEqual(x.children, ['World'])
  473. x = g.parse('Hello HelloWorld')
  474. self.assertSequenceEqual(x.children, ['HelloWorld'])
  475. def test_token_collision2(self):
  476. g = _Lark("""
  477. !start: "starts"
  478. %import common.LCASE_LETTER
  479. """)
  480. x = g.parse("starts")
  481. self.assertSequenceEqual(x.children, ['starts'])
  482. # def test_string_priority(self):
  483. # g = _Lark("""start: (A | /a?bb/)+
  484. # A: "a" """)
  485. # x = g.parse('abb')
  486. # self.assertEqual(len(x.children), 2)
  487. # # This parse raises an exception because the lexer will always try to consume
  488. # # "a" first and will never match the regular expression
  489. # # This behavior is subject to change!!
  490. # # Thie won't happen with ambiguity handling.
  491. # g = _Lark("""start: (A | /a?ab/)+
  492. # A: "a" """)
  493. # self.assertRaises(LexError, g.parse, 'aab')
  494. def test_undefined_rule(self):
  495. self.assertRaises(GrammarError, _Lark, """start: a""")
  496. def test_undefined_token(self):
  497. self.assertRaises(GrammarError, _Lark, """start: A""")
  498. def test_rule_collision(self):
  499. g = _Lark("""start: "a"+ "b"
  500. | "a"+ """)
  501. x = g.parse('aaaa')
  502. x = g.parse('aaaab')
  503. def test_rule_collision2(self):
  504. g = _Lark("""start: "a"* "b"
  505. | "a"+ """)
  506. x = g.parse('aaaa')
  507. x = g.parse('aaaab')
  508. x = g.parse('b')
  509. def test_token_not_anon(self):
  510. """Tests that "a" is matched as an anonymous token, and not A.
  511. """
  512. g = _Lark("""start: "a"
  513. A: "a" """)
  514. x = g.parse('a')
  515. self.assertEqual(len(x.children), 0, '"a" should be considered anonymous')
  516. g = _Lark("""start: "a" A
  517. A: "a" """)
  518. x = g.parse('aa')
  519. self.assertEqual(len(x.children), 1, 'only "a" should be considered anonymous')
  520. self.assertEqual(x.children[0].type, "A")
  521. g = _Lark("""start: /a/
  522. A: /a/ """)
  523. x = g.parse('a')
  524. self.assertEqual(len(x.children), 1)
  525. self.assertEqual(x.children[0].type, "A", "A isn't associated with /a/")
  526. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  527. def test_maybe(self):
  528. g = _Lark("""start: ["a"] """)
  529. x = g.parse('a')
  530. x = g.parse('')
  531. def test_start(self):
  532. g = _Lark("""a: "a" a? """, start='a')
  533. x = g.parse('a')
  534. x = g.parse('aa')
  535. x = g.parse('aaa')
  536. def test_alias(self):
  537. g = _Lark("""start: "a" -> b """)
  538. x = g.parse('a')
  539. self.assertEqual(x.data, "b")
  540. def test_token_ebnf(self):
  541. g = _Lark("""start: A
  542. A: "a"* ("b"? "c".."e")+
  543. """)
  544. x = g.parse('abcde')
  545. x = g.parse('dd')
  546. def test_backslash(self):
  547. g = _Lark(r"""start: "\\" "a"
  548. """)
  549. x = g.parse(r'\a')
  550. g = _Lark(r"""start: /\\/ /a/
  551. """)
  552. x = g.parse(r'\a')
  553. def test_special_chars(self):
  554. g = _Lark(r"""start: "\n"
  555. """)
  556. x = g.parse('\n')
  557. g = _Lark(r"""start: /\n/
  558. """)
  559. x = g.parse('\n')
  560. def test_backslash2(self):
  561. g = _Lark(r"""start: "\"" "-"
  562. """)
  563. x = g.parse('"-')
  564. g = _Lark(r"""start: /\// /-/
  565. """)
  566. x = g.parse('/-')
  567. # def test_token_recurse(self):
  568. # g = _Lark("""start: A
  569. # A: B
  570. # B: A
  571. # """)
  572. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  573. def test_empty(self):
  574. # Fails an Earley implementation without special handling for empty rules,
  575. # or re-processing of already completed rules.
  576. g = _Lark(r"""start: _empty a "B"
  577. a: _empty "A"
  578. _empty:
  579. """)
  580. x = g.parse('AB')
  581. def test_regex_quote(self):
  582. g = r"""
  583. start: SINGLE_QUOTED_STRING | DOUBLE_QUOTED_STRING
  584. SINGLE_QUOTED_STRING : /'[^']*'/
  585. DOUBLE_QUOTED_STRING : /"[^"]*"/
  586. """
  587. g = _Lark(g)
  588. self.assertEqual( g.parse('"hello"').children, ['"hello"'])
  589. self.assertEqual( g.parse("'hello'").children, ["'hello'"])
  590. def test_lexer_token_limit(self):
  591. "Python has a stupid limit of 100 groups in a regular expression. Test that we handle this limitation"
  592. tokens = {'A%d'%i:'"%d"'%i for i in range(300)}
  593. g = _Lark("""start: %s
  594. %s""" % (' '.join(tokens), '\n'.join("%s: %s"%x for x in tokens.items())))
  595. def test_float_without_lexer(self):
  596. expected_error = UnexpectedInput if LEXER == 'dynamic' else UnexpectedToken
  597. if PARSER == 'cyk':
  598. expected_error = ParseError
  599. g = _Lark("""start: ["+"|"-"] float
  600. float: digit* "." digit+ exp?
  601. | digit+ exp
  602. exp: ("e"|"E") ["+"|"-"] digit+
  603. digit: "0"|"1"|"2"|"3"|"4"|"5"|"6"|"7"|"8"|"9"
  604. """)
  605. g.parse("1.2")
  606. g.parse("-.2e9")
  607. g.parse("+2e-9")
  608. self.assertRaises( expected_error, g.parse, "+2e-9e")
  609. def test_keep_all_tokens(self):
  610. l = _Lark("""start: "a"+ """, keep_all_tokens=True)
  611. tree = l.parse('aaa')
  612. self.assertEqual(tree.children, ['a', 'a', 'a'])
  613. def test_token_flags(self):
  614. l = _Lark("""!start: "a"i+
  615. """
  616. )
  617. tree = l.parse('aA')
  618. self.assertEqual(tree.children, ['a', 'A'])
  619. l = _Lark("""!start: /a/i+
  620. """
  621. )
  622. tree = l.parse('aA')
  623. self.assertEqual(tree.children, ['a', 'A'])
  624. # g = """!start: "a"i "a"
  625. # """
  626. # self.assertRaises(GrammarError, _Lark, g)
  627. # g = """!start: /a/i /a/
  628. # """
  629. # self.assertRaises(GrammarError, _Lark, g)
  630. g = """start: NAME "," "a"
  631. NAME: /[a-z_]/i /[a-z0-9_]/i*
  632. """
  633. l = _Lark(g)
  634. tree = l.parse('ab,a')
  635. self.assertEqual(tree.children, ['ab'])
  636. tree = l.parse('AB,a')
  637. self.assertEqual(tree.children, ['AB'])
  638. def test_token_flags3(self):
  639. l = _Lark("""!start: ABC+
  640. ABC: "abc"i
  641. """
  642. )
  643. tree = l.parse('aBcAbC')
  644. self.assertEqual(tree.children, ['aBc', 'AbC'])
  645. def test_token_flags2(self):
  646. g = """!start: ("a"i | /a/ /b/?)+
  647. """
  648. l = _Lark(g)
  649. tree = l.parse('aA')
  650. self.assertEqual(tree.children, ['a', 'A'])
  651. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  652. def test_twice_empty(self):
  653. g = """!start: [["A"]]
  654. """
  655. l = _Lark(g)
  656. tree = l.parse('A')
  657. self.assertEqual(tree.children, ['A'])
  658. tree = l.parse('')
  659. self.assertEqual(tree.children, [])
  660. def test_undefined_ignore(self):
  661. g = """!start: "A"
  662. %ignore B
  663. """
  664. self.assertRaises( GrammarError, _Lark, g)
  665. def test_alias_in_terminal(self):
  666. g = """start: TERM
  667. TERM: "a" -> alias
  668. """
  669. self.assertRaises( GrammarError, _Lark, g)
  670. def test_line_and_column(self):
  671. g = r"""!start: "A" bc "D"
  672. !bc: "B\nC"
  673. """
  674. l = _Lark(g)
  675. a, bc, d = l.parse("AB\nCD").children
  676. self.assertEqual(a.line, 1)
  677. self.assertEqual(a.column, 1)
  678. bc ,= bc.children
  679. self.assertEqual(bc.line, 1)
  680. self.assertEqual(bc.column, 2)
  681. self.assertEqual(d.line, 2)
  682. self.assertEqual(d.column, 2)
  683. if LEXER != 'dynamic':
  684. self.assertEqual(a.end_line, 1)
  685. self.assertEqual(a.end_column, 2)
  686. self.assertEqual(bc.end_line, 2)
  687. self.assertEqual(bc.end_column, 2)
  688. self.assertEqual(d.end_line, 2)
  689. self.assertEqual(d.end_column, 3)
  690. def test_reduce_cycle(self):
  691. """Tests an edge-condition in the LALR parser, in which a transition state looks exactly like the end state.
  692. It seems that the correct solution is to explicitely distinguish finalization in the reduce() function.
  693. """
  694. l = _Lark("""
  695. term: A
  696. | term term
  697. A: "a"
  698. """, start='term')
  699. tree = l.parse("aa")
  700. self.assertEqual(len(tree.children), 2)
  701. @unittest.skipIf(LEXER != 'standard', "Only standard lexers care about token priority")
  702. def test_lexer_prioritization(self):
  703. "Tests effect of priority on result"
  704. grammar = """
  705. start: A B | AB
  706. A.2: "a"
  707. B: "b"
  708. AB: "ab"
  709. """
  710. l = _Lark(grammar)
  711. res = l.parse("ab")
  712. self.assertEqual(res.children, ['a', 'b'])
  713. self.assertNotEqual(res.children, ['ab'])
  714. grammar = """
  715. start: A B | AB
  716. A: "a"
  717. B: "b"
  718. AB.3: "ab"
  719. """
  720. l = _Lark(grammar)
  721. res = l.parse("ab")
  722. self.assertNotEqual(res.children, ['a', 'b'])
  723. self.assertEqual(res.children, ['ab'])
  724. def test_import(self):
  725. grammar = """
  726. start: NUMBER WORD
  727. %import common.NUMBER
  728. %import common.WORD
  729. %import common.WS
  730. %ignore WS
  731. """
  732. l = _Lark(grammar)
  733. x = l.parse('12 elephants')
  734. self.assertEqual(x.children, ['12', 'elephants'])
  735. @unittest.skipIf(PARSER != 'earley', "Currently only Earley supports priority in rules")
  736. def test_earley_prioritization(self):
  737. "Tests effect of priority on result"
  738. grammar = """
  739. start: a | b
  740. a.1: "a"
  741. b.2: "a"
  742. """
  743. # l = Lark(grammar, parser='earley', lexer='standard')
  744. l = _Lark(grammar)
  745. res = l.parse("a")
  746. self.assertEqual(res.children[0].data, 'b')
  747. grammar = """
  748. start: a | b
  749. a.2: "a"
  750. b.1: "a"
  751. """
  752. l = _Lark(grammar)
  753. # l = Lark(grammar, parser='earley', lexer='standard')
  754. res = l.parse("a")
  755. self.assertEqual(res.children[0].data, 'a')
  756. @unittest.skipIf(PARSER != 'earley', "Currently only Earley supports priority in rules")
  757. def test_earley_prioritization_sum(self):
  758. "Tests effect of priority on result"
  759. grammar = """
  760. start: ab_ b_ a_ | indirection
  761. indirection: a_ bb_ a_
  762. a_: "a"
  763. b_: "b"
  764. ab_: "ab"
  765. bb_.1: "bb"
  766. """
  767. l = _Lark(grammar, ambiguity='resolve__antiscore_sum')
  768. res = l.parse('abba')
  769. self.assertEqual(''.join(child.data for child in res.children), 'ab_b_a_')
  770. grammar = """
  771. start: ab_ b_ a_ | indirection
  772. indirection: a_ bb_ a_
  773. a_: "a"
  774. b_: "b"
  775. ab_.1: "ab"
  776. bb_: "bb"
  777. """
  778. l = Lark(grammar, parser='earley', ambiguity='resolve__antiscore_sum')
  779. res = l.parse('abba')
  780. self.assertEqual(''.join(child.data for child in res.children), 'indirection')
  781. grammar = """
  782. start: ab_ b_ a_ | indirection
  783. indirection: a_ bb_ a_
  784. a_.2: "a"
  785. b_.1: "b"
  786. ab_.3: "ab"
  787. bb_.3: "bb"
  788. """
  789. l = Lark(grammar, parser='earley', ambiguity='resolve__antiscore_sum')
  790. res = l.parse('abba')
  791. self.assertEqual(''.join(child.data for child in res.children), 'ab_b_a_')
  792. grammar = """
  793. start: ab_ b_ a_ | indirection
  794. indirection: a_ bb_ a_
  795. a_.1: "a"
  796. b_.1: "b"
  797. ab_.4: "ab"
  798. bb_.3: "bb"
  799. """
  800. l = Lark(grammar, parser='earley', ambiguity='resolve__antiscore_sum')
  801. res = l.parse('abba')
  802. self.assertEqual(''.join(child.data for child in res.children), 'indirection')
  803. def test_utf8(self):
  804. g = u"""start: a
  805. a: "±a"
  806. """
  807. l = _Lark(g)
  808. self.assertEqual(l.parse(u'±a'), Tree('start', [Tree('a', [])]))
  809. g = u"""start: A
  810. A: "±a"
  811. """
  812. l = _Lark(g)
  813. self.assertEqual(l.parse(u'±a'), Tree('start', [u'\xb1a']))
  814. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  815. def test_ignore(self):
  816. grammar = r"""
  817. COMMENT: /(!|(\/\/))[^\n]*/
  818. %ignore COMMENT
  819. %import common.WS -> _WS
  820. %import common.INT
  821. start: "INT"i _WS+ INT _WS*
  822. """
  823. parser = _Lark(grammar)
  824. tree = parser.parse("int 1 ! This is a comment\n")
  825. self.assertEqual(tree.children, ['1'])
  826. tree = parser.parse("int 1 ! This is a comment") # A trailing ignore token can be tricky!
  827. self.assertEqual(tree.children, ['1'])
  828. parser = _Lark(r"""
  829. start : "a"*
  830. %ignore "b"
  831. """)
  832. tree = parser.parse("bb")
  833. self.assertEqual(tree.children, [])
  834. def test_regex_escaping(self):
  835. g = _Lark("start: /[ab]/")
  836. g.parse('a')
  837. g.parse('b')
  838. self.assertRaises( UnexpectedInput, g.parse, 'c')
  839. _Lark(r'start: /\w/').parse('a')
  840. g = _Lark(r'start: /\\w/')
  841. self.assertRaises( UnexpectedInput, g.parse, 'a')
  842. g.parse(r'\w')
  843. _Lark(r'start: /\[/').parse('[')
  844. _Lark(r'start: /\//').parse('/')
  845. _Lark(r'start: /\\/').parse('\\')
  846. _Lark(r'start: /\[ab]/').parse('[ab]')
  847. _Lark(r'start: /\\[ab]/').parse('\\a')
  848. _Lark(r'start: /\t/').parse('\t')
  849. _Lark(r'start: /\\t/').parse('\\t')
  850. _Lark(r'start: /\\\t/').parse('\\\t')
  851. _Lark(r'start: "\t"').parse('\t')
  852. _Lark(r'start: "\\t"').parse('\\t')
  853. _Lark(r'start: "\\\t"').parse('\\\t')
  854. def test_ranged_repeat_rules(self):
  855. g = u"""!start: "A"~3
  856. """
  857. l = _Lark(g)
  858. self.assertEqual(l.parse(u'AAA'), Tree('start', ["A", "A", "A"]))
  859. self.assertRaises(ParseError, l.parse, u'AA')
  860. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAA')
  861. g = u"""!start: "A"~0..2
  862. """
  863. if PARSER != 'cyk': # XXX CYK currently doesn't support empty grammars
  864. l = _Lark(g)
  865. self.assertEqual(l.parse(u''), Tree('start', []))
  866. self.assertEqual(l.parse(u'A'), Tree('start', ['A']))
  867. self.assertEqual(l.parse(u'AA'), Tree('start', ['A', 'A']))
  868. self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'AAA')
  869. g = u"""!start: "A"~3..2
  870. """
  871. self.assertRaises(GrammarError, _Lark, g)
  872. g = u"""!start: "A"~2..3 "B"~2
  873. """
  874. l = _Lark(g)
  875. self.assertEqual(l.parse(u'AABB'), Tree('start', ['A', 'A', 'B', 'B']))
  876. self.assertEqual(l.parse(u'AAABB'), Tree('start', ['A', 'A', 'A', 'B', 'B']))
  877. self.assertRaises(ParseError, l.parse, u'AAAB')
  878. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAABBB')
  879. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB')
  880. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB')
  881. def test_ranged_repeat_terms(self):
  882. g = u"""!start: AAA
  883. AAA: "A"~3
  884. """
  885. l = _Lark(g)
  886. self.assertEqual(l.parse(u'AAA'), Tree('start', ["AAA"]))
  887. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AA')
  888. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAA')
  889. g = u"""!start: AABB CC
  890. AABB: "A"~0..2 "B"~2
  891. CC: "C"~1..2
  892. """
  893. l = _Lark(g)
  894. self.assertEqual(l.parse(u'AABBCC'), Tree('start', ['AABB', 'CC']))
  895. self.assertEqual(l.parse(u'BBC'), Tree('start', ['BB', 'C']))
  896. self.assertEqual(l.parse(u'ABBCC'), Tree('start', ['ABB', 'CC']))
  897. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAB')
  898. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAABBB')
  899. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB')
  900. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB')
  901. @unittest.skipIf(PARSER=='earley', "Priority not handled correctly right now") # TODO XXX
  902. def test_priority_vs_embedded(self):
  903. g = """
  904. A.2: "a"
  905. WORD: ("a".."z")+
  906. start: (A | WORD)+
  907. """
  908. l = _Lark(g)
  909. t = l.parse('abc')
  910. self.assertEqual(t.children, ['a', 'bc'])
  911. self.assertEqual(t.children[0].type, 'A')
  912. _NAME = "Test" + PARSER.capitalize() + LEXER.capitalize()
  913. _TestParser.__name__ = _NAME
  914. globals()[_NAME] = _TestParser
  915. # Note: You still have to import them in __main__ for the tests to run
  916. _TO_TEST = [
  917. ('standard', 'earley'),
  918. ('standard', 'cyk'),
  919. ('dynamic', 'earley'),
  920. ('standard', 'lalr'),
  921. ('contextual', 'lalr'),
  922. # (None, 'earley'),
  923. ]
  924. for _LEXER, _PARSER in _TO_TEST:
  925. _make_parser_test(_LEXER, _PARSER)
  926. for _LEXER in ('dynamic',):
  927. _make_full_earley_test(_LEXER)
  928. if __name__ == '__main__':
  929. unittest.main()