Browse Source

Standalone parser now uses contextual lexer instead of traditional one (Issue #212)

It should be easy to allow the user to choose between them, but perhaps unnecessary, as the one and only benefit of the traditional parser is a tiny performance advantage.
tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.6.5
Erez Shinan 6 years ago
parent
commit
720372a386
2 changed files with 83 additions and 19 deletions
  1. +47
    -10
      lark/tools/standalone.py
  2. +36
    -9
      tests/test_tools.py

+ 47
- 10
lark/tools/standalone.py View File

@@ -83,7 +83,7 @@ def extract_sections(lines):

return {name:''.join(text) for name, text in sections.items()}

class LexerAtoms:
class TraditionalLexerAtoms:
def __init__(self, lexer):
self.mres = [(p.pattern,d) for p,d in lexer.mres]
self.newline_types = lexer.newline_types
@@ -93,24 +93,60 @@ class LexerAtoms:

def print_python(self):
print('import re')
print('class LexerRegexps: pass')
print('NEWLINE_TYPES = %s' % self.newline_types)
print('IGNORE_TYPES = %s' % self.ignore_types)
self._print_python('lexer')

def _print_python(self, var_name):
print('MRES = (')
pprint(self.mres)
print(')')
print('LEXER_CALLBACK = (')
pprint(self.callback)
print(')')
print('NEWLINE_TYPES = %s' % self.newline_types)
print('IGNORE_TYPES = %s' % self.ignore_types)
print('class LexerRegexps: pass')
print('lexer_regexps = LexerRegexps()')
print('lexer_regexps.mres = [(re.compile(p), d) for p, d in MRES]')
print('lexer_regexps.callback = {n: UnlessCallback([(re.compile(p), d) for p, d in mres])')
print(' for n, mres in LEXER_CALLBACK.items()}')
print('lexer = _Lex(lexer_regexps)')
print('def lex(stream):')
print(' return lexer.lex(stream, NEWLINE_TYPES, IGNORE_TYPES)')
print('%s = (lexer_regexps)' % var_name)


class ContextualLexerAtoms:
def __init__(self, lexer):
self.lexer_atoms = {state: TraditionalLexerAtoms(lexer) for state, lexer in lexer.lexers.items()}
self.root_lexer_atoms = TraditionalLexerAtoms(lexer.root_lexer)

def print_python(self):
print('import re')
print('class LexerRegexps: pass')
print('NEWLINE_TYPES = %s' % self.root_lexer_atoms.newline_types)
print('IGNORE_TYPES = %s' % self.root_lexer_atoms.ignore_types)

print('LEXERS = {}')
for state, lexer_atoms in self.lexer_atoms.items():
lexer_atoms._print_python('LEXERS[%d]' % state)

print('class ContextualLexer:')
print(' def __init__(self):')
print(' self.lexers = LEXERS')
print(' self.set_parser_state(None)')
print(' def set_parser_state(self, state):')
print(' self.parser_state = state')
print(' def lex(self, stream):')
print(' newline_types = NEWLINE_TYPES')
print(' ignore_types = IGNORE_TYPES')
print(' lexers = LEXERS')
print(' l = _Lex(lexers[self.parser_state], self.parser_state)')
print(' for x in l.lex(stream, newline_types, ignore_types):')
print(' yield x')
print(' l.lexer = lexers[self.parser_state]')
print(' l.state = self.parser_state')

print('CON_LEXER = ContextualLexer()')
print('def lex(stream):')
print(' return CON_LEXER.lex(stream)')

class GetRule:
def __init__(self, rule_id):
self.rule_id = rule_id
@@ -153,8 +189,9 @@ class ParserAtoms:
print(' self.postlex = postlex')
print(' def parse(self, stream):')
print(' tokens = lex(stream)')
print(' sps = CON_LEXER.set_parser_state')
print(' if self.postlex: tokens = self.postlex.process(tokens)')
print(' return self.parser.parse(tokens)')
print(' return self.parser.parse(tokens, sps)')

class TreeBuilderAtoms:
def __init__(self, lark):
@@ -171,9 +208,9 @@ class TreeBuilderAtoms:
print('parse_tree_builder = ParseTreeBuilder(RULES.values(), Tree)')

def main(fobj, start):
lark_inst = Lark(fobj, parser="lalr", lexer="standard", start=start)
lark_inst = Lark(fobj, parser="lalr", lexer="contextual", start=start)

lexer_atoms = LexerAtoms(lark_inst.parser.lexer)
lexer_atoms = ContextualLexerAtoms(lark_inst.parser.lexer)
parser_atoms = ParserAtoms(lark_inst.parser.parser)
tree_builder_atoms = TreeBuilderAtoms(lark_inst)



+ 36
- 9
tests/test_tools.py View File

@@ -17,6 +17,18 @@ class TestStandalone(TestCase):
def setUp(self):
pass

def _create_standalone(self, grammar):
code_buf = StringIO()
temp = sys.stdout
sys.stdout = code_buf
standalone.main(StringIO(grammar), 'start')
sys.stdout = temp
code = code_buf.getvalue()

context = {}
exec(code, context)
return context

def test_simple(self):
grammar = """
start: NUMBER WORD
@@ -28,21 +40,36 @@ class TestStandalone(TestCase):

"""

code_buf = StringIO()
temp = sys.stdout
sys.stdout = code_buf
standalone.main(StringIO(grammar), 'start')
sys.stdout = temp
code = code_buf.getvalue()
context = self._create_standalone(grammar)

context = {}
exec(code, context)
_Lark = context['Lark_StandAlone']

l = _Lark()
x = l.parse('12 elephants')
self.assertEqual(x.children, ['12', 'elephants'])

def test_contextual(self):
grammar = """
start: a b
a: "A" "B"
b: "AB"
"""

context = self._create_standalone(grammar)

_Lark = context['Lark_StandAlone']
l = _Lark()
x = l.parse('ABAB')

class T(context['Transformer']):
def a(self, items):
return 'a'
def b(self, items):
return 'b'
start = list

x = T().transform(x)
self.assertEqual(x, ['a', 'b'])


if __name__ == '__main__':
unittest.main()


Loading…
Cancel
Save