diff --git a/README.md b/README.md
index 542977f..794a203 100644
--- a/README.md
+++ b/README.md
@@ -12,6 +12,7 @@ Lark can:
- Build a parse-tree automagically, no construction code required
- Outperform all other Python libraries when using LALR(1) (Yes, including PLY)
- Run on every Python interpreter (it's pure-python)
+ - Generate a stand-alone parser (for LALR(1) grammars)
And many more features. Read ahead and find out.
@@ -66,10 +67,11 @@ See more [examples in the wiki](https://github.com/erezsh/lark/wiki/Examples)
- Builds a parse-tree (AST) automagically, based on the structure of the grammar
- **Earley** parser
- - Can parse *ALL* context-free grammars
- - Full support for ambiguity in grammar
+ - Can parse all context-free grammars
+ - Full support for ambiguous grammars
- **LALR(1)** parser
- - Competitive with PLY
+ - Fast and light, competitive with PLY
+ - Can generate a stand-alone parser
- **EBNF** grammar
- **Unicode** fully supported
- **Python 2 & 3** compatible
@@ -86,7 +88,7 @@ See the full list of [features in the wiki](https://github.com/erezsh/lark/wiki/
#### Performance comparison
-Lower is better!
+Lark is the fastest and lightest (lower is better)
![Run-time Comparison](docs/comparison_runtime.png)
@@ -99,17 +101,17 @@ Check out the [JSON tutorial](/docs/json_tutorial.md#conclusion) for more detail
#### Feature comparison
-| Library | Algorithm | Grammar | Builds tree? | Supports ambiguity? | Can handle every CFG?
-|:--------|:----------|:----|:--------|:------------|:------------
-| **Lark** | Earley/LALR(1) | EBNF | Yes! | Yes! | Yes! |
-| [PLY](http://www.dabeaz.com/ply/) | LALR(1) | BNF | No | No | No |
-| [PyParsing](http://pyparsing.wikispaces.com/) | PEG | Combinators | No | No | No\* |
-| [Parsley](https://pypi.python.org/pypi/Parsley) | PEG | EBNF | No | No | No\* |
-| [funcparserlib](https://github.com/vlasovskikh/funcparserlib) | Recursive-Descent | Combinators | No | No | No |
-| [Parsimonious](https://github.com/erikrose/parsimonious) | PEG | EBNF | Yes | No | No\* |
+| Library | Algorithm | Grammar | Builds tree? | Supports ambiguity? | Can handle every CFG? | Line/Column tracking | Generates Stand-alone
+|:--------|:----------|:----|:--------|:------------|:------------|:----------|:----------
+| **Lark** | Earley/LALR(1) | EBNF | Yes! | Yes! | Yes! | Yes! | Yes! (LALR only) |
+| [PLY](http://www.dabeaz.com/ply/) | LALR(1) | BNF | No | No | No | No | No |
+| [PyParsing](http://pyparsing.wikispaces.com/) | PEG | Combinators | No | No | No\* | No | No |
+| [Parsley](https://pypi.python.org/pypi/Parsley) | PEG | EBNF | No | No | No\* | No | No |
+| [funcparserlib](https://github.com/vlasovskikh/funcparserlib) | Recursive-Descent | Combinators | No | No | No | No | No |
+| [Parsimonious](https://github.com/erikrose/parsimonious) | PEG | EBNF | Yes | No | No\* | No | No |
-(\* *According to Wikipedia, it remains unanswered whether PEGs can really parse all deterministic CFGs*)
+(\* *PEGs cannot handle non-deterministic grammars. Also, according to Wikipedia, it remains unanswered whether PEGs can really parse all deterministic CFGs*)
### Projects using Lark
diff --git a/examples/standalone/create_standalone.sh b/examples/standalone/create_standalone.sh
new file mode 100755
index 0000000..1eba3a4
--- /dev/null
+++ b/examples/standalone/create_standalone.sh
@@ -0,0 +1 @@
+python -m lark.tools.standalone json.g > json_parser.py
diff --git a/examples/standalone/json.g b/examples/standalone/json.g
new file mode 100644
index 0000000..243a230
--- /dev/null
+++ b/examples/standalone/json.g
@@ -0,0 +1,21 @@
+?start: value
+
+?value: object
+ | array
+ | string
+ | SIGNED_NUMBER -> number
+ | "true" -> true
+ | "false" -> false
+ | "null" -> null
+
+array : "[" [value ("," value)*] "]"
+object : "{" [pair ("," pair)*] "}"
+pair : string ":" value
+
+string : ESCAPED_STRING
+
+%import common.ESCAPED_STRING
+%import common.SIGNED_NUMBER
+%import common.WS
+
+%ignore WS
diff --git a/examples/standalone/json_parser.py b/examples/standalone/json_parser.py
new file mode 100644
index 0000000..f249f61
--- /dev/null
+++ b/examples/standalone/json_parser.py
@@ -0,0 +1,794 @@
+# The file was automatically generated by Lark v0.5.2
+#
+#
+# Lark Stand-alone Generator Tool
+# ----------------------------------
+# Generates a stand-alone LALR(1) parser with a standard lexer
+#
+# Git: https://github.com/erezsh/lark
+# Author: Erez Shinan (erezshin@gmail.com)
+#
+#
+# >>> LICENSE
+#
+# This tool and its generated code use a separate license from Lark.
+#
+# It is licensed under GPLv2 or above.
+#
+# If you wish to purchase a commercial license for this tool and its
+# generated code, contact me via email.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# See .
+#
+#
+
+
+import types
+import functools
+from contextlib import contextmanager
+
+Str = type(u'')
+
+def inline_args(f):
+ # print '@@', f.__name__, type(f), isinstance(f, types.FunctionType), isinstance(f, types.TypeType), isinstance(f, types.BuiltinFunctionType)
+ if isinstance(f, types.FunctionType):
+ @functools.wraps(f)
+ def _f_func(self, args):
+ return f(self, *args)
+ return _f_func
+ elif isinstance(f, (type, types.BuiltinFunctionType)):
+ @functools.wraps(f)
+ def _f_builtin(_self, args):
+ return f(*args)
+ return _f_builtin
+ elif isinstance(f, types.MethodType):
+ @functools.wraps(f.__func__)
+ def _f(self, args):
+ return f.__func__(self, *args)
+ return _f
+ else:
+ @functools.wraps(f.__call__.__func__)
+ def _f(self, args):
+ return f.__call__.__func__(self, *args)
+ return _f
+
+
+try:
+ from contextlib import suppress # Python 3
+except ImportError:
+ @contextmanager
+ def suppress(*excs):
+ '''Catch and dismiss the provided exception
+
+ >>> x = 'hello'
+ >>> with suppress(IndexError):
+ ... x = x[10]
+ >>> x
+ 'hello'
+ '''
+ try:
+ yield
+ except excs:
+ pass
+
+
+def is_terminal(sym):
+ return sym.isupper()
+
+class GrammarError(Exception):
+ pass
+
+class ParseError(Exception):
+ pass
+
+class UnexpectedToken(ParseError):
+ def __init__(self, token, expected, seq, index):
+ self.token = token
+ self.expected = expected
+ self.line = getattr(token, 'line', '?')
+ self.column = getattr(token, 'column', '?')
+
+ try:
+ context = ' '.join(['%r(%s)' % (t.value, t.type) for t in seq[index:index+5]])
+ except AttributeError:
+ context = seq[index:index+5]
+ except TypeError:
+ context = ""
+ message = ("Unexpected token %r at line %s, column %s.\n"
+ "Expected: %s\n"
+ "Context: %s" % (token, self.line, self.column, expected, context))
+
+ super(UnexpectedToken, self).__init__(message)
+
+
+
+class Tree(object):
+ def __init__(self, data, children):
+ self.data = data
+ self.children = list(children)
+
+ def __repr__(self):
+ return 'Tree(%s, %s)' % (self.data, self.children)
+
+ def _pretty_label(self):
+ return self.data
+
+ def _pretty(self, level, indent_str):
+ if len(self.children) == 1 and not isinstance(self.children[0], Tree):
+ return [ indent_str*level, self._pretty_label(), '\t', '%s' % self.children[0], '\n']
+
+ l = [ indent_str*level, self._pretty_label(), '\n' ]
+ for n in self.children:
+ if isinstance(n, Tree):
+ l += n._pretty(level+1, indent_str)
+ else:
+ l += [ indent_str*(level+1), '%s' % n, '\n' ]
+
+ return l
+
+ def pretty(self, indent_str=' '):
+ return ''.join(self._pretty(0, indent_str))
+class Transformer(object):
+ def _get_func(self, name):
+ return getattr(self, name)
+
+ def transform(self, tree):
+ items = []
+ for c in tree.children:
+ try:
+ items.append(self.transform(c) if isinstance(c, Tree) else c)
+ except Discard:
+ pass
+ try:
+ f = self._get_func(tree.data)
+ except AttributeError:
+ return self.__default__(tree.data, items)
+ else:
+ return f(items)
+
+ def __default__(self, data, children):
+ return Tree(data, children)
+
+ def __mul__(self, other):
+ return TransformerChain(self, other)
+
+
+class Discard(Exception):
+ pass
+
+class TransformerChain(object):
+ def __init__(self, *transformers):
+ self.transformers = transformers
+
+ def transform(self, tree):
+ for t in self.transformers:
+ tree = t.transform(tree)
+ return tree
+
+ def __mul__(self, other):
+ return TransformerChain(*self.transformers + (other,))
+
+
+
+class InlineTransformer(Transformer):
+ def _get_func(self, name): # use super()._get_func
+ return inline_args(getattr(self, name)).__get__(self)
+
+
+class Visitor(object):
+ def visit(self, tree):
+ for child in tree.children:
+ if isinstance(child, Tree):
+ self.visit(child)
+
+ f = getattr(self, tree.data, self.__default__)
+ f(tree)
+ return tree
+
+ def __default__(self, tree):
+ pass
+
+
+class Visitor_NoRecurse(Visitor):
+ def visit(self, tree):
+ subtrees = list(tree.iter_subtrees())
+
+ for subtree in (subtrees):
+ getattr(self, subtree.data, self.__default__)(subtree)
+ return tree
+
+
+class Transformer_NoRecurse(Transformer):
+ def transform(self, tree):
+ subtrees = list(tree.iter_subtrees())
+
+ def _t(t):
+ # Assumes t is already transformed
+ try:
+ f = self._get_func(t.data)
+ except AttributeError:
+ return self.__default__(t)
+ else:
+ return f(t)
+
+ for subtree in subtrees:
+ children = []
+ for c in subtree.children:
+ try:
+ children.append(_t(c) if isinstance(c, Tree) else c)
+ except Discard:
+ pass
+ subtree.children = children
+
+ return _t(tree)
+
+ def __default__(self, t):
+ return t
+
+class Indenter:
+ def __init__(self):
+ self.paren_level = 0
+ self.indent_level = [0]
+
+ def handle_NL(self, token):
+ if self.paren_level > 0:
+ return
+
+ yield token
+
+ indent_str = token.rsplit('\n', 1)[1] # Tabs and spaces
+ indent = indent_str.count(' ') + indent_str.count('\t') * self.tab_len
+
+ if indent > self.indent_level[-1]:
+ self.indent_level.append(indent)
+ yield Token.new_borrow_pos(self.INDENT_type, indent_str, token)
+ else:
+ while indent < self.indent_level[-1]:
+ self.indent_level.pop()
+ yield Token.new_borrow_pos(self.DEDENT_type, indent_str, token)
+
+ assert indent == self.indent_level[-1], '%s != %s' % (indent, self.indent_level[-1])
+
+ def process(self, stream):
+ for token in stream:
+ if token.type == self.NL_type:
+ for t in self.handle_NL(token):
+ yield t
+ else:
+ yield token
+
+ if token.type in self.OPEN_PAREN_types:
+ self.paren_level += 1
+ elif token.type in self.CLOSE_PAREN_types:
+ self.paren_level -= 1
+ assert self.paren_level >= 0
+
+ while len(self.indent_level) > 1:
+ self.indent_level.pop()
+ yield Token(self.DEDENT_type, '')
+
+ assert self.indent_level == [0], self.indent_level
+
+ # XXX Hack for ContextualLexer. Maybe there's a more elegant solution?
+ @property
+ def always_accept(self):
+ return (self.NL_type,)
+
+
+class LexError(Exception):
+ pass
+
+class UnexpectedInput(LexError):
+ def __init__(self, seq, lex_pos, line, column, allowed=None):
+ context = seq[lex_pos:lex_pos+5]
+ message = "No token defined for: '%s' in %r at line %d col %d" % (seq[lex_pos], context, line, column)
+
+ super(UnexpectedInput, self).__init__(message)
+
+ self.line = line
+ self.column = column
+ self.context = context
+ self.allowed = allowed
+
+class Token(Str):
+ def __new__(cls, type_, value, pos_in_stream=None, line=None, column=None):
+ inst = Str.__new__(cls, value)
+ inst.type = type_
+ inst.pos_in_stream = pos_in_stream
+ inst.value = value
+ inst.line = line
+ inst.column = column
+ return inst
+
+ @classmethod
+ def new_borrow_pos(cls, type_, value, borrow_t):
+ return cls(type_, value, borrow_t.pos_in_stream, line=borrow_t.line, column=borrow_t.column)
+
+ def __repr__(self):
+ return 'Token(%s, %r)' % (self.type, self.value)
+
+ def __deepcopy__(self, memo):
+ return Token(self.type, self.value, self.pos_in_stream, self.line, self.column)
+
+ def __eq__(self, other):
+ if isinstance(other, Token) and self.type != other.type:
+ return False
+
+ return Str.__eq__(self, other)
+
+ __hash__ = Str.__hash__
+
+
+class LineCounter:
+ def __init__(self):
+ self.newline_char = '\n'
+ self.char_pos = 0
+ self.line = 1
+ self.column = 0
+ self.line_start_pos = 0
+
+ def feed(self, token, test_newline=True):
+ """Consume a token and calculate the new line & column.
+
+ As an optional optimization, set test_newline=False is token doesn't contain a newline.
+ """
+ if test_newline:
+ newlines = token.count(self.newline_char)
+ if newlines:
+ self.line += newlines
+ self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1
+
+ self.char_pos += len(token)
+ self.column = self.char_pos - self.line_start_pos
+
+class _Lex:
+ "Built to serve both Lexer and ContextualLexer"
+ def __init__(self, lexer):
+ self.lexer = lexer
+
+ def lex(self, stream, newline_types, ignore_types):
+ newline_types = list(newline_types)
+ newline_types = list(newline_types)
+ line_ctr = LineCounter()
+
+ while True:
+ lexer = self.lexer
+ for mre, type_from_index in lexer.mres:
+ m = mre.match(stream, line_ctr.char_pos)
+ if m:
+ value = m.group(0)
+ type_ = type_from_index[m.lastindex]
+ if type_ not in ignore_types:
+ t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
+ if t.type in lexer.callback:
+ t = lexer.callback[t.type](t)
+ lexer = yield t
+
+ line_ctr.feed(value, type_ in newline_types)
+ break
+ else:
+ if line_ctr.char_pos < len(stream):
+ raise UnexpectedInput(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column)
+ break
+
+class UnlessCallback:
+ def __init__(self, mres):
+ self.mres = mres
+
+ def __call__(self, t):
+ for mre, type_from_index in self.mres:
+ m = mre.match(t.value)
+ if m:
+ value = m.group(0)
+ t.type = type_from_index[m.lastindex]
+ break
+ return t
+
+
+
+class NodeBuilder:
+ def __init__(self, tree_class, name):
+ self.tree_class = tree_class
+ self.name = name
+
+ def __call__(self, children):
+ return self.tree_class(self.name, children)
+
+class Expand1:
+ def __init__(self, node_builder):
+ self.node_builder = node_builder
+
+ def __call__(self, children):
+ if len(children) == 1:
+ return children[0]
+ else:
+ return self.node_builder(children)
+
+class Factory:
+ def __init__(self, cls, *args):
+ self.cls = cls
+ self.args = args
+
+ def __call__(self, node_builder):
+ return self.cls(node_builder, *self.args)
+
+
+class TokenWrapper:
+ "Used for fixing the results of scanless parsing"
+
+ def __init__(self, node_builder, token_name):
+ self.node_builder = node_builder
+ self.token_name = token_name
+
+ def __call__(self, children):
+ return self.node_builder( [Token(self.token_name, ''.join(children))] )
+
+def identity(node_builder):
+ return node_builder
+
+
+class ChildFilter:
+ def __init__(self, node_builder, to_include):
+ self.node_builder = node_builder
+ self.to_include = to_include
+
+ def __call__(self, children):
+ filtered = []
+ for i, to_expand in self.to_include:
+ if to_expand:
+ filtered += children[i].children
+ else:
+ filtered.append(children[i])
+
+ return self.node_builder(filtered)
+
+def create_rule_handler(expansion, keep_all_tokens, filter_out):
+ # if not keep_all_tokens:
+ to_include = [(i, not is_terminal(sym) and sym.startswith('_'))
+ for i, sym in enumerate(expansion)
+ if keep_all_tokens
+ or not ((is_terminal(sym) and sym.startswith('_')) or sym in filter_out)
+ ]
+
+ if len(to_include) < len(expansion) or any(to_expand for i, to_expand in to_include):
+ return Factory(ChildFilter, to_include)
+
+ # else, if no filtering required..
+ return identity
+
+class PropagatePositions:
+ def __init__(self, node_builder):
+ self.node_builder = node_builder
+
+ def __call__(self, children):
+ res = self.node_builder(children)
+
+ if children:
+ for a in children:
+ with suppress(AttributeError):
+ res.line = a.line
+ res.column = a.column
+ break
+
+ for a in reversed(children):
+ with suppress(AttributeError):
+ res.end_line = a.end_line
+ res.end_col = a.end_col
+ break
+
+ return res
+
+
+class Callback(object):
+ pass
+
+class ParseTreeBuilder:
+ def __init__(self, rules, tree_class, propagate_positions=False, keep_all_tokens=False):
+ self.tree_class = tree_class
+ self.propagate_positions = propagate_positions
+ self.always_keep_all_tokens = keep_all_tokens
+
+ self.rule_builders = list(self._init_builders(rules))
+
+ self.user_aliases = {}
+
+ def _init_builders(self, rules):
+ filter_out = set()
+ for rule in rules:
+ if rule.options and rule.options.filter_out:
+ assert rule.origin.startswith('_') # Just to make sure
+ filter_out.add(rule.origin)
+
+ for rule in rules:
+ options = rule.options
+ keep_all_tokens = self.always_keep_all_tokens or (options.keep_all_tokens if options else False)
+ expand1 = options.expand1 if options else False
+ create_token = options.create_token if options else False
+
+ wrapper_chain = filter(None, [
+ (expand1 and not rule.alias) and Expand1,
+ create_token and Factory(TokenWrapper, create_token),
+ create_rule_handler(rule.expansion, keep_all_tokens, filter_out),
+ self.propagate_positions and PropagatePositions,
+ ])
+
+ yield rule, wrapper_chain
+
+
+ def create_callback(self, transformer=None):
+ callback = Callback()
+
+ for rule, wrapper_chain in self.rule_builders:
+ internal_callback_name = '_callback_%s_%s' % (rule.origin, '_'.join(rule.expansion))
+
+ user_callback_name = rule.alias or rule.origin
+ try:
+ f = transformer._get_func(user_callback_name)
+ except AttributeError:
+ f = NodeBuilder(self.tree_class, user_callback_name)
+
+ self.user_aliases[rule] = rule.alias
+ rule.alias = internal_callback_name
+
+ for w in wrapper_chain:
+ f = w(f)
+
+ if hasattr(callback, internal_callback_name):
+ raise GrammarError("Rule '%s' already exists" % (rule,))
+ setattr(callback, internal_callback_name, f)
+
+ return callback
+
+
+
+class _Parser:
+ def __init__(self, parse_table, callbacks):
+ self.states = parse_table.states
+ self.start_state = parse_table.start_state
+ self.end_state = parse_table.end_state
+ self.callbacks = callbacks
+
+ def parse(self, seq, set_state=None):
+ i = 0
+ token = None
+ stream = iter(seq)
+ states = self.states
+
+ state_stack = [self.start_state]
+ value_stack = []
+
+ if set_state: set_state(self.start_state)
+
+ def get_action(key):
+ state = state_stack[-1]
+ try:
+ return states[state][key]
+ except KeyError:
+ expected = states[state].keys()
+
+ raise UnexpectedToken(token, expected, seq, i)
+
+ def reduce(rule):
+ size = len(rule.expansion)
+ if size:
+ s = value_stack[-size:]
+ del state_stack[-size:]
+ del value_stack[-size:]
+ else:
+ s = []
+
+ value = self.callbacks[rule](s)
+
+ _action, new_state = get_action(rule.origin)
+ assert _action is Shift
+ state_stack.append(new_state)
+ value_stack.append(value)
+
+ # Main LALR-parser loop
+ try:
+ token = next(stream)
+ i += 1
+ while True:
+ action, arg = get_action(token.type)
+ assert arg != self.end_state
+
+ if action is Shift:
+ state_stack.append(arg)
+ value_stack.append(token)
+ if set_state: set_state(arg)
+ token = next(stream)
+ i += 1
+ else:
+ reduce(arg)
+ except StopIteration:
+ pass
+
+ while True:
+ _action, arg = get_action('$END')
+ if _action is Shift:
+ assert arg == self.end_state
+ val ,= value_stack
+ return val
+ else:
+ reduce(arg)
+
+
+
+class Rule(object):
+ """
+ origin : a symbol
+ expansion : a list of symbols
+ """
+ def __init__(self, origin, expansion, alias=None, options=None):
+ self.origin = origin
+ self.expansion = expansion
+ self.alias = alias
+ self.options = options
+
+ def __str__(self):
+ return '<%s : %s>' % (self.origin, ' '.join(map(str,self.expansion)))
+
+ def __repr__(self):
+ return 'Rule(%r, %r, %r, %r)' % (self.origin, self.expansion, self.alias, self.options)
+
+
+class RuleOptions:
+ def __init__(self, keep_all_tokens=False, expand1=False, create_token=None, filter_out=False, priority=None):
+ self.keep_all_tokens = keep_all_tokens
+ self.expand1 = expand1
+ self.create_token = create_token # used for scanless postprocessing
+ self.priority = priority
+
+ self.filter_out = filter_out # remove this rule from the tree
+ # used for "token"-rules in scanless
+
+ def __repr__(self):
+ return 'RuleOptions(%r, %r, %r, %r, %r)' % (
+ self.keep_all_tokens,
+ self.expand1,
+ self.create_token,
+ self.priority,
+ self.filter_out
+ )
+
+Shift = 0
+Reduce = 1
+import re
+MRES = (
+[('(?P(?:(?:\\+|\\-))?(?:(?:(?:[0-9])+(?:e|E)(?:(?:\\+|\\-))?(?:[0-9])+|(?:(?:[0-9])+\\.(?:(?:[0-9])+)?|\\.(?:[0-9])+)(?:(?:e|E)(?:(?:\\+|\\-))?(?:[0-9])+)?)|(?:[0-9])+))|(?P\\"(?:(?:\\\\\\"|[^"]))*\\")|(?P(?:[ \t\x0c'
+ '\r\n'
+ '])+)|(?P<__FALSE1>false)|(?P<__NULL2>null)|(?P<__TRUE0>true)|(?P<__COLON>\\:)|(?P<__COMMA>\\,)|(?P<__LBRACE>\\{)|(?P<__LSQB>\\[)|(?P<__RBRACE>\\})|(?P<__RSQB>\\])',
+ {1: 'SIGNED_NUMBER',
+ 2: 'ESCAPED_STRING',
+ 3: 'WS',
+ 4: '__FALSE1',
+ 5: '__NULL2',
+ 6: '__TRUE0',
+ 7: '__COLON',
+ 8: '__COMMA',
+ 9: '__LBRACE',
+ 10: '__LSQB',
+ 11: '__RBRACE',
+ 12: '__RSQB'})]
+)
+LEXER_CALLBACK = (
+{}
+)
+NEWLINE_TYPES = ['WS']
+IGNORE_TYPES = ['WS']
+class LexerRegexps: pass
+lexer_regexps = LexerRegexps()
+lexer_regexps.mres = [(re.compile(p), d) for p, d in MRES]
+lexer_regexps.callback = {n: UnlessCallback([(re.compile(p), d) for p, d in mres])
+ for n, mres in LEXER_CALLBACK.items()}
+lexer = _Lex(lexer_regexps)
+def lex(stream):
+ return lexer.lex(stream, NEWLINE_TYPES, IGNORE_TYPES)
+RULES = {
+ 0: Rule('start', ['value'], None, RuleOptions(False, True, None, None, False)),
+ 1: Rule('value', ['object'], None, RuleOptions(False, True, None, None, False)),
+ 2: Rule('value', ['array'], None, RuleOptions(False, True, None, None, False)),
+ 3: Rule('value', ['string'], None, RuleOptions(False, True, None, None, False)),
+ 4: Rule('value', ['SIGNED_NUMBER'], 'number', RuleOptions(False, True, None, None, False)),
+ 5: Rule('value', ['__TRUE0'], 'true', RuleOptions(False, True, None, None, False)),
+ 6: Rule('value', ['__FALSE1'], 'false', RuleOptions(False, True, None, None, False)),
+ 7: Rule('value', ['__NULL2'], 'null', RuleOptions(False, True, None, None, False)),
+ 8: Rule('array', ['__LSQB', 'value', '__anon_star_0', '__RSQB'], None, RuleOptions(False, False, None, None, False)),
+ 9: Rule('array', ['__LSQB', 'value', '__RSQB'], None, RuleOptions(False, False, None, None, False)),
+ 10: Rule('array', ['__LSQB', '__RSQB'], None, RuleOptions(False, False, None, None, False)),
+ 11: Rule('object', ['__LBRACE', 'pair', '__anon_star_1', '__RBRACE'], None, RuleOptions(False, False, None, None, False)),
+ 12: Rule('object', ['__LBRACE', 'pair', '__RBRACE'], None, RuleOptions(False, False, None, None, False)),
+ 13: Rule('object', ['__LBRACE', '__RBRACE'], None, RuleOptions(False, False, None, None, False)),
+ 14: Rule('pair', ['string', '__COLON', 'value'], None, RuleOptions(False, False, None, None, False)),
+ 15: Rule('string', ['ESCAPED_STRING'], None, RuleOptions(False, False, None, None, False)),
+ 16: Rule('__anon_star_0', ['__COMMA', 'value'], None, None),
+ 17: Rule('__anon_star_0', ['__anon_star_0', '__COMMA', 'value'], None, None),
+ 18: Rule('__anon_star_1', ['__COMMA', 'pair'], None, None),
+ 19: Rule('__anon_star_1', ['__anon_star_1', '__COMMA', 'pair'], None, None),
+}
+parse_tree_builder = ParseTreeBuilder(RULES.values(), Tree)
+class ParseTable: pass
+parse_table = ParseTable()
+STATES = {
+ 0: {0: (0, 1), 1: (0, 2), 2: (0, 3), 3: (0, 4), 4: (0, 5), 5: (0, 6), 6: (0, 7), 7: (0, 8), 8: (0, 9), 9: (0, 10), 10: (0, 11), 11: (0, 12)},
+ 1: {12: (1, 5), 13: (1, 5), 14: (1, 5), 15: (1, 5)},
+ 2: {9: (0, 10), 14: (0, 13), 16: (0, 14), 11: (0, 15)},
+ 3: {12: (1, 2), 13: (1, 2), 14: (1, 2), 15: (1, 2)},
+ 4: {12: (1, 1), 13: (1, 1), 14: (1, 1), 15: (1, 1)},
+ 5: {12: (0, 16)},
+ 6: {7: (0, 17), 0: (0, 1), 1: (0, 2), 2: (0, 3), 3: (0, 4), 5: (0, 6), 6: (0, 7), 8: (0, 9), 9: (0, 10), 15: (0, 18), 10: (0, 11), 11: (0, 12)},
+ 7: {12: (1, 4), 13: (1, 4), 14: (1, 4), 15: (1, 4)},
+ 8: {12: (1, 0)},
+ 9: {12: (1, 7), 13: (1, 7), 14: (1, 7), 15: (1, 7)},
+ 10: {12: (1, 15), 17: (1, 15), 13: (1, 15), 14: (1, 15), 15: (1, 15)},
+ 11: {12: (1, 6), 13: (1, 6), 14: (1, 6), 15: (1, 6)},
+ 12: {12: (1, 3), 13: (1, 3), 14: (1, 3), 15: (1, 3)},
+ 13: {13: (1, 13), 12: (1, 13), 14: (1, 13), 15: (1, 13)},
+ 14: {14: (0, 19), 13: (0, 20), 18: (0, 21)},
+ 15: {17: (0, 22)},
+ 16: {},
+ 17: {19: (0, 23), 15: (0, 24), 13: (0, 25)},
+ 18: {13: (1, 10), 12: (1, 10), 14: (1, 10), 15: (1, 10)},
+ 19: {13: (1, 12), 12: (1, 12), 14: (1, 12), 15: (1, 12)},
+ 20: {9: (0, 10), 11: (0, 15), 16: (0, 26)},
+ 21: {14: (0, 27), 13: (0, 28)},
+ 22: {5: (0, 6), 1: (0, 2), 0: (0, 1), 8: (0, 9), 2: (0, 3), 3: (0, 4), 9: (0, 10), 6: (0, 7), 10: (0, 11), 11: (0, 12), 7: (0, 29)},
+ 23: {15: (0, 30), 13: (0, 31)},
+ 24: {13: (1, 9), 12: (1, 9), 14: (1, 9), 15: (1, 9)},
+ 25: {5: (0, 6), 1: (0, 2), 0: (0, 1), 8: (0, 9), 2: (0, 3), 3: (0, 4), 7: (0, 32), 9: (0, 10), 6: (0, 7), 10: (0, 11), 11: (0, 12)},
+ 26: {13: (1, 18), 14: (1, 18)},
+ 27: {13: (1, 11), 12: (1, 11), 14: (1, 11), 15: (1, 11)},
+ 28: {16: (0, 33), 9: (0, 10), 11: (0, 15)},
+ 29: {13: (1, 14), 14: (1, 14)},
+ 30: {13: (1, 8), 12: (1, 8), 14: (1, 8), 15: (1, 8)},
+ 31: {5: (0, 6), 1: (0, 2), 0: (0, 1), 7: (0, 34), 8: (0, 9), 2: (0, 3), 3: (0, 4), 9: (0, 10), 6: (0, 7), 10: (0, 11), 11: (0, 12)},
+ 32: {15: (1, 16), 13: (1, 16)},
+ 33: {13: (1, 19), 14: (1, 19)},
+ 34: {15: (1, 17), 13: (1, 17)},
+}
+TOKEN_TYPES = (
+{0: '__TRUE0',
+ 1: '__LBRACE',
+ 2: 'array',
+ 3: 'object',
+ 4: 'start',
+ 5: '__LSQB',
+ 6: 'SIGNED_NUMBER',
+ 7: 'value',
+ 8: '__NULL2',
+ 9: 'ESCAPED_STRING',
+ 10: '__FALSE1',
+ 11: 'string',
+ 12: '$END',
+ 13: '__COMMA',
+ 14: '__RBRACE',
+ 15: '__RSQB',
+ 16: 'pair',
+ 17: '__COLON',
+ 18: '__anon_star_1',
+ 19: '__anon_star_0'}
+)
+parse_table.states = {s: {TOKEN_TYPES[t]: (a, RULES[x] if a is Reduce else x) for t, (a, x) in acts.items()}
+ for s, acts in STATES.items()}
+parse_table.start_state = 0
+parse_table.end_state = 16
+class Lark_StandAlone:
+ def __init__(self, transformer=None, postlex=None):
+ callback = parse_tree_builder.create_callback(transformer=transformer)
+ callbacks = {rule: getattr(callback, rule.alias or rule.origin, None) for rule in RULES.values()}
+ self.parser = _Parser(parse_table, callbacks)
+ self.postlex = postlex
+ def parse(self, stream):
+ tokens = lex(stream)
+ if self.postlex: tokens = self.postlex.process(tokens)
+ return self.parser.parse(tokens)
diff --git a/examples/standalone/json_parser_main.py b/examples/standalone/json_parser_main.py
new file mode 100644
index 0000000..47c1bb1
--- /dev/null
+++ b/examples/standalone/json_parser_main.py
@@ -0,0 +1,25 @@
+import sys
+
+from json_parser import Lark_StandAlone, Transformer, inline_args
+
+class TreeToJson(Transformer):
+ @inline_args
+ def string(self, s):
+ return s[1:-1].replace('\\"', '"')
+
+ array = list
+ pair = tuple
+ object = dict
+ number = inline_args(float)
+
+ null = lambda self, _: None
+ true = lambda self, _: True
+ false = lambda self, _: False
+
+
+parser = Lark_StandAlone(transformer=TreeToJson())
+
+if __name__ == '__main__':
+ with open(sys.argv[1]) as f:
+ print(parser.parse(f.read()))
+
diff --git a/lark/__init__.py b/lark/__init__.py
index 930fa01..1637a75 100644
--- a/lark/__init__.py
+++ b/lark/__init__.py
@@ -4,4 +4,4 @@ from .lexer import UnexpectedInput, LexError
from .lark import Lark
from .utils import inline_args
-__version__ = "0.5.1"
+__version__ = "0.5.2"
diff --git a/lark/common.py b/lark/common.py
index 55e9d28..f745018 100644
--- a/lark/common.py
+++ b/lark/common.py
@@ -1,16 +1,21 @@
import re
-import sre_parse
import sys
+from .utils import get_regexp_width
+
Py36 = (sys.version_info[:2] >= (3, 6))
+
+###{standalone
+def is_terminal(sym):
+ return sym.isupper()
+
class GrammarError(Exception):
pass
class ParseError(Exception):
pass
-
class UnexpectedToken(ParseError):
def __init__(self, token, expected, seq, index):
self.token = token
@@ -31,9 +36,8 @@ class UnexpectedToken(ParseError):
super(UnexpectedToken, self).__init__(message)
+###}
-def is_terminal(sym):
- return isinstance(sym, Terminal) or sym.isupper() or sym == '$end'
class LexerConf:
@@ -44,7 +48,6 @@ class LexerConf:
class ParserConf:
def __init__(self, rules, callback, start):
- assert all(len(r) == 4 for r in rules)
self.rules = rules
self.callback = callback
self.start = start
@@ -93,10 +96,10 @@ class PatternRE(Pattern):
@property
def min_width(self):
- return sre_parse.parse(self.to_regexp()).getwidth()[0]
+ return get_regexp_width(self.to_regexp())[0]
@property
def max_width(self):
- return sre_parse.parse(self.to_regexp()).getwidth()[1]
+ return get_regexp_width(self.to_regexp())[1]
class TokenDef(object):
def __init__(self, name, pattern, priority=1):
@@ -108,27 +111,3 @@ class TokenDef(object):
def __repr__(self):
return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern)
-
-class Terminal:
- def __init__(self, data):
- self.data = data
-
- def __repr__(self):
- return '%r' % self.data
-
- def __eq__(self, other):
- return isinstance(other, type(self)) and self.data == other.data
- def __hash__(self):
- return hash(self.data)
-
-
-class Terminal_Regexp(Terminal):
- def __init__(self, name, regexp):
- Terminal.__init__(self, regexp)
- self.name = name
- self.match = re.compile(regexp).match
-
-class Terminal_Token(Terminal):
- def match(self, other):
- return self.data == other.type
-
diff --git a/lark/grammar.py b/lark/grammar.py
new file mode 100644
index 0000000..d257bc4
--- /dev/null
+++ b/lark/grammar.py
@@ -0,0 +1,37 @@
+
+class Rule(object):
+ """
+ origin : a symbol
+ expansion : a list of symbols
+ """
+ def __init__(self, origin, expansion, alias=None, options=None):
+ self.origin = origin
+ self.expansion = expansion
+ self.alias = alias
+ self.options = options
+
+ def __str__(self):
+ return '<%s : %s>' % (self.origin, ' '.join(map(str,self.expansion)))
+
+ def __repr__(self):
+ return 'Rule(%r, %r, %r, %r)' % (self.origin, self.expansion, self.alias, self.options)
+
+
+class RuleOptions:
+ def __init__(self, keep_all_tokens=False, expand1=False, create_token=None, filter_out=False, priority=None):
+ self.keep_all_tokens = keep_all_tokens
+ self.expand1 = expand1
+ self.create_token = create_token # used for scanless postprocessing
+ self.priority = priority
+
+ self.filter_out = filter_out # remove this rule from the tree
+ # used for "token"-rules in scanless
+
+ def __repr__(self):
+ return 'RuleOptions(%r, %r, %r, %r, %r)' % (
+ self.keep_all_tokens,
+ self.expand1,
+ self.create_token,
+ self.priority,
+ self.filter_out
+ )
diff --git a/lark/grammars/common.g b/lark/grammars/common.g
index a54d49d..c38f485 100644
--- a/lark/grammars/common.g
+++ b/lark/grammars/common.g
@@ -12,6 +12,7 @@ DECIMAL: INT "." INT? | "." INT
// float = /-?\d+(\.\d+)?([eE][+-]?\d+)?/
_EXP: ("e"|"E") SIGNED_INT
FLOAT: INT _EXP | DECIMAL _EXP?
+SIGNED_FLOAT: ["+"|"-"] INT
NUMBER: FLOAT | INT
SIGNED_NUMBER: ["+"|"-"] NUMBER
diff --git a/lark/indenter.py b/lark/indenter.py
index a5f107d..34e61a0 100644
--- a/lark/indenter.py
+++ b/lark/indenter.py
@@ -2,6 +2,7 @@
from .lexer import Token
+###{standalone
class Indenter:
def __init__(self):
self.paren_level = 0
@@ -50,3 +51,5 @@ class Indenter:
@property
def always_accept(self):
return (self.NL_type,)
+
+###}
diff --git a/lark/lark.py b/lark/lark.py
index 8029638..fa564ed 100644
--- a/lark/lark.py
+++ b/lark/lark.py
@@ -169,13 +169,15 @@ class Lark:
def _build_parser(self):
self.parser_class = get_frontend(self.options.parser, self.options.lexer)
- self.parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class, self.options.propagate_positions, self.options.keep_all_tokens)
- rules, callback = self.parse_tree_builder.apply(self.options.transformer)
+
+ self._parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class, self.options.propagate_positions, self.options.keep_all_tokens)
+ callback = self._parse_tree_builder.create_callback(self.options.transformer)
if self.profiler:
for f in dir(callback):
if not (f.startswith('__') and f.endswith('__')):
setattr(callback, f, self.profiler.make_wrapper('transformer', getattr(callback, f)))
- parser_conf = ParserConf(rules, callback, self.options.start)
+
+ parser_conf = ParserConf(self.rules, callback, self.options.start)
return self.parser_class(self.lexer_conf, parser_conf, options=self.options)
diff --git a/lark/lexer.py b/lark/lexer.py
index 2741af0..64cfb46 100644
--- a/lark/lexer.py
+++ b/lark/lexer.py
@@ -5,6 +5,7 @@ import re
from .utils import Str, classify
from .common import is_terminal, PatternStr, PatternRE, TokenDef
+###{standalone
class LexError(Exception):
pass
@@ -48,27 +49,75 @@ class Token(Str):
__hash__ = Str.__hash__
-class Regex:
- def __init__(self, pattern, flags=()):
- self.pattern = pattern
- self.flags = flags
-def _regexp_has_newline(r):
- return '\n' in r or '\\n' in r or ('(?s)' in r and '.' in r)
+class LineCounter:
+ def __init__(self):
+ self.newline_char = '\n'
+ self.char_pos = 0
+ self.line = 1
+ self.column = 0
+ self.line_start_pos = 0
+
+ def feed(self, token, test_newline=True):
+ """Consume a token and calculate the new line & column.
+
+ As an optional optimization, set test_newline=False is token doesn't contain a newline.
+ """
+ if test_newline:
+ newlines = token.count(self.newline_char)
+ if newlines:
+ self.line += newlines
+ self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1
+
+ self.char_pos += len(token)
+ self.column = self.char_pos - self.line_start_pos
+
+class _Lex:
+ "Built to serve both Lexer and ContextualLexer"
+ def __init__(self, lexer):
+ self.lexer = lexer
+
+ def lex(self, stream, newline_types, ignore_types):
+ newline_types = list(newline_types)
+ ignore_types = list(ignore_types)
+ line_ctr = LineCounter()
-def _create_unless_callback(strs):
- mres = build_mres(strs, match_whole=True)
- def unless_callback(t):
- # if t in strs:
- # t.type = strs[t]
- for mre, type_from_index in mres:
+ while True:
+ lexer = self.lexer
+ for mre, type_from_index in lexer.mres:
+ m = mre.match(stream, line_ctr.char_pos)
+ if m:
+ value = m.group(0)
+ type_ = type_from_index[m.lastindex]
+ if type_ not in ignore_types:
+ t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
+ if t.type in lexer.callback:
+ t = lexer.callback[t.type](t)
+ yield t
+
+ line_ctr.feed(value, type_ in newline_types)
+ break
+ else:
+ if line_ctr.char_pos < len(stream):
+ raise UnexpectedInput(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column)
+ break
+
+class UnlessCallback:
+ def __init__(self, mres):
+ self.mres = mres
+
+ def __call__(self, t):
+ for mre, type_from_index in self.mres:
m = mre.match(t.value)
if m:
value = m.group(0)
t.type = type_from_index[m.lastindex]
break
return t
- return unless_callback
+
+###}
+
+
def _create_unless(tokens):
tokens_by_type = classify(tokens, lambda t: type(t.pattern))
@@ -85,7 +134,7 @@ def _create_unless(tokens):
if strtok.pattern.flags <= retok.pattern.flags:
embedded_strs.add(strtok)
if unless:
- callback[retok.name] = _create_unless_callback(unless)
+ callback[retok.name] = UnlessCallback(build_mres(unless, match_whole=True))
tokens = [t for t in tokens if t not in embedded_strs]
return tokens, callback
@@ -110,13 +159,13 @@ def _build_mres(tokens, max_size, match_whole):
def build_mres(tokens, match_whole=False):
return _build_mres(tokens, len(tokens), match_whole)
+def _regexp_has_newline(r):
+ return '\n' in r or '\\n' in r or ('(?s)' in r and '.' in r)
-class Lexer(object):
+class Lexer:
def __init__(self, tokens, ignore=()):
assert all(isinstance(t, TokenDef) for t in tokens), tokens
- self.ignore = ignore
- self.newline_char = '\n'
tokens = list(tokens)
# Sanitization
@@ -129,14 +178,11 @@ class Lexer(object):
if t.pattern.min_width == 0:
raise LexError("Lexer does not allow zero-width tokens. (%s: %s)" % (t.name, t.pattern))
- token_names = {t.name for t in tokens}
- for t in ignore:
- if t not in token_names:
- raise LexError("Token '%s' was marked to ignore but it is not defined!" % t)
+ assert set(ignore) <= {t.name for t in tokens}
# Init
self.newline_types = [t.name for t in tokens if _regexp_has_newline(t.pattern.to_regexp())]
- self.ignore_types = [t for t in ignore]
+ self.ignore_types = list(ignore)
tokens.sort(key=lambda x:(-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name))
@@ -147,46 +193,8 @@ class Lexer(object):
self.mres = build_mres(tokens)
-
def lex(self, stream):
- lex_pos = 0
- line = 1
- col_start_pos = 0
- newline_types = list(self.newline_types)
- ignore_types = list(self.ignore_types)
- while True:
- for mre, type_from_index in self.mres:
- m = mre.match(stream, lex_pos)
- if m:
- value = m.group(0)
- type_ = type_from_index[m.lastindex]
- to_yield = type_ not in ignore_types
-
- if to_yield:
- t = Token(type_, value, lex_pos, line, lex_pos - col_start_pos)
- end_col = t.column + len(value)
- if t.type in self.callback:
- t = self.callback[t.type](t)
-
- if type_ in newline_types:
- newlines = value.count(self.newline_char)
- if newlines:
- line += newlines
- last_newline_index = value.rindex(self.newline_char) + 1
- col_start_pos = lex_pos + last_newline_index
- end_col = len(value) - last_newline_index
-
- if to_yield:
- t.end_line = line
- t.end_col = end_col
- yield t
-
- lex_pos += len(value)
- break
- else:
- if lex_pos < len(stream):
- raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos)
- break
+ return _Lex(self).lex(stream, self.newline_types, self.ignore_types)
class ContextualLexer:
@@ -204,7 +212,7 @@ class ContextualLexer:
lexer = lexer_by_tokens[key]
except KeyError:
accepts = set(accepts) | set(ignore) | set(always_accept)
- state_tokens = [tokens_by_name[n] for n in accepts if is_terminal(n) and n!='$end']
+ state_tokens = [tokens_by_name[n] for n in accepts if is_terminal(n) and n!='$END']
lexer = Lexer(state_tokens, ignore=ignore)
lexer_by_tokens[key] = lexer
@@ -218,33 +226,9 @@ class ContextualLexer:
self.parser_state = state
def lex(self, stream):
- lex_pos = 0
- line = 1
- col_start_pos = 0
- newline_types = list(self.root_lexer.newline_types)
- ignore_types = list(self.root_lexer.ignore_types)
- while True:
- lexer = self.lexers[self.parser_state]
- for mre, type_from_index in lexer.mres:
- m = mre.match(stream, lex_pos)
- if m:
- value = m.group(0)
- type_ = type_from_index[m.lastindex]
- if type_ not in ignore_types:
- t = Token(type_, value, lex_pos, line, lex_pos - col_start_pos)
- if t.type in lexer.callback:
- t = lexer.callback[t.type](t)
- yield t
+ l = _Lex(self.lexers[self.parser_state])
+ for x in l.lex(stream, self.root_lexer.newline_types, self.root_lexer.ignore_types):
+ yield x
+ l.lexer = self.lexers[self.parser_state]
- if type_ in newline_types:
- newlines = value.count(lexer.newline_char)
- if newlines:
- line += newlines
- col_start_pos = lex_pos + value.rindex(lexer.newline_char)
- lex_pos += len(value)
- break
- else:
- if lex_pos < len(stream):
- raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos, lexer.tokens)
- break
diff --git a/lark/load_grammar.py b/lark/load_grammar.py
index 72e2e22..2d01277 100644
--- a/lark/load_grammar.py
+++ b/lark/load_grammar.py
@@ -12,6 +12,7 @@ from .parse_tree_builder import ParseTreeBuilder
from .parser_frontends import LALR
from .parsers.lalr_parser import UnexpectedToken
from .common import is_terminal, GrammarError, LexerConf, ParserConf, PatternStr, PatternRE, TokenDef
+from .grammar import RuleOptions, Rule
from .tree import Tree as T, Transformer, InlineTransformer, Visitor
@@ -127,7 +128,7 @@ RULES = {
class EBNF_to_BNF(InlineTransformer):
def __init__(self):
- self.new_rules = {}
+ self.new_rules = []
self.rules_by_expr = {}
self.prefix = 'anon'
self.i = 0
@@ -140,7 +141,8 @@ class EBNF_to_BNF(InlineTransformer):
new_name = '__%s_%s_%d' % (self.prefix, type_, self.i)
self.i += 1
t = Token('RULE', new_name, -1)
- self.new_rules[new_name] = T('expansions', [T('expansion', [expr]), T('expansion', [t, expr])]), self.rule_options
+ tree = T('expansions', [T('expansion', [expr]), T('expansion', [t, expr])])
+ self.new_rules.append((new_name, tree, self.rule_options))
self.rules_by_expr[expr] = t
return t
@@ -174,7 +176,6 @@ class SimplifyRule_Visitor(Visitor):
break
tree.expand_kids_by_index(*to_expand)
-
def expansion(self, tree):
# rules_list unpacking
# a : b (c|d) e
@@ -194,7 +195,7 @@ class SimplifyRule_Visitor(Visitor):
tree.data = 'expansions'
tree.children = [self.visit(T('expansion', [option if i==j else other
for j, other in enumerate(tree.children)]))
- for option in child.children]
+ for option in set(child.children)]
break
else:
break
@@ -208,7 +209,10 @@ class SimplifyRule_Visitor(Visitor):
tree.data = 'expansions'
tree.children = aliases
- expansions = _flatten
+ def expansions(self, tree):
+ self._flatten(tree)
+ tree.children = list(set(tree.children))
+
class RuleTreeToText(Transformer):
def expansions(self, x):
@@ -389,12 +393,6 @@ def _interleave(l, item):
def _choice_of_rules(rules):
return T('expansions', [T('expansion', [Token('RULE', name)]) for name in rules])
-def dict_update_safe(d1, d2):
- for k, v in d2.items():
- assert k not in d1
- d1[k] = v
-
-
class Grammar:
def __init__(self, rule_defs, token_defs, ignore):
self.token_defs = token_defs
@@ -411,6 +409,7 @@ class Grammar:
terms_to_ignore = {name:'__'+name for name in self.ignore}
if terms_to_ignore:
assert set(terms_to_ignore) <= {name for name, _t in term_defs}
+
term_defs = [(terms_to_ignore.get(name,name),t) for name,t in term_defs]
expr = Token('RULE', '__ignore')
for r, tree, _o in rule_defs:
@@ -466,57 +465,41 @@ class Grammar:
# =================
# Compile Rules
# =================
- ebnf_to_bnf = EBNF_to_BNF()
- simplify_rule = SimplifyRule_Visitor()
+ # 1. Pre-process terminals
transformer = PrepareLiterals()
if not lexer:
transformer *= SplitLiterals()
transformer *= ExtractAnonTokens(tokens) # Adds to tokens
- rules = {}
+ # 2. Convert EBNF to BNF (and apply step 1)
+ ebnf_to_bnf = EBNF_to_BNF()
+ rules = []
for name, rule_tree, options in rule_defs:
- assert name not in rules, name
ebnf_to_bnf.rule_options = RuleOptions(keep_all_tokens=True) if options and options.keep_all_tokens else None
tree = transformer.transform(rule_tree)
- rules[name] = ebnf_to_bnf.transform(tree), options
+ rules.append((name, ebnf_to_bnf.transform(tree), options))
+ rules += ebnf_to_bnf.new_rules
- dict_update_safe(rules, ebnf_to_bnf.new_rules)
-
- for tree, _o in rules.values():
- simplify_rule.visit(tree)
+ assert len(rules) == len({name for name, _t, _o in rules}), "Whoops, name collision"
+ # 3. Compile tree to Rule objects
rule_tree_to_text = RuleTreeToText()
- rules = {origin: (rule_tree_to_text.transform(tree), options) for origin, (tree, options) in rules.items()}
-
- return tokens, rules, self.ignore
+ simplify_rule = SimplifyRule_Visitor()
+ compiled_rules = []
+ for name, tree, options in rules:
+ simplify_rule.visit(tree)
+ expansions = rule_tree_to_text.transform(tree)
+ for expansion, alias in expansions:
+ if alias and name.startswith('_'):
+ raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (name, alias))
-class RuleOptions:
- def __init__(self, keep_all_tokens=False, expand1=False, create_token=None, filter_out=False, priority=None):
- self.keep_all_tokens = keep_all_tokens
- self.expand1 = expand1
- self.create_token = create_token # used for scanless postprocessing
- self.priority = priority
-
- self.filter_out = filter_out # remove this rule from the tree
- # used for "token"-rules in scanless
- @classmethod
- def from_rule(cls, name, *x):
- if len(x) > 1:
- priority, expansions = x
- priority = int(priority)
- else:
- expansions ,= x
- priority = None
-
- keep_all_tokens = name.startswith('!')
- name = name.lstrip('!')
- expand1 = name.startswith('?')
- name = name.lstrip('?')
+ rule = Rule(name, expansion, alias, options)
+ compiled_rules.append(rule)
- return name, expansions, cls(keep_all_tokens, expand1, priority=priority)
+ return tokens, compiled_rules, self.ignore
@@ -553,15 +536,30 @@ def resolve_token_references(token_defs):
if not changed:
break
+def options_from_rule(name, *x):
+ if len(x) > 1:
+ priority, expansions = x
+ priority = int(priority)
+ else:
+ expansions ,= x
+ priority = None
+
+ keep_all_tokens = name.startswith('!')
+ name = name.lstrip('!')
+ expand1 = name.startswith('?')
+ name = name.lstrip('?')
+
+ return name, expansions, RuleOptions(keep_all_tokens, expand1, priority=priority)
class GrammarLoader:
def __init__(self):
tokens = [TokenDef(name, PatternRE(value)) for name, value in TOKENS.items()]
- rules = [RuleOptions.from_rule(name, x) for name, x in RULES.items()]
- d = {r: ([(x.split(), None) for x in xs], o) for r, xs, o in rules}
- rules, callback = ParseTreeBuilder(d, T).apply()
+ rules = [options_from_rule(name, x) for name, x in RULES.items()]
+ rules = [Rule(r, x.split(), None, o) for r, xs, o in rules for x in xs]
+ callback = ParseTreeBuilder(rules, T).create_callback()
lexer_conf = LexerConf(tokens, ['WS', 'COMMENT'])
+
parser_conf = ParserConf(rules, callback, 'start')
self.parser = LALR(lexer_conf, parser_conf)
@@ -636,7 +634,6 @@ class GrammarLoader:
ignore_names.append(name)
token_defs.append((name, (t, 0)))
-
# Verify correctness 2
token_names = set()
for name, _ in token_defs:
@@ -644,10 +641,13 @@ class GrammarLoader:
raise GrammarError("Token '%s' defined more than once" % name)
token_names.add(name)
+ if set(ignore_names) > token_names:
+ raise GrammarError("Tokens %s were marked to ignore but were not defined!" % (set(ignore_names) - token_names))
+
# Resolve token references
resolve_token_references(token_defs)
- rules = [RuleOptions.from_rule(*x) for x in rule_defs]
+ rules = [options_from_rule(*x) for x in rule_defs]
rule_names = set()
for name, _x, _o in rules:
diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py
index 975121d..7e52125 100644
--- a/lark/parse_tree_builder.py
+++ b/lark/parse_tree_builder.py
@@ -1,6 +1,9 @@
from .common import is_terminal, GrammarError
from .utils import suppress
from .lexer import Token
+from .grammar import Rule
+
+###{standalone
class NodeBuilder:
def __init__(self, tree_class, name):
@@ -27,7 +30,7 @@ class Factory:
def __call__(self, node_builder):
return self.cls(node_builder, *self.args)
-
+
class TokenWrapper:
"Used for fixing the results of scanless parsing"
@@ -106,51 +109,53 @@ class ParseTreeBuilder:
self.rule_builders = list(self._init_builders(rules))
+ self.user_aliases = {}
+
def _init_builders(self, rules):
filter_out = set()
- for origin, (expansions, options) in rules.items():
- if options and options.filter_out:
- assert origin.startswith('_') # Just to make sure
- filter_out.add(origin)
+ for rule in rules:
+ if rule.options and rule.options.filter_out:
+ assert rule.origin.startswith('_') # Just to make sure
+ filter_out.add(rule.origin)
- for origin, (expansions, options) in rules.items():
+ for rule in rules:
+ options = rule.options
keep_all_tokens = self.always_keep_all_tokens or (options.keep_all_tokens if options else False)
expand1 = options.expand1 if options else False
create_token = options.create_token if options else False
- for expansion, alias in expansions:
- if alias and origin.startswith('_'):
- raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (origin, alias))
+ wrapper_chain = filter(None, [
+ (expand1 and not rule.alias) and Expand1,
+ create_token and Factory(TokenWrapper, create_token),
+ create_rule_handler(rule.expansion, keep_all_tokens, filter_out),
+ self.propagate_positions and PropagatePositions,
+ ])
- wrapper_chain = filter(None, [
- (expand1 and not alias) and Expand1,
- create_token and Factory(TokenWrapper, create_token),
- create_rule_handler(expansion, keep_all_tokens, filter_out),
- self.propagate_positions and PropagatePositions,
- ])
+ yield rule, wrapper_chain
- yield origin, expansion, options, alias or origin, wrapper_chain
-
- def apply(self, transformer=None):
+ def create_callback(self, transformer=None):
callback = Callback()
- new_rules = []
- for origin, expansion, options, alias, wrapper_chain in self.rule_builders:
- callback_name = '_callback_%s_%s' % (origin, '_'.join(expansion))
+ for rule, wrapper_chain in self.rule_builders:
+ internal_callback_name = '_callback_%s_%s' % (rule.origin, '_'.join(rule.expansion))
+ user_callback_name = rule.alias or rule.origin
try:
- f = transformer._get_func(alias)
+ f = transformer._get_func(user_callback_name)
except AttributeError:
- f = NodeBuilder(self.tree_class, alias)
+ f = NodeBuilder(self.tree_class, user_callback_name)
+
+ self.user_aliases[rule] = rule.alias
+ rule.alias = internal_callback_name
for w in wrapper_chain:
f = w(f)
- if hasattr(callback, callback_name):
- raise GrammarError("Rule expansion '%s' already exists in rule %s" % (' '.join(expansion), origin))
- setattr(callback, callback_name, f)
+ if hasattr(callback, internal_callback_name):
+ raise GrammarError("Rule '%s' already exists" % (rule,))
+ setattr(callback, internal_callback_name, f)
- new_rules.append(( origin, expansion, callback_name, options ))
+ return callback
- return new_rules, callback
+###}
diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py
index 37c6dd0..bc87921 100644
--- a/lark/parser_frontends.py
+++ b/lark/parser_frontends.py
@@ -1,5 +1,5 @@
import re
-import sre_parse
+from .utils import get_regexp_width
from parsers.grammar_analysis import GrammarAnalyzer
from .lexer import Lexer, ContextualLexer, Token
@@ -9,10 +9,16 @@ from .parsers import lalr_parser, earley, xearley, resolve_ambig, cyk
from .tree import Tree
class WithLexer:
- def __init__(self, lexer_conf):
+ def init_traditional_lexer(self, lexer_conf):
self.lexer_conf = lexer_conf
self.lexer = Lexer(lexer_conf.tokens, ignore=lexer_conf.ignore)
+ def init_contextual_lexer(self, lexer_conf, parser_conf):
+ self.lexer_conf = lexer_conf
+ d = {idx:t.keys() for idx, t in self.parser.analysis.parse_table.states.items()}
+ always_accept = lexer_conf.postlex.always_accept if lexer_conf.postlex else ()
+ self.lexer = ContextualLexer(lexer_conf.tokens, d, ignore=lexer_conf.ignore, always_accept=always_accept)
+
def lex(self, text):
stream = self.lexer.lex(text)
if self.lexer_conf.postlex:
@@ -23,32 +29,22 @@ class WithLexer:
class LALR(WithLexer):
def __init__(self, lexer_conf, parser_conf, options=None):
- WithLexer.__init__(self, lexer_conf)
-
- self.parser_conf = parser_conf
self.parser = lalr_parser.Parser(parser_conf)
+ self.init_traditional_lexer(lexer_conf)
def parse(self, text):
- tokens = self.lex(text)
- return self.parser.parse(tokens)
+ token_stream = self.lex(text)
+ return self.parser.parse(token_stream)
-class LALR_ContextualLexer:
+class LALR_ContextualLexer(WithLexer):
def __init__(self, lexer_conf, parser_conf, options=None):
- self.lexer_conf = lexer_conf
- self.parser_conf = parser_conf
-
self.parser = lalr_parser.Parser(parser_conf)
-
- d = {idx:t.keys() for idx, t in self.parser.analysis.states_idx.items()}
- always_accept = lexer_conf.postlex.always_accept if lexer_conf.postlex else ()
- self.lexer = ContextualLexer(lexer_conf.tokens, d, ignore=lexer_conf.ignore, always_accept=always_accept)
+ self.init_contextual_lexer(lexer_conf, parser_conf)
def parse(self, text):
- tokens = self.lexer.lex(text)
- if self.lexer_conf.postlex:
- tokens = self.lexer_conf.postlex.process(tokens)
- return self.parser.parse(tokens, self.lexer.set_parser_state)
+ token_stream = self.lex(text)
+ return self.parser.parse(token_stream, self.lexer.set_parser_state)
def get_ambiguity_resolver(options):
if not options or options.ambiguity == 'resolve':
@@ -60,55 +56,47 @@ def get_ambiguity_resolver(options):
raise ValueError(options)
def tokenize_text(text):
- new_text = []
line = 1
col_start_pos = 0
for i, ch in enumerate(text):
if '\n' in ch:
line += ch.count('\n')
col_start_pos = i + ch.rindex('\n')
- new_text.append(Token('CHAR', ch, line=line, column=i - col_start_pos))
- return new_text
+ yield Token('CHAR', ch, line=line, column=i - col_start_pos)
class Earley_NoLex:
def __init__(self, lexer_conf, parser_conf, options=None):
- self.token_by_name = {t.name:t for t in lexer_conf.tokens}
+ self._prepare_match(lexer_conf)
- rules = [(n, list(self._prepare_expansion(x)), a, o) for n,x,a,o in parser_conf.rules]
-
- self.parser = earley.Parser(rules,
- parser_conf.start,
- parser_conf.callback,
+ self.parser = earley.Parser(parser_conf, self.match,
resolve_ambiguity=get_ambiguity_resolver(options))
- def _prepare_expansion(self, expansion):
- for sym in expansion:
- if is_terminal(sym):
- regexp = self.token_by_name[sym].pattern.to_regexp()
- width = sre_parse.parse(regexp).getwidth()
- if width != (1,1):
- raise GrammarError('Scanless parsing (lexer=None) requires all tokens to have a width of 1 (terminal %s: %s is %s)' % (sym, regexp, width))
- yield Terminal_Regexp(sym, regexp)
- else:
- yield sym
+
+ def match(self, term, text, index=0):
+ return self.regexps[term].match(text, index)
+
+ def _prepare_match(self, lexer_conf):
+ self.regexps = {}
+ for t in lexer_conf.tokens:
+ regexp = t.pattern.to_regexp()
+ width = get_regexp_width(regexp)
+ if width != (1,1):
+ raise GrammarError('Scanless parsing (lexer=None) requires all tokens to have a width of 1 (terminal %s: %s is %s)' % (sym, regexp, width))
+ self.regexps[t.name] = re.compile(regexp)
def parse(self, text):
- new_text = tokenize_text(text)
- return self.parser.parse(new_text)
+ token_stream = tokenize_text(text)
+ return self.parser.parse(token_stream)
class Earley(WithLexer):
def __init__(self, lexer_conf, parser_conf, options=None):
- WithLexer.__init__(self, lexer_conf)
-
- rules = [(n, self._prepare_expansion(x), a, o) for n,x,a,o in parser_conf.rules]
+ self.init_traditional_lexer(lexer_conf)
- self.parser = earley.Parser(rules,
- parser_conf.start,
- parser_conf.callback,
+ self.parser = earley.Parser(parser_conf, self.match,
resolve_ambiguity=get_ambiguity_resolver(options))
- def _prepare_expansion(self, expansion):
- return [Terminal_Token(sym) if is_terminal(sym) else sym for sym in expansion]
+ def match(self, term, token):
+ return term == token.type
def parse(self, text):
tokens = self.lex(text)
@@ -119,27 +107,31 @@ class XEarley:
def __init__(self, lexer_conf, parser_conf, options=None):
self.token_by_name = {t.name:t for t in lexer_conf.tokens}
- rules = [(n, list(self._prepare_expansion(x)), a, o) for n,x,a,o in parser_conf.rules]
+ self._prepare_match(lexer_conf)
- ignore = [Terminal_Regexp(x, self.token_by_name[x].pattern.to_regexp()) for x in lexer_conf.ignore]
-
- self.parser = xearley.Parser(rules,
- parser_conf.start,
- parser_conf.callback,
+ self.parser = xearley.Parser(parser_conf,
+ self.match,
resolve_ambiguity=get_ambiguity_resolver(options),
- ignore=ignore,
+ ignore=lexer_conf.ignore,
predict_all=options.earley__predict_all
)
- def _prepare_expansion(self, expansion):
- for sym in expansion:
- if is_terminal(sym):
- regexp = self.token_by_name[sym].pattern.to_regexp()
- width = sre_parse.parse(regexp).getwidth()
- assert width
- yield Terminal_Regexp(sym, regexp)
+ def match(self, term, text, index=0):
+ return self.regexps[term].match(text, index)
+
+ def _prepare_match(self, lexer_conf):
+ self.regexps = {}
+ for t in lexer_conf.tokens:
+ regexp = t.pattern.to_regexp()
+ try:
+ width = get_regexp_width(regexp)[0]
+ except ValueError:
+ raise ValueError("Bad regexp in token %s: %s" % (t.name, regexp))
else:
- yield sym
+ if width == 0:
+ raise ValueError("Dynamic Earley doesn't allow zero-width regexps")
+
+ self.regexps[t.name] = re.compile(regexp)
def parse(self, text):
return self.parser.parse(text)
diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py
index dbe6834..3586c22 100644
--- a/lark/parsers/earley.py
+++ b/lark/parsers/earley.py
@@ -13,14 +13,11 @@
# Author: Erez Shinan (2017)
# Email : erezshin@gmail.com
-from ..common import ParseError, UnexpectedToken, Terminal
+from ..common import ParseError, UnexpectedToken, is_terminal
from ..tree import Tree, Visitor_NoRecurse, Transformer_NoRecurse
from .grammar_analysis import GrammarAnalyzer
-class EndToken:
- type = '$end'
-
class Derivation(Tree):
_hash = None
@@ -35,8 +32,6 @@ class Derivation(Tree):
self._hash = Tree.__hash__(self)
return self._hash
-END_TOKEN = EndToken()
-
class Item(object):
"An Earley Item, the atom of the algorithm."
@@ -59,11 +54,8 @@ class Item(object):
new_tree = Derivation(self.rule, self.tree.children + [tree])
return self.__class__(self.rule, self.ptr+1, self.start, new_tree)
- def similar(self, other):
- return self.start is other.start and self.ptr == other.ptr and self.rule == other.rule
-
def __eq__(self, other):
- return self.similar(other) #and (self.tree == other.tree)
+ return self.start is other.start and self.ptr == other.ptr and self.rule == other.rule
def __hash__(self):
return hash((self.rule, self.ptr, id(self.start))) # Always runs Derivation.__hash__
@@ -134,7 +126,7 @@ class Column:
self.completed[item_key] = item
self.to_reduce.append(item)
else:
- if isinstance(item.expect, Terminal):
+ if is_terminal(item.expect):
self.to_scan.append(item)
else:
k = item_key if self.predict_all else item
@@ -151,31 +143,30 @@ class Column:
__nonzero__ = __bool__ # Py2 backwards-compatibility
class Parser:
- def __init__(self, rules, start_symbol, callback, resolve_ambiguity=None):
- self.analysis = GrammarAnalyzer(rules, start_symbol)
- self.start_symbol = start_symbol
+ def __init__(self, parser_conf, term_matcher, resolve_ambiguity=None):
+ self.analysis = GrammarAnalyzer(parser_conf)
+ self.parser_conf = parser_conf
self.resolve_ambiguity = resolve_ambiguity
+ self.FIRST = self.analysis.FIRST
self.postprocess = {}
self.predictions = {}
- self.FIRST = {}
- for rule in self.analysis.rules:
- if rule.origin != '$root': # XXX kinda ugly
- a = rule.alias
- self.postprocess[rule] = a if callable(a) else (a and getattr(callback, a))
- self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)]
+ for rule in parser_conf.rules:
+ self.postprocess[rule] = getattr(parser_conf.callback, rule.alias)
+ self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)]
- self.FIRST[rule.origin] = self.analysis.FIRST[rule.origin]
+ self.term_matcher = term_matcher
def parse(self, stream, start_symbol=None):
# Define parser functions
- start_symbol = start_symbol or self.start_symbol
+ start_symbol = start_symbol or self.parser_conf.start
_Item = Item
+ match = self.term_matcher
def predict(nonterm, column):
- assert not isinstance(nonterm, Terminal), nonterm
+ assert not is_terminal(nonterm), nonterm
return [_Item(rule, 0, column, None) for rule in self.predictions[nonterm]]
def complete(item):
@@ -195,14 +186,13 @@ class Parser:
for item in to_reduce:
new_items = list(complete(item))
- for new_item in new_items:
- if new_item.similar(item):
- raise ParseError('Infinite recursion detected! (rule %s)' % new_item.rule)
+ if item in new_items:
+ raise ParseError('Infinite recursion detected! (rule %s)' % item.rule)
column.add(new_items)
def scan(i, token, column):
next_set = Column(i, self.FIRST)
- next_set.add(item.advance(token) for item in column.to_scan if item.expect.match(token))
+ next_set.add(item.advance(token) for item in column.to_scan if match(item.expect, token))
if not next_set:
expect = {i.expect for i in column.to_scan}
@@ -249,24 +239,3 @@ class ApplyCallbacks(Transformer_NoRecurse):
return callback(children)
else:
return Tree(rule.origin, children)
-
-# RULES = [
-# ('a', ['d']),
-# ('d', ['b']),
-# ('b', ['C']),
-# ('b', ['b', 'C']),
-# ('b', ['C', 'b']),
-# ]
-# p = Parser(RULES, 'a')
-# for x in p.parse('CC'):
-# print x.pretty()
-
-#---------------
-# RULES = [
-# ('s', ['a', 'a']),
-# ('a', ['b', 'b']),
-# ('b', ['C'], lambda (x,): x),
-# ('b', ['b', 'C']),
-# ]
-# p = Parser(RULES, 's', {})
-# print p.parse('CCCCC').pretty()
diff --git a/lark/parsers/grammar_analysis.py b/lark/parsers/grammar_analysis.py
index 9250c47..a8c7757 100644
--- a/lark/parsers/grammar_analysis.py
+++ b/lark/parsers/grammar_analysis.py
@@ -1,20 +1,8 @@
from ..utils import bfs, fzset
from ..common import GrammarError, is_terminal
+from ..grammar import Rule
-class Rule(object):
- """
- origin : a symbol
- expansion : a list of symbols
- """
- def __init__(self, origin, expansion, alias=None, options=None):
- self.origin = origin
- self.expansion = expansion
- self.alias = alias
- self.options = options
-
- def __repr__(self):
- return '<%s : %s>' % (self.origin, ' '.join(map(str,self.expansion)))
class RulePtr(object):
def __init__(self, rule, index):
@@ -106,28 +94,30 @@ def calculate_sets(rules):
class GrammarAnalyzer(object):
- def __init__(self, rule_tuples, start_symbol, debug=False):
- self.start_symbol = start_symbol
+ def __init__(self, parser_conf, debug=False):
+ rules = parser_conf.rules
+ assert len(rules) == len(set(rules))
+
+ self.start_symbol = parser_conf.start
self.debug = debug
- rule_tuples = list(rule_tuples)
- rule_tuples.append(('$root', [start_symbol, '$end']))
- rule_tuples = [(t[0], t[1], None, None) if len(t)==2 else t for t in rule_tuples]
-
- self.rules = set()
- self.rules_by_origin = {o: [] for o, _x, _a, _opt in rule_tuples}
- for origin, exp, alias, options in rule_tuples:
- r = Rule( origin, exp, alias, options )
- self.rules.add(r)
- self.rules_by_origin[origin].append(r)
-
- for r in self.rules:
+
+ root_rule = Rule('$root', [self.start_symbol, '$END'])
+
+ self.rules_by_origin = {r.origin: [] for r in rules}
+ for r in rules:
+ self.rules_by_origin[r.origin].append(r)
+
+ self.rules_by_origin[root_rule.origin] = [root_rule]
+
+ for r in rules:
for sym in r.expansion:
if not (is_terminal(sym) or sym in self.rules_by_origin):
raise GrammarError("Using an undefined rule: %s" % sym)
- self.init_state = self.expand_rule('$root')
+ self.start_state = self.expand_rule('$root')
+ self.rules = rules
- self.FIRST, self.FOLLOW, self.NULLABLE = calculate_sets(self.rules)
+ self.FIRST, self.FOLLOW, self.NULLABLE = calculate_sets(rules + [root_rule])
def expand_rule(self, rule):
"Returns all init_ptrs accessible by rule (recursive)"
diff --git a/lark/parsers/lalr_analysis.py b/lark/parsers/lalr_analysis.py
index e763b08..6eb3fdf 100644
--- a/lark/parsers/lalr_analysis.py
+++ b/lark/parsers/lalr_analysis.py
@@ -14,7 +14,43 @@ from ..common import GrammarError, is_terminal
from .grammar_analysis import GrammarAnalyzer
-ACTION_SHIFT = 0
+class Action:
+ def __init__(self, name):
+ self.name = name
+ def __str__(self):
+ return self.name
+ def __repr__(self):
+ return str(self)
+
+Shift = Action('Shift')
+Reduce = Action('Reduce')
+
+class ParseTable:
+ def __init__(self, states, start_state, end_state):
+ self.states = states
+ self.start_state = start_state
+ self.end_state = end_state
+
+class IntParseTable(ParseTable):
+
+ @classmethod
+ def from_ParseTable(cls, parse_table):
+ enum = list(parse_table.states)
+ state_to_idx = {s:i for i,s in enumerate(enum)}
+ int_states = {}
+
+ for s, la in parse_table.states.items():
+ la = {k:(v[0], state_to_idx[v[1]]) if v[0] is Shift else v
+ for k,v in la.items()}
+ int_states[ state_to_idx[s] ] = la
+
+
+ start_state = state_to_idx[parse_table.start_state]
+ end_state = state_to_idx[parse_table.end_state]
+ return cls(int_states, start_state, end_state)
+
+
+
class LALR_Analyzer(GrammarAnalyzer):
@@ -27,7 +63,7 @@ class LALR_Analyzer(GrammarAnalyzer):
sat, unsat = classify_bool(state, lambda rp: rp.is_satisfied)
for rp in sat:
for term in self.FOLLOW.get(rp.rule.origin, ()):
- lookahead[term].append(('reduce', rp.rule))
+ lookahead[term].append((Reduce, rp.rule))
d = classify(unsat, lambda rp: rp.next)
for sym, rps in d.items():
@@ -38,8 +74,8 @@ class LALR_Analyzer(GrammarAnalyzer):
rps |= self.expand_rule(rp.next)
new_state = fzset(rps)
- lookahead[sym].append(('shift', new_state))
- if sym == '$end':
+ lookahead[sym].append((Shift, new_state))
+ if sym == '$END':
self.end_states.append( new_state )
yield fzset(rps)
@@ -50,7 +86,7 @@ class LALR_Analyzer(GrammarAnalyzer):
for x in v:
# XXX resolving shift/reduce into shift, like PLY
# Give a proper warning
- if x[0] == 'shift':
+ if x[0] is Shift:
lookahead[k] = [x]
for k, v in lookahead.items():
@@ -59,22 +95,15 @@ class LALR_Analyzer(GrammarAnalyzer):
self.states[state] = {k:v[0] for k, v in lookahead.items()}
- for _ in bfs([self.init_state], step):
+ for _ in bfs([self.start_state], step):
pass
self.end_state ,= self.end_states
- # --
- self.enum = list(self.states)
- self.enum_rev = {s:i for i,s in enumerate(self.enum)}
- self.states_idx = {}
-
- for s, la in self.states.items():
- la = {k:(ACTION_SHIFT, self.enum_rev[v[1]]) if v[0]=='shift'
- else (v[0], (v[1], len(v[1].expansion))) # Reduce
- for k,v in la.items()}
- self.states_idx[ self.enum_rev[s] ] = la
+ self._parse_table = ParseTable(self.states, self.start_state, self.end_state)
+ if self.debug:
+ self.parse_table = self._parse_table
+ else:
+ self.parse_table = IntParseTable.from_ParseTable(self._parse_table)
- self.init_state_idx = self.enum_rev[self.init_state]
- self.end_state_idx = self.enum_rev[self.end_state]
diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py
index f224bec..eafc4ea 100644
--- a/lark/parsers/lalr_parser.py
+++ b/lark/parsers/lalr_parser.py
@@ -3,30 +3,30 @@
# Author: Erez Shinan (2017)
# Email : erezshin@gmail.com
-from ..common import ParseError, UnexpectedToken
+from ..common import UnexpectedToken
-from .lalr_analysis import LALR_Analyzer, ACTION_SHIFT
-
-class FinalReduce:
- def __init__(self, value):
- self.value = value
+from .lalr_analysis import LALR_Analyzer, Shift
class Parser:
def __init__(self, parser_conf):
- assert all(o is None or o.priority is None for n,x,a,o in parser_conf.rules), "LALR doesn't yet support prioritization"
- self.analysis = analysis = LALR_Analyzer(parser_conf.rules, parser_conf.start)
+ assert all(r.options is None or r.options.priority is None
+ for r in parser_conf.rules), "LALR doesn't yet support prioritization"
+ self.analysis = analysis = LALR_Analyzer(parser_conf)
analysis.compute_lookahead()
callbacks = {rule: getattr(parser_conf.callback, rule.alias or rule.origin, None)
for rule in analysis.rules}
- self.parser = _Parser(analysis.states_idx, analysis.init_state_idx, analysis.end_state_idx, callbacks)
+ self.parser_conf = parser_conf
+ self.parser = _Parser(analysis.parse_table, callbacks)
self.parse = self.parser.parse
+###{standalone
+
class _Parser:
- def __init__(self, states, init_state, end_state, callbacks):
- self.states = states
- self.init_state = init_state
- self.end_state = end_state
+ def __init__(self, parse_table, callbacks):
+ self.states = parse_table.states
+ self.start_state = parse_table.start_state
+ self.end_state = parse_table.end_state
self.callbacks = callbacks
def parse(self, seq, set_state=None):
@@ -35,10 +35,10 @@ class _Parser:
stream = iter(seq)
states = self.states
- state_stack = [self.init_state]
+ state_stack = [self.start_state]
value_stack = []
- if set_state: set_state(self.init_state)
+ if set_state: set_state(self.start_state)
def get_action(key):
state = state_stack[-1]
@@ -49,7 +49,8 @@ class _Parser:
raise UnexpectedToken(token, expected, seq, i)
- def reduce(rule, size):
+ def reduce(rule):
+ size = len(rule.expansion)
if size:
s = value_stack[-size:]
del state_stack[-size:]
@@ -60,7 +61,7 @@ class _Parser:
value = self.callbacks[rule](s)
_action, new_state = get_action(rule.origin)
- assert _action == ACTION_SHIFT
+ assert _action is Shift
state_stack.append(new_state)
value_stack.append(value)
@@ -72,22 +73,24 @@ class _Parser:
action, arg = get_action(token.type)
assert arg != self.end_state
- if action == ACTION_SHIFT:
+ if action is Shift:
state_stack.append(arg)
value_stack.append(token)
if set_state: set_state(arg)
token = next(stream)
i += 1
else:
- reduce(*arg)
+ reduce(arg)
except StopIteration:
pass
while True:
- _action, arg = get_action('$end')
- if _action == ACTION_SHIFT:
+ _action, arg = get_action('$END')
+ if _action is Shift:
assert arg == self.end_state
val ,= value_stack
return val
else:
- reduce(*arg)
+ reduce(arg)
+
+###}
diff --git a/lark/parsers/xearley.py b/lark/parsers/xearley.py
index 9b26190..420c469 100644
--- a/lark/parsers/xearley.py
+++ b/lark/parsers/xearley.py
@@ -20,7 +20,7 @@
from collections import defaultdict
-from ..common import ParseError, UnexpectedToken, Terminal
+from ..common import ParseError, UnexpectedToken, is_terminal
from ..lexer import Token, UnexpectedInput
from ..tree import Tree
from .grammar_analysis import GrammarAnalyzer
@@ -28,37 +28,34 @@ from .grammar_analysis import GrammarAnalyzer
from .earley import ApplyCallbacks, Item, Column
class Parser:
- def __init__(self, rules, start_symbol, callback, resolve_ambiguity=None, ignore=(), predict_all=False):
- self.analysis = GrammarAnalyzer(rules, start_symbol)
- self.start_symbol = start_symbol
+ def __init__(self, parser_conf, term_matcher, resolve_ambiguity=None, ignore=(), predict_all=False):
+ self.analysis = GrammarAnalyzer(parser_conf)
+ self.parser_conf = parser_conf
self.resolve_ambiguity = resolve_ambiguity
self.ignore = list(ignore)
self.predict_all = predict_all
-
+ self.FIRST = self.analysis.FIRST
self.postprocess = {}
self.predictions = {}
- self.FIRST = {}
-
- for rule in self.analysis.rules:
- if rule.origin != '$root': # XXX kinda ugly
- a = rule.alias
- self.postprocess[rule] = a if callable(a) else (a and getattr(callback, a))
- self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)]
+ for rule in parser_conf.rules:
+ self.postprocess[rule] = getattr(parser_conf.callback, rule.alias)
+ self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)]
- self.FIRST[rule.origin] = self.analysis.FIRST[rule.origin]
+ self.term_matcher = term_matcher
def parse(self, stream, start_symbol=None):
# Define parser functions
- start_symbol = start_symbol or self.start_symbol
+ start_symbol = start_symbol or self.parser_conf.start
delayed_matches = defaultdict(list)
+ match = self.term_matcher
text_line = 1
text_column = 0
def predict(nonterm, column):
- assert not isinstance(nonterm, Terminal), nonterm
+ assert not is_terminal(nonterm), nonterm
return [Item(rule, 0, column, None) for rule in self.predictions[nonterm]]
def complete(item):
@@ -77,16 +74,15 @@ class Parser:
column.add( predict(nonterm, column) )
for item in to_reduce:
new_items = list(complete(item))
- for new_item in new_items:
- if new_item.similar(item):
- raise ParseError('Infinite recursion detected! (rule %s)' % new_item.rule)
+ if item in new_items:
+ raise ParseError('Infinite recursion detected! (rule %s)' % item.rule)
column.add(new_items)
def scan(i, token, column):
to_scan = column.to_scan
for x in self.ignore:
- m = x.match(stream, i)
+ m = match(x, stream, i)
if m:
delayed_matches[m.end()] += set(to_scan)
delayed_matches[m.end()] += set(column.to_reduce)
@@ -99,16 +95,16 @@ class Parser:
# delayed_matches[m.end()] += to_scan
for item in to_scan:
- m = item.expect.match(stream, i)
+ m = match(item.expect, stream, i)
if m:
- t = Token(item.expect.name, m.group(0), i, text_line, text_column)
+ t = Token(item.expect, m.group(0), i, text_line, text_column)
delayed_matches[m.end()].append(item.advance(t))
s = m.group(0)
for j in range(1, len(s)):
- m = item.expect.match(s[:-j])
+ m = match(item.expect, s[:-j])
if m:
- t = Token(item.expect.name, m.group(0), i, text_line, text_column)
+ t = Token(item.expect, m.group(0), i, text_line, text_column)
delayed_matches[i+m.end()].append(item.advance(t))
next_set = Column(i+1, self.FIRST, predict_all=self.predict_all)
@@ -131,7 +127,7 @@ class Parser:
if token == '\n':
text_line += 1
- text_column = 1
+ text_column = 0
else:
text_column += 1
@@ -143,7 +139,7 @@ class Parser:
if n.rule.origin==start_symbol and n.start is column0]
if not solutions:
- expected_tokens = [t.expect.name for t in column.to_scan]
+ expected_tokens = [t.expect for t in column.to_scan]
raise ParseError('Unexpected end of input! Expecting a terminal of: %s' % expected_tokens)
elif len(solutions) == 1:
diff --git a/lark/tools/standalone.py b/lark/tools/standalone.py
new file mode 100644
index 0000000..0444614
--- /dev/null
+++ b/lark/tools/standalone.py
@@ -0,0 +1,203 @@
+###{standalone
+#
+#
+# Lark Stand-alone Generator Tool
+# ----------------------------------
+# Generates a stand-alone LALR(1) parser with a standard lexer
+#
+# Git: https://github.com/erezsh/lark
+# Author: Erez Shinan (erezshin@gmail.com)
+#
+#
+# >>> LICENSE
+#
+# This tool and its generated code use a separate license from Lark.
+#
+# It is licensed under GPLv2 or above.
+#
+# If you wish to purchase a commercial license for this tool and its
+# generated code, contact me via email.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# See .
+#
+#
+###}
+
+import codecs
+import sys
+import os
+from pprint import pprint
+from os import path
+from collections import defaultdict
+
+import lark
+from lark import Lark
+from lark.parsers.lalr_analysis import Shift, Reduce
+
+from ..grammar import Rule
+
+__dir__ = path.dirname(__file__)
+__larkdir__ = path.join(__dir__, path.pardir)
+
+
+EXTRACT_STANDALONE_FILES = [
+ 'tools/standalone.py',
+ 'utils.py',
+ 'common.py',
+ 'tree.py',
+ 'indenter.py',
+ 'lexer.py',
+ 'parse_tree_builder.py',
+ 'parsers/lalr_parser.py',
+]
+
+
+def extract_sections(lines):
+ section = None
+ text = []
+ sections = defaultdict(list)
+ for l in lines:
+ if l.startswith('###'):
+ if l[3] == '{':
+ section = l[4:].strip()
+ elif l[3] == '}':
+ sections[section] += text
+ section = None
+ text = []
+ else:
+ raise ValueError(l)
+ elif section:
+ text.append(l)
+
+ return {name:''.join(text) for name, text in sections.items()}
+
+class LexerAtoms:
+ def __init__(self, lexer):
+ self.mres = [(p.pattern,d) for p,d in lexer.mres]
+ self.newline_types = lexer.newline_types
+ self.ignore_types = lexer.ignore_types
+ self.callback = {name:[(p.pattern,d) for p,d in c.mres]
+ for name, c in lexer.callback.items()}
+
+ def print_python(self):
+ print('import re')
+ print('MRES = (')
+ pprint(self.mres)
+ print(')')
+ print('LEXER_CALLBACK = (')
+ pprint(self.callback)
+ print(')')
+ print('NEWLINE_TYPES = %s' % self.newline_types)
+ print('IGNORE_TYPES = %s' % self.ignore_types)
+ print('class LexerRegexps: pass')
+ print('lexer_regexps = LexerRegexps()')
+ print('lexer_regexps.mres = [(re.compile(p), d) for p, d in MRES]')
+ print('lexer_regexps.callback = {n: UnlessCallback([(re.compile(p), d) for p, d in mres])')
+ print(' for n, mres in LEXER_CALLBACK.items()}')
+ print('lexer = _Lex(lexer_regexps)')
+ print('def lex(stream):')
+ print(' return lexer.lex(stream, NEWLINE_TYPES, IGNORE_TYPES)')
+
+
+class GetRule:
+ def __init__(self, rule_id):
+ self.rule_id = rule_id
+
+ def __repr__(self):
+ return 'RULES[%d]' % self.rule_id
+
+rule_ids = {}
+token_types = {}
+
+def _get_token_type(token_type):
+ if token_type not in token_types:
+ token_types[token_type] = len(token_types)
+ return token_types[token_type]
+
+class ParserAtoms:
+ def __init__(self, parser):
+ self.parse_table = parser.analysis.parse_table
+
+ def print_python(self):
+ print('class ParseTable: pass')
+ print('parse_table = ParseTable()')
+ print('STATES = {')
+ for state, actions in self.parse_table.states.items():
+ print(' %r: %r,' % (state, {_get_token_type(token): ((1, rule_ids[arg]) if action is Reduce else (0, arg))
+ for token, (action, arg) in actions.items()}))
+ print('}')
+ print('TOKEN_TYPES = (')
+ pprint({v:k for k, v in token_types.items()})
+ print(')')
+ print('parse_table.states = {s: {TOKEN_TYPES[t]: (a, RULES[x] if a is Reduce else x) for t, (a, x) in acts.items()}')
+ print(' for s, acts in STATES.items()}')
+ print('parse_table.start_state = %s' % self.parse_table.start_state)
+ print('parse_table.end_state = %s' % self.parse_table.end_state)
+ print('class Lark_StandAlone:')
+ print(' def __init__(self, transformer=None, postlex=None):')
+ print(' callback = parse_tree_builder.create_callback(transformer=transformer)')
+ print(' callbacks = {rule: getattr(callback, rule.alias or rule.origin, None) for rule in RULES.values()}')
+ print(' self.parser = _Parser(parse_table, callbacks)')
+ print(' self.postlex = postlex')
+ print(' def parse(self, stream):')
+ print(' tokens = lex(stream)')
+ print(' if self.postlex: tokens = self.postlex.process(tokens)')
+ print(' return self.parser.parse(tokens)')
+
+class TreeBuilderAtoms:
+ def __init__(self, lark):
+ self.rules = lark.rules
+ self.ptb = lark._parse_tree_builder
+
+ def print_python(self):
+ print('RULES = {')
+ for i, r in enumerate(self.rules):
+ rule_ids[r] = i
+ print(' %d: Rule(%r, %r, %r, %r),' % (i, r.origin, r.expansion, self.ptb.user_aliases[r], r.options ))
+ print('}')
+ print('parse_tree_builder = ParseTreeBuilder(RULES.values(), Tree)')
+
+def main(fn, start):
+ with codecs.open(fn, encoding='utf8') as f:
+ lark_inst = Lark(f, parser="lalr", start=start)
+
+ lexer_atoms = LexerAtoms(lark_inst.parser.lexer)
+ parser_atoms = ParserAtoms(lark_inst.parser.parser)
+ tree_builder_atoms = TreeBuilderAtoms(lark_inst)
+
+ print('# The file was automatically generated by Lark v%s' % lark.__version__)
+
+ for pyfile in EXTRACT_STANDALONE_FILES:
+ print (extract_sections(open(os.path.join(__larkdir__, pyfile)))['standalone'])
+
+ print(open(os.path.join(__larkdir__, 'grammar.py')).read())
+ print('Shift = 0')
+ print('Reduce = 1')
+ lexer_atoms.print_python()
+ tree_builder_atoms.print_python()
+ parser_atoms.print_python()
+
+if __name__ == '__main__':
+ if len(sys.argv) < 2:
+ print("Lark Stand-alone Generator Tool")
+ print("Usage: python -m lark.tools.standalone []")
+ sys.exit(1)
+
+ if len(sys.argv) == 3:
+ fn, start = sys.argv[1:]
+ elif len(sys.argv) == 2:
+ fn, start = sys.argv[1], 'start'
+ else:
+ assert False, sys.argv
+
+ main(fn, start)
diff --git a/lark/tree.py b/lark/tree.py
index 7251ce6..28f4fb6 100644
--- a/lark/tree.py
+++ b/lark/tree.py
@@ -7,6 +7,7 @@ from copy import deepcopy
from .utils import inline_args
+###{standalone
class Tree(object):
def __init__(self, data, children, rule=None):
self.data = data
@@ -34,6 +35,7 @@ class Tree(object):
def pretty(self, indent_str=' '):
return ''.join(self._pretty(0, indent_str))
+###}
def expand_kids_by_index(self, *indices):
for i in sorted(indices, reverse=True): # reverse so that changing tail won't affect indices
@@ -100,6 +102,7 @@ class Tree(object):
+###{standalone
class Transformer(object):
def _get_func(self, name):
return getattr(self, name)
@@ -139,7 +142,7 @@ class TransformerChain(object):
def __mul__(self, other):
return TransformerChain(*self.transformers + (other,))
-
+
class InlineTransformer(Transformer):
@@ -196,6 +199,7 @@ class Transformer_NoRecurse(Transformer):
def __default__(self, t):
return t
+###}
def pydot__tree_to_png(tree, filename):
diff --git a/lark/utils.py b/lark/utils.py
index d984400..abe036f 100644
--- a/lark/utils.py
+++ b/lark/utils.py
@@ -1,7 +1,4 @@
-import functools
-import types
from collections import deque
-from contextlib import contextmanager
class fzset(frozenset):
def __repr__(self):
@@ -49,8 +46,13 @@ try:
except NameError: # Python 3
STRING_TYPE = str
-Str = type(u'')
+###{standalone
+import types
+import functools
+from contextlib import contextmanager
+
+Str = type(u'')
def inline_args(f):
# print '@@', f.__name__, type(f), isinstance(f, types.FunctionType), isinstance(f, types.TypeType), isinstance(f, types.BuiltinFunctionType)
@@ -76,19 +78,6 @@ def inline_args(f):
return _f
-
-try:
- compare = cmp
-except NameError:
- def compare(a, b):
- if a == b:
- return 0
- elif a > b:
- return 1
- else:
- return -1
-
-
try:
from contextlib import suppress # Python 3
except ImportError:
@@ -107,6 +96,26 @@ except ImportError:
except excs:
pass
+###}
+try:
+ compare = cmp
+except NameError:
+ def compare(a, b):
+ if a == b:
+ return 0
+ elif a > b:
+ return 1
+ else:
+ return -1
+
+
+import sre_parse
+import sre_constants
+def get_regexp_width(regexp):
+ try:
+ return sre_parse.parse(regexp).getwidth()
+ except sre_constants.error:
+ raise ValueError(regexp)
diff --git a/tests/test_parser.py b/tests/test_parser.py
index d93e33b..8e954e2 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -126,7 +126,7 @@ class TestParsers(unittest.TestCase):
r = T().transform(g.parse("x"))
self.assertEqual( r.children, [""] )
-
+
g = Lark("""start: a
?a : b
b : "x"
@@ -142,14 +142,14 @@ class TestParsers(unittest.TestCase):
r = T().transform(g.parse("xx"))
self.assertEqual( r.children, [""] )
-
+
g = Lark("""start: a
?a : b b -> c
b : "x"
""", parser='lalr', transformer=T())
r = g.parse("xx")
self.assertEqual( r.children, [""] )
-
+
@@ -159,7 +159,7 @@ def _make_full_earley_test(LEXER):
# Fails an Earley implementation without special handling for empty rules,
# or re-processing of already completed rules.
g = Lark(r"""start: B
- B: ("ab"|/[^b]/)*
+ B: ("ab"|/[^b]/)+
""", lexer=LEXER)
self.assertEqual( g.parse('abc').children[0], 'abc')
@@ -796,6 +796,49 @@ def _make_parser_test(LEXER, PARSER):
self.assertEqual(tree.children, ['a', 'A'])
+ def test_twice_empty(self):
+ g = """!start: [["A"]]
+ """
+ l = _Lark(g)
+ tree = l.parse('A')
+ self.assertEqual(tree.children, ['A'])
+
+ tree = l.parse('')
+ self.assertEqual(tree.children, [])
+
+ def test_undefined_ignore(self):
+ g = """!start: "A"
+
+ %ignore B
+ """
+ self.assertRaises( GrammarError, _Lark, g)
+
+ @unittest.skipIf(LEXER==None, "TODO: Fix scanless parsing or get rid of it") # TODO
+ def test_line_and_column(self):
+ g = r"""!start: "A" bc "D"
+ !bc: "B\nC"
+ """
+ l = _Lark(g)
+ a, bc, d = l.parse("AB\nCD").children
+ self.assertEqual(a.line, 1)
+ self.assertEqual(a.column, 0)
+
+ bc ,= bc.children
+ self.assertEqual(bc.line, 1)
+ self.assertEqual(bc.column, 1)
+
+ self.assertEqual(d.line, 2)
+ self.assertEqual(d.column, 1)
+
+ # self.assertEqual(a.end_line, 1)
+ # self.assertEqual(a.end_col, 1)
+ # self.assertEqual(bc.end_line, 2)
+ # self.assertEqual(bc.end_col, 1)
+ # self.assertEqual(d.end_line, 2)
+ # self.assertEqual(d.end_col, 2)
+
+
+
def test_reduce_cycle(self):
"""Tests an edge-condition in the LALR parser, in which a transition state looks exactly like the end state.
It seems that the correct solution is to explicitely distinguish finalization in the reduce() function.
@@ -969,7 +1012,7 @@ def _make_parser_test(LEXER, PARSER):
parser = _Lark(grammar)
- tree = parser.parse("int 1 ! This is a comment\n")
+ tree = parser.parse("int 1 ! This is a comment\n")
self.assertEqual(tree.children, ['1'])
tree = parser.parse("int 1 ! This is a comment") # A trailing ignore token can be tricky!
@@ -983,6 +1026,7 @@ def _make_parser_test(LEXER, PARSER):
self.assertEqual(tree.children, [])
+
@unittest.skipIf(LEXER==None, "Scanless doesn't support regular expressions")
def test_regex_escaping(self):
g = _Lark("start: /[ab]/")