Browse Source

Merge branch 'lark-parser:master' into master

remotes/origin/gm/2021-09-23T00Z/github.com--lark-parser-lark/master
ornariece 3 years ago
committed by GitHub
parent
commit
99ced1fbe3
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 180 additions and 138 deletions
  1. +4
    -4
      docs/json_tutorial.md
  2. +2
    -2
      lark-stubs/lark.pyi
  3. +12
    -0
      lark/common.py
  4. +9
    -6
      lark/exceptions.py
  5. +8
    -8
      lark/lark.py
  6. +74
    -60
      lark/lexer.py
  7. +33
    -28
      lark/parse_tree_builder.py
  8. +9
    -10
      lark/parser_frontends.py
  9. +2
    -2
      lark/parsers/lalr_parser.py
  10. +7
    -18
      lark/utils.py
  11. +20
    -0
      tests/test_parser.py

+ 4
- 4
docs/json_tutorial.md View File

@@ -427,9 +427,9 @@ I measured memory consumption using a little script called [memusg](https://gist
| Lark - Earley *(with lexer)* | 42s | 4s | 1167M | 608M | | Lark - Earley *(with lexer)* | 42s | 4s | 1167M | 608M |
| Lark - LALR(1) | 8s | 1.53s | 453M | 266M | | Lark - LALR(1) | 8s | 1.53s | 453M | 266M |
| Lark - LALR(1) tree-less | 4.76s | 1.23s | 70M | 134M | | Lark - LALR(1) tree-less | 4.76s | 1.23s | 70M | 134M |
| PyParsing ([Parser](http://pyparsing.wikispaces.com/file/view/jsonParser.py)) | 32s | 3.53s | 443M | 225M |
| funcparserlib ([Parser](https://github.com/vlasovskikh/funcparserlib/blob/master/funcparserlib/tests/json.py)) | 8.5s | 1.3s | 483M | 293M |
| Parsimonious ([Parser](https://gist.githubusercontent.com/reclosedev/5222560/raw/5e97cf7eb62c3a3671885ec170577285e891f7d5/parsimonious_json.py)) | ? | 5.7s | ? | 1545M |
| PyParsing ([Parser](https://github.com/pyparsing/pyparsing/blob/master/examples/jsonParser.py)) | 32s | 3.53s | 443M | 225M |
| funcparserlib ([Parser](https://github.com/vlasovskikh/funcparserlib/blob/master/tests/json.py)) | 8.5s | 1.3s | 483M | 293M |
| Parsimonious ([Parser](https://gist.github.com/reclosedev/5222560)) | ? | 5.7s | ? | 1545M |




I added a few other parsers for comparison. PyParsing and funcparselib fair pretty well in their memory usage (they don't build a tree), but they can't compete with the run-time speed of LALR(1). I added a few other parsers for comparison. PyParsing and funcparselib fair pretty well in their memory usage (they don't build a tree), but they can't compete with the run-time speed of LALR(1).
@@ -442,7 +442,7 @@ Once again, shout-out to PyPy for being so effective.


This is the end of the tutorial. I hoped you liked it and learned a little about Lark. This is the end of the tutorial. I hoped you liked it and learned a little about Lark.


To see what else you can do with Lark, check out the [examples](examples).
To see what else you can do with Lark, check out the [examples](/examples).


For questions or any other subject, feel free to email me at erezshin at gmail dot com. For questions or any other subject, feel free to email me at erezshin at gmail dot com.



+ 2
- 2
lark-stubs/lark.pyi View File

@@ -33,7 +33,7 @@ class LarkOptions:
regex: bool regex: bool
debug: bool debug: bool
keep_all_tokens: bool keep_all_tokens: bool
propagate_positions: Union[bool, str]
propagate_positions: Union[bool, Callable]
maybe_placeholders: bool maybe_placeholders: bool
lexer_callbacks: Dict[str, Callable[[Token], Token]] lexer_callbacks: Dict[str, Callable[[Token], Token]]
cache: Union[bool, str] cache: Union[bool, str]
@@ -77,7 +77,7 @@ class Lark:
regex: bool = False, regex: bool = False,
debug: bool = False, debug: bool = False,
keep_all_tokens: bool = False, keep_all_tokens: bool = False,
propagate_positions: Union[bool, str] = False,
propagate_positions: Union[bool, Callable] = False,
maybe_placeholders: bool = False, maybe_placeholders: bool = False,
lexer_callbacks: Optional[Dict[str, Callable[[Token], Token]]] = None, lexer_callbacks: Optional[Dict[str, Callable[[Token], Token]]] = None,
cache: Union[bool, str] = False, cache: Union[bool, str] = False,


+ 12
- 0
lark/common.py View File

@@ -1,4 +1,5 @@
from warnings import warn from warnings import warn
from copy import deepcopy


from .utils import Serialize from .utils import Serialize
from .lexer import TerminalDef from .lexer import TerminalDef
@@ -31,6 +32,17 @@ class LexerConf(Serialize):
def _deserialize(self): def _deserialize(self):
self.terminals_by_name = {t.name: t for t in self.terminals} self.terminals_by_name = {t.name: t for t in self.terminals}


def __deepcopy__(self, memo=None):
return type(self)(
deepcopy(self.terminals, memo),
self.re_module,
deepcopy(self.ignore, memo),
deepcopy(self.postlex, memo),
deepcopy(self.callbacks, memo),
deepcopy(self.g_regex_flags, memo),
deepcopy(self.skip_validation, memo),
deepcopy(self.use_bytes, memo),
)




class ParserConf(Serialize): class ParserConf(Serialize):


+ 9
- 6
lark/exceptions.py View File

@@ -129,6 +129,8 @@ class UnexpectedInput(LarkError):


class UnexpectedEOF(ParseError, UnexpectedInput): class UnexpectedEOF(ParseError, UnexpectedInput):
def __init__(self, expected, state=None, terminals_by_name=None): def __init__(self, expected, state=None, terminals_by_name=None):
super(UnexpectedEOF, self).__init__()

self.expected = expected self.expected = expected
self.state = state self.state = state
from .lexer import Token from .lexer import Token
@@ -138,7 +140,6 @@ class UnexpectedEOF(ParseError, UnexpectedInput):
self.column = -1 self.column = -1
self._terminals_by_name = terminals_by_name self._terminals_by_name = terminals_by_name


super(UnexpectedEOF, self).__init__()


def __str__(self): def __str__(self):
message = "Unexpected end-of-input. " message = "Unexpected end-of-input. "
@@ -149,6 +150,8 @@ class UnexpectedEOF(ParseError, UnexpectedInput):
class UnexpectedCharacters(LexError, UnexpectedInput): class UnexpectedCharacters(LexError, UnexpectedInput):
def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None, def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None,
terminals_by_name=None, considered_rules=None): terminals_by_name=None, considered_rules=None):
super(UnexpectedCharacters, self).__init__()

# TODO considered_tokens and allowed can be figured out using state # TODO considered_tokens and allowed can be figured out using state
self.line = line self.line = line
self.column = column self.column = column
@@ -167,7 +170,6 @@ class UnexpectedCharacters(LexError, UnexpectedInput):
self.char = seq[lex_pos] self.char = seq[lex_pos]
self._context = self.get_context(seq) self._context = self.get_context(seq)


super(UnexpectedCharacters, self).__init__()


def __str__(self): def __str__(self):
message = "No terminal matches '%s' in the current parser context, at line %d col %d" % (self.char, self.line, self.column) message = "No terminal matches '%s' in the current parser context, at line %d col %d" % (self.char, self.line, self.column)
@@ -190,6 +192,8 @@ class UnexpectedToken(ParseError, UnexpectedInput):
""" """


def __init__(self, token, expected, considered_rules=None, state=None, interactive_parser=None, terminals_by_name=None, token_history=None): def __init__(self, token, expected, considered_rules=None, state=None, interactive_parser=None, terminals_by_name=None, token_history=None):
super(UnexpectedToken, self).__init__()
# TODO considered_rules and expected can be figured out using state # TODO considered_rules and expected can be figured out using state
self.line = getattr(token, 'line', '?') self.line = getattr(token, 'line', '?')
self.column = getattr(token, 'column', '?') self.column = getattr(token, 'column', '?')
@@ -204,7 +208,6 @@ class UnexpectedToken(ParseError, UnexpectedInput):
self._terminals_by_name = terminals_by_name self._terminals_by_name = terminals_by_name
self.token_history = token_history self.token_history = token_history


super(UnexpectedToken, self).__init__()


@property @property
def accepts(self): def accepts(self):
@@ -236,10 +239,10 @@ class VisitError(LarkError):
""" """


def __init__(self, rule, obj, orig_exc): def __init__(self, rule, obj, orig_exc):
self.obj = obj
self.orig_exc = orig_exc

message = 'Error trying to process rule "%s":\n\n%s' % (rule, orig_exc) message = 'Error trying to process rule "%s":\n\n%s' % (rule, orig_exc)
super(VisitError, self).__init__(message) super(VisitError, self).__init__(message)


self.obj = obj
self.orig_exc = orig_exc

###} ###}

+ 8
- 8
lark/lark.py View File

@@ -44,7 +44,7 @@ class LarkOptions(Serialize):
Applies the transformer to every parse tree (equivalent to applying it after the parse, but faster) Applies the transformer to every parse tree (equivalent to applying it after the parse, but faster)
propagate_positions propagate_positions
Propagates (line, column, end_line, end_column) attributes into all tree branches. Propagates (line, column, end_line, end_column) attributes into all tree branches.
Accepts ``False``, ``True``, or "ignore_ws", which will trim the whitespace around your trees.
Accepts ``False``, ``True``, or a callable, which will filter which nodes to ignore when propagating.
maybe_placeholders maybe_placeholders
When ``True``, the ``[]`` operator returns ``None`` when not matched. When ``True``, the ``[]`` operator returns ``None`` when not matched.


@@ -162,7 +162,7 @@ class LarkOptions(Serialize):
assert_config(self.parser, ('earley', 'lalr', 'cyk', None)) assert_config(self.parser, ('earley', 'lalr', 'cyk', None))


if self.parser == 'earley' and self.transformer: if self.parser == 'earley' and self.transformer:
raise ConfigurationError('Cannot specify an embedded transformer when using the Earley algorithm.'
raise ConfigurationError('Cannot specify an embedded transformer when using the Earley algorithm. '
'Please use your transformer on the resulting parse tree, or use a different algorithm (i.e. LALR)') 'Please use your transformer on the resulting parse tree, or use a different algorithm (i.e. LALR)')


if o: if o:
@@ -451,11 +451,11 @@ class Lark(Serialize):
d = f d = f
else: else:
d = pickle.load(f) d = pickle.load(f)
memo = d['memo']
memo_json = d['memo']
data = d['data'] data = d['data']


assert memo
memo = SerializeMemoizer.deserialize(memo, {'Rule': Rule, 'TerminalDef': TerminalDef}, {})
assert memo_json
memo = SerializeMemoizer.deserialize(memo_json, {'Rule': Rule, 'TerminalDef': TerminalDef}, {})
options = dict(data['options']) options = dict(data['options'])
if (set(kwargs) - _LOAD_ALLOWED_OPTIONS) & set(LarkOptions._defaults): if (set(kwargs) - _LOAD_ALLOWED_OPTIONS) & set(LarkOptions._defaults):
raise ConfigurationError("Some options are not allowed when loading a Parser: {}" raise ConfigurationError("Some options are not allowed when loading a Parser: {}"
@@ -512,11 +512,11 @@ class Lark(Serialize):


Lark.open_from_package(__name__, "example.lark", ("grammars",), parser=...) Lark.open_from_package(__name__, "example.lark", ("grammars",), parser=...)
""" """
package = FromPackageLoader(package, search_paths)
full_path, text = package(None, grammar_path)
package_loader = FromPackageLoader(package, search_paths)
full_path, text = package_loader(None, grammar_path)
options.setdefault('source_path', full_path) options.setdefault('source_path', full_path)
options.setdefault('import_paths', []) options.setdefault('import_paths', [])
options['import_paths'].append(package)
options['import_paths'].append(package_loader)
return cls(text, **options) return cls(text, **options)


def __repr__(self): def __repr__(self):


+ 74
- 60
lark/lexer.py View File

@@ -120,33 +120,33 @@ class Token(Str):
Attributes: Attributes:
type: Name of the token (as specified in grammar) type: Name of the token (as specified in grammar)
value: Value of the token (redundant, as ``token.value == token`` will always be true) value: Value of the token (redundant, as ``token.value == token`` will always be true)
pos_in_stream: The index of the token in the text
start_pos: The index of the token in the text
line: The line of the token in the text (starting with 1) line: The line of the token in the text (starting with 1)
column: The column of the token in the text (starting with 1) column: The column of the token in the text (starting with 1)
end_line: The line where the token ends end_line: The line where the token ends
end_column: The next column after the end of the token. For example, end_column: The next column after the end of the token. For example,
if the token is a single character with a column value of 4, if the token is a single character with a column value of 4,
end_column will be 5. end_column will be 5.
end_pos: the index where the token ends (basically ``pos_in_stream + len(token)``)
end_pos: the index where the token ends (basically ``start_pos + len(token)``)
""" """
__slots__ = ('type', 'start_pos', 'value', 'line', 'column', 'end_line', 'end_column', 'end_pos') __slots__ = ('type', 'start_pos', 'value', 'line', 'column', 'end_line', 'end_column', 'end_pos')


def __new__(cls, type_, value, start_pos=None, line=None, column=None, end_line=None, end_column=None, end_pos=None, pos_in_stream=None): def __new__(cls, type_, value, start_pos=None, line=None, column=None, end_line=None, end_column=None, end_pos=None, pos_in_stream=None):
try: try:
self = super(Token, cls).__new__(cls, value)
inst = super(Token, cls).__new__(cls, value)
except UnicodeDecodeError: except UnicodeDecodeError:
value = value.decode('latin1') value = value.decode('latin1')
self = super(Token, cls).__new__(cls, value)
self.type = type_
self.start_pos = start_pos if start_pos is not None else pos_in_stream
self.value = value
self.line = line
self.column = column
self.end_line = end_line
self.end_column = end_column
self.end_pos = end_pos
return self
inst = super(Token, cls).__new__(cls, value)
inst.type = type_
inst.start_pos = start_pos if start_pos is not None else pos_in_stream
inst.value = value
inst.line = line
inst.column = column
inst.end_line = end_line
inst.end_column = end_column
inst.end_pos = end_pos
return inst


@property @property
def pos_in_stream(self): def pos_in_stream(self):
@@ -214,15 +214,13 @@ class LineCounter:




class UnlessCallback: class UnlessCallback:
def __init__(self, mres):
self.mres = mres
def __init__(self, scanner):
self.scanner = scanner


def __call__(self, t): def __call__(self, t):
for mre, type_from_index in self.mres:
m = mre.match(t.value)
if m:
t.type = type_from_index[m.lastindex]
break
res = self.scanner.match(t.value, 0)
if res:
_value, t.type = res
return t return t




@@ -237,6 +235,11 @@ class CallChain:
return self.callback2(t) if self.cond(t2) else t2 return self.callback2(t) if self.cond(t2) else t2




def _get_match(re_, regexp, s, flags):
m = re_.match(regexp, s, flags)
if m:
return m.group(0)

def _create_unless(terminals, g_regex_flags, re_, use_bytes): def _create_unless(terminals, g_regex_flags, re_, use_bytes):
tokens_by_type = classify(terminals, lambda t: type(t.pattern)) tokens_by_type = classify(terminals, lambda t: type(t.pattern))
assert len(tokens_by_type) <= 2, tokens_by_type.keys() assert len(tokens_by_type) <= 2, tokens_by_type.keys()
@@ -248,40 +251,54 @@ def _create_unless(terminals, g_regex_flags, re_, use_bytes):
if strtok.priority > retok.priority: if strtok.priority > retok.priority:
continue continue
s = strtok.pattern.value s = strtok.pattern.value
m = re_.match(retok.pattern.to_regexp(), s, g_regex_flags)
if m and m.group(0) == s:
if s == _get_match(re_, retok.pattern.to_regexp(), s, g_regex_flags):
unless.append(strtok) unless.append(strtok)
if strtok.pattern.flags <= retok.pattern.flags: if strtok.pattern.flags <= retok.pattern.flags:
embedded_strs.add(strtok) embedded_strs.add(strtok)
if unless: if unless:
callback[retok.name] = UnlessCallback(build_mres(unless, g_regex_flags, re_, match_whole=True, use_bytes=use_bytes))

terminals = [t for t in terminals if t not in embedded_strs]
return terminals, callback


def _build_mres(terminals, max_size, g_regex_flags, match_whole, re_, use_bytes):
# Python sets an unreasonable group limit (currently 100) in its re module
# Worse, the only way to know we reached it is by catching an AssertionError!
# This function recursively tries less and less groups until it's successful.
postfix = '$' if match_whole else ''
mres = []
while terminals:
pattern = u'|'.join(u'(?P<%s>%s)' % (t.name, t.pattern.to_regexp() + postfix) for t in terminals[:max_size])
if use_bytes:
pattern = pattern.encode('latin-1')
try:
mre = re_.compile(pattern, g_regex_flags)
except AssertionError: # Yes, this is what Python provides us.. :/
return _build_mres(terminals, max_size//2, g_regex_flags, match_whole, re_, use_bytes)
callback[retok.name] = UnlessCallback(Scanner(unless, g_regex_flags, re_, match_whole=True, use_bytes=use_bytes))


mres.append((mre, {i: n for n, i in mre.groupindex.items()}))
terminals = terminals[max_size:]
return mres
new_terminals = [t for t in terminals if t not in embedded_strs]
return new_terminals, callback




def build_mres(terminals, g_regex_flags, re_, use_bytes, match_whole=False):
return _build_mres(terminals, len(terminals), g_regex_flags, match_whole, re_, use_bytes)

class Scanner:
def __init__(self, terminals, g_regex_flags, re_, use_bytes, match_whole=False):
self.terminals = terminals
self.g_regex_flags = g_regex_flags
self.re_ = re_
self.use_bytes = use_bytes
self.match_whole = match_whole

self.allowed_types = {t.name for t in self.terminals}

self._mres = self._build_mres(terminals, len(terminals))

def _build_mres(self, terminals, max_size):
# Python sets an unreasonable group limit (currently 100) in its re module
# Worse, the only way to know we reached it is by catching an AssertionError!
# This function recursively tries less and less groups until it's successful.
postfix = '$' if self.match_whole else ''
mres = []
while terminals:
pattern = u'|'.join(u'(?P<%s>%s)' % (t.name, t.pattern.to_regexp() + postfix) for t in terminals[:max_size])
if self.use_bytes:
pattern = pattern.encode('latin-1')
try:
mre = self.re_.compile(pattern, self.g_regex_flags)
except AssertionError: # Yes, this is what Python provides us.. :/
return self._build_mres(terminals, max_size//2)

mres.append((mre, {i: n for n, i in mre.groupindex.items()}))
terminals = terminals[max_size:]
return mres

def match(self, text, pos):
for mre, type_from_index in self._mres:
m = mre.match(text, pos)
if m:
return m.group(0), type_from_index[m.lastindex]




def _regexp_has_newline(r): def _regexp_has_newline(r):
@@ -341,9 +358,9 @@ class TraditionalLexer(Lexer):
self.use_bytes = conf.use_bytes self.use_bytes = conf.use_bytes
self.terminals_by_name = conf.terminals_by_name self.terminals_by_name = conf.terminals_by_name


self._mres = None
self._scanner = None


def _build(self):
def _build_scanner(self):
terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, self.re, self.use_bytes) terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, self.re, self.use_bytes)
assert all(self.callback.values()) assert all(self.callback.values())


@@ -354,19 +371,16 @@ class TraditionalLexer(Lexer):
else: else:
self.callback[type_] = f self.callback[type_] = f


self._mres = build_mres(terminals, self.g_regex_flags, self.re, self.use_bytes)
self._scanner = Scanner(terminals, self.g_regex_flags, self.re, self.use_bytes)


@property @property
def mres(self):
if self._mres is None:
self._build()
return self._mres
def scanner(self):
if self._scanner is None:
self._build_scanner()
return self._scanner


def match(self, text, pos): def match(self, text, pos):
for mre, type_from_index in self.mres:
m = mre.match(text, pos)
if m:
return m.group(0), type_from_index[m.lastindex]
return self.scanner.match(text, pos)


def lex(self, state, parser_state): def lex(self, state, parser_state):
with suppress(EOFError): with suppress(EOFError):
@@ -378,7 +392,7 @@ class TraditionalLexer(Lexer):
while line_ctr.char_pos < len(lex_state.text): while line_ctr.char_pos < len(lex_state.text):
res = self.match(lex_state.text, line_ctr.char_pos) res = self.match(lex_state.text, line_ctr.char_pos)
if not res: if not res:
allowed = {v for m, tfi in self.mres for v in tfi.values()} - self.ignore_types
allowed = self.scanner.allowed_types - self.ignore_types
if not allowed: if not allowed:
allowed = {"<END-OF-FILE>"} allowed = {"<END-OF-FILE>"}
raise UnexpectedCharacters(lex_state.text, line_ctr.char_pos, line_ctr.line, line_ctr.column, raise UnexpectedCharacters(lex_state.text, line_ctr.char_pos, line_ctr.line, line_ctr.column,


+ 33
- 28
lark/parse_tree_builder.py View File

@@ -23,54 +23,59 @@ class ExpandSingleChild:




class PropagatePositions: class PropagatePositions:
def __init__(self, node_builder):
def __init__(self, node_builder, node_filter=None):
self.node_builder = node_builder self.node_builder = node_builder
self.node_filter = node_filter


def __call__(self, children): def __call__(self, children):
res = self.node_builder(children) res = self.node_builder(children)


# local reference to Tree.meta reduces number of presence checks
if isinstance(res, Tree): if isinstance(res, Tree):
res_meta = res.meta
# Calculate positions while the tree is streaming, according to the rule:
# - nodes start at the start of their first child's container,
# and end at the end of their last child's container.
# Containers are nodes that take up space in text, but have been inlined in the tree.


src_meta = self._pp_get_meta(children)
if src_meta is not None:
res_meta.line = src_meta.line
res_meta.column = src_meta.column
res_meta.start_pos = src_meta.start_pos
res_meta.empty = False
res_meta = res.meta


src_meta = self._pp_get_meta(reversed(children))
if src_meta is not None:
res_meta.end_line = src_meta.end_line
res_meta.end_column = src_meta.end_column
res_meta.end_pos = src_meta.end_pos
res_meta.empty = False
first_meta = self._pp_get_meta(children)
if first_meta is not None:
if not hasattr(res_meta, 'line'):
# meta was already set, probably because the rule has been inlined (e.g. `?rule`)
res_meta.line = getattr(first_meta, 'container_line', first_meta.line)
res_meta.column = getattr(first_meta, 'container_column', first_meta.column)
res_meta.start_pos = getattr(first_meta, 'container_start_pos', first_meta.start_pos)
res_meta.empty = False

res_meta.container_line = getattr(first_meta, 'container_line', first_meta.line)
res_meta.container_column = getattr(first_meta, 'container_column', first_meta.column)

last_meta = self._pp_get_meta(reversed(children))
if last_meta is not None:
if not hasattr(res_meta, 'end_line'):
res_meta.end_line = getattr(last_meta, 'container_end_line', last_meta.end_line)
res_meta.end_column = getattr(last_meta, 'container_end_column', last_meta.end_column)
res_meta.end_pos = getattr(last_meta, 'container_end_pos', last_meta.end_pos)
res_meta.empty = False

res_meta.container_end_line = getattr(last_meta, 'container_end_line', last_meta.end_line)
res_meta.container_end_column = getattr(last_meta, 'container_end_column', last_meta.end_column)


return res return res


def _pp_get_meta(self, children): def _pp_get_meta(self, children):
for c in children: for c in children:
if self.node_filter is not None and not self.node_filter(c):
continue
if isinstance(c, Tree): if isinstance(c, Tree):
if not c.meta.empty: if not c.meta.empty:
return c.meta return c.meta
elif isinstance(c, Token): elif isinstance(c, Token):
return c return c


class PropagatePositions_IgnoreWs(PropagatePositions):
def _pp_get_meta(self, children):
for c in children:
if isinstance(c, Tree):
if not c.meta.empty:
return c.meta
elif isinstance(c, Token):
if c and not c.isspace(): # Disregard whitespace-only tokens
return c


def make_propagate_positions(option): def make_propagate_positions(option):
if option == "ignore_ws":
return PropagatePositions_IgnoreWs
if callable(option):
return partial(PropagatePositions, node_filter=option)
elif option is True: elif option is True:
return PropagatePositions return PropagatePositions
elif option is False: elif option is False:


+ 9
- 10
lark/parser_frontends.py View File

@@ -39,8 +39,7 @@ class MakeParsingFrontend:
lexer_conf.lexer_type = self.lexer_type lexer_conf.lexer_type = self.lexer_type
return ParsingFrontend(lexer_conf, parser_conf, options) return ParsingFrontend(lexer_conf, parser_conf, options)


@classmethod
def deserialize(cls, data, memo, lexer_conf, callbacks, options):
def deserialize(self, data, memo, lexer_conf, callbacks, options):
parser_conf = ParserConf.deserialize(data['parser_conf'], memo) parser_conf = ParserConf.deserialize(data['parser_conf'], memo)
parser = LALR_Parser.deserialize(data['parser'], memo, callbacks, options.debug) parser = LALR_Parser.deserialize(data['parser'], memo, callbacks, options.debug)
parser_conf.callbacks = callbacks parser_conf.callbacks = callbacks
@@ -92,26 +91,26 @@ class ParsingFrontend(Serialize):
def _verify_start(self, start=None): def _verify_start(self, start=None):
if start is None: if start is None:
start = self.parser_conf.start
if len(start) > 1:
raise ConfigurationError("Lark initialized with more than 1 possible start rule. Must specify which start rule to parse", start)
start ,= start
start_decls = self.parser_conf.start
if len(start_decls) > 1:
raise ConfigurationError("Lark initialized with more than 1 possible start rule. Must specify which start rule to parse", start_decls)
start ,= start_decls
elif start not in self.parser_conf.start: elif start not in self.parser_conf.start:
raise ConfigurationError("Unknown start rule %s. Must be one of %r" % (start, self.parser_conf.start)) raise ConfigurationError("Unknown start rule %s. Must be one of %r" % (start, self.parser_conf.start))
return start return start


def parse(self, text, start=None, on_error=None): def parse(self, text, start=None, on_error=None):
start = self._verify_start(start)
chosen_start = self._verify_start(start)
stream = text if self.skip_lexer else LexerThread(self.lexer, text) stream = text if self.skip_lexer else LexerThread(self.lexer, text)
kw = {} if on_error is None else {'on_error': on_error} kw = {} if on_error is None else {'on_error': on_error}
return self.parser.parse(stream, start, **kw)
return self.parser.parse(stream, chosen_start, **kw)
def parse_interactive(self, text=None, start=None): def parse_interactive(self, text=None, start=None):
start = self._verify_start(start)
chosen_start = self._verify_start(start)
if self.parser_conf.parser_type != 'lalr': if self.parser_conf.parser_type != 'lalr':
raise ConfigurationError("parse_interactive() currently only works with parser='lalr' ") raise ConfigurationError("parse_interactive() currently only works with parser='lalr' ")
stream = text if self.skip_lexer else LexerThread(self.lexer, text) stream = text if self.skip_lexer else LexerThread(self.lexer, text)
return self.parser.parse_interactive(stream, start)
return self.parser.parse_interactive(stream, chosen_start)




def get_frontend(parser, lexer): def get_frontend(parser, lexer):


+ 2
- 2
lark/parsers/lalr_parser.py View File

@@ -178,8 +178,8 @@ class _Parser(object):
for token in state.lexer.lex(state): for token in state.lexer.lex(state):
state.feed_token(token) state.feed_token(token)


token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1)
return state.feed_token(token, True)
end_token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1)
return state.feed_token(end_token, True)
except UnexpectedInput as e: except UnexpectedInput as e:
try: try:
e.interactive_parser = InteractiveParser(self, state, state.lexer) e.interactive_parser = InteractiveParser(self, state, state.lexer)


+ 7
- 18
lark/utils.py View File

@@ -73,14 +73,13 @@ class Serialize(object):
fields = getattr(self, '__serialize_fields__') fields = getattr(self, '__serialize_fields__')
res = {f: _serialize(getattr(self, f), memo) for f in fields} res = {f: _serialize(getattr(self, f), memo) for f in fields}
res['__type__'] = type(self).__name__ res['__type__'] = type(self).__name__
postprocess = getattr(self, '_serialize', None)
if postprocess:
postprocess(res, memo)
if hasattr(self, '_serialize'):
self._serialize(res, memo)
return res return res


@classmethod @classmethod
def deserialize(cls, data, memo): def deserialize(cls, data, memo):
namespace = getattr(cls, '__serialize_namespace__', {})
namespace = getattr(cls, '__serialize_namespace__', [])
namespace = {c.__name__:c for c in namespace} namespace = {c.__name__:c for c in namespace}


fields = getattr(cls, '__serialize_fields__') fields = getattr(cls, '__serialize_fields__')
@@ -94,9 +93,10 @@ class Serialize(object):
setattr(inst, f, _deserialize(data[f], namespace, memo)) setattr(inst, f, _deserialize(data[f], namespace, memo))
except KeyError as e: except KeyError as e:
raise KeyError("Cannot find key for class", cls, e) raise KeyError("Cannot find key for class", cls, e)
postprocess = getattr(inst, '_deserialize', None)
if postprocess:
postprocess()

if hasattr(inst, '_deserialize'):
inst._deserialize()

return inst return inst




@@ -241,17 +241,6 @@ except ImportError:
pass pass




try:
compare = cmp
except NameError:
def compare(a, b):
if a == b:
return 0
elif a > b:
return 1
return -1


class Enumerator(Serialize): class Enumerator(Serialize):
def __init__(self): def __init__(self):
self.enums = {} self.enums = {}


+ 20
- 0
tests/test_parser.py View File

@@ -94,6 +94,26 @@ class TestParsers(unittest.TestCase):
r = g.parse('a') r = g.parse('a')
self.assertEqual( r.children[0].meta.line, 1 ) self.assertEqual( r.children[0].meta.line, 1 )


def test_propagate_positions2(self):
g = Lark("""start: a
a: b
?b: "(" t ")"
!t: "t"
""", propagate_positions=True)

start = g.parse("(t)")
a ,= start.children
t ,= a.children
assert t.children[0] == "t"

assert t.meta.column == 2
assert t.meta.end_column == 3

assert start.meta.column == a.meta.column == 1
assert start.meta.end_column == a.meta.end_column == 4



def test_expand1(self): def test_expand1(self):


g = Lark("""start: a g = Lark("""start: a


Loading…
Cancel
Save