Browse Source

Merge pull request #965 from MegaIng/dynamic_complete_example

remotes/origin/gm/2021-09-23T00Z/github.com--lark-parser-lark/master
Erez Shinan 3 years ago
committed by GitHub
parent
commit
8f914cf78b
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 144 additions and 0 deletions
  1. +144
    -0
      examples/advanced/dynamic_complete.py

+ 144
- 0
examples/advanced/dynamic_complete.py View File

@@ -0,0 +1,144 @@
"""
Using lexer dynamic_complete
============================

Demonstrates how to use ``lexer='dynamic_complete'`` and ``ambiguity='explicit'``

Sometimes you have data that is highly ambiguous or 'broken' in some sense.
When using ``parser='earley'`` and ``lexer='dynamic_complete'``, Lark will be able
parse just about anything as long as there is a valid way to generate it from
the Grammar, including looking 'into' the Regexes.

This examples shows how to parse a json input where the quotes have been
replaced by underscores: ``{_foo_:{}, _bar_: [], _baz_: __}``
Notice that underscores might still appear inside strings, so a potentially
valid reading of the above is:
``{"foo_:{}, _bar": [], "baz": ""}``
"""
from pprint import pprint

from lark import Lark, Tree, Transformer, v_args
from lark.visitors import Transformer_InPlace

GRAMMAR = r"""
%import common.SIGNED_NUMBER
%import common.WS_INLINE
%import common.NEWLINE
%ignore WS_INLINE

?start: value

?value: object
| array
| string
| SIGNED_NUMBER -> number
| "true" -> true
| "false" -> false
| "null" -> null

array : "[" [value ("," value)*] "]"
object : "{" [pair ("," pair)*] "}"
pair : string ":" value

string: STRING
STRING : ESCAPED_STRING

ESCAPED_STRING: QUOTE_CHAR _STRING_ESC_INNER QUOTE_CHAR
QUOTE_CHAR: "_"

_STRING_INNER: /.*/
_STRING_ESC_INNER: _STRING_INNER /(?<!\\)(\\\\)*?/

"""


def score(tree: Tree):
"""
Scores an option by how many children (and grand-children, and
grand-grand-children, ...) it has.
This means that the option with fewer large terminals get's selected

Between
object
pair
string _foo_
object
pair
string _bar_: [], _baz_
string __

and

object
pair
string _foo_
object
pair
string _bar_
array
pair
string _baz_
string __

this will give the second a higher score. (9 vs 13)
"""
return sum(len(t.children) for t in tree.iter_subtrees())


class RemoveAmbiguities(Transformer_InPlace):
"""
Selects an option to resolve an ambiguity using the score function above.
Scores each option and selects the one with the higher score, e.g. the one
with more nodes.

If there is a performance problem with the Tree having to many _ambig and
being slow and to large, this can instead be written as a ForestVisitor.
Look at the 'Custom SPPF Prioritizer' example.
"""
def _ambig(self, options):
return max(options, key=score)


class TreeToJson(Transformer):
"""
This is the same Transformer as the json_parser example.
"""
@v_args(inline=True)
def string(self, s):
return s[1:-1].replace('\\"', '"')

array = list
pair = tuple
object = dict
number = v_args(inline=True)(float)

null = lambda self, _: None
true = lambda self, _: True
false = lambda self, _: False


parser = Lark(GRAMMAR, parser='earley', ambiguity="explicit", lexer='dynamic_complete')

EXAMPLES = [
r'{_array_:[1,2,3]}',

r'{_abc_: _array must be of the following format [_1_, _2_, _3_]_}',

r'{_foo_:{}, _bar_: [], _baz_: __}',

r'{_error_:_invalid_client_, _error_description_:_AADSTS7000215: Invalid '
r'client secret is provided.\r\nTrace ID: '
r'a0a0aaaa-a0a0-0a00-000a-00a00aaa0a00\r\nCorrelation ID: '
r'aa0aaa00-0aaa-0000-00a0-00000aaaa0aa\r\nTimestamp: 1997-10-10 00:00:00Z_, '
r'_error_codes_:[7000215], _timestamp_:_1997-10-10 00:00:00Z_, '
r'_trace_id_:_a0a0aaaa-a0a0-0a00-000a-00a00aaa0a00_, '
r'_correlation_id_:_aa0aaa00-0aaa-0000-00a0-00000aaaa0aa_, '
r'_error_uri_:_https://example.com_}',

]
for example in EXAMPLES:
tree = parser.parse(example)
tree = RemoveAmbiguities().transform(tree)
result = TreeToJson().transform(tree)
print('-' * 100)
pprint(result)

Loading…
Cancel
Save