| @@ -0,0 +1,103 @@ | |||||
| """ | |||||
| Using lexer dynamic_complete | |||||
| ============================ | |||||
| Demonstrates how to use ``lexer='dynamic_complete'`` and ``ambiguity='explicit'`` | |||||
| Sometimes you have data that is highly ambiguous or 'broken' in some sense. | |||||
| When using ``parser='earley'`` and ``lexer='dynamic_complete'``, Lark will be able | |||||
| parse just about anything as long as there is a valid way to generate it from | |||||
| the Grammar, including looking 'into' the Regexes. | |||||
| This examples shows how to parse a json input where are quotes have been | |||||
| replaced by underscores: ``{_foo_:{}, _bar_: [], _baz_: __}`` | |||||
| Notice that underscores might still appear inside strings, so a potentially | |||||
| valid reading of the above might in normal json be: | |||||
| ``{"foo_:{}, _bar": [], "baz": ""}`` | |||||
| """ | |||||
| from pprint import pprint | |||||
| from lark import Lark, Tree, Transformer, v_args | |||||
| from lark.visitors import Transformer_InPlace | |||||
| GRAMMAR = r""" | |||||
| %import common.SIGNED_NUMBER | |||||
| %import common.WS_INLINE | |||||
| %import common.NEWLINE | |||||
| %ignore WS_INLINE | |||||
| ?start: value | |||||
| ?value: object | |||||
| | array | |||||
| | string | |||||
| | SIGNED_NUMBER -> number | |||||
| | "true" -> true | |||||
| | "false" -> false | |||||
| | "null" -> null | |||||
| array : "[" [value ("," value)*] "]" | |||||
| object : "{" [pair ("," pair)*] "}" | |||||
| pair : string ":" value | |||||
| string: STRING | |||||
| STRING : ESCAPED_STRING | |||||
| ESCAPED_STRING: QUOTE_CHAR _STRING_ESC_INNER QUOTE_CHAR | |||||
| QUOTE_CHAR: "_" | |||||
| _STRING_INNER: /.*/ | |||||
| _STRING_ESC_INNER: _STRING_INNER /(?<!\\)(\\\\)*?/ | |||||
| """ | |||||
| def score(tree: Tree): | |||||
| return sum(len(t.children) for t in tree.iter_subtrees()) | |||||
| class RemoveAmbiguities(Transformer_InPlace): | |||||
| def _ambig(self, options): | |||||
| return max(options, key=score) | |||||
| class TreeToJson(Transformer): | |||||
| @v_args(inline=True) | |||||
| def string(self, s): | |||||
| return s[1:-1].replace('\\"', '"') | |||||
| array = list | |||||
| pair = tuple | |||||
| object = dict | |||||
| number = v_args(inline=True)(float) | |||||
| null = lambda self, _: None | |||||
| true = lambda self, _: True | |||||
| false = lambda self, _: False | |||||
| parser = Lark(GRAMMAR, parser='earley', ambiguity="explicit", lexer='dynamic_complete') | |||||
| EXAMPLES = [ | |||||
| r'{_array_:[1,2,3]}', | |||||
| r'{_abc_: _array must be of the following format [_1_, _2_, _3_]_}', | |||||
| r'{_foo_:{}, _bar_: [], _baz_: __}', | |||||
| r'{_error_:_invalid_client_, _error_description_:_AADSTS7000215: Invalid ' | |||||
| r'client secret is provided.\r\nTrace ID: ' | |||||
| r'a0a0aaaa-a0a0-0a00-000a-00a00aaa0a00\r\nCorrelation ID: ' | |||||
| r'aa0aaa00-0aaa-0000-00a0-00000aaaa0aa\r\nTimestamp: 1997-10-10 00:00:00Z_, ' | |||||
| r'_error_codes_:[7000215], _timestamp_:_1997-10-10 00:00:00Z_, ' | |||||
| r'_trace_id_:_a0a0aaaa-a0a0-0a00-000a-00a00aaa0a00_, ' | |||||
| r'_correlation_id_:_aa0aaa00-0aaa-0000-00a0-00000aaaa0aa_, ' | |||||
| r'_error_uri_:_https://example.com_}', | |||||
| ] | |||||
| for example in EXAMPLES: | |||||
| tree = parser.parse(example) | |||||
| tree = RemoveAmbiguities().transform(tree) | |||||
| result = TreeToJson().transform(tree) | |||||
| print('-' * 100) | |||||
| pprint(result) | |||||