This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

145 lines
3.9 KiB

  1. """
  2. Using lexer dynamic_complete
  3. ============================
  4. Demonstrates how to use ``lexer='dynamic_complete'`` and ``ambiguity='explicit'``
  5. Sometimes you have data that is highly ambiguous or 'broken' in some sense.
  6. When using ``parser='earley'`` and ``lexer='dynamic_complete'``, Lark will be able
  7. parse just about anything as long as there is a valid way to generate it from
  8. the Grammar, including looking 'into' the Regexes.
  9. This examples shows how to parse a json input where the quotes have been
  10. replaced by underscores: ``{_foo_:{}, _bar_: [], _baz_: __}``
  11. Notice that underscores might still appear inside strings, so a potentially
  12. valid reading of the above is:
  13. ``{"foo_:{}, _bar": [], "baz": ""}``
  14. """
  15. from pprint import pprint
  16. from lark import Lark, Tree, Transformer, v_args
  17. from lark.visitors import Transformer_InPlace
  18. GRAMMAR = r"""
  19. %import common.SIGNED_NUMBER
  20. %import common.WS_INLINE
  21. %import common.NEWLINE
  22. %ignore WS_INLINE
  23. ?start: value
  24. ?value: object
  25. | array
  26. | string
  27. | SIGNED_NUMBER -> number
  28. | "true" -> true
  29. | "false" -> false
  30. | "null" -> null
  31. array : "[" [value ("," value)*] "]"
  32. object : "{" [pair ("," pair)*] "}"
  33. pair : string ":" value
  34. string: STRING
  35. STRING : ESCAPED_STRING
  36. ESCAPED_STRING: QUOTE_CHAR _STRING_ESC_INNER QUOTE_CHAR
  37. QUOTE_CHAR: "_"
  38. _STRING_INNER: /.*/
  39. _STRING_ESC_INNER: _STRING_INNER /(?<!\\)(\\\\)*?/
  40. """
  41. def score(tree: Tree):
  42. """
  43. Scores an option by how many children (and grand-children, and
  44. grand-grand-children, ...) it has.
  45. This means that the option with fewer large terminals get's selected
  46. Between
  47. object
  48. pair
  49. string _foo_
  50. object
  51. pair
  52. string _bar_: [], _baz_
  53. string __
  54. and
  55. object
  56. pair
  57. string _foo_
  58. object
  59. pair
  60. string _bar_
  61. array
  62. pair
  63. string _baz_
  64. string __
  65. this will give the second a higher score. (9 vs 13)
  66. """
  67. return sum(len(t.children) for t in tree.iter_subtrees())
  68. class RemoveAmbiguities(Transformer_InPlace):
  69. """
  70. Selects an option to resolve an ambiguity using the score function above.
  71. Scores each option and selects the one with the higher score, e.g. the one
  72. with more nodes.
  73. If there is a performance problem with the Tree having to many _ambig and
  74. being slow and to large, this can instead be written as a ForestVisitor.
  75. Look at the 'Custom SPPF Prioritizer' example.
  76. """
  77. def _ambig(self, options):
  78. return max(options, key=score)
  79. class TreeToJson(Transformer):
  80. """
  81. This is the same Transformer as the json_parser example.
  82. """
  83. @v_args(inline=True)
  84. def string(self, s):
  85. return s[1:-1].replace('\\"', '"')
  86. array = list
  87. pair = tuple
  88. object = dict
  89. number = v_args(inline=True)(float)
  90. null = lambda self, _: None
  91. true = lambda self, _: True
  92. false = lambda self, _: False
  93. parser = Lark(GRAMMAR, parser='earley', ambiguity="explicit", lexer='dynamic_complete')
  94. EXAMPLES = [
  95. r'{_array_:[1,2,3]}',
  96. r'{_abc_: _array must be of the following format [_1_, _2_, _3_]_}',
  97. r'{_foo_:{}, _bar_: [], _baz_: __}',
  98. r'{_error_:_invalid_client_, _error_description_:_AADSTS7000215: Invalid '
  99. r'client secret is provided.\r\nTrace ID: '
  100. r'a0a0aaaa-a0a0-0a00-000a-00a00aaa0a00\r\nCorrelation ID: '
  101. r'aa0aaa00-0aaa-0000-00a0-00000aaaa0aa\r\nTimestamp: 1997-10-10 00:00:00Z_, '
  102. r'_error_codes_:[7000215], _timestamp_:_1997-10-10 00:00:00Z_, '
  103. r'_trace_id_:_a0a0aaaa-a0a0-0a00-000a-00a00aaa0a00_, '
  104. r'_correlation_id_:_aa0aaa00-0aaa-0000-00a0-00000aaaa0aa_, '
  105. r'_error_uri_:_https://example.com_}',
  106. ]
  107. for example in EXAMPLES:
  108. tree = parser.parse(example)
  109. tree = RemoveAmbiguities().transform(tree)
  110. result = TreeToJson().transform(tree)
  111. print('-' * 100)
  112. pprint(result)