From 24434ec5ff2d3ae93a43af6227b48e1682f77ccf Mon Sep 17 00:00:00 2001 From: Chanic Panic Date: Fri, 28 Aug 2020 17:26:01 -0700 Subject: [PATCH] Add documentation for working with the SPPF --- docs/classes.rst | 5 ++ docs/forest.rst | 65 ++++++++++++++++ docs/index.rst | 4 +- docs/parsers.md | 3 +- lark/lark.py | 4 +- lark/parsers/earley_forest.py | 143 ++++++++++++++++++++++++---------- 6 files changed, 180 insertions(+), 44 deletions(-) create mode 100644 docs/forest.rst diff --git a/docs/classes.rst b/docs/classes.rst index cf72189..9391e20 100644 --- a/docs/classes.rst +++ b/docs/classes.rst @@ -51,6 +51,11 @@ Transformer, Visitor & Interpreter See :doc:`visitors`. +ForestVisitor, ForestTransformer, & TreeForestTransformer +----------------------------------------------------------- + +See :doc:`forest`. + UnexpectedInput --------------- diff --git a/docs/forest.rst b/docs/forest.rst new file mode 100644 index 0000000..946db53 --- /dev/null +++ b/docs/forest.rst @@ -0,0 +1,65 @@ +Working with the SPPF +===================== + +When parsing with Earley, Lark provides the ``ambiguity='forest'`` option +to obtain the shared packed parse forest (SPPF) produced by the parser as +an alternative to it being automatically converted to a tree. + +Lark provides a few tools to facilitate working with the SPPF. Here are some +things to consider when deciding whether or not to use the SPPF. + +**Pros** + +- Efficient storage of highly ambiguous parses +- Precise handling of ambiguities +- Custom rule prioritizers +- Ability to handle infinite ambiguities +- Directly transform forest -> object instead of forest -> tree -> object + +**Cons** + +- More complex than working with a tree +- SPPF may contain nodes corresponding to rules generated internally +- Loss of Lark grammar features: + + - Rules starting with '_' are not inlined in the SPPF + - Rules starting with '?' are never inlined in the SPPF + - All tokens will appear in the SPPF + +SymbolNode +---------- + +.. autoclass:: lark.parsers.earley_forest.SymbolNode + :members: is_ambiguous, children + +PackedNode +---------- + +.. autoclass:: lark.parsers.earley_forest.PackedNode + :members: children + +ForestVisitor +------------- + +.. autoclass:: lark.parsers.earley_forest.ForestVisitor + :members: visit, visit_symbol_node_in, visit_symbol_node_out, + visit_packed_node_in, visit_packed_node_out, + visit_token_node, on_cycle, get_cycle_in_path + +ForestTransformer +----------------- + +.. autoclass:: lark.parsers.earley_forest.ForestTransformer + :members: transform, transform_symbol_node, transform_intermediate_node, + transform_packed_node, transform_token_node + +TreeForestTransformer +--------------------- + +.. autoclass:: lark.parsers.earley_forest.TreeForestTransformer + :members: __default__, __default_token__, __default_ambig__ + +handles_ambiguity +----------------- + +.. autodecorator:: lark.parsers.earley_forest.handles_ambiguity diff --git a/docs/index.rst b/docs/index.rst index f37d47a..9bef29d 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -36,6 +36,7 @@ Welcome to Lark's documentation! tree_construction classes visitors + forest nearley @@ -96,6 +97,7 @@ Resources - :doc:`grammar` - :doc:`tree_construction` - :doc:`visitors` + - :doc:`forest` - :doc:`classes` - :doc:`nearley` - `Cheatsheet (PDF)`_ @@ -112,4 +114,4 @@ Resources .. _Program Synthesis is Possible: https://www.cs.cornell.edu/~asampson/blog/minisynth.html .. _Cheatsheet (PDF): _static/lark_cheatsheet.pdf .. _Gitter: https://gitter.im/lark-parser/Lobby -.. _Forum (Google Groups): https://groups.google.com/forum/#!forum/lark-parser \ No newline at end of file +.. _Forum (Google Groups): https://groups.google.com/forum/#!forum/lark-parser diff --git a/docs/parsers.md b/docs/parsers.md index 7a05f93..85fd35e 100644 --- a/docs/parsers.md +++ b/docs/parsers.md @@ -23,8 +23,7 @@ Lark provides the following options to combat ambiguity: 2) Users may choose to receive the set of all possible parse-trees (using ambiguity='explicit'), and choose the best derivation themselves. While simple and flexible, it comes at the cost of space and performance, and so it isn't recommended for highly ambiguous grammars, or very long inputs. -3) As an advanced feature, users may use specialized visitors to iterate the SPPF themselves. Future versions of Lark intend to improve and simplify this interface. - +3) As an advanced feature, users may use specialized visitors to iterate the SPPF themselves. **dynamic_complete** diff --git a/lark/lark.py b/lark/lark.py index ad60f04..8799610 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -73,8 +73,8 @@ class LarkOptions(Serialize): ambiguity Decides how to handle ambiguity in the parse. Only relevant if parser="earley" - - "resolve" - The parser will automatically choose the simplest derivation - (it chooses consistently: greedy for tokens, non-greedy for rules) + - "resolve": The parser will automatically choose the simplest derivation + (it chooses consistently: greedy for tokens, non-greedy for rules) - "explicit": The parser will return all derivations wrapped in "_ambig" tree nodes (i.e. a forest). - "forest": The parser will return the root of the shared packed parse forest. diff --git a/lark/parsers/earley_forest.py b/lark/parsers/earley_forest.py index d8f1395..532dedf 100644 --- a/lark/parsers/earley_forest.py +++ b/lark/parsers/earley_forest.py @@ -36,6 +36,14 @@ class SymbolNode(ForestNode): with each Packed Node child representing a single derivation of a production. Hence a Symbol Node with a single child is unambiguous. + + :ivar s: A Symbol, or a tuple of (rule, ptr) for an intermediate node. + :ivar start: The index of the start of the substring matched by this + symbol (inclusive). + :ivar end: The index of the end of the substring matched by this + symbol (exclusive). + :ivar is_intermediate: True if this node is an intermediate node. + :ivar priority: The priority of the node's symbol. """ __slots__ = ('s', 'start', 'end', '_children', 'paths', 'paths_loaded', 'priority', 'is_intermediate', '_hash') def __init__(self, s, start, end): @@ -70,10 +78,13 @@ class SymbolNode(ForestNode): @property def is_ambiguous(self): + """Returns True if this node is ambiguous.""" return len(self.children) > 1 @property def children(self): + """Returns a list of this node's children sorted from greatest to + least priority.""" if not self.paths_loaded: self.load_paths() return sorted(self._children, key=attrgetter('sort_key')) @@ -102,6 +113,12 @@ class SymbolNode(ForestNode): class PackedNode(ForestNode): """ A Packed Node represents a single derivation in a symbol node. + + :ivar rule: The rule associated with this node. + :ivar parent: The parent of this node. + :ivar left: The left child of this node. ``None`` if one does not exist. + :ivar right: The right child of this node. ``None`` if one does not exist. + :ivar priority: The priority of this node. """ __slots__ = ('parent', 's', 'rule', 'start', 'left', 'right', 'priority', '_hash') def __init__(self, parent, s, rule, start, left, right): @@ -130,6 +147,7 @@ class PackedNode(ForestNode): @property def children(self): + """Returns a list of this node's children.""" return [x for x in [self.left, self.right] if x is not None] def __iter__(self): @@ -159,33 +177,66 @@ class ForestVisitor(object): """ An abstract base class for building forest visitors. - Use this as a base when you need to walk the forest. + This class performs a controllable depth-first walk of an SPPF. + The visitor will not enter cycles and will backtrack if one is encountered. + Subclasses are notified of cycles through the ``on_cycle`` method. + + Behavior for visit events is defined by overriding the + ``visit*node*`` functions. + + The walk is controlled by the return values of the ``visit*node_in`` + methods. Returning a node(s) will schedule them to be visited. The visitor + will begin to backtrack if no nodes are returned. """ + def visit_token_node(self, node): + """Called when a ``Token`` is visited. ``Token`` nodes are always leaves.""" + pass + + def visit_symbol_node_in(self, node): + """Called when a symbol node is visited. Nodes that are returned + will be scheduled to be visited. If ``visit_intermediate_node_in`` + is not implemented, this function will be called for intermediate + nodes as well.""" + pass + + def visit_symbol_node_out(self, node): + """Called after all nodes returned from a corresponding ``visit_symbol_node_in`` + call have been visited. If ``visit_intermediate_node_out`` + is not implemented, this function will be called for intermediate + nodes as well.""" + pass + + def visit_packed_node_in(self, node): + """Called when a packed node is visited. Nodes that are returned + will be scheduled to be visited. """ + pass + + def visit_packed_node_out(self, node): + """Called after all nodes returned from a corresponding ``visit_packed_node_in`` + call have been visited.""" + pass + + def on_cycle(self, node, path): + """Called when a cycle is encountered. + + :param node: The node that causes a cycle. + :param path: The list of nodes being visited: nodes that have been + entered but not exited. The first element is the root in a forest + visit, and the last element is the node visited most recently. + ``path`` should be treated as read-only. + """ + pass + def get_cycle_in_path(self, node, path): + """A utility function for use in ``on_cycle`` to obtain a slice of + ``path`` that only contains the nodes that make up the cycle.""" index = len(path) - 1 while id(path[index]) != id(node): index -= 1 return path[index:] - def visit_token_node(self, node): pass - def visit_symbol_node_in(self, node): pass - def visit_symbol_node_out(self, node): pass - def visit_packed_node_in(self, node): pass - def visit_packed_node_out(self, node): pass - - def on_cycle(self, node, path): - """Called when a cycle is encountered. `node` is the node that causes - the cycle. `path` the list of nodes being visited: nodes that have been - entered but not exited. The first element is the root in a forest - visit, and the last element is the node visited most recently. - `path` should be treated as read-only. The utility function - `get_cycle_in_path` may be used to obtain a slice of `path` that only - contains the nodes that make up the cycle.""" - pass - def visit(self, root): - # Visiting is a list of IDs of all symbol/intermediate nodes currently in # the stack. It serves two purposes: to detect when we 'recurse' in and out # of a symbol/intermediate so that we can process both up and down. Also, @@ -273,21 +324,19 @@ class ForestVisitor(object): continue class ForestTransformer(ForestVisitor): - """The base class for a bottom-up forest transformation. - Transformations are applied via inheritance and overriding of the - following methods: + """The base class for a bottom-up forest transformation. Most users will + want to use ``TreeForestTransformer`` instead as it has a friendlier + interface and covers most use cases. - transform_symbol_node - transform_intermediate_node - transform_packed_node - transform_token_node + Transformations are applied via inheritance and overriding of the + ``transform*node`` methods. - `transform_token_node` receives a Token as an argument. + ``transform_token_node`` receives a ``Token`` as an argument. All other methods receive the node that is being transformed and a list of the results of the transformations of that node's children. The return value of these methods are the resulting transformations. - If `Discard` is raised in a transformation, no data from that node + If ``Discard`` is raised in a node's transformation, no data from that node will be passed to its parent's transformation. """ @@ -298,7 +347,7 @@ class ForestTransformer(ForestVisitor): self.node_stack = deque() def transform(self, root): - """Perform a transformation on a Forest.""" + """Perform a transformation on an SPPF.""" self.node_stack.append('result') self.data['result'] = [] self.visit(root) @@ -307,15 +356,19 @@ class ForestTransformer(ForestVisitor): return self.data['result'][0] def transform_symbol_node(self, node, data): + """Transform a symbol node.""" return node def transform_intermediate_node(self, node, data): + """Transform an intermediate node.""" return node def transform_packed_node(self, node, data): + """Transform a packed node.""" return node def transform_token_node(self, node): + """Transform a ``Token``.""" return node def visit_symbol_node_in(self, node): @@ -421,9 +474,9 @@ class ForestToParseTree(ForestTransformer): """Used by the earley parser when ambiguity equals 'resolve' or 'explicit'. Transforms an SPPF into an (ambiguous) parse tree. - tree_class: The Tree class to use for construction + tree_class: The tree class to use for construction callbacks: A dictionary of rules to functions that output a tree - prioritizer: A ForestVisitor that manipulates the priorities of + prioritizer: A ``ForestVisitor`` that manipulates the priorities of ForestNodes resolve_ambiguity: If True, ambiguities will be resolved based on priorities. Otherwise, `_ambig` nodes will be in the resulting @@ -523,13 +576,13 @@ class ForestToParseTree(ForestTransformer): return super(ForestToParseTree, self).visit_token_node(node) def handles_ambiguity(func): - """Decorator for methods of subclasses of TreeForestTransformer. + """Decorator for methods of subclasses of ``TreeForestTransformer``. Denotes that the method should receive a list of transformed derivations.""" func.handles_ambiguity = True return func class TreeForestTransformer(ForestToParseTree): - """A ForestTransformer with a tree-Transformer-like interface. + """A ``ForestTransformer`` with a tree ``Transformer``-like interface. By default, it will construct a tree. Methods provided via inheritance are called based on the rule/symbol @@ -538,16 +591,28 @@ class TreeForestTransformer(ForestToParseTree): Methods that act on rules will receive a list of the results of the transformations of the rule's children. By default, trees and tokens. - Methods that act on tokens will receive a Token. + Methods that act on tokens will receive a token. Alternatively, methods that act on rules may be annotated with - `handles_ambiguity`. In this case, the function will receive a list + ``handles_ambiguity``. In this case, the function will receive a list of all the transformations of all the derivations of the rule. By default, a list of trees where each tree.data is equal to the rule name or one of its aliases. Non-tree transformations are made possible by override of - `__default__`, `__default_token__`, and `__default_ambig__`. + ``__default__``, ``__default_token__``, and ``__default_ambig__``. + + .. note:: + + Tree shaping features such as inlined rules and token filtering are + not built into the transformation. Positions are also not + propagated. + + :param tree_class: The tree class to use for construction + :param prioritizer: A ``ForestVisitor`` that manipulates the priorities of + nodes in the SPPF. + :param resolve_ambiguity: If True, ambiguities will be resolved based on + priorities. """ def __init__(self, tree_class=Tree, prioritizer=ForestSumVisitor(), resolve_ambiguity=True): @@ -563,8 +628,8 @@ class TreeForestTransformer(ForestToParseTree): def __default_ambig__(self, name, data): """Default operation on ambiguous rule (for override). - Wraps data in an '_ambig_ node if it contains more than - one element.' + Wraps data in an '_ambig_' node if it contains more than + one element. """ if len(data) > 1: return self.tree_class('_ambig', data) @@ -573,9 +638,9 @@ class TreeForestTransformer(ForestToParseTree): raise Discard() def __default_token__(self, node): - """Default operation on Token (for override). + """Default operation on ``Token`` (for override). - Returns node + Returns ``node``. """ return node