Get rid of Reshape again, now that we Vectorize join

ricardoV94 · ricardoV94 · commit fe20a66f1267 · 2024-07-10T21:41:39.000+02:00
diff --git a/pytensor/tensor/basic.py b/pytensor/tensor/basic.py
@@ -1681,21 +1681,22 @@ def do_constant_folding(self, fgraph, node):
             return False
 
         for client, idx in clients:
-            if isinstance(client.op, Output):
+            client_op = client.op
+            if isinstance(client_op, Output):
                 # If the output is a constant, it will have to be deepcopied
                 # each time the function is called.  So we do not fold.
                 return False
-            # Allow alloc to be lifted out of Elemwise before constant folding it
-            elif isinstance(client.op, Elemwise):
-                return None
+            # Op's through which Alloc can be lifted
+            elif isinstance(client_op, Elemwise | DimShuffle | Alloc | Join):
+                return False
             # Same for Blockwise, unless it has no batch_dims
-            elif isinstance(client.op, Blockwise) and client.op.batch_ndim(client):
-                return None
+            elif isinstance(client_op, Blockwise) and client.op.batch_ndim(client):
+                return False
             elif (
                 # The following ops work inplace of their input id 0.
                 idx == 0
                 and isinstance(
-                    client.op,
+                    client_op,
                     pytensor.tensor.subtensor.IncSubtensor
                     | pytensor.tensor.subtensor.AdvancedIncSubtensor1
                     | pytensor.tensor.subtensor.AdvancedIncSubtensor
diff --git a/pytensor/tensor/rewriting/basic.py b/pytensor/tensor/rewriting/basic.py
@@ -52,6 +52,7 @@
     TensorFromScalar,
     alloc,
     as_tensor_variable,
+    atleast_Nd,
     cast,
     extract_constant,
     fill,
@@ -1219,3 +1220,124 @@ def local_merge_alloc(fgraph, node):
 
 
 register_canonicalize(RemovalNodeRewriter(tensor_copy), name="remove_tensor_copy")
+
+
+@register_specialize
+@node_rewriter([DimShuffle])
+def local_dimshuffle_alloc(fgraph, node):
+    """
+    Lift DimShuffle through Alloc
+
+    dimshuffle{x, 0, 1}(alloc([3 4], 3, 2) => alloc([3 4], 1, 3, 2)
+    """
+    alloc_out = node.inputs[0]
+    alloc_node = alloc_out.owner
+    if not (alloc_node and isinstance(alloc_node.op, Alloc)):
+        return
+
+    ds_op = node.op
+    value, *alloc_shape = alloc_node.inputs
+
+    # Add implicit dimensions of value
+    value = atleast_Nd(value, n=len(alloc_shape))
+
+    # Dimshuffle value and alloc_shape
+    ds_value = value.dimshuffle(ds_op.new_order)
+    ds_alloc_shape = [alloc_shape[i] for i in ds_op.shuffle]
+    for dim in ds_op.augment:
+        ds_alloc_shape.insert(dim, 1)
+
+    return [alloc(ds_value, *ds_alloc_shape)]
+
+
+@register_specialize("shape_unsafe")
+@node_rewriter([Join])
+def local_join_of_alloc(fgraph, node):
+    """Rewrite a Join of Alloc nodes to an Alloc of the Join nodes."""
+    axis, *tensors = node.inputs
+
+    if len(tensors) < 2:
+        # Let other rewrite handle the useless Join
+        return
+
+    if not isinstance(axis, Constant):
+        return
+
+    core_tensors = []
+    alloc_shapes = []
+    for tensor in tensors:
+        if tensor.owner is None:
+            print(" > failed no owner")
+            return
+
+        # tensor = expand_dims_to_alloc(tensor)
+        if not isinstance(tensor.owner.op, Alloc):
+            return
+
+        value, *shape = tensor.owner.inputs
+        # Introduce explicit batch dims
+        value = atleast_Nd(value, n=len(shape))
+        core_tensors.append(value)
+        alloc_shapes.append(shape)
+
+    # Find which allocated dimensions can be lifted
+    # Axis can never be lifted
+    # Non-axis allocated dimensions can be lifted if they are all broadcastable
+    [out] = node.outputs
+    axis = axis.data
+
+    broadcasted_dims = list(
+        zip(
+            *(
+                [
+                    bef and not aft
+                    for bef, aft in zip(
+                        core_tensor.type.broadcastable,
+                        tensor.type.broadcastable,
+                        strict=True,
+                    )
+                ]
+                for core_tensor, tensor in zip(core_tensors, tensors, strict=True)
+            )
+        )
+    )
+
+    lifteable_alloc_dims = {
+        dim
+        for dim in range(out.type.ndim)
+        if dim != axis and all(broadcasted_dims[dim])
+    }
+
+    if not lifteable_alloc_dims:
+        return
+
+    # Lift the allocated dimensions
+    new_tensors = []
+    for core_tensor, alloc_shape in zip(core_tensors, alloc_shapes):
+        pre_join_shape = [
+            1 if i in lifteable_alloc_dims else alloc_dim
+            for i, alloc_dim in enumerate(alloc_shape)
+        ]
+        new_tensor = alloc(core_tensor, *pre_join_shape)
+        copy_stack_trace(tensor, new_tensor)
+        new_tensors.append(new_tensor)
+
+    new_join = node.op(axis, *new_tensors)
+    copy_stack_trace(node.outputs[0], new_join)
+
+    # Reintroduce the lifted dims
+    post_join_shape = []
+    for i, alloc_dims in enumerate(zip(*alloc_shapes)):
+        if i == axis:
+            # The alloc dim along the axis is the sum of all the pre-join alloc dims
+            post_join_shape.append(add(*alloc_dims))
+        else:
+            # Otherwise the shapes should all match. We prioritize constants if any
+            for best_alloc_dim in alloc_dims:
+                if isinstance(best_alloc_dim, Constant):
+                    break
+            post_join_shape.append(best_alloc_dim)
+
+    new_out = alloc(new_join, *post_join_shape)
+    copy_stack_trace(node.outputs[0], new_out)
+    return [new_out]
diff --git a/pytensor/tensor/rewriting/blockwise.py b/pytensor/tensor/rewriting/blockwise.py
@@ -1,18 +1,15 @@
-from pytensor import Variable
 from pytensor.compile.mode import optdb
 from pytensor.graph import Constant, node_rewriter
 from pytensor.graph.replace import vectorize_node
 from pytensor.graph.rewriting.basic import copy_stack_trace, out2in
 from pytensor.tensor.basic import Alloc, ARange, alloc, shape_padleft
 from pytensor.tensor.blockwise import Blockwise
-from pytensor.tensor.elemwise import DimShuffle
 from pytensor.tensor.math import Dot
 from pytensor.tensor.rewriting.basic import (
     register_canonicalize,
     register_specialize,
     register_stabilize,
 )
-from pytensor.tensor.rewriting.uncanonicalize import local_dimshuffle_alloc
 from pytensor.tensor.shape import Reshape
 from pytensor.tensor.subtensor import AdvancedIncSubtensor, AdvancedSubtensor, Subtensor
 
@@ -71,7 +68,13 @@ def local_useless_unbatched_blockwise(fgraph, node):
 def local_eager_useless_unbatched_blockwise(fgraph, node):
     if isinstance(
         node.op.core_op,
-        Dot | Alloc | ARange | Subtensor | AdvancedSubtensor | AdvancedIncSubtensor,
+        Dot
+        | Alloc
+        | ARange
+        | Subtensor
+        | AdvancedSubtensor
+        | AdvancedIncSubtensor
+        | Reshape,
     ):
         # Many Dot-related rewrites (eg, all of BlasOpt) happen before specialize
         # These other Ops can't always be trivially vectorized at runtime,
@@ -90,18 +93,6 @@ def _squeeze_left(x, stop_at_dim: int | None = None):
     return x.squeeze(axis=tuple(range(squeeze_ndim)))
 
 
-def alloc_or_expand_dims_of_alloc(var: Variable) -> bool:
-    return var.owner and (
-        isinstance(var.owner.op, Alloc)
-        or (
-            isinstance(var.owner.op, DimShuffle)
-            and var.owner.inputs[0].owner
-            and isinstance(var.owner.inputs[0].owner.op, Alloc)
-        )
-    )
-
-
-@register_canonicalize("shape_unsafe")
 @register_specialize("shape_unsafe")
 @node_rewriter([Blockwise])
 def local_blockwise_alloc(fgraph, node):
@@ -119,20 +110,14 @@ def local_blockwise_alloc(fgraph, node):
     if not batch_ndim:
         return None
 
-    if not any(alloc_or_expand_dims_of_alloc(var) for var in node.inputs):
+    if not any(var.owner and isinstance(var.owner.op, Alloc) for var in node.inputs):
         return None
 
     new_inputs = []
     batch_shapes = []
     can_push_any_alloc = False
     for inp, inp_sig in zip(node.inputs, op.inputs_sig):
         if not all(inp.type.broadcastable[:batch_ndim]):
-            if inp.owner and isinstance(inp.owner.op, DimShuffle):
-                # Convert DimShuffle of Alloc to Alloc
-                new_inp = local_dimshuffle_alloc.transform(None, inp.owner)
-                if new_inp:
-                    [inp] = new_inp
-
             if inp.owner and isinstance(inp.owner.op, Alloc):
                 # Push batch dims from Alloc
                 value, *shape = inp.owner.inputs
@@ -217,7 +202,6 @@ def local_blockwise_alloc(fgraph, node):
     return new_outs
 
 
-@register_canonicalize
 @register_specialize
 @node_rewriter([Blockwise])
 def local_blockwise_reshape(fgraph, node):