pytorch
diff --git a/‎backends/vulkan/runtime/graph/ComputeGraph.cpp
Lines changed: 16 additions & 5 deletions b/‎backends/vulkan/runtime/graph/ComputeGraph.cpp
Lines changed: 16 additions & 5 deletions
diff --git a/‎backends/vulkan/runtime/graph/ComputeGraph.h
Lines changed: 17 additions & 1 deletion b/‎backends/vulkan/runtime/graph/ComputeGraph.h
Lines changed: 17 additions & 1 deletion
diff --git a/‎backends/vulkan/runtime/graph/ops/PrepackNode.cpp
Lines changed: 2 additions & 2 deletions b/‎backends/vulkan/runtime/graph/ops/PrepackNode.cpp
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/conv2d_prepack_weights.glsl
Lines changed: 21 additions & 29 deletions b/‎backends/vulkan/runtime/graph/ops/glsl/conv2d_prepack_weights.glsl
Lines changed: 21 additions & 29 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h
Lines changed: 9 additions & 7 deletions b/‎backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h
Lines changed: 9 additions & 7 deletions
@@ -132,16 +132,27 @@ ValueRef ComputeGraph::add_tensor(
       sizes, dtype, suggested_storage_type(), memory_layout, shared_object_idx);
 }
 
+ValueRef ComputeGraph::add_tensor_like(
+    const ValueRef vref,
+    const api::StorageType storage_type,
+    const api::GPUMemoryLayout memory_layout) {
+  TensorRef& tref = get_val(vref).toTensorRef();
+  return add_tensor(tref.sizes, tref.dtype, storage_type, memory_layout);
+}
+
+ValueRef ComputeGraph::add_tensor_like(
+    const ValueRef vref,
+    const api::GPUMemoryLayout memory_layout) {
+  TensorRef& tref = get_val(vref).toTensorRef();
+  return add_tensor(tref.sizes, tref.dtype, memory_layout);
+}
+
 ValueRef ComputeGraph::add_tensor(
     const std::vector<int64_t>& sizes,
     const api::ScalarType dtype,
     const int64_t shared_object_idx) {
   return add_tensor(
-      sizes,
-      dtype,
-      suggested_storage_type(),
-      suggested_memory_layout(sizes),
-      shared_object_idx);
+      sizes, dtype, suggested_memory_layout(sizes), shared_object_idx);
 }
 
 ValueRef ComputeGraph::add_tensorref(
 
@@ -191,9 +191,25 @@ class ComputeGraph final {
    */
   ValueRef add_tensor(
       const std::vector<int64_t>& sizes,
-      const api::ScalarType dtype = api::ScalarType::Float,
+      const api::ScalarType dtype,
       const int64_t shared_object_idx = -1);
 
+  /*
+   * Add a `vTensor` value to the graph with the properties of `vref`.
+   */
+  ValueRef add_tensor_like(
+      const ValueRef vref,
+      const api::StorageType storage_type,
+      const api::GPUMemoryLayout memory_layout);
+
+  /*
+   * Add a `vTensor` value to the graph with the properties of `vref`. The
+   * suggested storage type will be used to construct the `vTensor`.
+   */
+  ValueRef add_tensor_like(
+      const ValueRef vref,
+      const api::GPUMemoryLayout memory_layout);
+
   /*
    * Add a `TensorRef` value to the graph with the specific properties. A
    * `TensorRef` is a reference to a `vTensor` whose data is stored in an
 
@@ -36,8 +36,8 @@ void PrepackNode::encode(ComputeGraph* graph) {
   api::Context* const context = graph->context();
   api::PipelineBarrier pipeline_barrier{};
 
-  TensorRef tref = graph->get_val(tref_).toTensorRef();
-  vTensor packed = graph->get_val(packed_).toTensor();
+  TensorRef& tref = graph->get_val(tref_).toTensorRef();
+  vTensor& packed = graph->get_val(packed_).toTensor();
 
   size_t numel = api::utils::multiply_integers(tref.sizes);
   api::StorageBuffer staging(graph->context(), tref.dtype, numel);
 
@@ -33,8 +33,8 @@ layout(set = 0, binding = 3) uniform PRECISION restrict OriginalSizes {
 original_sizes;
 
 // Corresponds to {3,3,8,12} in the example below.
-layout(set = 0, binding = 4) uniform PRECISION restrict AlignedSizes {
-  ivec4 data;
+layout(set = 0, binding = 4) uniform PRECISION restrict PaddedSizes {
+  ivec2 data;
 }
 padded_sizes;
 
@@ -94,39 +94,31 @@ void main() {
       base_index + ivec4(0, 1, 2, 3) * STRIDE_CHANNELS_PACKED(gpu_sizes.data);
 
   // Re-map the normal CPU buffer indices to special indices, through a series
-  // of permutations: reshape is a no-op to the underlying indices, and permute
-  // is one of the hardest math problems I've ever solved.
-  //
+  // of mappings: reshape is a no-op to the underlying indices, pad is hard, and
+  // permute is one of the hardest math problems I've ever solved.
+  const int Np = padded_sizes.data.y;
+  const int Cp = padded_sizes.data.x;
+  const int N = original_sizes.data.w;
+  const int C = original_sizes.data.z;
+  const int H = original_sizes.data.y;
+  const int W = original_sizes.data.x;
+
   // Undo step 6 premute: (4,3,3,24) -> (3,4,3,24)
   // Undo step 4 permute: (12,3,2,12) -> (12,2,3,12)
   // Undo step 3 permute, part 1: (12,2,3h,3w,4) -> (12,2,3h,4,3w)
   // Undo step 3 permute, part 2: (12,2,3h,4,3w) -> (12,2,4,3h,3w)
-  const ivec4 p1 = SWAP_DIMS(
-      p0,
-      4,
-      (padded_sizes.data.w / 4),
-      (padded_sizes.data.y * padded_sizes.data.z * padded_sizes.data.x));
-  const ivec4 p2 = SWAP_DIMS(
-      p1,
-      padded_sizes.data.y,
-      (padded_sizes.data.z / 4),
-      (padded_sizes.data.x * 4));
-  const ivec4 p3 = SWAP_DIMS(p2, padded_sizes.data.x, 4, 1);
-  const ivec4 p4 = SWAP_DIMS(p3, padded_sizes.data.y, 4, padded_sizes.data.x);
+  const ivec4 p1 = SWAP_ADJ_DIMS(p0, 4, (Np / 4), (H * Cp * W));
+  const ivec4 p2 = SWAP_ADJ_DIMS(p1, H, (Cp / 4), (W * 4));
+  const ivec4 p3 = SWAP_ADJ_DIMS(p2, W, 4, 1);
+  const ivec4 p4 = SWAP_ADJ_DIMS(p3, H, 4, W);
 
-  // For values in the padded region, write zero instead of buffer data.
-  //
   // Undo step 1 pad: (12,8,3,3) -> (10,7,3,3)
-  const ivec4 c = p4 %
-      (padded_sizes.data.z * padded_sizes.data.y * padded_sizes.data.x) /
-      (padded_sizes.data.y * padded_sizes.data.x);
-  const ivec4 n =
-      p4 / (padded_sizes.data.z * padded_sizes.data.y * padded_sizes.data.x);
-  const ivec4 p5 = p4 -
-      n * (padded_sizes.data.z - original_sizes.data.z) * padded_sizes.data.y *
-          padded_sizes.data.x;
-  const ivec4 mask = ivec4(greaterThanEqual(c, original_sizes.data.zzzz)) |
-      ivec4(greaterThanEqual(n, original_sizes.data.wwww));
+  // For values in the padded region, write zero instead of buffer data.
+  const ivec4 c = p4 % (Cp * H * W) / (H * W);
+  const ivec4 n = p4 / (Cp * H * W);
+  const ivec4 p5 = p4 - n * (Cp - C) * H * W;
+  const ivec4 mask = ivec4(greaterThanEqual(c, ivec4(C))) |
+      ivec4(greaterThanEqual(n, ivec4(N)));
 
   ${T[DTYPE]} val_x = mix(buffer_in.data[p5.x], 0, mask.x);
   ${T[DTYPE]} val_y = mix(buffer_in.data[p5.y], 0, mask.y);
 
@@ -46,10 +46,12 @@
 #define STRIDE_HEIGHT_PACKED(vec) (vec.x)
 
 // Given a buffer(1-D) index cur, compute a new index where the corresponding
-// tensor(N-D)'s x and y dimensions are swapped, and size is of the M-D plane of
-// dimensions lower than x and y.
-#define SWAP_DIMS(cur, x, y, size)                          \
-  cur +                                                     \
-      size*(                                                \
-          (1 - y) * ((cur % (x * y * size)) / (y * size)) + \
-          (x - 1) * ((cur % (y * size)) / size))
+// tensor(N-D)'s adjacent dimensions are swapped. The parameters x,y and plane
+// describe sizes. As an example, let's say we want to swap dimensions 0,1 for a
+// tensor of shape {4,3,2,24} to obtain {3,4,2,24}. Then, x=4, y=3 and
+// plane=2*24=48.
+#define SWAP_ADJ_DIMS(cur, x, y, plane)                       \
+  cur +                                                       \
+      plane*(                                                 \
+          (1 - y) * ((cur % (x * y * plane)) / (y * plane)) + \
+          (x - 1) * ((cur % (y * plane)) / plane))