pytorch
diff --git a/‎backends/vulkan/runtime/api/containers/Tensor.h
Lines changed: 8 additions & 0 deletions b/‎backends/vulkan/runtime/api/containers/Tensor.h
Lines changed: 8 additions & 0 deletions
diff --git a/‎backends/vulkan/runtime/graph/ComputeGraph.h
Lines changed: 4 additions & 0 deletions b/‎backends/vulkan/runtime/graph/ComputeGraph.h
Lines changed: 4 additions & 0 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h
Lines changed: 5 additions & 0 deletions b/‎backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h
Lines changed: 5 additions & 0 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/softmax.glsl
Lines changed: 267 additions & 0 deletions b/‎backends/vulkan/runtime/graph/ops/glsl/softmax.glsl
Lines changed: 267 additions & 0 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/softmax.h
Lines changed: 0 additions & 43 deletions b/‎backends/vulkan/runtime/graph/ops/glsl/softmax.h
Lines changed: 0 additions & 43 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/softmax_channel.yaml renamed to ‎backends/vulkan/runtime/graph/ops/glsl/softmax.yaml
Lines changed: 4 additions & 4 deletions b/‎backends/vulkan/runtime/graph/ops/glsl/softmax_channel.yaml renamed to ‎backends/vulkan/runtime/graph/ops/glsl/softmax.yaml
Lines changed: 4 additions & 4 deletions
@@ -395,6 +395,14 @@ class vTensor final {
     return packed_dim_;
   }
 
+  /*
+   * Returns the WHCN index of the dimension that is used to concatenate batches
+   * as an int32_t.
+   */
+  inline int32_t concat_dim() const {
+    return utils::safe_downcast<int32_t>(axis_map_.at(3));
+  }
+
   inline const std::vector<int64_t>& sizes() const {
     return sizes_;
   }
 
@@ -322,6 +322,10 @@ class ComputeGraph final {
     return values_.at(idx).toConstTensor().packed_dim();
   }
 
+  inline int32_t concat_dim_of(const ValueRef idx) const {
+    return values_.at(idx).toConstTensor().concat_dim();
+  }
+
   inline vkapi::BufferBindInfo sizes_ubo(const ValueRef idx) {
     return values_.at(idx).toTensor().sizes_ubo();
   }
 
@@ -53,6 +53,11 @@
  */
 #define alignup4(x) ((x + 3) & -4)
 
+/*
+ * Fast modulo by 4 using bit masking
+ */
+#define mod4(x) (x & 3)
+
 /*
  * Find the packed dimension of a tensor given its strides. The packed dimension
  * is the "fastest moving" dimension which will have a stride of 1.
 
@@ -0,0 +1,267 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define op1(X) ${OPERATOR1}
+
+#define op2(X, Y) ${OPERATOR2}
+
+${define_active_storage_type(STORAGE)}
+
+#extension GL_EXT_control_flow_attributes : require
+
+layout(std430) buffer;
+
+${layout_declare_tensor(B, "w", "tout", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "r", "tin", DTYPE, STORAGE)}
+
+${layout_declare_ubo(B, "ivec3", "tout_limits")}
+${layout_declare_ubo(B, "ivec4", "tin_sizes")}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+layout(constant_id = 3) const int packed_dim = 0;
+layout(constant_id = 4) const int reduce_dim = 0;
+layout(constant_id = 5) const int group_dim = 1;
+
+// A more verbose name would be NWORKERS_PER_GROUP. This describes the number of
+// threads that will co-operate to compute one reduction output. There may be
+// multiple groups computing distinct reduction outputs within one work group.
+#define NWORKERS 4
+
+// Sets an upper limit on the total size of a work group based on how many
+// elements are allocated in the shared memory array below. Each thread in the
+// work group will write into its assigned element in the shared array.
+#define MAX_NTHREADS 16
+
+shared vec4 shared_vecs[MAX_NTHREADS];
+
+#include "indexing_utils.h"
+
+int tid_to_smi(const ivec2 tid) {
+  return tid.x + tid.y * NWORKERS;
+}
+
+/*
+ * The shaders below compute softmax for a tensor. Softmax is an interesting mix
+ * between a reduction operator and a unary elementwise operator, defined as
+ * exp(x) / (sum of exp(x)). The general flow of the computation is:
+ *
+ * First, find the maximum element along the reduction dim. The maximum element
+ * is used to preserve numerical stability, since division of exponents is
+ * translation invariant.
+ *
+ * Next, compute the sum of exp(x - max_element) along the reduction dim.
+ *
+ * Finally, for each element along the reduction dim, we compute the output as
+ * exp(x - max_element) / sum_of_exponents.
+ *
+ * The shaders below also utilize shared memory to have multiple threads help
+ * compute the max and sum reduction operations. A total of NGROUPS x NWORKERS
+ * threads are launched. Each group works on a unique reduction "row", and
+ * within a group NWORKERS threads co-operate to compute the max and sum of one
+ * "row". Each worker in the group is responsible for computing a partial output
+ * of the "row" and uploading it to shared memory; the overall reduction output
+ * can then be determined by aggregating the partial outputs stored in shared
+ * memory.
+ *
+ * As a caveat, this shader does not currently support cases where `batch` > 1
+ * and the reduce dim happens to also be the batch concatenation dim.  To support
+ * this, there will need to be additional logic to set the starting value of
+ * `scan_pos[reduce_dim]`. Since this is not expected to be a common use-case,
+ * supporting this case is left as an exercise for when it is required.
+ *
+ * As a final note, log softmax is supported with this shader as well since via
+ * the op1 and op2 macro definitions. See the corresponding YAML file for more
+ * details.
+ */
+
+/*
+ * Computes softmax where the reduction dim is orthogonal to the packed dim.
+ * This case is simpler because each element of a texel belongs to a separate
+ * reduction dim, meaning we don't have to perform reduction along a texel.
+ */
+void softmax_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) {
+  // shared memory index of this thread
+  const int smi = tid_to_smi(tid);
+  // used to iterate over all shared memory in the group
+  int group_i;
+
+  scan_pos[reduce_dim] = tid.x;
+  vec4 max_elements = load_texel(tin, scan_pos);
+  // This thread computes a partial maximum
+  for (int i = tid.x; i < tin_sizes[reduce_dim];
+       i += NWORKERS, scan_pos[reduce_dim] += NWORKERS) {
+    max_elements = max(max_elements, load_texel(tin, scan_pos));
+  }
+  shared_vecs[smi] = max_elements;
+  barrier();
+  // Iterate over the partial maximums to obtain the overall maximum
+  group_i = tid.y * NWORKERS;
+  max_elements = shared_vecs[group_i++];
+  for (int i = 1; i < NWORKERS; ++i, group_i++) {
+    max_elements = max(max_elements, shared_vecs[group_i]);
+  }
+
+  scan_pos[reduce_dim] = tid.x;
+  vec4 denominators = vec4(0);
+  // Compute partial sum
+  for (int i = tid.x; i < tin_sizes[reduce_dim];
+       i += NWORKERS, scan_pos[reduce_dim] += NWORKERS) {
+    denominators += exp(load_texel(tin, scan_pos) - max_elements);
+  }
+  shared_vecs[smi] = denominators;
+  barrier();
+  // Iterate over the partial sums to obtain the overall sum
+  group_i = tid.y * NWORKERS;
+  denominators = shared_vecs[group_i++];
+  for (int i = 1; i < NWORKERS; ++i, group_i++) {
+    denominators += shared_vecs[group_i];
+  }
+
+  // Determine if there are any padding elements in the final texel of the
+  // packed dimension
+  const int nspill = mod4(tin_sizes[packed_dim]);
+  // Detect if this thread is working on the final texels of the packed
+  // dimension, which may have padding elements
+  const bool is_last_texel =
+      scan_pos[packed_dim] == (tout_limits[packed_dim] - 1);
+
+  scan_pos[reduce_dim] = tid.x;
+  for (int i = tid.x; i < tin_sizes[reduce_dim];
+       i += NWORKERS, scan_pos[reduce_dim] += NWORKERS) {
+    const vec4 numerators = op1(load_texel(tin, scan_pos) - max_elements);
+    vec4 outtex = op2(numerators, denominators);
+    // For the last texel in the packed dim, make sure that the padding elements
+    // are explicitly set to 0. Otherwise, they may influence computations later
+    // down the line.
+    if (is_last_texel && nspill > 0) {
+      [[unroll]] for (int i = nspill; i < 4; ++i) {
+        outtex[i] = 0;
+      }
+    }
+    write_texel(tout, scan_pos, outtex);
+  }
+}
+
+/*
+ * Compute softmax where the reduction dim is also the packed dim. This case is
+ * complex because the reduction needs to occur over the individual texels.
+ * Therefore, in this algorithm each element of the accumulator texels are
+ * themselves partial outputs. Special care has to be taken to ignore padding
+ * elements in texels (which occur when the size of the packed dim is not a
+ * multiple of 4) so that they do not influence the output of reduction.
+ */
+void softmax_packed_dim(const ivec2 tid, ivec3 scan_pos) {
+  // shared memory index of this thread
+  const int smi = tid_to_smi(tid);
+  // used to iterate over all shared memory in the group
+  int group_i;
+
+  const int nspill = mod4(tin_sizes[packed_dim]);
+  const int reduce_len = tin_sizes[packed_dim] - nspill;
+
+  scan_pos[reduce_dim] = tid.x;
+  vec4 max_elements = vec4(load_texel(tin, scan_pos).x);
+  for (int i = tid.x * 4; i < reduce_len;
+       i += NWORKERS * 4, scan_pos[reduce_dim] += NWORKERS) {
+    max_elements = max(max_elements, load_texel(tin, scan_pos));
+  }
+  // For the last texel in the dim, if there are padding elements then each
+  // element of the texel needs to be processed individually such that the
+  // padding elements are ignored
+  if (scan_pos[reduce_dim] == tout_limits[reduce_dim] - 1 && nspill > 0) {
+    const vec4 intex = load_texel(tin, scan_pos);
+    for (int i = 0; i < nspill; ++i) {
+      max_elements.x = max(intex[i], max_elements.x);
+    }
+  }
+  shared_vecs[smi] = max_elements;
+  barrier();
+  // Iterate over the partial maximums to obtain the overall maximum
+  group_i = tid.y * NWORKERS;
+  max_elements = shared_vecs[group_i++];
+  for (int i = 1; i < NWORKERS; ++i, group_i++) {
+    max_elements = max(max_elements, shared_vecs[group_i]);
+  }
+  // Each element of the texel is itself a partial maximum; iterate over the
+  // texel to find the actual maximum
+  float max_element = max_elements.x;
+  [[unroll]] for (int i = 1; i < 4; ++i) {
+    max_element = max(max_elements[i], max_element);
+  }
+
+  scan_pos[reduce_dim] = tid.x;
+  vec4 denominators = vec4(0);
+  for (int i = tid.x * 4; i < reduce_len;
+       i += NWORKERS * 4, scan_pos[reduce_dim] += NWORKERS) {
+    denominators += exp(load_texel(tin, scan_pos) - max_element);
+  }
+  // For the last texel in the dim, if there are padding elements then each
+  // element of the texel needs to be processed individually such that the
+  // padding elements are ignored
+  if (nspill > 0 && scan_pos[reduce_dim] == tout_limits[reduce_dim] - 1) {
+    const vec4 intex = load_texel(tin, scan_pos);
+    for (int i = 0; i < nspill; ++i) {
+      denominators.x += exp(intex[i] - max_element);
+    }
+  }
+  shared_vecs[smi] = denominators;
+  barrier();
+  // Iterate over the partial sums to obtain the overall sum
+  group_i = tid.y * NWORKERS;
+  denominators = shared_vecs[group_i++];
+  for (int i = 1; i < NWORKERS; ++i, group_i++) {
+    denominators += shared_vecs[group_i];
+  }
+  // Reduce over the accumulated texel to find the overall sum
+  float denominator = 0;
+  [[unroll]] for (int i = 0; i < 4; ++i) {
+    denominator += denominators[i];
+  }
+
+  scan_pos[reduce_dim] = tid.x;
+  for (int i = tid.x * 4; i < reduce_len;
+       i += NWORKERS * 4, scan_pos[reduce_dim] += NWORKERS) {
+    const vec4 numerators = op1(load_texel(tin, scan_pos) - max_element);
+    write_texel(tout, scan_pos, op2(numerators, denominator));
+  }
+  // For the last texel in the dim, if there are padding elements then the
+  // padding elements need to be set to 0 explicitly, otherwise they may
+  // influence subsequent operations.
+  if (nspill > 0 && scan_pos[reduce_dim] == tout_limits[reduce_dim] - 1) {
+    const vec4 numerator = op1(load_texel(tin, scan_pos) - max_element);
+    vec4 outtex = op2(numerator, denominator);
+    [[unroll]] for (int i = nspill; i < 4; ++i) {
+      outtex[i] = 0;
+    }
+    write_texel(tout, scan_pos, outtex);
+  }
+}
+
+void main() {
+  ivec3 scan_pos = ivec3(gl_GlobalInvocationID);
+  scan_pos[reduce_dim] = 0;
+
+  const ivec2 tid = ivec2(
+      gl_LocalInvocationID[reduce_dim],
+      gl_LocalInvocationID[group_dim]);
+
+  if (any(greaterThanEqual(scan_pos, tout_limits))) {
+    return;
+  }
+
+  if (reduce_dim != packed_dim) {
+    softmax_nonpacked_dim(tid, scan_pos);
+  } else {
+    softmax_packed_dim(tid, scan_pos);
+  }
+}
@@ -4,18 +4,18 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-softmax_channel:
+softmax:
   parameter_names_with_default_values:
     OPERATOR1: exp(X)
     OPERATOR2: X / Y
-    NDIM: 3
     DTYPE: float
+    STORAGE: texture3d
   generate_variant_forall:
     DTYPE:
       - VALUE: half
       - VALUE: float
   shader_variants:
-    - NAME: softmax_channel
-    - NAME: log_softmax_channel
+    - NAME: softmax
+    - NAME: log_softmax
       OPERATOR1: X
       OPERATOR2: X - log(Y)
Original file line number	Diff line number	Diff line change
`@@ -395,6 +395,14 @@ class vTensor final {`
`395`	`395`	`return packed_dim_;`
`396`	`396`	`}`
`397`	`397`
	`398`	`+ /*`
	`399`	`+ * Returns the WHCN index of the dimension that is used to concatenate batches`
	`400`	`+ * as an int32_t.`
	`401`	`+ */`
	`402`	`+ inline int32_t concat_dim() const {`
	`403`	`+ return utils::safe_downcast<int32_t>(axis_map_.at(3));`
	`404`	`+ }`
	`405`	`+`
`398`	`406`	`inline const std::vector<int64_t>& sizes() const {`
`399`	`407`	`return sizes_;`
`400`	`408`	`}`
Original file line number	Diff line number	Diff line change
`@@ -322,6 +322,10 @@ class ComputeGraph final {`
`322`	`322`	`return values_.at(idx).toConstTensor().packed_dim();`
`323`	`323`	`}`
`324`	`324`
	`325`	`+ inline int32_t concat_dim_of(const ValueRef idx) const {`
	`326`	`+ return values_.at(idx).toConstTensor().concat_dim();`
	`327`	`+ }`
	`328`	`+`
`325`	`329`	`inline vkapi::BufferBindInfo sizes_ubo(const ValueRef idx) {`
`326`	`330`	`return values_.at(idx).toTensor().sizes_ubo();`
`327`	`331`	`}`