Implement SDPA + KV-Cache operator (#5799)

SS-JIA · facebook-github-bot · commit 6e871c3b617f · 2024-10-07T14:51:46.000-07:00
Summary: Pull Request resolved: #5799 ## Context As title, this diff adds an implementation for a fused SDPA + KV-Cache update operator which will be used in LLaMA models. Currently the SDPA portion of the operator is implemented via it's consituent operators, but a future optimization opportunity would be to implement a single flash attention shader. ## Reference Implementation For future reference, a reference implementation of the SDPA + KV cache update mechanism is shown below. This reference implementation was originally used to check intermediate outputs but in the end I decided to compare against the `sdpa_with_kv_cache` operator in `extension/llm` for simplicity. ``` at::Tensor convert_boolean_attn_mask( const at::Tensor& attn_mask, caffe2::TypeMeta dtype) { // Convert boolean mask to additive mask; need to invert mask to indicate what // to mask *out*. if (attn_mask.dtype() == at::kBool) { return at::where( attn_mask.logical_not(), -std::numeric_limits<double>::infinity(), at::scalar_tensor( 0.0, at::TensorOptions().dtype(dtype).device(attn_mask.device()))); } // Otherwise, attn_mask represents an additive attention tensor return attn_mask; } at::Tensor construct_attention_mask( const at::Tensor& q, const at::Tensor& k_cache, const int start_pos) { const int max_seq_len = k_cache.size(1); const int seq_len = q.size(1); at::Tensor attn_mask_base = at::ones({max_seq_len, start_pos + seq_len}, q.options().dtype(at::kBool)) .tril(); at::Tensor attn_mask_sliced = at::slice(attn_mask_base, 0, start_pos, start_pos + seq_len); attn_mask_sliced = convert_boolean_attn_mask(attn_mask_sliced, q.dtype()); return attn_mask_sliced; } std::vector<at::Tensor> sdpa_reference_impl( const at::Tensor& q_projected, const at::Tensor& k_projected, const at::Tensor& v_projected, at::Tensor& key_cache, at::Tensor& value_cache, const int64_t start_pos, const int64_t seq_len, const c10::optional<at::Tensor> __attn_mask_ignored, const double dropout_p, const bool is_causal, const c10::optional<double> scale) { at::Tensor attn_mask = construct_attention_mask(q_projected, key_cache, start_pos); at::Tensor key_cache_updated = at::slice_scatter( key_cache, k_projected, 1, start_pos, start_pos + k_projected.size(1)); at::Tensor value_cache_updated = at::slice_scatter( value_cache, v_projected, 1, start_pos, start_pos + v_projected.size(1)); at::Tensor key_cache_sliced = at::slice(key_cache_updated, 1, 0, start_pos + q_projected.size(1)); at::Tensor value_cache_sliced = at::slice(value_cache_updated, 1, 0, start_pos + q_projected.size(1)); at::Tensor q_transposed = q_projected.transpose(1, 2); at::Tensor k_transposed = key_cache_sliced.transpose(1, 2); at::Tensor v_transposed = value_cache_sliced.transpose(1, 2); // Skip doing repeat_interleave; assume that num_attention_heads == // num_kv_heads float scale_factor = 1.0 / sqrt(q_transposed.size(-1)); at::Tensor k_transposed_2 = k_transposed.transpose(-2, -1); at::Tensor attn_weight_prescale = at::matmul(q_transposed, k_transposed_2); at::Tensor attn_weight = attn_weight_prescale * scale_factor + attn_mask; at::Tensor attn_weight_softmax = at::softmax(attn_weight, -1); at::Tensor out = at::matmul(attn_weight_softmax, v_transposed); return { out.transpose(1, 2), key_cache_sliced, value_cache_sliced, q_transposed, k_transposed, v_transposed, k_transposed_2, attn_weight_prescale, attn_weight, attn_weight_softmax, out, }; } ``` ghstack-source-id: 246640547 Reviewed By: kimishpatel Differential Revision: D63724114 fbshipit-source-id: c85afc2f8eade8e0ac6e348eabbe608e5a0efce6
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h
@@ -254,7 +254,7 @@ class ComputeGraph final {
 #undef GET_AND_CHECK_VAL_AS_TYPE_FNS
 
   inline bool val_is_none(const ValueRef idx) {
-    return values_.at(idx).isNone();
+    return idx == kDummyValueRef ? true : values_.at(idx).isNone();
   }
 
   inline TypeTag get_val_type(const ValueRef idx) {
diff --git a/backends/vulkan/runtime/graph/ops/glsl/kv_cache_update.glsl b/backends/vulkan/runtime/graph/ops/glsl/kv_cache_update.glsl
@@ -0,0 +1,80 @@
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define T ${buffer_scalar_type(DTYPE)}
+
+${define_active_storage_type(STORAGE)}
+${define_required_extensions(DTYPE)}
+
+layout(std430) buffer;
+
+#include "indexing_utils.h"
+
+${layout_declare_tensor(B, "w", "cache", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "r", "projected", DTYPE, STORAGE)}
+$if STORAGE == "buffer":
+  ${layout_declare_ubo(B, "int", "projected_numel")}
+  ${layout_declare_ubo(B, "ivec4", "cache_strides")}
+  ${layout_declare_ubo(B, "int", "input_pos")}
+$else:
+  ${layout_declare_ubo(B, "ivec3", "projected_limits")}
+  ${layout_declare_ubo(B, "int", "input_pos")}
+
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+/*
+ * t_cache will have sizes of (max_batch_size, max_seq_len, n_heads, head_dim).
+ * t_projected will have sizes of (batch_size, seq_len, n_heads, head_dim).
+ *
+ * The cache update inserts the values of t_projected into t_cache at the index
+ * specified by input_pos at the seq_len dimension. It is equivalent to calling
+
+ * t_cache = t_cache.slice_scatter(
+ *     t_projected, dim=1, start=input_pos, end=input_pos+seq_len)
+ *
+ * Note that this shader is implemented assuming that max_batch_size is 1.
+ */
+
+#ifdef USING_BUFFER
+
+/***************************
+ ** Buffer Implementation **
+ ***************************/
+
+void main() {
+  int projected_bufi = int(gl_GlobalInvocationID.x);
+  // Bump cache index forward by input_pos elements along the seq_len dimension.
+  // cache_strides contains the strides of the cache tensor.
+  int cache_bufi = input_pos * cache_strides.z + projected_bufi;
+  if (projected_bufi >= projected_numel) {
+    return;
+  }
+  cache[cache_bufi] = projected[projected_bufi];
+}
+
+#else
+
+/****************************
+ ** Texture Implementation **
+ ****************************/
+
+// Note that this shader assumes the that tensors are width packed, i.e.
+// packed_dim = 0
+void main() {
+  const ivec3 projected_pos = ivec3(gl_GlobalInvocationID);
+
+  if (any(greaterThanEqual(projected_pos, projected_limits))) {
+    return;
+  }
+
+  const ivec3 cache_pos = ivec3(
+      projected_pos.x,
+      projected_pos.y,
+      projected_pos.z + input_pos);
+
+  write_texel(cache, cache_pos, load_texel(projected, projected_pos));
+}
+
+#endif // USING_BUFFER
diff --git a/backends/vulkan/runtime/graph/ops/glsl/kv_cache_update.yaml b/backends/vulkan/runtime/graph/ops/glsl/kv_cache_update.yaml
@@ -0,0 +1,19 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+kv_cache_update:
+  parameter_names_with_default_values:
+    DTYPE: float
+    STORAGE: buffer
+  generate_variant_forall:
+    STORAGE:
+      - VALUE: buffer
+      - VALUE: texture3d
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+  shader_variants:
+    - NAME: kv_cache_update
diff --git a/backends/vulkan/runtime/graph/ops/glsl/sdpa_attn_weight_scale_and_mask.glsl b/backends/vulkan/runtime/graph/ops/glsl/sdpa_attn_weight_scale_and_mask.glsl
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define T ${buffer_scalar_type(DTYPE)}
+
+${define_active_storage_type(STORAGE)}
+${define_required_extensions(DTYPE)}
+
+#extension GL_EXT_control_flow_attributes : require
+
+layout(std430) buffer;
+
+${layout_declare_tensor(B, "rw", "attn_weight", DTYPE, STORAGE)}
+
+$if STORAGE == "buffer":
+  ${layout_declare_ubo(B, "ivec4", "attn_weight_sizes")}
+  ${layout_declare_ubo(B, "ivec4", "attn_weight_strides")}
+$else:
+  ${layout_declare_ubo(B, "ivec3", "attn_weight_limits")}
+
+${layout_declare_ubo(B, "int", "input_pos")}
+${layout_declare_ubo(B, "float", "scale")}
+
+
+#include "indexing_utils.h"
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+// Negative infinity is represented by having sign bit be 1, all exponent bits
+// be 1, all mantissa bits be 0.
+#define NEGATIVE_INF_BITS 0xFF800000
+const float negative_infinity = NEGATIVE_INF_BITS;
+
+#ifdef USING_BUFFER
+
+/*
+ * This implementations applies a scale and mask to the attention weight tensor
+ * of an SDPA block. The sizes of the attention weight is
+ * (batch_size, n_heads, seq_len, input_pos + seq_len)
+ * Conceptually the weights represent the relationship between each token in the
+ * sequence with each token preceding it.
+ *
+ * The scale applied is 1.0 / sqrt(head_dim_length)
+ *
+ * The mask applied is a bit more complicated. Imagine you create a square
+ * matrix of size (input_pos + seq_len, input_pos + seq_len), and then set the
+ * lower triangular section of the matrix to -inf. Then, slice the matrix along
+ * the row dimension starting from input_pos to input_pos + seq_len. You end up
+ * with a partial mask with size (seq_len, input_pos + seq_len). This is the
+ * mask that is applied to the attention weight.
+ *
+ * In the shader, instead of generating the mask, the index of the elment is
+ * inspected to determine if it would have been masked. Given an element at
+ * tensor index (n, c, h, w), it would be masked if w < h + input_pos.
+ */
+
+/***************************
+ ** Buffer Implementation **
+ ***************************/
+
+void main() {
+  const ivec4 attn_weight_idx = ivec4(
+      gl_GlobalInvocationID.x,
+      gl_GlobalInvocationID.y,
+      gl_GlobalInvocationID.z,
+      0);
+
+  if (any(greaterThanEqual(attn_weight_idx, attn_weight_sizes))) {
+    return;
+  }
+
+  const T scale_conv = T(scale);
+
+  const int attn_weight_id = tidx_to_bufi(attn_weight_idx, attn_weight_strides);
+  if (attn_weight_idx.x <= attn_weight_idx.y + input_pos) {
+    attn_weight[attn_weight_id] = attn_weight[attn_weight_id] * scale_conv;
+  } else {
+    attn_weight[attn_weight_id] = T(negative_infinity);
+  }
+}
+
+#else
+
+/****************************
+ ** Texture Implementation **
+ ****************************/
+
+/*
+ * This implementation assumes that the attention weight is width packed, i.e.
+ * the packed dim of the attn_weight is 0.
+ */
+void main() {
+  const ivec3 attn_weight_pos = ivec3(gl_GlobalInvocationID);
+
+  if (any(greaterThanEqual(attn_weight_pos, attn_weight_limits))) {
+    return;
+  }
+
+  vec4 outtex = imageLoad(attn_weight, attn_weight_pos) * scale;
+
+  // Mask out the upper triangular of attn_weight to -inf
+  [[unroll]] for (int i = 0; i < 4; ++i) {
+    if (attn_weight_pos.x * 4 + i > attn_weight_pos.y + input_pos) {
+      outtex[i] = negative_infinity;
+    }
+  }
+
+  write_texel(attn_weight, attn_weight_pos, outtex);
+}
+
+#endif // USING_BUFFER
diff --git a/backends/vulkan/runtime/graph/ops/glsl/sdpa_attn_weight_scale_and_mask.yaml b/backends/vulkan/runtime/graph/ops/glsl/sdpa_attn_weight_scale_and_mask.yaml
@@ -0,0 +1,13 @@
+sdpa_attn_weight_scale_and_mask:
+  parameter_names_with_default_values:
+    DTYPE: float
+    STORAGE: buffer
+  generate_variant_forall:
+    STORAGE:
+      - VALUE: buffer
+      - VALUE: texture3d
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+  shader_variants:
+    - NAME: sdpa_attn_weight_scale_and_mask
diff --git a/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp b/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp
@@ -48,16 +48,10 @@ void resize_matmul_node(
   const int out_rows = mat2_is_transposed ? utils::val_at(-2, mat2->sizes())
                                           : utils::val_at(-1, mat2->sizes());
 
-  std::vector<int64_t> new_out_sizes(3);
-  if (mat1->sizes().size() == 2) {
-    new_out_sizes.resize(2);
-    new_out_sizes.at(0) = out_cols;
-    new_out_sizes.at(1) = out_rows;
-  } else {
-    new_out_sizes.at(0) = mat1->sizes().at(0);
-    new_out_sizes.at(1) = out_cols;
-    new_out_sizes.at(2) = out_rows;
-  }
+  const int64_t out_dim = out->dim();
+  std::vector<int64_t> new_out_sizes(mat1->sizes());
+  new_out_sizes.at(out_dim - 1) = out_rows;
+  new_out_sizes.at(out_dim - 2) = out_cols;
 
   out->virtual_resize(new_out_sizes);
 }
diff --git a/backends/vulkan/runtime/graph/ops/impl/SDPA.cpp b/backends/vulkan/runtime/graph/ops/impl/SDPA.cpp
diff --git a/backends/vulkan/test/op_tests/sdpa_test.cpp b/backends/vulkan/test/op_tests/sdpa_test.cpp
diff --git a/backends/vulkan/test/op_tests/targets.bzl b/backends/vulkan/test/op_tests/targets.bzl

Original file line number	Diff line number	Diff line change
`@@ -254,7 +254,7 @@ class ComputeGraph final {`
`254`	`254`	`#undef GET_AND_CHECK_VAL_AS_TYPE_FNS`
`255`	`255`
`256`	`256`	`inline bool val_is_none(const ValueRef idx) {`
`257`		`- return values_.at(idx).isNone();`
	`257`	`+ return idx == kDummyValueRef ? true : values_.at(idx).isNone();`
`258`	`258`	`}`
`259`	`259`
`260`	`260`	`inline TypeTag get_val_type(const ValueRef idx) {`