[executorch] Custom op for fast hadamard transform kernel

swolchok · swolchok · commit 37492bdca8c7 · 2024-09-11T18:12:42.000-07:00
Pull Request resolved: #5291 Custom op support for Fast Hadamard Transform. ghstack-source-id: 242138454 @exported-using-ghexport Differential Revision: [D60530438](https://our.internmc.facebook.com/intern/diff/D60530438/)
diff --git a/extension/llm/custom_ops/CMakeLists.txt b/extension/llm/custom_ops/CMakeLists.txt
@@ -75,6 +75,7 @@ if(EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT)
   add_library(
     custom_ops_aot_lib SHARED
     ${_custom_ops__srcs} ${CMAKE_CURRENT_SOURCE_DIR}/op_sdpa_aot.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/op_fast_hadamard_transform_aten.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/op_tile_crop.cpp
   )
   target_include_directories(
diff --git a/extension/llm/custom_ops/op_fast_hadamard_transform.cpp b/extension/llm/custom_ops/op_fast_hadamard_transform.cpp
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/extension/kernel_util/make_boxed_from_unboxed_functor.h>
+#include <executorch/extension/llm/custom_ops/spinquant/fast_hadamard_transform.h>
+#include <executorch/kernels/optimized/utils/llvmMathExtras.h>
+#include <executorch/kernels/portable/cpu/util/reduce_util.h> // For apply_over_dim.
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+namespace torch {
+namespace executor {
+namespace native {
+
+Tensor& fast_hadamard_transform_out(
+    RuntimeContext& ctx,
+    const Tensor& mat,
+    Tensor& out) {
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      resize_tensor(out, mat.sizes()) == Error::Ok,
+      InvalidArgument,
+      out,
+      "Failed to resize output tensor.");
+
+  ET_KERNEL_CHECK(
+      ctx, mat.scalar_type() == out.scalar_type(), InvalidArgument, out);
+
+  if (mat.dim() == 0) {
+    return out;
+  }
+
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      mat.strides().back() == 1,
+      InvalidArgument,
+      out,
+      "input matrix that isn't contiguous in the last dimension is not supported!");
+
+  const auto last_dim_size = mat.sizes().back();
+  const auto divisible_by_28 = last_dim_size % 28 == 0;
+  auto power_of_two_size = divisible_by_28 ? last_dim_size / 28 : last_dim_size;
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      (power_of_two_size & (power_of_two_size - 1)) == 0,
+      InvalidArgument,
+      out,
+      "This implementation requires power-of-2 (or power-of-2 * 28) input size in the last dimension!");
+
+  const auto log2_power_of_two_size = executorch::llvm::countTrailingZeros(
+      static_cast<unsigned int>(power_of_two_size),
+      executorch::llvm::ZeroBehavior::ZB_Undefined);
+
+  ET_SWITCH_FLOATH_TYPES(mat.scalar_type(), ctx, __func__, CTYPE, [&] {
+    const CTYPE* const mat_data = mat.const_data_ptr<CTYPE>();
+    CTYPE* const out_data = out.mutable_data_ptr<CTYPE>();
+
+    std::memcpy(out_data, mat_data, mat.numel() * sizeof(CTYPE));
+
+    if (divisible_by_28) {
+      apply_over_dim(
+          [log2_power_of_two_size, out_data](
+              const size_t size, const size_t stride, const size_t base) {
+            executorch::fast_hadamard_transform_28N(
+                out_data + base, log2_power_of_two_size);
+          },
+          out,
+          out.dim() - 1);
+    } else {
+      apply_over_dim(
+          [log2_power_of_two_size, out_data](
+              const size_t size, const size_t stride, const size_t base) {
+            executorch::fast_hadamard_transform(
+                out_data + base, log2_power_of_two_size);
+          },
+          out,
+          out.dim() - 1);
+    }
+  });
+  return out;
+}
+} // namespace native
+} // namespace executor
+} // namespace torch
+
+EXECUTORCH_LIBRARY(
+    llama,
+    "fast_hadamard_transform.out",
+    torch::executor::native::fast_hadamard_transform_out);
diff --git a/extension/llm/custom_ops/op_fast_hadamard_transform.h b/extension/llm/custom_ops/op_fast_hadamard_transform.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+namespace torch::executor::native {
+
+// Compute the fast Walsh-Hadamard transform
+// (https://en.wikipedia.org/wiki/Fast_Walsh%E2%80%93Hadamard_transform)
+// of mat along the last dimension (which must be contiguous).
+//
+// mat.sizes().back() is currently required to be either a power of
+// two, or 28 * a power of two.
+Tensor& fast_hadamard_transform_out(
+    RuntimeContext& ctx,
+    const Tensor& mat,
+    Tensor& out);
+} // namespace torch::executor::native
diff --git a/extension/llm/custom_ops/op_fast_hadamard_transform_aten.cpp b/extension/llm/custom_ops/op_fast_hadamard_transform_aten.cpp
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/extension/aten_util/make_aten_functor_from_et_functor.h>
+#include <executorch/extension/llm/custom_ops/op_fast_hadamard_transform.h>
+
+#include <torch/library.h>
+
+namespace torch::executor::native {
+namespace {
+Tensor& fast_hadamard_transform_out_no_context(const Tensor& vec, Tensor& out) {
+  exec_aten::RuntimeContext context;
+  return fast_hadamard_transform_out(context, vec, out);
+}
+at::Tensor fast_hadamard_transform_aten(const at::Tensor& vec) {
+  auto out = at::empty_like(vec);
+  WRAP_TO_ATEN(fast_hadamard_transform_out_no_context, 1)
+  (vec, out);
+  return out;
+}
+} // namespace
+} // namespace torch::executor::native
+
+TORCH_LIBRARY_FRAGMENT(llama, m) {
+  m.def("fast_hadamard_transform(Tensor mat) -> Tensor");
+  m.def(
+      "fast_hadamard_transform.out(Tensor mat, *, Tensor(a!) out) -> Tensor(a!)");
+}
+
+TORCH_LIBRARY_IMPL(llama, CompositeExplicitAutograd, m) {
+  m.impl(
+      "fast_hadamard_transform",
+      torch::executor::native::fast_hadamard_transform_aten);
+  m.impl(
+      "fast_hadamard_transform.out",
+      WRAP_TO_ATEN(
+          torch::executor::native::fast_hadamard_transform_out_no_context, 1));
+}
diff --git a/extension/llm/custom_ops/op_sdpa_aot.cpp b/extension/llm/custom_ops/op_sdpa_aot.cpp
@@ -16,7 +16,7 @@ namespace torch {
 namespace executor {
 
 namespace native {
-
+namespace {
 Tensor& sdpa_with_kv_cache_out_no_context(
     const Tensor& q_projected,
     const Tensor& k_projected,
@@ -81,12 +81,12 @@ at::Tensor sdpa_with_kv_cache_aten(
    output);
   return output;
 }
-
+} // namespace
 } // namespace native
 } // namespace executor
 } // namespace torch
 
-TORCH_LIBRARY(llama, m) {
+TORCH_LIBRARY_FRAGMENT(llama, m) {
   m.def(
       "sdpa_with_kv_cache(Tensor query, Tensor key, Tensor value, Tensor(a!) key_cache, "
       "Tensor(b!) value_cache, SymInt start_pos, SymInt seq_len, Tensor? attn_mask=None, "
diff --git a/extension/llm/custom_ops/sdpa_with_kv_cache.py b/extension/llm/custom_ops/sdpa_with_kv_cache.py
@@ -20,13 +20,17 @@
 try:
     op = torch.ops.llama.sdpa_with_kv_cache.default
     assert op is not None
+    op2 = torch.ops.llama.fast_hadamard_transform.default
+    assert op2 is not None
 except:
     libs = list(Path(__file__).parent.resolve().glob("libcustom_ops_aot_lib.*"))
     assert len(libs) == 1, f"Expected 1 library but got {len(libs)}"
     logging.info(f"Loading custom ops library: {libs[0]}")
     torch.ops.load_library(libs[0])
     op = torch.ops.llama.sdpa_with_kv_cache.default
     assert op is not None
+    op2 = torch.ops.llama.fast_hadamard_transform.default
+    assert op2 is not None
 
 custom_ops_lib = torch.library.Library("llama", "IMPL")
 
@@ -126,3 +130,11 @@ def sdpa_with_kv_cache_meta(
     )
 
     return torch.empty_like(query)
+
+
+@impl(custom_ops_lib, "fast_hadamard_transform", "Meta")
+def fast_hadamard_transform_meta(mat):
+    # assert(mat.strides[-1] == 1, "input matrix must be contiguous in the last dimension!")
+    # assert(mat.shape[-1] == 128 or mat.shape[-1] == 14336, "unexpected input size for llama3 demo!")
+    # assert(mat.is_contiguous(), "input matrix must be contiguous currently!")
+    return torch.empty_like(mat)
diff --git a/extension/llm/custom_ops/spinquant/test/op_fast_hadamard_transform_test.cpp b/extension/llm/custom_ops/spinquant/test/op_fast_hadamard_transform_test.cpp
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/extension/llm/custom_ops/op_fast_hadamard_transform.h>
+#include <executorch/extension/llm/custom_ops/spinquant/third-party/FFHT/dumb_fht.h>
+#include <executorch/kernels/test/TestUtil.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
+
+#include <gtest/gtest.h>
+
+#include <cmath>
+#include <random>
+
+using exec_aten::Tensor;
+
+namespace {
+Tensor& fast_hadamard_transform_nocontext(const Tensor& vec, Tensor& out) {
+  exec_aten::RuntimeContext context;
+  return torch::executor::native::fast_hadamard_transform_out(
+      context, vec, out);
+}
+
+void reference_fht_impl(float* buf, int n) {
+  dumb_fht(buf, std::log2<int>(n));
+  const auto root_n = std::sqrt(n);
+  for (int ii = 0; ii < n; ++ii) {
+    buf[ii] /= root_n;
+  }
+}
+} // namespace
+
+TEST(FastHadamardTransformTest, EmptyInput) {
+  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Byte> tfByte;
+  auto vec = tfFloat.zeros({0});
+  auto out = tfFloat.zeros({0});
+  auto result = fast_hadamard_transform_nocontext(vec, out);
+  EXPECT_EQ(result.numel(), 0);
+}
+
+TEST(FastHadamardTransformTest, SingleElementInput) {
+  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Byte> tfByte;
+  auto vec = tfFloat.ones({1});
+  auto out = tfFloat.zeros({1});
+  auto result = fast_hadamard_transform_nocontext(vec, out);
+  EXPECT_EQ(result.numel(), 1);
+  // FHT of a single element is a no-op.
+  EXPECT_EQ(result.const_data_ptr<float>()[0], 1);
+}
+
+TEST(FastHadamardTransformTest, FourKInput) {
+  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Byte> tfByte;
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::normal_distribution<float> dist;
+  std::vector<float> data(4096);
+  for (int ii = 0; ii < data.size(); ++ii) {
+    data[ii] = dist(gen);
+  }
+  auto vec = tfFloat.make({4096}, data);
+  auto out = tfFloat.zeros({4096});
+  auto result = fast_hadamard_transform_nocontext(vec, out);
+
+  std::vector<float> reference_result = data;
+  reference_fht_impl(reference_result.data(), reference_result.size());
+
+  const float* const result_data = result.const_data_ptr<float>();
+  for (int ii = 0; ii < 4096; ++ii) {
+    EXPECT_FLOAT_EQ(result_data[ii], reference_result[ii]);
+  }
+}
+
+TEST(FastHadamardTransformTest, MultipleRows) {
+  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Byte> tfByte;
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::normal_distribution<float> dist;
+  std::vector<float> data(8 * 8 * 8);
+  for (int ii = 0; ii < data.size(); ++ii) {
+    data[ii] = dist(gen);
+  }
+  auto mat = tfFloat.make({8, 8, 8}, data);
+  auto out = tfFloat.zeros({8, 8, 8});
+
+  auto result = fast_hadamard_transform_nocontext(mat, out);
+
+  std::vector<float> reference_result = data;
+  for (int ii = 0; ii < 8; ++ii) {
+    for (int jj = 0; jj < 8; ++jj) {
+      reference_fht_impl(&reference_result[ii * 64 + jj * 8], 8);
+    }
+  }
+
+  const float* const result_data = result.const_data_ptr<float>();
+  for (int ii = 0; ii < data.size(); ++ii) {
+    EXPECT_FLOAT_EQ(result_data[ii], reference_result[ii]);
+  }
+}
diff --git a/extension/llm/custom_ops/spinquant/test/targets.bzl b/extension/llm/custom_ops/spinquant/test/targets.bzl
@@ -15,3 +15,13 @@ def define_common_targets():
             "//executorch/extension/llm/custom_ops/spinquant/third-party/FFHT:dumb_fht",
         ],
     )
+
+    runtime.cxx_test(
+        name = "op_fast_hadamard_transform_test",
+        srcs = ["op_fast_hadamard_transform_test.cpp"],
+        deps = [
+            "//executorch/extension/llm/custom_ops:custom_ops",
+            "//executorch/extension/llm/custom_ops/spinquant/third-party/FFHT:dumb_fht",
+            "//executorch/kernels/test:test_util",
+        ],
+    )
diff --git a/extension/llm/custom_ops/targets.bzl b/extension/llm/custom_ops/targets.bzl
@@ -9,8 +9,16 @@ def define_common_targets():
     for mkl_dep in ["", "_mkl_noomp"]:
         runtime.cxx_library(
             name = "custom_ops" + mkl_dep,
-            srcs = ["op_sdpa.cpp", "op_fallback.cpp"],
-            exported_headers = ["op_sdpa.h", "op_fallback.h"],
+            srcs = [
+                "op_fallback.cpp",
+                "op_fast_hadamard_transform.cpp",
+                "op_sdpa.cpp",
+            ],
+            exported_headers = [
+                "op_fallback.h",
+                "op_fast_hadamard_transform.h",
+                "op_sdpa.h",
+            ],
             exported_deps = [
                 "//executorch/runtime/kernel:kernel_includes",
                 "//executorch/kernels/portable/cpu:scalar_utils",
@@ -20,6 +28,10 @@ def define_common_targets():
                 "//executorch/extension/parallel:thread_parallel",
                 "//executorch/extension/threadpool:threadpool",
             ],
+            deps = [
+                "//executorch/kernels/portable/cpu/util:reduce_util",
+                "//executorch/extension/llm/custom_ops/spinquant:fast_hadamard_transform",
+            ],
             compiler_flags = ["-Wno-missing-prototypes", "-Wno-global-constructors"],
             visibility = [
                 "//executorch/...",
@@ -35,7 +47,9 @@ def define_common_targets():
             name = "custom_ops_aot_lib" + mkl_dep,
             srcs = [
                 "op_sdpa_aot.cpp",
+                "op_fast_hadamard_transform_aten.cpp",
             ],
+            compiler_flags = ["-Wno-global-constructors"],
             visibility = [
                 "//executorch/...",
                 "@EXECUTORCH_CLIENTS",
diff --git a/kernels/portable/cpu/util/targets.bzl b/kernels/portable/cpu/util/targets.bzl
@@ -249,5 +249,9 @@ def define_common_targets():
                 "//executorch/runtime/core/exec_aten/util:tensor_util{}".format(suffix),
             ],
             exported_preprocessor_flags = ["-DUSE_ATEN_LIB"] if aten_mode else [],
-            visibility = ["//executorch/kernels/portable/cpu/...", "//executorch/kernels/quantized/..."],
+            visibility = [
+                "//executorch/extension/llm/custom_ops/...",
+                "//executorch/kernels/portable/cpu/...",
+                "//executorch/kernels/quantized/...",
+            ],
         )

Original file line number	Diff line number	Diff line change
`@@ -75,6 +75,7 @@ if(EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT)`
`75`	`75`	`add_library(`
`76`	`76`	`custom_ops_aot_lib SHARED`
`77`	`77`	`${_custom_ops__srcs} ${CMAKE_CURRENT_SOURCE_DIR}/op_sdpa_aot.cpp`
	`78`	`+ ${CMAKE_CURRENT_SOURCE_DIR}/op_fast_hadamard_transform_aten.cpp`
`78`	`79`	`${CMAKE_CURRENT_SOURCE_DIR}/op_tile_crop.cpp`
`79`	`80`	`)`
`80`	`81`	`target_include_directories(`