FFHT enhancements to fast hadamard transform kernels

swolchok · swolchok · commit 08bad7593b83 · 2024-09-12T08:12:05.000-07:00
Pull Request resolved: #5290 Use FFHT to speed up Fast Hadamard Transform on CPU. fast_hadamard_test was delayed to here becuase it was a source for a reference implementation. ghstack-source-id: 242216121 @exported-using-ghexport Differential Revision: [D61029709](https://our.internmc.facebook.com/intern/diff/D61029709/)
diff --git a/extension/llm/custom_ops/spinquant/fast_hadamard_transform.h b/extension/llm/custom_ops/spinquant/fast_hadamard_transform.h
@@ -14,6 +14,8 @@
 #include <cstdint>
 #include <memory>
 
+#include <executorch/extension/llm/custom_ops/spinquant/third-party/FFHT/fht.h>
+
 #include "fast_hadamard_transform_special.h"
 
 namespace executorch {
@@ -41,10 +43,22 @@ void normalize_after_fht(T* out, int log2_vec_size) {
   }
 }
 
+inline void fast_hadamard_transform_ffht_impl(float* vec, int log2_vec_size) {
+  if (log2_vec_size <= 0) {
+    return;
+  }
+
+  fht_float(vec, log2_vec_size);
+  normalize_after_fht(vec, log2_vec_size);
+}
+
 template <typename T>
 void fast_hadamard_transform_unnormalized_simple_impl(
     T* vec,
     int log2_vec_size) {
+  // NOTE: If you're here because you're profiling a model and this is
+  // slow, consider updating FFHT to generate efficient assembly for
+  // your data type!
   if (log2_vec_size == 0) {
     return;
   }
@@ -77,7 +91,11 @@ void fast_hadamard_transform_simple_impl(T* vec, int log2_vec_size) {
 // of vec, which must be of length (1 << log2_vec_size).
 template <typename T>
 void fast_hadamard_transform(T* vec, int log2_vec_size) {
-  internal::fast_hadamard_transform_simple_impl(vec, log2_vec_size);
+  if constexpr (std::is_same_v<T, float>) {
+    internal::fast_hadamard_transform_ffht_impl(vec, log2_vec_size);
+  } else {
+    internal::fast_hadamard_transform_simple_impl(vec, log2_vec_size);
+  }
 }
 
 // Compute a quantized fast Walsh-Hadamard transform of vec, which
diff --git a/extension/llm/custom_ops/spinquant/targets.bzl b/extension/llm/custom_ops/spinquant/targets.bzl
@@ -15,5 +15,8 @@ def define_common_targets():
         srcs = [
             "fast_hadamard_transform.cpp",
         ],
+        exported_deps = [
+            "//executorch/extension/llm/custom_ops/spinquant/third-party/FFHT:fht",
+        ],
         visibility = ["@EXECUTORCH_CLIENTS"],
     )
diff --git a/extension/llm/custom_ops/spinquant/test/TARGETS b/extension/llm/custom_ops/spinquant/test/TARGETS
@@ -0,0 +1,5 @@
+load(":targets.bzl", "define_common_targets")
+
+oncall("executorch")
+
+define_common_targets()
diff --git a/extension/llm/custom_ops/spinquant/test/fast_hadamard_transform_test.cpp b/extension/llm/custom_ops/spinquant/test/fast_hadamard_transform_test.cpp
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <algorithm>
+#include <cmath>
+#include <iostream>
+#include <random>
+
+#include <gtest/gtest.h>
+
+#include <executorch/extension/llm/custom_ops/spinquant/fast_hadamard_transform.h>
+#include <executorch/extension/llm/custom_ops/spinquant/test/fast_hadamard_transform_special_unstrided_cpu.h>
+#include <executorch/extension/llm/custom_ops/spinquant/third-party/FFHT/dumb_fht.h>
+
+namespace {
+void reference_fht_impl(float* buf, int n) {
+  dumb_fht(buf, std::log2<int>(n));
+  const auto root_n = std::sqrt(n);
+  for (int ii = 0; ii < n; ++ii) {
+    buf[ii] /= root_n;
+  }
+}
+
+// Alternate implementation of fast_hadamard_transform_28N to mutation
+// test against. Benchmarking suggests this one is slower, which is
+// why it's in the test and the strided implementation is in the
+// header.
+template <typename T>
+void fast_hadamard_transform_28N_with_transpose(T* vec, int log2_vec_size) {
+  const int vec_size = (1 << log2_vec_size);
+  for (int ii = 0; ii < 28; ++ii) {
+    executorch::fast_hadamard_transform(&vec[ii * vec_size], log2_vec_size);
+  }
+  std::unique_ptr<T[]> transposed = std::make_unique<T[]>(28 * vec_size);
+  for (int ii = 0; ii < 28; ++ii) {
+    for (int jj = 0; jj < vec_size; ++jj) {
+      transposed[jj * 28 + ii] = vec[ii * vec_size + jj];
+    }
+  }
+  for (int ii = 0; ii < vec_size; ++ii) {
+    hadamard_mult_28(&transposed[ii * 28]);
+  }
+  for (int jj = 0; jj < vec_size; ++jj) {
+    for (int ii = 0; ii < 28; ++ii) {
+      vec[ii * vec_size + jj] = transposed[jj * 28 + ii];
+    }
+  }
+}
+
+std::vector<float> randomFloats(int howMany) {
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::normal_distribution<float> dist;
+  std::vector<float> data(howMany);
+  for (int ii = 0; ii < data.size(); ++ii) {
+    data[ii] = dist(gen);
+  }
+  return data;
+}
+} // namespace
+
+TEST(FastHadamardTransformTest, SingleElement) {
+  // FHT of a single element is a no-op.
+  float data[1] = {42};
+  executorch::fast_hadamard_transform(data, 0);
+  EXPECT_EQ(data[0], 42);
+}
+
+TEST(FastHadamardTransformTest, LargerInput) {
+  std::vector<float> data = randomFloats(4096);
+
+  auto expected = data;
+  reference_fht_impl(expected.data(), expected.size());
+
+  auto actual = data;
+  executorch::fast_hadamard_transform(actual.data(), 12);
+
+  for (int ii = 0; ii < expected.size(); ++ii) {
+    EXPECT_FLOAT_EQ(actual[ii], expected[ii]);
+  }
+}
+
+TEST(FastHadamardTransform28NTest, Basic) {
+  std::vector<float> data = randomFloats(1024 * 28);
+
+  auto expected = data;
+  fast_hadamard_transform_28N_with_transpose(expected.data(), 10);
+
+  auto actual = data;
+  executorch::fast_hadamard_transform_28N(actual.data(), 10);
+
+  for (int ii = 0; ii < actual.size(); ++ii) {
+    EXPECT_FLOAT_EQ(actual[ii], expected[ii]);
+  }
+}
+
+namespace {
+constexpr int32_t qmin = -(1 << 15) + 1;
+constexpr int32_t qmax = -qmin;
+
+int16_t quantize(float x, float scale) {
+  float scaled = x / scale;
+  // XXX: Supposed to round ties to even, but this is just test code.
+  int32_t scaled_int =
+      std::clamp((int32_t)std::lround<int32_t>(scaled), qmin, qmax);
+  return static_cast<int16_t>(scaled_int);
+}
+
+template <typename T>
+std::vector<T> quantize(const std::vector<float>& data, float scale) {
+  std::vector<T> result;
+  result.reserve(data.size());
+  for (const float unquant : data) {
+    result.push_back(quantize(unquant, scale));
+  }
+  return result;
+}
+
+template <typename T>
+std::pair<std::vector<T>, float> quantize(const std::vector<float>& data) {
+  auto [minIt, maxIt] = std::minmax_element(data.begin(), data.end());
+  float scale = (*maxIt - *minIt) / (qmax - qmin);
+  return {quantize<T>(data, scale), scale};
+}
+
+template <typename T>
+float dequantize(T x, float scale) {
+  return x * scale;
+}
+
+template <typename T>
+std::vector<float> dequantize(const std::vector<T>& data, float scale) {
+  static_assert(!std::is_same_v<T, float>);
+  std::vector<float> result;
+  result.reserve(data.size());
+  for (const T quant : data) {
+    result.push_back(dequantize(quant, scale));
+  }
+  return result;
+}
+
+#define EXPECT_CLOSE_IMPL(a, b, atol, rtol)             \
+  EXPECT_LE(std::abs(a - b), atol + rtol * std::abs(b)) \
+      << "a: " << a << ", b: " << b
+#define EXPECT_CLOSE(a, b) EXPECT_CLOSE_IMPL(a, b, 2e-4, 1e-4)
+
+void testQuantizedFastHadamardTransform(int logN) {
+  std::vector<float> data = randomFloats(1 << logN);
+
+  auto [qdata, scale] = quantize<int16_t>(data);
+
+  auto expected_unquant = dequantize(qdata, scale);
+  reference_fht_impl(expected_unquant.data(), expected_unquant.size());
+  auto expected = quantize<int16_t>(expected_unquant, scale);
+
+  auto actual = qdata;
+  executorch::fast_hadamard_transform_symmetric_quantized_s16(
+      actual.data(), logN);
+
+  for (int ii = 0; ii < expected.size(); ++ii) {
+    EXPECT_CLOSE(
+        dequantize(actual[ii], scale), dequantize(expected[ii], scale));
+  }
+}
+
+} // namespace
+
+TEST(QuantizedFastHadamardTransformTest, Basic) {
+  testQuantizedFastHadamardTransform(12); // 4096
+}
+
+TEST(QuantizedFastHadamardTransformTest, OddLogN) {
+  testQuantizedFastHadamardTransform(11); // 2048
+}
+
+TEST(QuantizedFastHadamardTransform28NTest, Basic) {
+  std::vector<float> data = randomFloats(1024 * 28);
+
+  auto [qdata, scale] = quantize<int16_t>(data);
+
+  auto expected_unquant = dequantize(qdata, scale);
+  fast_hadamard_transform_28N_with_transpose(expected_unquant.data(), 10);
+  auto expected = quantize<int16_t>(expected_unquant, scale);
+
+  auto actual = qdata;
+  executorch::fast_hadamard_transform_symmetric_quantized_s16_28N(
+      actual.data(), 10);
+
+  for (int ii = 0; ii < expected.size(); ++ii) {
+    std::cerr << "element " << ii << ": actual: " << actual[ii]
+              << ", expected: " << expected[ii] << std::endl;
+    EXPECT_CLOSE(
+        dequantize(actual[ii], scale), dequantize(expected[ii], scale));
+  }
+}
diff --git a/extension/llm/custom_ops/spinquant/test/targets.bzl b/extension/llm/custom_ops/spinquant/test/targets.bzl
@@ -0,0 +1,17 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+def define_common_targets():
+    """Defines targets that should be shared between fbcode and xplat.
+
+    The directory containing this targets.bzl file should also contain both
+    TARGETS and BUCK files that call this function.
+    """
+    runtime.cxx_test(
+        name = "fast_hadamard_transform_test",
+        srcs = ["fast_hadamard_transform_test.cpp"],
+        headers = ["fast_hadamard_transform_special_unstrided_cpu.h"],
+        deps = [
+            "//executorch/extension/llm/custom_ops/spinquant:fast_hadamard_transform",
+            "//executorch/extension/llm/custom_ops/spinquant/third-party/FFHT:dumb_fht",
+        ],
+    )
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/Makefile b/extension/llm/custom_ops/spinquant/third-party/FFHT/Makefile
@@ -3,7 +3,7 @@ CFLAGS = -O3 -march=native -std=c99 -pedantic -Wall -Wextra -Wshadow -Wpointer-a
 
 all: test_float test_double fast_copy.o fht.o
 
-OBJ := fast_copy.o fht.o
+OBJ := dumb_fht.o fast_copy.o fht.o
 
 %.o: %.c
 	$(CC) $< -o $@ -c $(CFLAGS)
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/TARGETS b/extension/llm/custom_ops/spinquant/third-party/FFHT/TARGETS
@@ -0,0 +1,5 @@
+load(":targets.bzl", "define_common_targets")
+
+oncall("executorch")
+
+define_common_targets()
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/dumb_fht.c b/extension/llm/custom_ops/spinquant/third-party/FFHT/dumb_fht.c
@@ -0,0 +1,17 @@
+#include "dumb_fht.h"
+
+void dumb_fht(float* buf, int log_n) {
+  int n = 1 << log_n;
+  for (int i = 0; i < log_n; ++i) {
+    int s1 = 1 << i;
+    int s2 = s1 << 1;
+    for (int j = 0; j < n; j += s2) {
+      for (int k = 0; k < s1; ++k) {
+        float u = buf[j + k];
+        float v = buf[j + k + s1];
+        buf[j + k] = u + v;
+        buf[j + k + s1] = u - v;
+      }
+    }
+  }
+}
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/dumb_fht.h b/extension/llm/custom_ops/spinquant/third-party/FFHT/dumb_fht.h
@@ -0,0 +1,14 @@
+#ifndef DUMB_FHT_H
+#define DUMB_FHT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void dumb_fht(float* buf, int log_n);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif /* DUMB_FHT_H */
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/targets.bzl b/extension/llm/custom_ops/spinquant/third-party/FFHT/targets.bzl
@@ -0,0 +1,35 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+def define_common_targets():
+    """Defines targets that should be shared between fbcode and xplat.
+
+    The directory containing this targets.bzl file should also contain both
+    TARGETS and BUCK files that call this function.
+    """
+    runtime.cxx_library(
+        name = "dumb_fht",
+        srcs = ["dumb_fht.c"],
+        exported_headers = ["dumb_fht.h"],
+        visibility = ["@EXECUTORCH_CLIENTS"],
+    )
+
+    runtime.cxx_library(
+        name = "fht",
+        srcs = select({
+            "DEFAULT": [],
+            "ovr_config//cpu:x86_64": ["fht_avx.c"],
+            "ovr_config//cpu:arm64": ["fht_neon.c"],
+            "ovr_config//cpu:arm32": ["fht_neon.c"],
+        }),
+        exported_headers = ["fht.h"],
+        visibility = ["@EXECUTORCH_CLIENTS"],
+    )
+
+    runtime.cxx_binary(
+        name = "test_float",
+        srcs = ["test_float.c"],
+        deps = [
+            ":dumb_fht",
+            ":fht",
+        ],
+    )
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/test_float.c b/extension/llm/custom_ops/spinquant/third-party/FFHT/test_float.c

Original file line number	Diff line number	Diff line change
`@@ -15,5 +15,8 @@ def define_common_targets():`
`15`	`15`	`srcs = [`
`16`	`16`	`"fast_hadamard_transform.cpp",`
`17`	`17`	`],`
	`18`	`+ exported_deps = [`
	`19`	`+ "//executorch/extension/llm/custom_ops/spinquant/third-party/FFHT:fht",`
	`20`	`+ ],`
`18`	`21`	`visibility = ["@EXECUTORCH_CLIENTS"],`
`19`	`22`	`)`