Implement _fft_r2c core ATen op (#8277)

swolchok · web-flow · commit 5cab322ee122 · 2025-02-06T19:33:26.000-08:00
* Update

[ghstack-poisoned]

* Update

[ghstack-poisoned]
diff --git a/.gitmodules b/.gitmodules
@@ -67,3 +67,6 @@
 [submodule "backends/cadence/utils/FACTO"]
 	path = backends/cadence/utils/FACTO
 	url = https://github.com/pytorch-labs/FACTO.git
+[submodule "third-party/pocketfft"]
+	path = third-party/pocketfft
+	url = https://github.com/mreineck/pocketfft
diff --git a/kernels/aten/functions.yaml b/kernels/aten/functions.yaml
@@ -6,6 +6,8 @@
 
 - op: _fake_quantize_per_tensor_affine_cachemask_tensor_qparams.out
 
+- op: _fft_r2c.out
+
 - op: _linalg_det.result
 
 - op: _linalg_svd.U
diff --git a/kernels/optimized/CMakeLists.txt b/kernels/optimized/CMakeLists.txt
@@ -60,6 +60,7 @@ message("Generated files ${gen_command_sources}")
 
 list(TRANSFORM _optimized_kernels__srcs PREPEND "${EXECUTORCH_ROOT}/")
 add_library(optimized_kernels ${_optimized_kernels__srcs})
+target_include_directories(optimized_kernels PRIVATE "${EXECUTORCH_ROOT}/third-party/pocketfft")
 target_link_libraries(
   optimized_kernels PRIVATE executorch_core cpublas extension_threadpool
 )
diff --git a/kernels/optimized/cpu/op_fft_r2c.cpp b/kernels/optimized/cpu/op_fft_r2c.cpp
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/runtime/core/span.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+#include <pocketfft_hdronly.h>
+
+#include <optional>
+
+namespace torch::executor::native {
+
+// TODO: contents of this anonymous namespace are copy/pasted from
+// PyTorch core (aten/src/ATen/native/mkl/SpectralOps.cpp). Small
+// portions (the parts that don't depend on Tensor) could be reused;
+// refactor to enable that once we can share headers from PyTorch
+// core.
+namespace {
+pocketfft::stride_t stride_from_tensor(const Tensor& t) {
+  pocketfft::stride_t stride(t.strides().begin(), t.strides().end());
+  for (auto& s : stride) {
+    s *= t.element_size();
+  }
+  return stride;
+}
+
+pocketfft::shape_t shape_from_tensor(const Tensor& t) {
+  return pocketfft::shape_t(t.sizes().begin(), t.sizes().end());
+}
+
+// NOTE: The reinterpret_cast in tensor_cdata is UB, but it's what
+// PyTorch core does and I'm not aware of a portable way to do this
+// that doesn't rely on UB.
+template <typename T>
+inline std::complex<T>* tensor_cdata(Tensor& t) {
+  return reinterpret_cast<std::complex<T>*>(
+      t.data_ptr<executorch::runtime::etensor::complex<T>>());
+}
+
+template <typename T>
+inline const std::complex<T>* tensor_cdata(const Tensor& t) {
+  return reinterpret_cast<const std::complex<T>*>(
+      t.const_data_ptr<executorch::runtime::etensor::complex<T>>());
+}
+
+// NOTE: in particular this is in ATen/native/SpectralOpsUtils.h and
+// could be shared immediately.
+enum class fft_norm_mode {
+  none, // No normalization
+  by_root_n, // Divide by sqrt(signal_size)
+  by_n, // Divide by signal_size
+};
+
+// NOTE: slight fork from upstream PyTorch to use ET_KERNEL_CHECK;
+// upstream with TORCH_CHECK will be fine to use once we have code
+// sharing.
+template <typename T>
+std::optional<T>
+compute_fct(KernelRuntimeContext& ctx, int64_t size, int64_t normalization) {
+  constexpr auto one = static_cast<T>(1);
+  switch (static_cast<fft_norm_mode>(normalization)) {
+    case fft_norm_mode::none:
+      return one;
+    case fft_norm_mode::by_n:
+      return one / static_cast<T>(size);
+    case fft_norm_mode::by_root_n:
+      return one / std::sqrt(static_cast<T>(size));
+  }
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      false,
+      InvalidArgument,
+      std::nullopt,
+      "Unsupported normalization type: %" PRId64,
+      normalization);
+}
+
+template <typename T>
+std::optional<T> compute_fct(
+    KernelRuntimeContext& ctx,
+    const Tensor& t,
+    IntArrayRef dim,
+    int64_t normalization) {
+  if (static_cast<fft_norm_mode>(normalization) == fft_norm_mode::none) {
+    return static_cast<T>(1);
+  }
+  const auto& sizes = t.sizes();
+  int64_t n = 1;
+  for (auto idx : dim) {
+    n *= sizes[idx];
+  }
+  return compute_fct<T>(ctx, n, normalization);
+}
+
+} // namespace
+
+Tensor& opt_fft_r2c_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& in,
+    IntArrayRef dim,
+    int64_t normalization,
+    bool onesided,
+    Tensor& out) {
+  auto in_sizes = in.sizes();
+  ET_KERNEL_CHECK(ctx, in.dim() <= kTensorDimensionLimit, InvalidArgument, out);
+
+  std::array<Tensor::SizesType, kTensorDimensionLimit> out_sizes_storage;
+  executorch::runtime::Span<Tensor::SizesType> out_sizes(
+      out_sizes_storage.data(), in_sizes.size());
+  std::copy(in_sizes.begin(), in_sizes.end(), out_sizes.begin());
+  ET_KERNEL_CHECK(ctx, !dim.empty(), InvalidArgument, out);
+
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      onesided,
+      InvalidArgument,
+      out,
+      "onesided=False is not supported yet in _fft_r2c");
+
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      out.scalar_type() == executorch::runtime::toComplexType(in.scalar_type()),
+      InvalidArgument,
+      out,
+      "the output type for _fft_r2c must be the Complex type corresponding to the input type");
+
+  for (auto d : dim) {
+    ET_KERNEL_CHECK_MSG(
+        ctx,
+        d >= 0 && d < in.dim(),
+        InvalidArgument,
+        out,
+        "dims must be in bounds (got %" PRId64 ")",
+        d);
+  }
+
+  if (onesided) {
+    out_sizes[dim.back()] = out_sizes[dim.back()] / 2 + 1;
+  }
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      resize_tensor(
+          out,
+          executorch::runtime::ArrayRef<Tensor::SizesType>(
+              out_sizes.data(), out_sizes.size())) == Error::Ok,
+      InvalidArgument,
+      out,
+      "Failed to resize output tensor (last dim %d).",
+      out_sizes[dim.back()]);
+
+  pocketfft::shape_t axes(dim.begin(), dim.end());
+  auto in_shape = shape_from_tensor(in);
+  // TODO: if arbitrary strides are a possibility, we need to validate
+  // these, because pocketfft README says "Strides that lead to
+  // multiple accesses of the same memory address are not allowed."
+  auto in_stride = stride_from_tensor(in);
+  auto out_stride = stride_from_tensor(out);
+  // NOTE: as of this writing, upstream PyTorch only supports
+  // float/double, so we follow suit.
+  ET_SWITCH_FLOAT_TYPES(in.scalar_type(), ctx, "_fft_r2c.out", CTYPE_IN, [&] {
+    auto fct = compute_fct<CTYPE_IN>(ctx, in, dim, normalization);
+    if (!fct) {
+      // Check failed, just bail out of the lambda.
+      return;
+    }
+    pocketfft::r2c<CTYPE_IN>(
+        in_shape,
+        in_stride,
+        out_stride,
+        axes,
+        true,
+        in.const_data_ptr<CTYPE_IN>(),
+        tensor_cdata<CTYPE_IN>(out),
+        *fct);
+
+    // TODO: fill with conjugate symmetry if not onesided; see
+    // ATen/native/mkl/SpectralOps.cpp
+  });
+  return out;
+}
+} // namespace torch::executor::native
diff --git a/kernels/optimized/cpu/targets.bzl b/kernels/optimized/cpu/targets.bzl
@@ -25,6 +25,10 @@ _OPTIMIZED_ATEN_OPS = (
         ],
     ),
     op_target(name = "op_exp"),
+    op_target(
+        name = "op_fft_r2c",
+        deps = [] if runtime.is_oss else ["fbsource//third-party/pocket_fft:pocketfft"],
+    ),
     op_target(name = "op_sigmoid"),
     op_target(
         name = "op_gelu",
diff --git a/kernels/optimized/optimized-oss.yaml b/kernels/optimized/optimized-oss.yaml
@@ -5,6 +5,11 @@
 # log_softmax, due to the OSS build not currently including sleef.
 # TODO (T183193812)
 
+- op: _fft_r2c.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::opt_fft_r2c_out
+
 - op: add.out
   kernels:
     - arg_meta: null
diff --git a/kernels/optimized/optimized.yaml b/kernels/optimized/optimized.yaml
@@ -2,6 +2,11 @@
 #
 # This yaml file contains operators that have optimized kernels available.
 
+- op: _fft_r2c.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::opt_fft_r2c_out
+
 - op: _log_softmax.out
   kernels:
     - arg_meta: null
diff --git a/kernels/test/CMakeLists.txt b/kernels/test/CMakeLists.txt
@@ -265,6 +265,7 @@ set(_optimized_kernels_test_sources
     "op_bmm_test.cpp"
     "op_div_test.cpp"
     "op_exp_test.cpp"
+    "op_fft_r2c_test.cpp"
     "op_gelu_test.cpp"
     "op_le_test.cpp"
     "op_log_softmax_test.cpp"
diff --git a/kernels/test/op_fft_r2c_test.cpp b/kernels/test/op_fft_r2c_test.cpp
diff --git a/kernels/test/targets.bzl b/kernels/test/targets.bzl
diff --git a/runtime/core/exec_aten/testing_util/tensor_util.cpp b/runtime/core/exec_aten/testing_util/tensor_util.cpp
diff --git a/third-party/pocketfft b/third-party/pocketfft

Original file line number	Diff line number	Diff line change
`@@ -60,6 +60,7 @@ message("Generated files ${gen_command_sources}")`
`60`	`60`
`61`	`61`	`list(TRANSFORM _optimized_kernels__srcs PREPEND "${EXECUTORCH_ROOT}/")`
`62`	`62`	`add_library(optimized_kernels ${_optimized_kernels__srcs})`
	`63`	`+target_include_directories(optimized_kernels PRIVATE "${EXECUTORCH_ROOT}/third-party/pocketfft")`
`63`	`64`	`target_link_libraries(`
`64`	`65`	`optimized_kernels PRIVATE executorch_core cpublas extension_threadpool`
`65`	`66`	`)`