fix numerics for internal test on "Remove ExecuTorch copy of Vectorized"

swolchok · swolchok · commit b612e5b0a121 · 2025-06-10T16:51:26.000-07:00
All uses are outside ExecuTorch core, so we can just use ATen Vectorized. Differential Revision: [D66396016](https://our.internmc.facebook.com/intern/diff/D66396016/) [ghstack-poisoned]
diff --git a/extension/llm/custom_ops/op_sdpa_impl.h b/extension/llm/custom_ops/op_sdpa_impl.h
@@ -9,6 +9,7 @@
 #pragma once
 
 #include <ATen/cpu/vec/vec.h>
+#include <ATen/cpu/vec/vec_n.h>
 #include <executorch/kernels/optimized/blas/CPUBlas.h>
 #include <executorch/kernels/optimized/vec/functional.h>
 #include <executorch/runtime/core/exec_aten/util/dim_order_util.h>
@@ -362,22 +363,37 @@ using Tensor = ::executorch::aten::Tensor;
 template <typename T1, typename T2>
 inline void
 _exp_reduce_sum_fusion_kernel(T1* a, const int& size, T2* out, T1& val) {
-  auto vec_size = vec::Vectorized<T1>::size();
-  auto vec_max = vec::Vectorized<T1>(val);
+  // NOTE: we observed numerics issues with this function when
+  // deleting the old executorch::vec and replacing with at::vec
+  // here. The major known difference is that executorch::vec was 256
+  // bits wide vs 128 bits for at::vec (and the hardware). Preserving
+  // this function's execution width at 256 bits and avoiding
+  // vec_reduce_all below removed the issues.
+  constexpr auto vec_size = vec::Vectorized<T1>::size() * 2;
+  auto vec_max = vec::VectorizedN<T1, 2>(val);
   T1 tmp_sum = 0;
-  auto vec_tmp_sum = vec::Vectorized<T1>(tmp_sum);
+  auto vec_tmp_sum = vec::VectorizedN<T1, 2>(tmp_sum);
   for (int i = 0; i < vec_size * (size / vec_size); i += vec_size) {
-    auto tmp0 = vec::Vectorized<T1>::loadu(a + i);
+    auto tmp0 = vec::VectorizedN<T1, 2>::loadu(a + i);
     auto tmp1 = tmp0 - vec_max;
     // Replace with exp_u20 later
     // auto tmp2 = tmp1.exp_u20();
     auto tmp2 = tmp1.exp();
-    vec_tmp_sum += tmp2;
-    _store(out + i, tmp2);
+    vec_tmp_sum = vec_tmp_sum + tmp2;
+    tmp2.store(out + i);
   }
-  tmp_sum = vec::vec_reduce_all<T1>(
-      [](vec::Vectorized<T1>& x, vec::Vectorized<T1>& y) { return x + y; },
-      vec_tmp_sum);
+
+  __at_align__ T1 vec_tmp_sum_array[vec_size];
+  vec_tmp_sum.store(vec_tmp_sum_array);
+  for (const auto i : c10::irange(vec_size)) {
+    tmp_sum += vec_tmp_sum_array[i];
+  }
+  // See NOTE above; we should replace the scalar reduction above with
+  // this reduction (which uses vaddvq_f32 internally), but it changes
+  // numerics.
+  // tmp_sum = vec::vec_reduce_all<T1>(
+  //     [](vec::Vectorized<T1>& x, vec::Vectorized<T1>& y) { return x + y; },
+  //     vec_tmp_sum);
   for (int i = vec_size * (size / vec_size); i < size; i++) {
     auto tmp0 = a[i];
     auto tmp1 = tmp0 - val;