also enable for non-aarch64 on "Mostly sync BlasKernel.cpp with ATen ReducedPrecisionGemvFastPathKernel"

swolchok · swolchok · commit 27af7f668373 · 2025-05-15T14:16:17.000-07:00
The two files were similar, but diverged due to recent changes. Since we have sharing of PyTorch headers, we can keep them mostly the same; differences are some of the namespace stuff, lintrunner, and a couple of EXECUTORCH NOTEs. Differential Revision: [D74702689](https://our.internmc.facebook.com/intern/diff/D74702689/) [ghstack-poisoned]
diff --git a/kernels/optimized/blas/BlasKernel.h b/kernels/optimized/blas/BlasKernel.h
@@ -158,7 +158,6 @@ void gemm_transa_(
   }
 }
 
-#ifdef __aarch64__
 namespace internal {
 float bf16_dot_with_fp32_arith(const torch::executor::BFloat16* vec1, const torch::executor::BFloat16* vec2, int64_t len);
 } // namespace internal
@@ -204,7 +203,6 @@ inline void gemm_transa_<torch::executor::BFloat16, torch::executor::BFloat16>(
     }
   });
 }
-#endif
 
 // clang-format on
 

Original file line number	Diff line number	Diff line change
`@@ -158,7 +158,6 @@ void gemm_transa_(`
`158`	`158`	`}`
`159`	`159`	`}`
`160`	`160`
`161`		`-#ifdef __aarch64__`
`162`	`161`	`namespace internal {`
`163`	`162`	`float bf16_dot_with_fp32_arith(const torch::executor::BFloat16* vec1, const torch::executor::BFloat16* vec2, int64_t len);`
`164`	`163`	`} // namespace internal`
`@@ -204,7 +203,6 @@ inline void gemm_transa_<torch::executor::BFloat16, torch::executor::BFloat16>(`
`204`	`203`	`}`
`205`	`204`	`});`
`206`	`205`	`}`
`207`		`-#endif`
`208`	`206`
`209`	`207`	`// clang-format on`
`210`	`208`