Update on "Mostly sync BlasKernel.cpp with ATen ReducedPrecisionGemvFastPathKernel"

swolchok · swolchok · commit a3c268a7b9e0 · 2025-05-14T16:16:26.000-07:00
The two files were similar, but diverged due to recent changes. Since we have sharing of PyTorch headers, we can keep them mostly the same; differences are some of the namespace stuff and a couple of EXECUTORCH NOTEs. Differential Revision: [D74702689](https://our.internmc.facebook.com/intern/diff/D74702689/) [ghstack-poisoned]
diff --git a/kernels/optimized/blas/BlasKernel.cpp b/kernels/optimized/blas/BlasKernel.cpp
@@ -95,10 +95,13 @@ float reduce(vec::VectorizedN<float, kF32RegistersPerIteration>& x) {
   return reduce(x[0]);
 }
 
+// EXECUTORCH NOTE: removed __ARM_FEATURE_BF16_VECTOR_ARITHMETIC gate
+// added in https://github.com/pytorch/pytorch/pull/152766, which I
+// complained on.
+
 // We would have to write a separate SVE-specific path to use SVE
 // BFDOT. Deferring that for now to get the NEON/ASIMD BFDOT path
 // working.
-#if __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
 #if defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE) && defined(__clang__) && __clang_major__ > 15
 // https://godbolt.org/z/z8P4Yncra
 #define COMPILER_SUPPORTS_BF16_TARGET 1
@@ -109,9 +112,6 @@ float reduce(vec::VectorizedN<float, kF32RegistersPerIteration>& x) {
 #else // defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE) && defined(__clang__) && __clang_major__ > 15
 #define COMPILER_SUPPORTS_BF16_TARGET 0
 #endif // defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE) && defined(__clang__) && __clang_major__ > 15
-#else // __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
-#define COMPILER_SUPPORTS_BF16_TARGET 0
-#endif // __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
 
 #if COMPILER_SUPPORTS_BF16_TARGET
 #define TARGET_ARM_BF16_ATTRIBUTE __attribute__((target("arch=armv8.2-a+bf16")))
diff --git a/kernels/optimized/lib_defs.bzl b/kernels/optimized/lib_defs.bzl
@@ -2,10 +2,6 @@ load("@fbsource//tools/build_defs:default_platform_defs.bzl", "DEVSERVER_PLATFOR
 load("@fbsource//tools/build_defs:fb_native_wrapper.bzl", "fb_native")
 load("@fbsource//xplat/executorch/backends/xnnpack/third-party:third_party_libs.bzl", "third_party_dep")
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
-load(
-    "@fbsource//xplat/executorch/kernels/portable:op_registration_util.bzl",
-    "get_compiler_optimization_flags",
-)
 
 # Because vec exists as a collection of header files, compile and preprocessor
 # flags applied to the vec target do not have any effect, since no compilation
@@ -200,7 +196,12 @@ def define_libs(is_fbcode=False):
             exported_headers = native.glob([
                 "blas/**/*.h",
             ]),
-            compiler_flags = ["-Wno-pass-failed"] + get_compiler_optimization_flags(),
+            compiler_flags = ["-Wno-pass-failed"] + select({
+                "ovr_config//runtime:fbcode": [],
+                # TODO: replace with get_compiler_optimization_flags from op_registration_util.bzl when that
+                # is re-enabled.
+                "DEFAULT": ["-Os"],
+            }),
             header_namespace = "executorch/kernels/optimized",
             visibility = [
                 "//executorch/...",