[ExecuTorch] Make ForcedUnroll usage in bf16 BlasKernel actually work for -Oz builds

swolchok · swolchok · commit d90b31f7612c · 2024-09-11T13:06:34.000-07:00
Pull Request resolved: #5247 Clang is very resistant to inlining under -Oz. For ForcedUnroll to actually unroll, we need to force-inline the lambda. ghstack-source-id: 242067820 @exported-using-ghexport Differential Revision: [D62154247](https://our.internmc.facebook.com/intern/diff/D62154247/)
diff --git a/kernels/optimized/blas/BlasKernel.cpp b/kernels/optimized/blas/BlasKernel.cpp
@@ -54,7 +54,7 @@ static_assert(kF32RegistersPerIteration == 1 << kF32RegistersPerIterationShift);
 static inline double reduce(float32x4_t x[kF32RegistersPerIteration]) {
   int offset = kF32RegistersPerIteration;
   utils::ForcedUnroll<kF32RegistersPerIterationShift>{}(
-      [&offset, &x](auto idx) {
+      [&offset, &x](auto idx) ET_INLINE_ATTRIBUTE {
         offset /= 2;
         for (int i = 0; i < offset; ++i) {
           x[i] = vaddq_f32(x[i], x[offset + i]);
@@ -115,7 +115,7 @@ float dot_with_fp32_arith(const T* vec1, const T* vec2, int64_t len) {
     const auto* vec1_ = vec1 + j;
     const auto* vec2_ = vec2 + j;
     utils::ForcedUnroll<kF32RegisterPairsPerIteration>{}(
-        [vec1_, vec2_, &sum](auto k) {
+        [vec1_, vec2_, &sum](auto k) ET_INLINE_ATTRIBUTE {
           dot_with_fp32_arith_main_inner_loop(vec1_, vec2_, sum, k);
         });
   }
diff --git a/runtime/platform/compiler.h b/runtime/platform/compiler.h
@@ -57,6 +57,7 @@
 #define ET_NORETURN [[noreturn]]
 #define ET_NOINLINE __attribute__((noinline))
 #define ET_INLINE __attribute__((always_inline)) inline
+#define ET_INLINE_ATTRIBUTE __attribute__((always_inline))
 
 #if defined(__GNUC__)