Skip to content

Commit d90b31f

Browse files
committed
[ExecuTorch] Make ForcedUnroll usage in bf16 BlasKernel actually work for -Oz builds
Pull Request resolved: #5247 Clang is very resistant to inlining under -Oz. For ForcedUnroll to actually unroll, we need to force-inline the lambda. ghstack-source-id: 242067820 @exported-using-ghexport Differential Revision: [D62154247](https://our.internmc.facebook.com/intern/diff/D62154247/)
1 parent 23dc48d commit d90b31f

File tree

2 files changed

+3
-2
lines changed

2 files changed

+3
-2
lines changed

kernels/optimized/blas/BlasKernel.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ static_assert(kF32RegistersPerIteration == 1 << kF32RegistersPerIterationShift);
5454
static inline double reduce(float32x4_t x[kF32RegistersPerIteration]) {
5555
int offset = kF32RegistersPerIteration;
5656
utils::ForcedUnroll<kF32RegistersPerIterationShift>{}(
57-
[&offset, &x](auto idx) {
57+
[&offset, &x](auto idx) ET_INLINE_ATTRIBUTE {
5858
offset /= 2;
5959
for (int i = 0; i < offset; ++i) {
6060
x[i] = vaddq_f32(x[i], x[offset + i]);
@@ -115,7 +115,7 @@ float dot_with_fp32_arith(const T* vec1, const T* vec2, int64_t len) {
115115
const auto* vec1_ = vec1 + j;
116116
const auto* vec2_ = vec2 + j;
117117
utils::ForcedUnroll<kF32RegisterPairsPerIteration>{}(
118-
[vec1_, vec2_, &sum](auto k) {
118+
[vec1_, vec2_, &sum](auto k) ET_INLINE_ATTRIBUTE {
119119
dot_with_fp32_arith_main_inner_loop(vec1_, vec2_, sum, k);
120120
});
121121
}

runtime/platform/compiler.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@
5757
#define ET_NORETURN [[noreturn]]
5858
#define ET_NOINLINE __attribute__((noinline))
5959
#define ET_INLINE __attribute__((always_inline)) inline
60+
#define ET_INLINE_ATTRIBUTE __attribute__((always_inline))
6061

6162
#if defined(__GNUC__)
6263

0 commit comments

Comments
 (0)