Skip to content

Commit a3c268a

Browse files
committed
Update on "Mostly sync BlasKernel.cpp with ATen ReducedPrecisionGemvFastPathKernel"
The two files were similar, but diverged due to recent changes. Since we have sharing of PyTorch headers, we can keep them mostly the same; differences are some of the namespace stuff and a couple of EXECUTORCH NOTEs. Differential Revision: [D74702689](https://our.internmc.facebook.com/intern/diff/D74702689/) [ghstack-poisoned]
1 parent 120c4ba commit a3c268a

File tree

2 files changed

+10
-9
lines changed

2 files changed

+10
-9
lines changed

kernels/optimized/blas/BlasKernel.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -95,10 +95,13 @@ float reduce(vec::VectorizedN<float, kF32RegistersPerIteration>& x) {
9595
return reduce(x[0]);
9696
}
9797

98+
// EXECUTORCH NOTE: removed __ARM_FEATURE_BF16_VECTOR_ARITHMETIC gate
99+
// added in https://github.com/pytorch/pytorch/pull/152766, which I
100+
// complained on.
101+
98102
// We would have to write a separate SVE-specific path to use SVE
99103
// BFDOT. Deferring that for now to get the NEON/ASIMD BFDOT path
100104
// working.
101-
#if __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
102105
#if defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE) && defined(__clang__) && __clang_major__ > 15
103106
// https://godbolt.org/z/z8P4Yncra
104107
#define COMPILER_SUPPORTS_BF16_TARGET 1
@@ -109,9 +112,6 @@ float reduce(vec::VectorizedN<float, kF32RegistersPerIteration>& x) {
109112
#else // defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE) && defined(__clang__) && __clang_major__ > 15
110113
#define COMPILER_SUPPORTS_BF16_TARGET 0
111114
#endif // defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE) && defined(__clang__) && __clang_major__ > 15
112-
#else // __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
113-
#define COMPILER_SUPPORTS_BF16_TARGET 0
114-
#endif // __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
115115

116116
#if COMPILER_SUPPORTS_BF16_TARGET
117117
#define TARGET_ARM_BF16_ATTRIBUTE __attribute__((target("arch=armv8.2-a+bf16")))

kernels/optimized/lib_defs.bzl

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,6 @@ load("@fbsource//tools/build_defs:default_platform_defs.bzl", "DEVSERVER_PLATFOR
22
load("@fbsource//tools/build_defs:fb_native_wrapper.bzl", "fb_native")
33
load("@fbsource//xplat/executorch/backends/xnnpack/third-party:third_party_libs.bzl", "third_party_dep")
44
load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
5-
load(
6-
"@fbsource//xplat/executorch/kernels/portable:op_registration_util.bzl",
7-
"get_compiler_optimization_flags",
8-
)
95

106
# Because vec exists as a collection of header files, compile and preprocessor
117
# flags applied to the vec target do not have any effect, since no compilation
@@ -200,7 +196,12 @@ def define_libs(is_fbcode=False):
200196
exported_headers = native.glob([
201197
"blas/**/*.h",
202198
]),
203-
compiler_flags = ["-Wno-pass-failed"] + get_compiler_optimization_flags(),
199+
compiler_flags = ["-Wno-pass-failed"] + select({
200+
"ovr_config//runtime:fbcode": [],
201+
# TODO: replace with get_compiler_optimization_flags from op_registration_util.bzl when that
202+
# is re-enabled.
203+
"DEFAULT": ["-Os"],
204+
}),
204205
header_namespace = "executorch/kernels/optimized",
205206
visibility = [
206207
"//executorch/...",

0 commit comments

Comments
 (0)