-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[PowerPC] Fix use of FPSCR builtins in smmintrin.h #67299
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-x86 @llvm/pr-subscribers-clang Changessmmintrin.h uses __builtin_mffs, __builtin_mffsl, __builtin_mtfsf and __builtin_set_fpscr_rn. This patch replaces the uses with ppc prefix and implement the missing ones. This fixes issue #64664. Migrated from https://reviews.llvm.org/D158066 Full diff: https://github.com/llvm/llvm-project/pull/67299.diff 11 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsPPC.def b/clang/include/clang/Basic/BuiltinsPPC.def
index 18a1186053481ed..a35488ed3dfa565 100644
--- a/clang/include/clang/Basic/BuiltinsPPC.def
+++ b/clang/include/clang/Basic/BuiltinsPPC.def
@@ -151,9 +151,11 @@ TARGET_BUILTIN(__builtin_ppc_extract_exp, "Uid", "", "power9-vector")
TARGET_BUILTIN(__builtin_ppc_extract_sig, "ULLid", "", "power9-vector")
BUILTIN(__builtin_ppc_mtfsb0, "vUIi", "")
BUILTIN(__builtin_ppc_mtfsb1, "vUIi", "")
+BUILTIN(__builtin_ppc_mffs, "d", "")
TARGET_BUILTIN(__builtin_ppc_mffsl, "d", "", "isa-v30-instructions")
BUILTIN(__builtin_ppc_mtfsf, "vUIiUi", "")
BUILTIN(__builtin_ppc_mtfsfi, "vUIiUIi", "")
+BUILTIN(__builtin_ppc_set_fpscr_rn, "di", "")
TARGET_BUILTIN(__builtin_ppc_insert_exp, "ddULLi", "", "power9-vector")
BUILTIN(__builtin_ppc_fmsub, "dddd", "")
BUILTIN(__builtin_ppc_fmsubs, "ffff", "")
diff --git a/clang/lib/Basic/Targets/PPC.cpp b/clang/lib/Basic/Targets/PPC.cpp
index e0abc069032e1ce..34fe9df60cea473 100644
--- a/clang/lib/Basic/Targets/PPC.cpp
+++ b/clang/lib/Basic/Targets/PPC.cpp
@@ -264,6 +264,10 @@ static void defineXLCompatMacros(MacroBuilder &Builder) {
Builder.defineMacro("__builtin_minfe", "__builtin_ppc_minfe");
Builder.defineMacro("__builtin_minfl", "__builtin_ppc_minfl");
Builder.defineMacro("__builtin_minfs", "__builtin_ppc_minfs");
+ Builder.defineMacro("__builtin_mffs", "__builtin_ppc_mffs");
+ Builder.defineMacro("__builtin_mffsl", "__builtin_ppc_mffsl");
+ Builder.defineMacro("__builtin_mtfsf", "__builtin_ppc_mtfsf");
+ Builder.defineMacro("__builtin_set_fpscr_rn", "__builtin_ppc_set_fpscr_rn");
}
/// PPCTargetInfo::getTargetDefines - Return a set of the PowerPC-specific
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index c175f274319b8c4..f64cb995fa1898a 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -17237,6 +17237,11 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
Value *Op1 = EmitScalarExpr(E->getArg(1));
return Builder.CreateFDiv(Op0, Op1, "swdiv");
}
+ case PPC::BI__builtin_ppc_set_fpscr_rn:
+ return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::ppc_setrnd),
+ {EmitScalarExpr(E->getArg(0))});
+ case PPC::BI__builtin_ppc_mffs:
+ return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::ppc_readflm));
}
}
diff --git a/clang/lib/Headers/ppc_wrappers/smmintrin.h b/clang/lib/Headers/ppc_wrappers/smmintrin.h
index 349b395c4f00b92..19cdecb18d2b83f 100644
--- a/clang/lib/Headers/ppc_wrappers/smmintrin.h
+++ b/clang/lib/Headers/ppc_wrappers/smmintrin.h
@@ -14,7 +14,7 @@
#ifndef NO_WARN_X86_INTRINSICS
/* This header is distributed to simplify porting x86_64 code that
- makes explicit use of Intel intrinsics to powerp64/powerpc64le.
+ makes explicit use of Intel intrinsics to powerpc64/powerpc64le.
It is the user's responsibility to determine if the results are
acceptable and make additional changes as necessary.
@@ -68,10 +68,10 @@ extern __inline __m128d
__asm__("mffsce %0" : "=f"(__fpscr_save.__fr));
__enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
#else
- __fpscr_save.__fr = __builtin_mffs();
+ __fpscr_save.__fr = __builtin_ppc_mffs();
__enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
__fpscr_save.__fpscr &= ~0xf8;
- __builtin_mtfsf(0b00000011, __fpscr_save.__fr);
+ __builtin_ppc_mtfsf(0b00000011, __fpscr_save.__fr);
#endif
/* Insert an artificial "read/write" reference to the variable
read below, to ensure the compiler does not schedule
@@ -83,10 +83,15 @@ extern __inline __m128d
switch (__rounding) {
case _MM_FROUND_TO_NEAREST_INT:
- __fpscr_save.__fr = __builtin_mffsl();
+#ifdef _ARCH_PWR9
+ __fpscr_save.__fr = __builtin_ppc_mffsl();
+#else
+ __fpscr_save.__fr = __builtin_ppc_mffs();
+ __fpscr_save.__fpscr &= 0x70007f0ffL;
+#endif
__attribute__((fallthrough));
case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC:
- __builtin_set_fpscr_rn(0b00);
+ __builtin_ppc_set_fpscr_rn(0b00);
/* Insert an artificial "read/write" reference to the variable
read below, to ensure the compiler does not schedule
a read/use of the variable before the FPSCR is modified, above.
@@ -102,7 +107,7 @@ extern __inline __m128d
This can be removed if and when GCC PR102783 is fixed.
*/
__asm__("" : : "wa"(__r));
- __builtin_set_fpscr_rn(__fpscr_save.__fpscr);
+ __builtin_ppc_set_fpscr_rn(__fpscr_save.__fpscr);
break;
case _MM_FROUND_TO_NEG_INF:
case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC:
@@ -128,9 +133,14 @@ extern __inline __m128d
*/
__asm__("" : : "wa"(__r));
/* Restore enabled exceptions. */
- __fpscr_save.__fr = __builtin_mffsl();
+#ifdef _ARCH_PWR9
+ __fpscr_save.__fr = __builtin_ppc_mffsl();
+#else
+ __fpscr_save.__fr = __builtin_ppc_mffs();
+ __fpscr_save.__fpscr &= 0x70007f0ffL;
+#endif
__fpscr_save.__fpscr |= __enables_save.__fpscr;
- __builtin_mtfsf(0b00000011, __fpscr_save.__fr);
+ __builtin_ppc_mtfsf(0b00000011, __fpscr_save.__fr);
}
return (__m128d)__r;
}
@@ -159,10 +169,10 @@ extern __inline __m128
__asm__("mffsce %0" : "=f"(__fpscr_save.__fr));
__enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
#else
- __fpscr_save.__fr = __builtin_mffs();
+ __fpscr_save.__fr = __builtin_ppc_mffs();
__enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
__fpscr_save.__fpscr &= ~0xf8;
- __builtin_mtfsf(0b00000011, __fpscr_save.__fr);
+ __builtin_ppc_mtfsf(0b00000011, __fpscr_save.__fr);
#endif
/* Insert an artificial "read/write" reference to the variable
read below, to ensure the compiler does not schedule
@@ -174,10 +184,15 @@ extern __inline __m128
switch (__rounding) {
case _MM_FROUND_TO_NEAREST_INT:
- __fpscr_save.__fr = __builtin_mffsl();
+#ifdef _ARCH_PWR9
+ __fpscr_save.__fr = __builtin_ppc_mffsl();
+#else
+ __fpscr_save.__fr = __builtin_ppc_mffs();
+ __fpscr_save.__fpscr &= 0x70007f0ffL;
+#endif
__attribute__((fallthrough));
case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC:
- __builtin_set_fpscr_rn(0b00);
+ __builtin_ppc_set_fpscr_rn(0b00);
/* Insert an artificial "read/write" reference to the variable
read below, to ensure the compiler does not schedule
a read/use of the variable before the FPSCR is modified, above.
@@ -193,7 +208,7 @@ extern __inline __m128
This can be removed if and when GCC PR102783 is fixed.
*/
__asm__("" : : "wa"(__r));
- __builtin_set_fpscr_rn(__fpscr_save.__fpscr);
+ __builtin_ppc_set_fpscr_rn(__fpscr_save.__fpscr);
break;
case _MM_FROUND_TO_NEG_INF:
case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC:
@@ -219,9 +234,14 @@ extern __inline __m128
*/
__asm__("" : : "wa"(__r));
/* Restore enabled exceptions. */
- __fpscr_save.__fr = __builtin_mffsl();
+#ifdef _ARCH_PWR9
+ __fpscr_save.__fr = __builtin_ppc_mffsl();
+#else
+ __fpscr_save.__fr = __builtin_ppc_mffs();
+ __fpscr_save.__fpscr &= 0x70007f0ffL;
+#endif
__fpscr_save.__fpscr |= __enables_save.__fpscr;
- __builtin_mtfsf(0b00000011, __fpscr_save.__fr);
+ __builtin_ppc_mtfsf(0b00000011, __fpscr_save.__fr);
}
return (__m128)__r;
}
diff --git a/clang/test/CodeGen/PowerPC/builtins-ppc.c b/clang/test/CodeGen/PowerPC/builtins-ppc.c
index ccc91b6560845e2..c13edf44cdcbd2a 100644
--- a/clang/test/CodeGen/PowerPC/builtins-ppc.c
+++ b/clang/test/CodeGen/PowerPC/builtins-ppc.c
@@ -1,5 +1,8 @@
// REQUIRES: powerpc-registered-target
-// RUN: %clang_cc1 -triple powerpc-unknown-unknown -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple powerpc-unknown-unknown -emit-llvm %s -o - \
+// RUN: | FileCheck %s
+// RUN: %clang_cc1 -triple powerpc-unknown-unknown -emit-llvm %s -o - \
+// RUN: -target-cpu pwr9 | FileCheck %s --check-prefixes=P9,CHECK
void test_eh_return_data_regno()
{
@@ -26,6 +29,9 @@ void test_builtin_ppc_setrnd() {
// CHECK: call double @llvm.ppc.setrnd(i32 %2)
res = __builtin_setrnd(x);
+
+ // CHECK: call double @llvm.ppc.setrnd(i32 %4)
+ res = __builtin_ppc_set_fpscr_rn(x);
}
void test_builtin_ppc_flm() {
@@ -33,7 +39,10 @@ void test_builtin_ppc_flm() {
// CHECK: call double @llvm.ppc.readflm()
res = __builtin_readflm();
- // CHECK: call double @llvm.ppc.setflm(double %1)
+ // CHECK: call double @llvm.ppc.readflm()
+ res = __builtin_ppc_mffs();
+
+ // CHECK: call double @llvm.ppc.setflm(double %2)
res = __builtin_setflm(res);
#ifdef _ARCH_PWR9
diff --git a/clang/test/CodeGen/PowerPC/ppc-emmintrin.c b/clang/test/CodeGen/PowerPC/ppc-emmintrin.c
index e2d26e611ac81c4..15d291496c20a4a 100644
--- a/clang/test/CodeGen/PowerPC/ppc-emmintrin.c
+++ b/clang/test/CodeGen/PowerPC/ppc-emmintrin.c
@@ -8,6 +8,11 @@
// RUN: %clang -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr10 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
// RUN: -ffp-contract=off -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefixes=CHECK-P10
+// RUN: %clang -x c++ -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr8 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
+// RUN: -ffp-contract=off -fno-discard-value-names -mllvm -disable-llvm-optzns -fsyntax-only
+// RUN: %clang -x c++ -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr10 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
+// RUN: -ffp-contract=off -fno-discard-value-names -mllvm -disable-llvm-optzns -fsyntax-only
+
// RUN: %clang -S -emit-llvm -target powerpc64-ibm-aix -mcpu=pwr8 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
// RUN: -ffp-contract=off -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefixes=CHECK,CHECK-BE
// RUN: %clang -S -emit-llvm -target powerpc64-ibm-aix -mcpu=pwr10 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
diff --git a/clang/test/CodeGen/PowerPC/ppc-mmintrin.c b/clang/test/CodeGen/PowerPC/ppc-mmintrin.c
index 4cb5b8540092f9b..1dc6292ae3244c3 100644
--- a/clang/test/CodeGen/PowerPC/ppc-mmintrin.c
+++ b/clang/test/CodeGen/PowerPC/ppc-mmintrin.c
@@ -9,6 +9,11 @@
// RUN: %clang -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr9 -DNO_WARN_X86_INTRINSICS %s \
// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n| FileCheck %s --check-prefixes=CHECK-P9,CHECK,CHECK-LE
+// RUN: %clang -x c++ -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr8 -DNO_WARN_X86_INTRINSICS %s \
+// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -fsyntax-only
+// RUN: %clang -x c++ -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr9 -DNO_WARN_X86_INTRINSICS %s \
+// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -fsyntax-only
+
// RUN: %clang -S -emit-llvm -target powerpc64-unknown-freebsd13.0 -mcpu=pwr8 -DNO_WARN_X86_INTRINSICS %s \
// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefixes=CHECK-P8,CHECK,CHECK-BE
// RUN: %clang -S -emit-llvm -target powerpc64le-unknown-freebsd13.0 -mcpu=pwr8 -DNO_WARN_X86_INTRINSICS %s \
diff --git a/clang/test/CodeGen/PowerPC/ppc-pmmintrin.c b/clang/test/CodeGen/PowerPC/ppc-pmmintrin.c
index 39194427978ad42..6e152c549498d23 100644
--- a/clang/test/CodeGen/PowerPC/ppc-pmmintrin.c
+++ b/clang/test/CodeGen/PowerPC/ppc-pmmintrin.c
@@ -13,6 +13,9 @@
// RUN: %clang -S -emit-llvm -target powerpc64-ibm-aix -mcpu=pwr8 -DNO_MM_MALLOC -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s
+// RUN: %clang -x c++ -S -emit-llvm -target powerpc64le-gnu-linux -mcpu=pwr8 -DNO_MM_MALLOC -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
+// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -fsyntax-only
+
#include <pmmintrin.h>
__m128d resd, md1, md2;
diff --git a/clang/test/CodeGen/PowerPC/ppc-smmintrin.c b/clang/test/CodeGen/PowerPC/ppc-smmintrin.c
index 220b65c1ce16495..7daef71a61c329e 100644
--- a/clang/test/CodeGen/PowerPC/ppc-smmintrin.c
+++ b/clang/test/CodeGen/PowerPC/ppc-smmintrin.c
@@ -15,6 +15,11 @@
// RUN: %clang -S -emit-llvm -target powerpc64-unknown-linux-gnu -mcpu=pwr10 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefix=P10
+// RUN: %clang -x c++ -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr8 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
+// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -fsyntax-only
+// RUN: %clang -x c++ -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr10 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
+// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -fsyntax-only
+
// RUN: %clang -S -emit-llvm -target powerpc64le-unknown-freebsd13.0 -mcpu=pwr8 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s
// RUN: %clang -S -emit-llvm -target powerpc64-unknown-freebsd13.0 -mcpu=pwr8 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
@@ -239,44 +244,48 @@ test_round() {
// CHECK-LABEL: @test_round
// CHECK-LABEL: define available_externally <4 x float> @_mm_round_ps(<4 x float> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}})
-// CHECK: call signext i32 @__builtin_mffs()
-// CHECK: call signext i32 @__builtin_mtfsf(i32 noundef signext 3, double noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call double @llvm.ppc.readflm()
+// CHECK: call void @llvm.ppc.mtfsf(i32 3, double %{{[0-9a-zA-Z_.]+}})
// CHECK: %{{[0-9a-zA-Z_.]+}} = call <4 x float> asm "", "=^wa,0"
-// CHECK: call signext i32 @__builtin_mffsl()
-// CHECK: call signext i32 @__builtin_set_fpscr_rn(i32 noundef signext 0)
+// CHECK: call double @llvm.ppc.readflm()
+// P10: call double @llvm.ppc.mffsl()
+// CHECK: call double @llvm.ppc.setrnd(i32 0)
// CHECK: %{{[0-9a-zA-Z_.]+}} = call <4 x float> asm "", "=^wa,0"
// CHECK: call <4 x float> @vec_rint(float vector[4])
// CHECK: call void asm sideeffect "", "^wa"
-// CHECK: call signext i32 @__builtin_set_fpscr_rn(i64 noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call double @llvm.ppc.setrnd(i32 %{{[0-9a-zA-Z_.]+}})
// CHECK: call <4 x float> @vec_floor(float vector[4])
// CHECK: call <4 x float> @vec_ceil(float vector[4])
// CHECK: call <4 x float> @vec_trunc(float vector[4])
// CHECK: call <4 x float> @vec_rint(float vector[4])
// CHECK: call void asm sideeffect "", "^wa"
-// CHECK: call signext i32 @__builtin_mffsl()
-// CHECK: call signext i32 @__builtin_mtfsf(i32 noundef signext 3, double noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call double @llvm.ppc.readflm()
+// P10: call double @llvm.ppc.mffsl()
+// CHECK: call void @llvm.ppc.mtfsf(i32 3, double %{{[0-9a-zA-Z_.]+}})
// CHECK-LABEL: define available_externally <4 x float> @_mm_round_ss(<4 x float> noundef %{{[0-9a-zA-Z_.]+}}, <4 x float> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}})
// CHECK: call <4 x float> @_mm_round_ps(<4 x float> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}})
// CHECK: extractelement <4 x float> %{{[0-9a-zA-Z_.]+}}, i32 0
// CHECK-LABEL: define available_externally <2 x double> @_mm_round_pd(<2 x double> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}})
-// CHECK: call signext i32 @__builtin_mffs()
-// CHECK: call signext i32 @__builtin_mtfsf(i32 noundef signext 3, double noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call double @llvm.ppc.readflm()
+// CHECK: call void @llvm.ppc.mtfsf(i32 3, double %{{[0-9a-zA-Z_.]+}})
// CHECK: %{{[0-9a-zA-Z_.]+}} = call <2 x double> asm "", "=^wa,0"
-// CHECK: call signext i32 @__builtin_mffsl()
-// CHECK: call signext i32 @__builtin_set_fpscr_rn(i32 noundef signext 0)
+// CHECK: call double @llvm.ppc.readflm()
+// P10: call double @llvm.ppc.mffsl()
+// CHECK: call double @llvm.ppc.setrnd(i32 0)
// CHECK: %{{[0-9a-zA-Z_.]+}} = call <2 x double> asm "", "=^wa,0"
// CHECK: call <2 x double> @vec_rint(double vector[2])
// CHECK: call void asm sideeffect "", "^wa"
-// CHECK: call signext i32 @__builtin_set_fpscr_rn(i64 noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call double @llvm.ppc.setrnd(i32 %{{[0-9a-zA-Z_.]+}})
// CHECK: call <2 x double> @vec_floor(double vector[2])
// CHECK: call <2 x double> @vec_ceil(double vector[2])
// CHECK: call <2 x double> @vec_trunc(double vector[2])
// CHECK: call <2 x double> @vec_rint(double vector[2])
// CHECK: call void asm sideeffect "", "^wa"
-// CHECK: call signext i32 @__builtin_mffsl()
-// CHECK: call signext i32 @__builtin_mtfsf(i32 noundef signext 3, double noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call double @llvm.ppc.readflm()
+// P10: call double @llvm.ppc.mffsl()
+// CHECK: call void @llvm.ppc.mtfsf(i32 3, double %{{[0-9a-zA-Z_.]+}})
// CHECK-LABEL: define available_externally <2 x double> @_mm_round_sd(<2 x double> noundef %{{[0-9a-zA-Z_.]+}}, <2 x double> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}})
// CHECK: call <2 x double> @_mm_round_pd(<2 x double> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}})
diff --git a/clang/test/CodeGen/PowerPC/ppc-tmmintrin.c b/clang/test/CodeGen/PowerPC/ppc-tmmintrin.c
index 60633e34b56b9a1..40d3839dcf026f7 100644
--- a/clang/test/CodeGen/PowerPC/ppc-tmmintrin.c
+++ b/clang/test/CodeGen/PowerPC/ppc-tmmintrin.c
@@ -13,6 +13,9 @@
// RUN: %clang -S -emit-llvm -target powerpc64-ibm-aix -mcpu=pwr8 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefixes=CHECK,CHECK-BE
+// RUN: %clang -x c++ -S -emit-llvm -target powerpc64le-gnu-linux -mcpu=pwr8 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
+// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -fsyntax-only
+
#include <tmmintrin.h>
__m64 res, m1, m2;
diff --git a/clang/test/CodeGen/PowerPC/ppc-x86gprintrin.c b/clang/test/CodeGen/PowerPC/ppc-x86gprintrin.c
index 238ce7c7ee574ef..ac90a5f8c530ba4 100644
--- a/clang/test/CodeGen/PowerPC/ppc-x86gprintrin.c
+++ b/clang/test/CodeGen/PowerPC/ppc-x86gprintrin.c
@@ -12,6 +12,9 @@
// RUN: %clang -S -emit-llvm -target powerpc64-ibm-aix -mcpu=pwr7 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s
+// RUN: %clang -x c++ -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr7 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
+// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -fsyntax-only
+
#include <x86gprintrin.h>
unsigned short us;
|
ACK 5abcf8964394eb5124d606b00a94781147fd8f8d. I can confirm that this fixes SSE4.1 support in ncnn for POWER9 LE (and gives a nice 3x speedup when benchmarking ncnn, which indicates that the additional intrinsics are actually getting used). Testing was done by applying the patch to the Clang package from Fedora 38's LLVM snapshot Copr. I didn't attempt to test POWER8. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this makes sense to me overall.
Thank you for adding the tests for C++ as well!
I only had a couple of comments about this patch.
@@ -68,10 +68,10 @@ extern __inline __m128d | |||
__asm__("mffsce %0" : "=f"(__fpscr_save.__fr)); | |||
__enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8; | |||
#else | |||
__fpscr_save.__fr = __builtin_mffs(); | |||
__fpscr_save.__fr = __builtin_ppc_mffs(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't believe that this builtin needs to be renamed. On clang both __builtin_mffs
and __builtin_ppc_mffs
work.
Also, this is the same for the other 3 builtins.
When you update these names you will probably also have to update the ppc-smmintrin.c
test as well.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
__builtin_mffs
aliases to __builtin_ppc_mffs
through macro. But the compat macros do not always work. In the test cases using -ffreestanding
or targeting non-AIX non-Linux OSes, the macros will not be defined.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Okay, that's fair.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should we remove __fpscr_save.__fpscr &= 0x70007f0ffL;
? I suspect it may break some assumption of following code
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure that there are any assumptions broken here. For example, __builtin_ppc_set_fpscr_rn
only uses the last two bits for the rounding control and masks off the rest anyway. Also, __builtin_ppc_mtfsf(0b00000011, __fpscr_save.__fr);
is using the mask 0b00000011
so it only uses the last 8 bits.
__fpscr_save.__fr = __builtin_ppc_mffs(); | ||
__fpscr_save.__fpscr &= 0x70007f0ffL; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
On GCC the builtin __builtin_mffsl
also expands on pre-Power9 architectures and uses an expansion similar to this behind the scenes where mffs
is used instead. I feel like we should do the same thing in order to behave in the same way as they do and in order to avoid having the user write this kind of code.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Implemented by 2c7688c
smmintrin.h uses __builtin_mffs, __builtin_mffsl, __builtin_mtfsf and __builtin_set_fpscr_rn. This patch replaces the uses with ppc prefix and implement the missing ones. This fixes issue llvm#64664.
@@ -11595,6 +11595,50 @@ SDValue PPCTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { | |||
llvm_unreachable("ERROR:Should return for all cases within swtich."); | |||
} | |||
|
|||
// Lower mffsl intrinsic with mffs in targets without ISA 3.0 | |||
static SDValue lowerMFFSL(SDValue Op, SelectionDAG &DAG, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I wonder if we actually need this. The reason mffsl
exists is because it is a lightweight version of mffs
. In order to make it lightweight, the instruction only extracts some bits from the FPSCR.
So in order to match the semantics, we end up doing the heavy weight instruction, materializing a 64-bit constant, moving to a GPR, masking out the bits and then moving it back to an FPR. So a user's attempt to use the lightweight version ends up costing them more than the heavy weight version on older CPU's.
Can we not just reject it on older CPU's and force the user to use the heavy weight instruction?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The reason mffsl exists is because it is a lightweight version of mffs.
Thanks, according to the 'lightweight' meaning, this sounds reasonable. I don't have strong preference to align with GCC behavior. We have builtins only for P9 which can't be or haven't been emulated.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I personally prefer this solution. @stefanp-ibm What do you think?
LGTM
I understand your point about forcing users to just use
and just use:
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
With respect to the comment from Nemanja I'm okay with this not being 100% the same as GCC. I'm sure we will get questions in the future but really we should discourage the use of __builtin_mffsl
on pre-Power 9.
@@ -68,10 +68,10 @@ extern __inline __m128d | |||
__asm__("mffsce %0" : "=f"(__fpscr_save.__fr)); | |||
__enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8; | |||
#else | |||
__fpscr_save.__fr = __builtin_mffs(); | |||
__fpscr_save.__fr = __builtin_ppc_mffs(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Okay, that's fair.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm going to approve this patch.
I don't think that removing the mask __fpscr_save.__fpscr &= 0x70007f0ffL;
in smmintrin.h
with have issues but I don't feel strongly enough about it to block this patch. If you are not convinced that it is safe you can leave the masks in.
LGTM.
@@ -68,10 +68,10 @@ extern __inline __m128d | |||
__asm__("mffsce %0" : "=f"(__fpscr_save.__fr)); | |||
__enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8; | |||
#else | |||
__fpscr_save.__fr = __builtin_mffs(); | |||
__fpscr_save.__fr = __builtin_ppc_mffs(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure that there are any assumptions broken here. For example, __builtin_ppc_set_fpscr_rn
only uses the last two bits for the rounding control and masks off the rest anyway. Also, __builtin_ppc_mtfsf(0b00000011, __fpscr_save.__fr);
is using the mask 0b00000011
so it only uses the last 8 bits.
smmintrin.h uses __builtin_mffs, __builtin_mffsl, __builtin_mtfsf and __builtin_set_fpscr_rn. This patch replaces the uses with ppc prefix and implement the missing ones.
smmintrin.h uses __builtin_mffs, __builtin_mffsl, __builtin_mtfsf and __builtin_set_fpscr_rn. This patch replaces the uses with ppc prefix and implement the missing ones.
Fixes #64664
Migrated from https://reviews.llvm.org/D158066