[PowerPC] Fix use of FPSCR builtins in smmintrin.h #67299

ecnelises · 2023-09-25T09:07:35Z

smmintrin.h uses __builtin_mffs, __builtin_mffsl, __builtin_mtfsf and __builtin_set_fpscr_rn. This patch replaces the uses with ppc prefix and implement the missing ones.

Fixes #64664

Migrated from https://reviews.llvm.org/D158066

llvmbot · 2023-09-25T09:08:42Z

@llvm/pr-subscribers-backend-x86
@llvm/pr-subscribers-clang-codegen

@llvm/pr-subscribers-clang

Changes

smmintrin.h uses __builtin_mffs, __builtin_mffsl, __builtin_mtfsf and __builtin_set_fpscr_rn. This patch replaces the uses with ppc prefix and implement the missing ones.

This fixes issue #64664.

Migrated from https://reviews.llvm.org/D158066

Full diff: https://github.com/llvm/llvm-project/pull/67299.diff

11 Files Affected:

(modified) clang/include/clang/Basic/BuiltinsPPC.def (+2)
(modified) clang/lib/Basic/Targets/PPC.cpp (+4)
(modified) clang/lib/CodeGen/CGBuiltin.cpp (+5)
(modified) clang/lib/Headers/ppc_wrappers/smmintrin.h (+35-15)
(modified) clang/test/CodeGen/PowerPC/builtins-ppc.c (+11-2)
(modified) clang/test/CodeGen/PowerPC/ppc-emmintrin.c (+5)
(modified) clang/test/CodeGen/PowerPC/ppc-mmintrin.c (+5)
(modified) clang/test/CodeGen/PowerPC/ppc-pmmintrin.c (+3)
(modified) clang/test/CodeGen/PowerPC/ppc-smmintrin.c (+23-14)
(modified) clang/test/CodeGen/PowerPC/ppc-tmmintrin.c (+3)
(modified) clang/test/CodeGen/PowerPC/ppc-x86gprintrin.c (+3)

diff --git a/clang/include/clang/Basic/BuiltinsPPC.def b/clang/include/clang/Basic/BuiltinsPPC.def
index 18a1186053481ed..a35488ed3dfa565 100644
--- a/clang/include/clang/Basic/BuiltinsPPC.def
+++ b/clang/include/clang/Basic/BuiltinsPPC.def
@@ -151,9 +151,11 @@ TARGET_BUILTIN(__builtin_ppc_extract_exp, "Uid", "", "power9-vector")
 TARGET_BUILTIN(__builtin_ppc_extract_sig, "ULLid", "", "power9-vector")
 BUILTIN(__builtin_ppc_mtfsb0, "vUIi", "")
 BUILTIN(__builtin_ppc_mtfsb1, "vUIi", "")
+BUILTIN(__builtin_ppc_mffs, "d", "")
 TARGET_BUILTIN(__builtin_ppc_mffsl, "d", "", "isa-v30-instructions")
 BUILTIN(__builtin_ppc_mtfsf, "vUIiUi", "")
 BUILTIN(__builtin_ppc_mtfsfi, "vUIiUIi", "")
+BUILTIN(__builtin_ppc_set_fpscr_rn, "di", "")
 TARGET_BUILTIN(__builtin_ppc_insert_exp, "ddULLi", "", "power9-vector")
 BUILTIN(__builtin_ppc_fmsub, "dddd", "")
 BUILTIN(__builtin_ppc_fmsubs, "ffff", "")
diff --git a/clang/lib/Basic/Targets/PPC.cpp b/clang/lib/Basic/Targets/PPC.cpp
index e0abc069032e1ce..34fe9df60cea473 100644
--- a/clang/lib/Basic/Targets/PPC.cpp
+++ b/clang/lib/Basic/Targets/PPC.cpp
@@ -264,6 +264,10 @@ static void defineXLCompatMacros(MacroBuilder &Builder) {
   Builder.defineMacro("__builtin_minfe", "__builtin_ppc_minfe");
   Builder.defineMacro("__builtin_minfl", "__builtin_ppc_minfl");
   Builder.defineMacro("__builtin_minfs", "__builtin_ppc_minfs");
+  Builder.defineMacro("__builtin_mffs", "__builtin_ppc_mffs");
+  Builder.defineMacro("__builtin_mffsl", "__builtin_ppc_mffsl");
+  Builder.defineMacro("__builtin_mtfsf", "__builtin_ppc_mtfsf");
+  Builder.defineMacro("__builtin_set_fpscr_rn", "__builtin_ppc_set_fpscr_rn");
 }
 
 /// PPCTargetInfo::getTargetDefines - Return a set of the PowerPC-specific
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index c175f274319b8c4..f64cb995fa1898a 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -17237,6 +17237,11 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
     Value *Op1 = EmitScalarExpr(E->getArg(1));
     return Builder.CreateFDiv(Op0, Op1, "swdiv");
   }
+  case PPC::BI__builtin_ppc_set_fpscr_rn:
+    return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::ppc_setrnd),
+                              {EmitScalarExpr(E->getArg(0))});
+  case PPC::BI__builtin_ppc_mffs:
+    return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::ppc_readflm));
   }
 }
 
diff --git a/clang/lib/Headers/ppc_wrappers/smmintrin.h b/clang/lib/Headers/ppc_wrappers/smmintrin.h
index 349b395c4f00b92..19cdecb18d2b83f 100644
--- a/clang/lib/Headers/ppc_wrappers/smmintrin.h
+++ b/clang/lib/Headers/ppc_wrappers/smmintrin.h
@@ -14,7 +14,7 @@
 
 #ifndef NO_WARN_X86_INTRINSICS
 /* This header is distributed to simplify porting x86_64 code that
-   makes explicit use of Intel intrinsics to powerp64/powerpc64le.
+   makes explicit use of Intel intrinsics to powerpc64/powerpc64le.
 
    It is the user's responsibility to determine if the results are
    acceptable and make additional changes as necessary.
@@ -68,10 +68,10 @@ extern __inline __m128d
     __asm__("mffsce %0" : "=f"(__fpscr_save.__fr));
     __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
 #else
-    __fpscr_save.__fr = __builtin_mffs();
+    __fpscr_save.__fr = __builtin_ppc_mffs();
     __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
     __fpscr_save.__fpscr &= ~0xf8;
-    __builtin_mtfsf(0b00000011, __fpscr_save.__fr);
+    __builtin_ppc_mtfsf(0b00000011, __fpscr_save.__fr);
 #endif
     /* Insert an artificial "read/write" reference to the variable
        read below, to ensure the compiler does not schedule
@@ -83,10 +83,15 @@ extern __inline __m128d
 
   switch (__rounding) {
   case _MM_FROUND_TO_NEAREST_INT:
-    __fpscr_save.__fr = __builtin_mffsl();
+#ifdef _ARCH_PWR9
+    __fpscr_save.__fr = __builtin_ppc_mffsl();
+#else
+    __fpscr_save.__fr = __builtin_ppc_mffs();
+    __fpscr_save.__fpscr &= 0x70007f0ffL;
+#endif
     __attribute__((fallthrough));
   case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC:
-    __builtin_set_fpscr_rn(0b00);
+    __builtin_ppc_set_fpscr_rn(0b00);
     /* Insert an artificial "read/write" reference to the variable
        read below, to ensure the compiler does not schedule
        a read/use of the variable before the FPSCR is modified, above.
@@ -102,7 +107,7 @@ extern __inline __m128d
        This can be removed if and when GCC PR102783 is fixed.
      */
     __asm__("" : : "wa"(__r));
-    __builtin_set_fpscr_rn(__fpscr_save.__fpscr);
+    __builtin_ppc_set_fpscr_rn(__fpscr_save.__fpscr);
     break;
   case _MM_FROUND_TO_NEG_INF:
   case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC:
@@ -128,9 +133,14 @@ extern __inline __m128d
      */
     __asm__("" : : "wa"(__r));
     /* Restore enabled exceptions.  */
-    __fpscr_save.__fr = __builtin_mffsl();
+#ifdef _ARCH_PWR9
+    __fpscr_save.__fr = __builtin_ppc_mffsl();
+#else
+    __fpscr_save.__fr = __builtin_ppc_mffs();
+    __fpscr_save.__fpscr &= 0x70007f0ffL;
+#endif
     __fpscr_save.__fpscr |= __enables_save.__fpscr;
-    __builtin_mtfsf(0b00000011, __fpscr_save.__fr);
+    __builtin_ppc_mtfsf(0b00000011, __fpscr_save.__fr);
   }
   return (__m128d)__r;
 }
@@ -159,10 +169,10 @@ extern __inline __m128
     __asm__("mffsce %0" : "=f"(__fpscr_save.__fr));
     __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
 #else
-    __fpscr_save.__fr = __builtin_mffs();
+    __fpscr_save.__fr = __builtin_ppc_mffs();
     __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
     __fpscr_save.__fpscr &= ~0xf8;
-    __builtin_mtfsf(0b00000011, __fpscr_save.__fr);
+    __builtin_ppc_mtfsf(0b00000011, __fpscr_save.__fr);
 #endif
     /* Insert an artificial "read/write" reference to the variable
        read below, to ensure the compiler does not schedule
@@ -174,10 +184,15 @@ extern __inline __m128
 
   switch (__rounding) {
   case _MM_FROUND_TO_NEAREST_INT:
-    __fpscr_save.__fr = __builtin_mffsl();
+#ifdef _ARCH_PWR9
+    __fpscr_save.__fr = __builtin_ppc_mffsl();
+#else
+    __fpscr_save.__fr = __builtin_ppc_mffs();
+    __fpscr_save.__fpscr &= 0x70007f0ffL;
+#endif
     __attribute__((fallthrough));
   case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC:
-    __builtin_set_fpscr_rn(0b00);
+    __builtin_ppc_set_fpscr_rn(0b00);
     /* Insert an artificial "read/write" reference to the variable
        read below, to ensure the compiler does not schedule
        a read/use of the variable before the FPSCR is modified, above.
@@ -193,7 +208,7 @@ extern __inline __m128
        This can be removed if and when GCC PR102783 is fixed.
      */
     __asm__("" : : "wa"(__r));
-    __builtin_set_fpscr_rn(__fpscr_save.__fpscr);
+    __builtin_ppc_set_fpscr_rn(__fpscr_save.__fpscr);
     break;
   case _MM_FROUND_TO_NEG_INF:
   case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC:
@@ -219,9 +234,14 @@ extern __inline __m128
      */
     __asm__("" : : "wa"(__r));
     /* Restore enabled exceptions.  */
-    __fpscr_save.__fr = __builtin_mffsl();
+#ifdef _ARCH_PWR9
+    __fpscr_save.__fr = __builtin_ppc_mffsl();
+#else
+    __fpscr_save.__fr = __builtin_ppc_mffs();
+    __fpscr_save.__fpscr &= 0x70007f0ffL;
+#endif
     __fpscr_save.__fpscr |= __enables_save.__fpscr;
-    __builtin_mtfsf(0b00000011, __fpscr_save.__fr);
+    __builtin_ppc_mtfsf(0b00000011, __fpscr_save.__fr);
   }
   return (__m128)__r;
 }
diff --git a/clang/test/CodeGen/PowerPC/builtins-ppc.c b/clang/test/CodeGen/PowerPC/builtins-ppc.c
index ccc91b6560845e2..c13edf44cdcbd2a 100644
--- a/clang/test/CodeGen/PowerPC/builtins-ppc.c
+++ b/clang/test/CodeGen/PowerPC/builtins-ppc.c
@@ -1,5 +1,8 @@
 // REQUIRES: powerpc-registered-target
-// RUN: %clang_cc1 -triple powerpc-unknown-unknown -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple powerpc-unknown-unknown -emit-llvm %s -o - \
+// RUN:   | FileCheck %s
+// RUN: %clang_cc1 -triple powerpc-unknown-unknown -emit-llvm %s -o - \
+// RUN:   -target-cpu pwr9 | FileCheck %s --check-prefixes=P9,CHECK
 
 void test_eh_return_data_regno()
 {
@@ -26,6 +29,9 @@ void test_builtin_ppc_setrnd() {
 
   // CHECK: call double @llvm.ppc.setrnd(i32 %2)
   res = __builtin_setrnd(x);
+
+  // CHECK: call double @llvm.ppc.setrnd(i32 %4)
+  res = __builtin_ppc_set_fpscr_rn(x);
 }
 
 void test_builtin_ppc_flm() {
@@ -33,7 +39,10 @@ void test_builtin_ppc_flm() {
   // CHECK: call double @llvm.ppc.readflm()
   res = __builtin_readflm();
 
-  // CHECK: call double @llvm.ppc.setflm(double %1)
+  // CHECK: call double @llvm.ppc.readflm()
+  res = __builtin_ppc_mffs();
+
+  // CHECK: call double @llvm.ppc.setflm(double %2)
   res = __builtin_setflm(res);
 
 #ifdef _ARCH_PWR9
diff --git a/clang/test/CodeGen/PowerPC/ppc-emmintrin.c b/clang/test/CodeGen/PowerPC/ppc-emmintrin.c
index e2d26e611ac81c4..15d291496c20a4a 100644
--- a/clang/test/CodeGen/PowerPC/ppc-emmintrin.c
+++ b/clang/test/CodeGen/PowerPC/ppc-emmintrin.c
@@ -8,6 +8,11 @@
 // RUN: %clang -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr10 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
 // RUN:   -ffp-contract=off -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefixes=CHECK-P10
 
+// RUN: %clang -x c++ -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr8 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
+// RUN:   -ffp-contract=off -fno-discard-value-names -mllvm -disable-llvm-optzns -fsyntax-only
+// RUN: %clang -x c++ -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr10 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
+// RUN:   -ffp-contract=off -fno-discard-value-names -mllvm -disable-llvm-optzns -fsyntax-only
+
 // RUN: %clang -S -emit-llvm -target powerpc64-ibm-aix -mcpu=pwr8 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
 // RUN:   -ffp-contract=off -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefixes=CHECK,CHECK-BE
 // RUN: %clang -S -emit-llvm -target powerpc64-ibm-aix -mcpu=pwr10 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
diff --git a/clang/test/CodeGen/PowerPC/ppc-mmintrin.c b/clang/test/CodeGen/PowerPC/ppc-mmintrin.c
index 4cb5b8540092f9b..1dc6292ae3244c3 100644
--- a/clang/test/CodeGen/PowerPC/ppc-mmintrin.c
+++ b/clang/test/CodeGen/PowerPC/ppc-mmintrin.c
@@ -9,6 +9,11 @@
 // RUN: %clang -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr9 -DNO_WARN_X86_INTRINSICS %s \
 // RUN:   -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n| FileCheck %s --check-prefixes=CHECK-P9,CHECK,CHECK-LE
 
+// RUN: %clang -x c++ -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr8 -DNO_WARN_X86_INTRINSICS %s \
+// RUN:   -fno-discard-value-names -mllvm -disable-llvm-optzns -fsyntax-only
+// RUN: %clang -x c++ -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr9 -DNO_WARN_X86_INTRINSICS %s \
+// RUN:   -fno-discard-value-names -mllvm -disable-llvm-optzns -fsyntax-only
+
 // RUN: %clang -S -emit-llvm -target powerpc64-unknown-freebsd13.0 -mcpu=pwr8 -DNO_WARN_X86_INTRINSICS %s \
 // RUN:   -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefixes=CHECK-P8,CHECK,CHECK-BE
 // RUN: %clang -S -emit-llvm -target powerpc64le-unknown-freebsd13.0 -mcpu=pwr8 -DNO_WARN_X86_INTRINSICS %s \
diff --git a/clang/test/CodeGen/PowerPC/ppc-pmmintrin.c b/clang/test/CodeGen/PowerPC/ppc-pmmintrin.c
index 39194427978ad42..6e152c549498d23 100644
--- a/clang/test/CodeGen/PowerPC/ppc-pmmintrin.c
+++ b/clang/test/CodeGen/PowerPC/ppc-pmmintrin.c
@@ -13,6 +13,9 @@
 // RUN: %clang -S -emit-llvm -target powerpc64-ibm-aix -mcpu=pwr8 -DNO_MM_MALLOC -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
 // RUN:   -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s
 
+// RUN: %clang -x c++ -S -emit-llvm -target powerpc64le-gnu-linux -mcpu=pwr8 -DNO_MM_MALLOC -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
+// RUN:   -fno-discard-value-names -mllvm -disable-llvm-optzns -fsyntax-only
+
 #include <pmmintrin.h>
 
 __m128d resd, md1, md2;
diff --git a/clang/test/CodeGen/PowerPC/ppc-smmintrin.c b/clang/test/CodeGen/PowerPC/ppc-smmintrin.c
index 220b65c1ce16495..7daef71a61c329e 100644
--- a/clang/test/CodeGen/PowerPC/ppc-smmintrin.c
+++ b/clang/test/CodeGen/PowerPC/ppc-smmintrin.c
@@ -15,6 +15,11 @@
 // RUN: %clang -S -emit-llvm -target powerpc64-unknown-linux-gnu -mcpu=pwr10 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
 // RUN:   -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefix=P10
 
+// RUN: %clang -x c++ -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr8 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
+// RUN:   -fno-discard-value-names -mllvm -disable-llvm-optzns -fsyntax-only
+// RUN: %clang -x c++ -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr10 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
+// RUN:   -fno-discard-value-names -mllvm -disable-llvm-optzns -fsyntax-only
+
 // RUN: %clang -S -emit-llvm -target powerpc64le-unknown-freebsd13.0 -mcpu=pwr8 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
 // RUN:   -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s
 // RUN: %clang -S -emit-llvm -target powerpc64-unknown-freebsd13.0 -mcpu=pwr8 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
@@ -239,44 +244,48 @@ test_round() {
 // CHECK-LABEL: @test_round
 
 // CHECK-LABEL: define available_externally <4 x float> @_mm_round_ps(<4 x float> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}})
-// CHECK: call signext i32 @__builtin_mffs()
-// CHECK: call signext i32 @__builtin_mtfsf(i32 noundef signext 3, double noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call double @llvm.ppc.readflm()
+// CHECK: call void @llvm.ppc.mtfsf(i32 3, double %{{[0-9a-zA-Z_.]+}})
 // CHECK: %{{[0-9a-zA-Z_.]+}} = call <4 x float> asm "", "=^wa,0"
-// CHECK: call signext i32 @__builtin_mffsl()
-// CHECK: call signext i32 @__builtin_set_fpscr_rn(i32 noundef signext 0)
+// CHECK: call double @llvm.ppc.readflm()
+// P10: call double @llvm.ppc.mffsl()
+// CHECK: call double @llvm.ppc.setrnd(i32 0)
 // CHECK: %{{[0-9a-zA-Z_.]+}} = call <4 x float> asm "", "=^wa,0"
 // CHECK: call <4 x float> @vec_rint(float vector[4])
 // CHECK: call void asm sideeffect "", "^wa"
-// CHECK: call signext i32 @__builtin_set_fpscr_rn(i64 noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call double @llvm.ppc.setrnd(i32 %{{[0-9a-zA-Z_.]+}})
 // CHECK: call <4 x float> @vec_floor(float vector[4])
 // CHECK: call <4 x float> @vec_ceil(float vector[4])
 // CHECK: call <4 x float> @vec_trunc(float vector[4])
 // CHECK: call <4 x float> @vec_rint(float vector[4])
 // CHECK: call void asm sideeffect "", "^wa"
-// CHECK: call signext i32 @__builtin_mffsl()
-// CHECK: call signext i32 @__builtin_mtfsf(i32 noundef signext 3, double noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call double @llvm.ppc.readflm()
+// P10: call double @llvm.ppc.mffsl()
+// CHECK: call void @llvm.ppc.mtfsf(i32 3, double %{{[0-9a-zA-Z_.]+}})
 
 // CHECK-LABEL: define available_externally <4 x float> @_mm_round_ss(<4 x float> noundef %{{[0-9a-zA-Z_.]+}}, <4 x float> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}})
 // CHECK: call <4 x float> @_mm_round_ps(<4 x float> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}})
 // CHECK: extractelement <4 x float> %{{[0-9a-zA-Z_.]+}}, i32 0
 
 // CHECK-LABEL: define available_externally <2 x double> @_mm_round_pd(<2 x double> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}})
-// CHECK: call signext i32 @__builtin_mffs()
-// CHECK: call signext i32 @__builtin_mtfsf(i32 noundef signext 3, double noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call double @llvm.ppc.readflm()
+// CHECK: call void @llvm.ppc.mtfsf(i32 3, double %{{[0-9a-zA-Z_.]+}})
 // CHECK: %{{[0-9a-zA-Z_.]+}} = call <2 x double> asm "", "=^wa,0"
-// CHECK: call signext i32 @__builtin_mffsl()
-// CHECK: call signext i32 @__builtin_set_fpscr_rn(i32 noundef signext 0)
+// CHECK: call double @llvm.ppc.readflm()
+// P10: call double @llvm.ppc.mffsl()
+// CHECK: call double @llvm.ppc.setrnd(i32 0)
 // CHECK: %{{[0-9a-zA-Z_.]+}} = call <2 x double> asm "", "=^wa,0"
 // CHECK: call <2 x double> @vec_rint(double vector[2])
 // CHECK: call void asm sideeffect "", "^wa"
-// CHECK: call signext i32 @__builtin_set_fpscr_rn(i64 noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call double @llvm.ppc.setrnd(i32 %{{[0-9a-zA-Z_.]+}})
 // CHECK: call <2 x double> @vec_floor(double vector[2])
 // CHECK: call <2 x double> @vec_ceil(double vector[2])
 // CHECK: call <2 x double> @vec_trunc(double vector[2])
 // CHECK: call <2 x double> @vec_rint(double vector[2])
 // CHECK: call void asm sideeffect "", "^wa"
-// CHECK: call signext i32 @__builtin_mffsl()
-// CHECK: call signext i32 @__builtin_mtfsf(i32 noundef signext 3, double noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call double @llvm.ppc.readflm()
+// P10: call double @llvm.ppc.mffsl()
+// CHECK: call void @llvm.ppc.mtfsf(i32 3, double %{{[0-9a-zA-Z_.]+}})
 
 // CHECK-LABEL: define available_externally <2 x double> @_mm_round_sd(<2 x double> noundef %{{[0-9a-zA-Z_.]+}}, <2 x double> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}})
 // CHECK: call <2 x double> @_mm_round_pd(<2 x double> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}})
diff --git a/clang/test/CodeGen/PowerPC/ppc-tmmintrin.c b/clang/test/CodeGen/PowerPC/ppc-tmmintrin.c
index 60633e34b56b9a1..40d3839dcf026f7 100644
--- a/clang/test/CodeGen/PowerPC/ppc-tmmintrin.c
+++ b/clang/test/CodeGen/PowerPC/ppc-tmmintrin.c
@@ -13,6 +13,9 @@
 // RUN: %clang -S -emit-llvm -target powerpc64-ibm-aix -mcpu=pwr8 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
 // RUN:   -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefixes=CHECK,CHECK-BE
 
+// RUN: %clang -x c++ -S -emit-llvm -target powerpc64le-gnu-linux -mcpu=pwr8 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
+// RUN:   -fno-discard-value-names -mllvm -disable-llvm-optzns -fsyntax-only
+
 #include <tmmintrin.h>
 
 __m64 res, m1, m2;
diff --git a/clang/test/CodeGen/PowerPC/ppc-x86gprintrin.c b/clang/test/CodeGen/PowerPC/ppc-x86gprintrin.c
index 238ce7c7ee574ef..ac90a5f8c530ba4 100644
--- a/clang/test/CodeGen/PowerPC/ppc-x86gprintrin.c
+++ b/clang/test/CodeGen/PowerPC/ppc-x86gprintrin.c
@@ -12,6 +12,9 @@
 // RUN: %clang -S -emit-llvm -target powerpc64-ibm-aix -mcpu=pwr7 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
 // RUN:   -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s
 
+// RUN: %clang -x c++ -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr7 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
+// RUN:   -fno-discard-value-names -mllvm -disable-llvm-optzns -fsyntax-only
+
 #include <x86gprintrin.h>
 
 unsigned short us;

JeremyRand · 2023-10-02T17:41:18Z

ACK 5abcf8964394eb5124d606b00a94781147fd8f8d. I can confirm that this fixes SSE4.1 support in ncnn for POWER9 LE (and gives a nice 3x speedup when benchmarking ncnn, which indicates that the additional intrinsics are actually getting used). Testing was done by applying the patch to the Clang package from Fedora 38's LLVM snapshot Copr.

I didn't attempt to test POWER8.

stefanp-synopsys

I think this makes sense to me overall.
Thank you for adding the tests for C++ as well!

I only had a couple of comments about this patch.

stefanp-synopsys · 2023-10-06T17:14:07Z

clang/lib/Headers/ppc_wrappers/smmintrin.h

@@ -68,10 +68,10 @@ extern __inline __m128d
    __asm__("mffsce %0" : "=f"(__fpscr_save.__fr));
    __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
 #else
-    __fpscr_save.__fr = __builtin_mffs();
+    __fpscr_save.__fr = __builtin_ppc_mffs();


I don't believe that this builtin needs to be renamed. On clang both __builtin_mffs and __builtin_ppc_mffs work.

Also, this is the same for the other 3 builtins.

When you update these names you will probably also have to update the ppc-smmintrin.c test as well.

__builtin_mffs aliases to __builtin_ppc_mffs through macro. But the compat macros do not always work. In the test cases using -ffreestanding or targeting non-AIX non-Linux OSes, the macros will not be defined.

Okay, that's fair.

Should we remove __fpscr_save.__fpscr &= 0x70007f0ffL;? I suspect it may break some assumption of following code

I'm not sure that there are any assumptions broken here. For example, __builtin_ppc_set_fpscr_rn only uses the last two bits for the rounding control and masks off the rest anyway. Also, __builtin_ppc_mtfsf(0b00000011, __fpscr_save.__fr); is using the mask 0b00000011 so it only uses the last 8 bits.

stefanp-synopsys · 2023-10-06T17:49:14Z

clang/lib/Headers/ppc_wrappers/smmintrin.h

+    __fpscr_save.__fr = __builtin_ppc_mffs();
+    __fpscr_save.__fpscr &= 0x70007f0ffL;


On GCC the builtin __builtin_mffsl also expands on pre-Power9 architectures and uses an expansion similar to this behind the scenes where mffs is used instead. I feel like we should do the same thing in order to behave in the same way as they do and in order to avoid having the user write this kind of code.

Implemented by 2c7688c

smmintrin.h uses __builtin_mffs, __builtin_mffsl, __builtin_mtfsf and __builtin_set_fpscr_rn. This patch replaces the uses with ppc prefix and implement the missing ones. This fixes issue llvm#64664.

nemanjai · 2023-10-12T19:07:11Z

llvm/lib/Target/PowerPC/PPCISelLowering.cpp

@@ -11595,6 +11595,50 @@ SDValue PPCTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
  llvm_unreachable("ERROR:Should return for all cases within swtich.");
 }

+// Lower mffsl intrinsic with mffs in targets without ISA 3.0
+static SDValue lowerMFFSL(SDValue Op, SelectionDAG &DAG,


I wonder if we actually need this. The reason mffsl exists is because it is a lightweight version of mffs. In order to make it lightweight, the instruction only extracts some bits from the FPSCR.
So in order to match the semantics, we end up doing the heavy weight instruction, materializing a 64-bit constant, moving to a GPR, masking out the bits and then moving it back to an FPR. So a user's attempt to use the lightweight version ends up costing them more than the heavy weight version on older CPU's.
Can we not just reject it on older CPU's and force the user to use the heavy weight instruction?

The reason mffsl exists is because it is a lightweight version of mffs.

Thanks, according to the 'lightweight' meaning, this sounds reasonable. I don't have strong preference to align with GCC behavior. We have builtins only for P9 which can't be or haven't been emulated.

This reverts commit 2c7688c.

nemanjai

I personally prefer this solution. @stefanp-ibm What do you think?

LGTM

stefanp-synopsys · 2023-10-20T14:15:16Z

I personally prefer this solution. @stefanp-ibm What do you think?

LGTM

I understand your point about forcing users to just use mffs when mffsl is not available. If that's the case do we also want to change this:

#ifdef _ARCH_PWR9
    __fpscr_save.__fr = __builtin_ppc_mffsl();
#else
    __fpscr_save.__fr = __builtin_ppc_mffs();
    __fpscr_save.__fpscr &= 0x70007f0ffL;
#endif

and just use:

#ifdef _ARCH_PWR9
    __fpscr_save.__fr = __builtin_ppc_mffsl();
#else
   // Since mffsl is not available on anything before Power 9 use the longer latency instruction.
    __fpscr_save.__fr = __builtin_ppc_mffs();
#endif

stefanp-synopsys

With respect to the comment from Nemanja I'm okay with this not being 100% the same as GCC. I'm sure we will get questions in the future but really we should discourage the use of __builtin_mffsl on pre-Power 9.

stefanp-synopsys · 2023-10-20T13:48:07Z

clang/lib/Headers/ppc_wrappers/smmintrin.h

@@ -68,10 +68,10 @@ extern __inline __m128d
    __asm__("mffsce %0" : "=f"(__fpscr_save.__fr));
    __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
 #else
-    __fpscr_save.__fr = __builtin_mffs();
+    __fpscr_save.__fr = __builtin_ppc_mffs();


Okay, that's fair.

stefanp-synopsys

I'm going to approve this patch.
I don't think that removing the mask __fpscr_save.__fpscr &= 0x70007f0ffL; in smmintrin.h with have issues but I don't feel strongly enough about it to block this patch. If you are not convinced that it is safe you can leave the masks in.

LGTM.

stefanp-synopsys · 2023-10-24T16:56:08Z

clang/lib/Headers/ppc_wrappers/smmintrin.h

@@ -68,10 +68,10 @@ extern __inline __m128d
    __asm__("mffsce %0" : "=f"(__fpscr_save.__fr));
    __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
 #else
-    __fpscr_save.__fr = __builtin_mffs();
+    __fpscr_save.__fr = __builtin_ppc_mffs();


I'm not sure that there are any assumptions broken here. For example, __builtin_ppc_set_fpscr_rn only uses the last two bits for the rounding control and masks off the rest anyway. Also, __builtin_ppc_mtfsf(0b00000011, __fpscr_save.__fr); is using the mask 0b00000011 so it only uses the last 8 bits.

smmintrin.h uses __builtin_mffs, __builtin_mffsl, __builtin_mtfsf and __builtin_set_fpscr_rn. This patch replaces the uses with ppc prefix and implement the missing ones.

llvmbot added clang Clang issues not falling into any other category backend:X86 clang:frontend Language frontend issues, e.g. anything involving "Sema" clang:headers Headers provided by Clang, e.g. for intrinsics clang:codegen IR generation bugs: mangling, exceptions, etc. labels Sep 25, 2023

RKSimon removed the backend:X86 label Sep 25, 2023

ecnelises requested review from lei137, chenzheng1030, nemanjai, stefanp-synopsys and tuliom September 28, 2023 09:40

stefanp-synopsys reviewed Oct 6, 2023

View reviewed changes

ecnelises added 2 commits October 9, 2023 15:32

[PowerPC] Fix use of FPSCR builtins in smmintrin.h

2d62858

smmintrin.h uses __builtin_mffs, __builtin_mffsl, __builtin_mtfsf and __builtin_set_fpscr_rn. This patch replaces the uses with ppc prefix and implement the missing ones. This fixes issue llvm#64664.

Implement mffsl in pre-Power9 targets

2c7688c

ecnelises force-pushed the fix_64664 branch from 5abcf89 to 2c7688c Compare October 10, 2023 04:11

llvmbot added the backend:X86 label Oct 10, 2023

ecnelises added backend:PowerPC and removed backend:X86 labels Oct 10, 2023

nemanjai reviewed Oct 12, 2023

View reviewed changes

ecnelises added 2 commits October 16, 2023 17:15

Merge commit '7571f27ed768530f3fe9707d310c83d5a687ea16' into fix_64664

f80fbd7

Revert "Implement mffsl in pre-Power9 targets"

c50491e

This reverts commit 2c7688c.

llvmbot added the backend:X86 label Oct 16, 2023

nemanjai approved these changes Oct 19, 2023

View reviewed changes

stefanp-synopsys reviewed Oct 20, 2023

View reviewed changes

stefanp-synopsys approved these changes Oct 25, 2023

View reviewed changes

ecnelises merged commit de7c006 into llvm:main Oct 26, 2023

		__fpscr_save.__fr = __builtin_ppc_mffs();
		__fpscr_save.__fpscr &= 0x70007f0ffL;

[PowerPC] Fix use of FPSCR builtins in smmintrin.h #67299

[PowerPC] Fix use of FPSCR builtins in smmintrin.h #67299

Uh oh!

Conversation

ecnelises commented Sep 25, 2023 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Sep 25, 2023 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

JeremyRand commented Oct 2, 2023

Uh oh!

stefanp-synopsys left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

nemanjai left a comment

Choose a reason for hiding this comment

Uh oh!

stefanp-synopsys commented Oct 20, 2023

Uh oh!

stefanp-synopsys left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

stefanp-synopsys left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

ecnelises commented Sep 25, 2023 •

edited

Loading

llvmbot commented Sep 25, 2023 •

edited

Loading