Skip to content

[PowerPC] Fix use of FPSCR builtins in smmintrin.h #67299

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Oct 26, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion clang/include/clang/Basic/BuiltinsPPC.def
Original file line number Diff line number Diff line change
Expand Up @@ -151,9 +151,11 @@ TARGET_BUILTIN(__builtin_ppc_extract_exp, "Uid", "", "power9-vector")
TARGET_BUILTIN(__builtin_ppc_extract_sig, "ULLid", "", "power9-vector")
BUILTIN(__builtin_ppc_mtfsb0, "vUIi", "")
BUILTIN(__builtin_ppc_mtfsb1, "vUIi", "")
TARGET_BUILTIN(__builtin_ppc_mffsl, "d", "", "isa-v30-instructions")
BUILTIN(__builtin_ppc_mffs, "d", "")
TARGET_BUILTIN(__builtin_ppc_mffsl, "d", "", "")
BUILTIN(__builtin_ppc_mtfsf, "vUIiUi", "")
BUILTIN(__builtin_ppc_mtfsfi, "vUIiUIi", "")
BUILTIN(__builtin_ppc_set_fpscr_rn, "di", "")
TARGET_BUILTIN(__builtin_ppc_insert_exp, "ddULLi", "", "power9-vector")
BUILTIN(__builtin_ppc_fmsub, "dddd", "")
BUILTIN(__builtin_ppc_fmsubs, "ffff", "")
Expand Down
4 changes: 4 additions & 0 deletions clang/lib/Basic/Targets/PPC.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,10 @@ static void defineXLCompatMacros(MacroBuilder &Builder) {
Builder.defineMacro("__builtin_minfe", "__builtin_ppc_minfe");
Builder.defineMacro("__builtin_minfl", "__builtin_ppc_minfl");
Builder.defineMacro("__builtin_minfs", "__builtin_ppc_minfs");
Builder.defineMacro("__builtin_mffs", "__builtin_ppc_mffs");
Builder.defineMacro("__builtin_mffsl", "__builtin_ppc_mffsl");
Builder.defineMacro("__builtin_mtfsf", "__builtin_ppc_mtfsf");
Builder.defineMacro("__builtin_set_fpscr_rn", "__builtin_ppc_set_fpscr_rn");
}

/// PPCTargetInfo::getTargetDefines - Return a set of the PowerPC-specific
Expand Down
5 changes: 5 additions & 0 deletions clang/lib/CodeGen/CGBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17258,6 +17258,11 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
Value *Op1 = EmitScalarExpr(E->getArg(1));
return Builder.CreateFDiv(Op0, Op1, "swdiv");
}
case PPC::BI__builtin_ppc_set_fpscr_rn:
return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::ppc_setrnd),
{EmitScalarExpr(E->getArg(0))});
case PPC::BI__builtin_ppc_mffs:
return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::ppc_readflm));
}
}

Expand Down
30 changes: 15 additions & 15 deletions clang/lib/Headers/ppc_wrappers/smmintrin.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

#ifndef NO_WARN_X86_INTRINSICS
/* This header is distributed to simplify porting x86_64 code that
makes explicit use of Intel intrinsics to powerp64/powerpc64le.
makes explicit use of Intel intrinsics to powerpc64/powerpc64le.

It is the user's responsibility to determine if the results are
acceptable and make additional changes as necessary.
Expand Down Expand Up @@ -68,10 +68,10 @@ extern __inline __m128d
__asm__("mffsce %0" : "=f"(__fpscr_save.__fr));
__enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
#else
__fpscr_save.__fr = __builtin_mffs();
__fpscr_save.__fr = __builtin_ppc_mffs();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't believe that this builtin needs to be renamed. On clang both __builtin_mffs and __builtin_ppc_mffs work.

Also, this is the same for the other 3 builtins.

When you update these names you will probably also have to update the ppc-smmintrin.c test as well.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

__builtin_mffs aliases to __builtin_ppc_mffs through macro. But the compat macros do not always work. In the test cases using -ffreestanding or targeting non-AIX non-Linux OSes, the macros will not be defined.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay, that's fair.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we remove __fpscr_save.__fpscr &= 0x70007f0ffL;? I suspect it may break some assumption of following code

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure that there are any assumptions broken here. For example, __builtin_ppc_set_fpscr_rn only uses the last two bits for the rounding control and masks off the rest anyway. Also, __builtin_ppc_mtfsf(0b00000011, __fpscr_save.__fr); is using the mask 0b00000011 so it only uses the last 8 bits.

__enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
__fpscr_save.__fpscr &= ~0xf8;
__builtin_mtfsf(0b00000011, __fpscr_save.__fr);
__builtin_ppc_mtfsf(0b00000011, __fpscr_save.__fr);
#endif
/* Insert an artificial "read/write" reference to the variable
read below, to ensure the compiler does not schedule
Expand All @@ -83,10 +83,10 @@ extern __inline __m128d

switch (__rounding) {
case _MM_FROUND_TO_NEAREST_INT:
__fpscr_save.__fr = __builtin_mffsl();
__fpscr_save.__fr = __builtin_ppc_mffsl();
__attribute__((fallthrough));
case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC:
__builtin_set_fpscr_rn(0b00);
__builtin_ppc_set_fpscr_rn(0b00);
/* Insert an artificial "read/write" reference to the variable
read below, to ensure the compiler does not schedule
a read/use of the variable before the FPSCR is modified, above.
Expand All @@ -102,7 +102,7 @@ extern __inline __m128d
This can be removed if and when GCC PR102783 is fixed.
*/
__asm__("" : : "wa"(__r));
__builtin_set_fpscr_rn(__fpscr_save.__fpscr);
__builtin_ppc_set_fpscr_rn(__fpscr_save.__fpscr);
break;
case _MM_FROUND_TO_NEG_INF:
case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC:
Expand All @@ -128,9 +128,9 @@ extern __inline __m128d
*/
__asm__("" : : "wa"(__r));
/* Restore enabled exceptions. */
__fpscr_save.__fr = __builtin_mffsl();
__fpscr_save.__fr = __builtin_ppc_mffsl();
__fpscr_save.__fpscr |= __enables_save.__fpscr;
__builtin_mtfsf(0b00000011, __fpscr_save.__fr);
__builtin_ppc_mtfsf(0b00000011, __fpscr_save.__fr);
}
return (__m128d)__r;
}
Expand Down Expand Up @@ -159,10 +159,10 @@ extern __inline __m128
__asm__("mffsce %0" : "=f"(__fpscr_save.__fr));
__enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
#else
__fpscr_save.__fr = __builtin_mffs();
__fpscr_save.__fr = __builtin_ppc_mffs();
__enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
__fpscr_save.__fpscr &= ~0xf8;
__builtin_mtfsf(0b00000011, __fpscr_save.__fr);
__builtin_ppc_mtfsf(0b00000011, __fpscr_save.__fr);
#endif
/* Insert an artificial "read/write" reference to the variable
read below, to ensure the compiler does not schedule
Expand All @@ -174,10 +174,10 @@ extern __inline __m128

switch (__rounding) {
case _MM_FROUND_TO_NEAREST_INT:
__fpscr_save.__fr = __builtin_mffsl();
__fpscr_save.__fr = __builtin_ppc_mffsl();
__attribute__((fallthrough));
case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC:
__builtin_set_fpscr_rn(0b00);
__builtin_ppc_set_fpscr_rn(0b00);
/* Insert an artificial "read/write" reference to the variable
read below, to ensure the compiler does not schedule
a read/use of the variable before the FPSCR is modified, above.
Expand All @@ -193,7 +193,7 @@ extern __inline __m128
This can be removed if and when GCC PR102783 is fixed.
*/
__asm__("" : : "wa"(__r));
__builtin_set_fpscr_rn(__fpscr_save.__fpscr);
__builtin_ppc_set_fpscr_rn(__fpscr_save.__fpscr);
break;
case _MM_FROUND_TO_NEG_INF:
case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC:
Expand All @@ -219,9 +219,9 @@ extern __inline __m128
*/
__asm__("" : : "wa"(__r));
/* Restore enabled exceptions. */
__fpscr_save.__fr = __builtin_mffsl();
__fpscr_save.__fr = __builtin_ppc_mffsl();
__fpscr_save.__fpscr |= __enables_save.__fpscr;
__builtin_mtfsf(0b00000011, __fpscr_save.__fr);
__builtin_ppc_mtfsf(0b00000011, __fpscr_save.__fr);
}
return (__m128)__r;
}
Expand Down
15 changes: 10 additions & 5 deletions clang/test/CodeGen/PowerPC/builtins-ppc.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
// REQUIRES: powerpc-registered-target
// RUN: %clang_cc1 -triple powerpc-unknown-unknown -emit-llvm %s -o - | FileCheck %s
// RUN: %clang_cc1 -triple powerpc-unknown-unknown -emit-llvm %s -o - \
// RUN: | FileCheck %s

void test_eh_return_data_regno()
{
Expand All @@ -26,20 +27,24 @@ void test_builtin_ppc_setrnd() {

// CHECK: call double @llvm.ppc.setrnd(i32 %2)
res = __builtin_setrnd(x);

// CHECK: call double @llvm.ppc.setrnd(i32 %4)
res = __builtin_ppc_set_fpscr_rn(x);
}

void test_builtin_ppc_flm() {
volatile double res;
// CHECK: call double @llvm.ppc.readflm()
res = __builtin_readflm();

// CHECK: call double @llvm.ppc.setflm(double %1)
// CHECK: call double @llvm.ppc.readflm()
res = __builtin_ppc_mffs();

// CHECK: call double @llvm.ppc.setflm(double %2)
res = __builtin_setflm(res);

#ifdef _ARCH_PWR9
// P9: call double @llvm.ppc.mffsl()
// CHECK: call double @llvm.ppc.mffsl()
res = __builtin_ppc_mffsl();
#endif
}

double test_builtin_unpack_ldbl(long double x) {
Expand Down
5 changes: 5 additions & 0 deletions clang/test/CodeGen/PowerPC/ppc-emmintrin.c
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@
// RUN: %clang -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr10 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
// RUN: -ffp-contract=off -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefixes=CHECK-P10

// RUN: %clang -x c++ -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr8 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
// RUN: -ffp-contract=off -fno-discard-value-names -mllvm -disable-llvm-optzns -fsyntax-only
// RUN: %clang -x c++ -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr10 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
// RUN: -ffp-contract=off -fno-discard-value-names -mllvm -disable-llvm-optzns -fsyntax-only

// RUN: %clang -S -emit-llvm -target powerpc64-ibm-aix -mcpu=pwr8 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
// RUN: -ffp-contract=off -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefixes=CHECK,CHECK-BE
// RUN: %clang -S -emit-llvm -target powerpc64-ibm-aix -mcpu=pwr10 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
Expand Down
5 changes: 5 additions & 0 deletions clang/test/CodeGen/PowerPC/ppc-mmintrin.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,11 @@
// RUN: %clang -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr9 -DNO_WARN_X86_INTRINSICS %s \
// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n| FileCheck %s --check-prefixes=CHECK-P9,CHECK,CHECK-LE

// RUN: %clang -x c++ -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr8 -DNO_WARN_X86_INTRINSICS %s \
// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -fsyntax-only
// RUN: %clang -x c++ -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr9 -DNO_WARN_X86_INTRINSICS %s \
// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -fsyntax-only

// RUN: %clang -S -emit-llvm -target powerpc64-unknown-freebsd13.0 -mcpu=pwr8 -DNO_WARN_X86_INTRINSICS %s \
// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefixes=CHECK-P8,CHECK,CHECK-BE
// RUN: %clang -S -emit-llvm -target powerpc64le-unknown-freebsd13.0 -mcpu=pwr8 -DNO_WARN_X86_INTRINSICS %s \
Expand Down
3 changes: 3 additions & 0 deletions clang/test/CodeGen/PowerPC/ppc-pmmintrin.c
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@
// RUN: %clang -S -emit-llvm -target powerpc64-ibm-aix -mcpu=pwr8 -DNO_MM_MALLOC -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s

// RUN: %clang -x c++ -S -emit-llvm -target powerpc64le-gnu-linux -mcpu=pwr8 -DNO_MM_MALLOC -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -fsyntax-only

#include <pmmintrin.h>

__m128d resd, md1, md2;
Expand Down
33 changes: 19 additions & 14 deletions clang/test/CodeGen/PowerPC/ppc-smmintrin.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,11 @@
// RUN: %clang -S -emit-llvm -target powerpc64-unknown-linux-gnu -mcpu=pwr10 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefix=P10

// RUN: %clang -x c++ -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr8 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -fsyntax-only
// RUN: %clang -x c++ -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr10 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -fsyntax-only

// RUN: %clang -S -emit-llvm -target powerpc64le-unknown-freebsd13.0 -mcpu=pwr8 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s
// RUN: %clang -S -emit-llvm -target powerpc64-unknown-freebsd13.0 -mcpu=pwr8 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
Expand Down Expand Up @@ -239,44 +244,44 @@ test_round() {
// CHECK-LABEL: @test_round

// CHECK-LABEL: define available_externally <4 x float> @_mm_round_ps(<4 x float> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}})
// CHECK: call signext i32 @__builtin_mffs()
// CHECK: call signext i32 @__builtin_mtfsf(i32 noundef signext 3, double noundef %{{[0-9a-zA-Z_.]+}})
// CHECK: call double @llvm.ppc.readflm()
// CHECK: call void @llvm.ppc.mtfsf(i32 3, double %{{[0-9a-zA-Z_.]+}})
// CHECK: %{{[0-9a-zA-Z_.]+}} = call <4 x float> asm "", "=^wa,0"
// CHECK: call signext i32 @__builtin_mffsl()
// CHECK: call signext i32 @__builtin_set_fpscr_rn(i32 noundef signext 0)
// CHECK: call double @llvm.ppc.mffsl()
// CHECK: call double @llvm.ppc.setrnd(i32 0)
// CHECK: %{{[0-9a-zA-Z_.]+}} = call <4 x float> asm "", "=^wa,0"
// CHECK: call <4 x float> @vec_rint(float vector[4])
// CHECK: call void asm sideeffect "", "^wa"
// CHECK: call signext i32 @__builtin_set_fpscr_rn(i64 noundef %{{[0-9a-zA-Z_.]+}})
// CHECK: call double @llvm.ppc.setrnd(i32 %{{[0-9a-zA-Z_.]+}})
// CHECK: call <4 x float> @vec_floor(float vector[4])
// CHECK: call <4 x float> @vec_ceil(float vector[4])
// CHECK: call <4 x float> @vec_trunc(float vector[4])
// CHECK: call <4 x float> @vec_rint(float vector[4])
// CHECK: call void asm sideeffect "", "^wa"
// CHECK: call signext i32 @__builtin_mffsl()
// CHECK: call signext i32 @__builtin_mtfsf(i32 noundef signext 3, double noundef %{{[0-9a-zA-Z_.]+}})
// CHECK: call double @llvm.ppc.mffsl()
// CHECK: call void @llvm.ppc.mtfsf(i32 3, double %{{[0-9a-zA-Z_.]+}})

// CHECK-LABEL: define available_externally <4 x float> @_mm_round_ss(<4 x float> noundef %{{[0-9a-zA-Z_.]+}}, <4 x float> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}})
// CHECK: call <4 x float> @_mm_round_ps(<4 x float> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}})
// CHECK: extractelement <4 x float> %{{[0-9a-zA-Z_.]+}}, i32 0

// CHECK-LABEL: define available_externally <2 x double> @_mm_round_pd(<2 x double> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}})
// CHECK: call signext i32 @__builtin_mffs()
// CHECK: call signext i32 @__builtin_mtfsf(i32 noundef signext 3, double noundef %{{[0-9a-zA-Z_.]+}})
// CHECK: call double @llvm.ppc.readflm()
// CHECK: call void @llvm.ppc.mtfsf(i32 3, double %{{[0-9a-zA-Z_.]+}})
// CHECK: %{{[0-9a-zA-Z_.]+}} = call <2 x double> asm "", "=^wa,0"
// CHECK: call signext i32 @__builtin_mffsl()
// CHECK: call signext i32 @__builtin_set_fpscr_rn(i32 noundef signext 0)
// CHECK: call double @llvm.ppc.mffsl()
// CHECK: call double @llvm.ppc.setrnd(i32 0)
// CHECK: %{{[0-9a-zA-Z_.]+}} = call <2 x double> asm "", "=^wa,0"
// CHECK: call <2 x double> @vec_rint(double vector[2])
// CHECK: call void asm sideeffect "", "^wa"
// CHECK: call signext i32 @__builtin_set_fpscr_rn(i64 noundef %{{[0-9a-zA-Z_.]+}})
// CHECK: call double @llvm.ppc.setrnd(i32 %{{[0-9a-zA-Z_.]+}})
// CHECK: call <2 x double> @vec_floor(double vector[2])
// CHECK: call <2 x double> @vec_ceil(double vector[2])
// CHECK: call <2 x double> @vec_trunc(double vector[2])
// CHECK: call <2 x double> @vec_rint(double vector[2])
// CHECK: call void asm sideeffect "", "^wa"
// CHECK: call signext i32 @__builtin_mffsl()
// CHECK: call signext i32 @__builtin_mtfsf(i32 noundef signext 3, double noundef %{{[0-9a-zA-Z_.]+}})
// CHECK: call double @llvm.ppc.mffsl()
// CHECK: call void @llvm.ppc.mtfsf(i32 3, double %{{[0-9a-zA-Z_.]+}})

// CHECK-LABEL: define available_externally <2 x double> @_mm_round_sd(<2 x double> noundef %{{[0-9a-zA-Z_.]+}}, <2 x double> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}})
// CHECK: call <2 x double> @_mm_round_pd(<2 x double> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}})
Expand Down
3 changes: 3 additions & 0 deletions clang/test/CodeGen/PowerPC/ppc-tmmintrin.c
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@
// RUN: %clang -S -emit-llvm -target powerpc64-ibm-aix -mcpu=pwr8 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefixes=CHECK,CHECK-BE

// RUN: %clang -x c++ -S -emit-llvm -target powerpc64le-gnu-linux -mcpu=pwr8 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -fsyntax-only

#include <tmmintrin.h>

__m64 res, m1, m2;
Expand Down
3 changes: 3 additions & 0 deletions clang/test/CodeGen/PowerPC/ppc-x86gprintrin.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@
// RUN: %clang -S -emit-llvm -target powerpc64-ibm-aix -mcpu=pwr7 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s

// RUN: %clang -x c++ -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr7 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -fsyntax-only

#include <x86gprintrin.h>

unsigned short us;
Expand Down
54 changes: 51 additions & 3 deletions llvm/lib/Target/PowerPC/PPCISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -646,8 +646,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f64, Custom);

// To handle counter-based loop conditions.
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i1, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);

setOperationAction(ISD::INTRINSIC_VOID, MVT::i8, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::i16, Custom);
Expand Down Expand Up @@ -11595,6 +11595,50 @@ SDValue PPCTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
llvm_unreachable("ERROR:Should return for all cases within swtich.");
}

// Lower mffsl intrinsic with mffs in targets without ISA 3.0
static SDValue lowerMFFSL(SDValue Op, SelectionDAG &DAG,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if we actually need this. The reason mffsl exists is because it is a lightweight version of mffs. In order to make it lightweight, the instruction only extracts some bits from the FPSCR.
So in order to match the semantics, we end up doing the heavy weight instruction, materializing a 64-bit constant, moving to a GPR, masking out the bits and then moving it back to an FPR. So a user's attempt to use the lightweight version ends up costing them more than the heavy weight version on older CPU's.
Can we not just reject it on older CPU's and force the user to use the heavy weight instruction?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The reason mffsl exists is because it is a lightweight version of mffs.

Thanks, according to the 'lightweight' meaning, this sounds reasonable. I don't have strong preference to align with GCC behavior. We have builtins only for P9 which can't be or haven't been emulated.

const PPCSubtarget &Subtarget) {
assert(cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue() ==
Intrinsic::ppc_mffsl &&
"Should only be called on int_ppc_mffsl");
if (Subtarget.isISA3_0())
return Op;

SDLoc dl(Op);
SDValue Chain = Op.getOperand(0);
SDValue MFFS = DAG.getNode(PPCISD::MFFS, dl, {MVT::f64, MVT::Other}, Chain);
Chain = MFFS.getValue(1);

if (Subtarget.isPPC64()) {
SDValue Int = DAG.getNode(ISD::BITCAST, dl, MVT::i64, MFFS);
// Mask 29-31, 45-51 and 56-63 bits
SDValue Masked = DAG.getNode(ISD::AND, dl, MVT::i64, Int,
DAG.getConstant(0x70007f0ffULL, dl, MVT::i64));
SDValue Cast = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Masked);
return DAG.getMergeValues({Cast, Chain}, dl);
}

MachineFunction &MF = DAG.getMachineFunction();
MachinePointerInfo PtrInfo;
int SSFI = MF.getFrameInfo().CreateStackObject(8, Align(8), false);
SDValue Base = DAG.getFrameIndex(SSFI, MVT::i32);
Chain = DAG.getStore(Chain, dl, MFFS, Base, PtrInfo);

assert(!Subtarget.isLittleEndian() && "32-bit little endian is unsupported!");
SDValue Offset4 = DAG.getNode(ISD::ADD, dl, MVT::i32, Base,
DAG.getConstant(4, dl, MVT::i32));
SDValue Hi = DAG.getLoad(MVT::i32, dl, Chain, Base, PtrInfo);
SDValue Lo = DAG.getLoad(MVT::i32, dl, Hi.getValue(1), Offset4, PtrInfo);
Chain = Lo.getValue(1);
Hi =
DAG.getNode(ISD::AND, dl, MVT::i32, Hi, DAG.getConstant(7, dl, MVT::i32));
Lo = DAG.getNode(ISD::AND, dl, MVT::i32, Lo,
DAG.getConstant(0x7f0ffULL, dl, MVT::i32));
Chain = DAG.getStore(Chain, dl, Hi, Base, PtrInfo);
Chain = DAG.getStore(Chain, dl, Lo, Offset4, PtrInfo);
return DAG.getLoad(MVT::f64, dl, Chain, Base, PtrInfo);
}

/// LowerOperation - Provide custom lowering hooks for some operations.
///
SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
Expand Down Expand Up @@ -11669,8 +11713,12 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return LowerFP_ROUND(Op, DAG);
case ISD::ROTL: return LowerROTL(Op, DAG);

// For counter-based loop handling.
case ISD::INTRINSIC_W_CHAIN: return SDValue();
case ISD::INTRINSIC_W_CHAIN: {
if (cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue() ==
Intrinsic::ppc_mffsl)
return lowerMFFSL(Op, DAG, Subtarget);
return SDValue();
}

case ISD::BITCAST: return LowerBITCAST(Op, DAG);

Expand Down
4 changes: 3 additions & 1 deletion llvm/lib/Target/PowerPC/PPCInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -3188,7 +3188,6 @@ def : Pat<(PPCtc_return CTRRC:$dst, imm:$imm),
(TCRETURNri CTRRC:$dst, imm:$imm)>;

def : Pat<(int_ppc_readflm), (MFFS)>;
def : Pat<(int_ppc_mffsl), (MFFSL)>;

// Hi and Lo for Darwin Global Addresses.
def : Pat<(PPChi tglobaladdr:$in, 0), (LIS tglobaladdr:$in)>;
Expand Down Expand Up @@ -4510,6 +4509,9 @@ def : Pat<(int_ppc_dcbfl xoaddr:$dst),
def : Pat<(int_ppc_dcbflp xoaddr:$dst),
(DCBF 3, xoaddr:$dst)>;

let Predicates = [IsISA3_0] in
def : Pat<(int_ppc_mffsl), (MFFSL)>;

let Predicates = [IsISA3_1] in {
def DCBFPS : PPCAsmPseudo<"dcbfps $dst", (ins memrr:$dst)>;
def DCBSTPS : PPCAsmPseudo<"dcbstps $dst", (ins memrr:$dst)>;
Expand Down
Loading