@@ -27096,21 +27096,37 @@ AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
27096
27096
: AtomicExpansionKind::LLSC;
27097
27097
}
27098
27098
27099
+ // Return true if the atomic operation expansion will lower to use a library
27100
+ // call, and is thus ineligible to use an LLSC expansion.
27101
+ static bool rmwOpMayLowerToLibcall(const AArch64Subtarget &Subtarget,
27102
+ const AtomicRMWInst *RMW) {
27103
+ if (!RMW->isFloatingPointOperation())
27104
+ return false;
27105
+ switch (RMW->getType()->getScalarType()->getTypeID()) {
27106
+ case Type::FloatTyID:
27107
+ case Type::DoubleTyID:
27108
+ case Type::HalfTyID:
27109
+ case Type::BFloatTyID:
27110
+ // Will use soft float
27111
+ return !Subtarget.hasFPARMv8();
27112
+ default:
27113
+ // fp128 will emit library calls.
27114
+ return true;
27115
+ }
27116
+
27117
+ llvm_unreachable("covered type switch");
27118
+ }
27119
+
27099
27120
// The "default" for integer RMW operations is to expand to an LL/SC loop.
27100
27121
// However, with the LSE instructions (or outline-atomics mode, which provides
27101
27122
// library routines in place of the LSE-instructions), we can directly emit many
27102
27123
// operations instead.
27103
- //
27104
- // Floating-point operations are always emitted to a cmpxchg loop, because they
27105
- // may trigger a trap which aborts an LLSC sequence.
27106
27124
TargetLowering::AtomicExpansionKind
27107
27125
AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
27108
- unsigned Size = AI->getType()->getPrimitiveSizeInBits();
27126
+ Type *Ty = AI->getType();
27127
+ unsigned Size = Ty->getPrimitiveSizeInBits();
27109
27128
assert(Size <= 128 && "AtomicExpandPass should've handled larger sizes.");
27110
27129
27111
- if (AI->isFloatingPointOperation())
27112
- return AtomicExpansionKind::CmpXChg;
27113
-
27114
27130
bool CanUseLSE128 = Subtarget->hasLSE128() && Size == 128 &&
27115
27131
(AI->getOperation() == AtomicRMWInst::Xchg ||
27116
27132
AI->getOperation() == AtomicRMWInst::Or ||
@@ -27120,7 +27136,8 @@ AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
27120
27136
27121
27137
// Nand is not supported in LSE.
27122
27138
// Leave 128 bits to LLSC or CmpXChg.
27123
- if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128) {
27139
+ if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128 &&
27140
+ !AI->isFloatingPointOperation()) {
27124
27141
if (Subtarget->hasLSE())
27125
27142
return AtomicExpansionKind::None;
27126
27143
if (Subtarget->outlineAtomics()) {
@@ -27146,7 +27163,7 @@ AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
27146
27163
// succeed. So at -O0 lower this operation to a CAS loop. Also worthwhile if
27147
27164
// we have a single CAS instruction that can replace the loop.
27148
27165
if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None ||
27149
- Subtarget->hasLSE())
27166
+ Subtarget->hasLSE() || rmwOpMayLowerToLibcall(*Subtarget, AI) )
27150
27167
return AtomicExpansionKind::CmpXChg;
27151
27168
27152
27169
return AtomicExpansionKind::LLSC;
@@ -27193,10 +27210,14 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder,
27193
27210
27194
27211
Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
27195
27212
Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
27196
- Lo = Builder.CreateZExt(Lo, ValueTy, "lo64");
27197
- Hi = Builder.CreateZExt(Hi, ValueTy, "hi64");
27198
- return Builder.CreateOr(
27199
- Lo, Builder.CreateShl(Hi, ConstantInt::get(ValueTy, 64)), "val64");
27213
+
27214
+ auto *Int128Ty = Type::getInt128Ty(Builder.getContext());
27215
+ Lo = Builder.CreateZExt(Lo, Int128Ty, "lo64");
27216
+ Hi = Builder.CreateZExt(Hi, Int128Ty, "hi64");
27217
+
27218
+ Value *Or = Builder.CreateOr(
27219
+ Lo, Builder.CreateShl(Hi, ConstantInt::get(Int128Ty, 64)), "val64");
27220
+ return Builder.CreateBitCast(Or, ValueTy);
27200
27221
}
27201
27222
27202
27223
Type *Tys[] = { Addr->getType() };
@@ -27207,8 +27228,8 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder,
27207
27228
const DataLayout &DL = M->getDataLayout();
27208
27229
IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(ValueTy));
27209
27230
CallInst *CI = Builder.CreateCall(Ldxr, Addr);
27210
- CI->addParamAttr(
27211
- 0, Attribute::get(Builder.getContext(), Attribute::ElementType, ValueTy ));
27231
+ CI->addParamAttr(0, Attribute::get(Builder.getContext(),
27232
+ Attribute::ElementType, IntEltTy ));
27212
27233
Value *Trunc = Builder.CreateTrunc(CI, IntEltTy);
27213
27234
27214
27235
return Builder.CreateBitCast(Trunc, ValueTy);
@@ -27234,9 +27255,13 @@ Value *AArch64TargetLowering::emitStoreConditional(IRBuilderBase &Builder,
27234
27255
IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
27235
27256
Function *Stxr = Intrinsic::getDeclaration(M, Int);
27236
27257
Type *Int64Ty = Type::getInt64Ty(M->getContext());
27258
+ Type *Int128Ty = Type::getInt128Ty(M->getContext());
27259
+
27260
+ Value *CastVal = Builder.CreateBitCast(Val, Int128Ty);
27237
27261
27238
- Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo");
27239
- Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi");
27262
+ Value *Lo = Builder.CreateTrunc(CastVal, Int64Ty, "lo");
27263
+ Value *Hi =
27264
+ Builder.CreateTrunc(Builder.CreateLShr(CastVal, 64), Int64Ty, "hi");
27240
27265
return Builder.CreateCall(Stxr, {Lo, Hi, Addr});
27241
27266
}
27242
27267
0 commit comments