Skip to content

SystemZ: Stop casting fp typed atomic loads in the IR #90768

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
May 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 43 additions & 6 deletions llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,7 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
// the atomic operations in order to exploit SystemZ instructions.
setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom);
setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom);
setOperationAction(ISD::ATOMIC_LOAD, MVT::f128, Custom);

// Mark sign/zero extending atomic loads as legal, which will make
// DAGCombiner fold extensions into atomic loads if possible.
Expand Down Expand Up @@ -935,9 +936,6 @@ bool SystemZTargetLowering::hasInlineStackProbe(const MachineFunction &MF) const

TargetLowering::AtomicExpansionKind
SystemZTargetLowering::shouldCastAtomicLoadInIR(LoadInst *LI) const {
// Lower fp128 the same way as i128.
if (LI->getType()->isFP128Ty())
return AtomicExpansionKind::CastToInteger;
return AtomicExpansionKind::None;
}

Expand Down Expand Up @@ -4550,7 +4548,9 @@ SDValue SystemZTargetLowering::lowerATOMIC_FENCE(SDValue Op,
SDValue SystemZTargetLowering::lowerATOMIC_LDST_I128(SDValue Op,
SelectionDAG &DAG) const {
auto *Node = cast<AtomicSDNode>(Op.getNode());
assert(Node->getMemoryVT() == MVT::i128 && "Only custom lowering i128.");
assert(
(Node->getMemoryVT() == MVT::i128 || Node->getMemoryVT() == MVT::f128) &&
"Only custom lowering i128 or f128.");
// Use same code to handle both legal and non-legal i128 types.
SmallVector<SDValue, 2> Results;
LowerOperationWrapper(Node, Results, DAG);
Expand Down Expand Up @@ -6249,6 +6249,26 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
}
}

// Manually lower a bitcast to avoid introducing illegal types after type
// legalization.
static SDValue expandBitCastI128ToF128(SelectionDAG &DAG, SDValue Src,
SDValue Chain, const SDLoc &SL) {
SDValue Hi =
DAG.getTargetExtractSubreg(SystemZ::subreg_h64, SL, MVT::i64, Src);
SDValue Lo =
DAG.getTargetExtractSubreg(SystemZ::subreg_l64, SL, MVT::i64, Src);

Hi = DAG.getBitcast(MVT::f64, Hi);
Lo = DAG.getBitcast(MVT::f64, Lo);

SDNode *Pair = DAG.getMachineNode(
SystemZ::REG_SEQUENCE, SL, MVT::f128,
{DAG.getTargetConstant(SystemZ::FP128BitRegClassID, SL, MVT::i32), Lo,
DAG.getTargetConstant(SystemZ::subreg_l64, SL, MVT::i32), Hi,
DAG.getTargetConstant(SystemZ::subreg_h64, SL, MVT::i32)});
return SDValue(Pair, 0);
}

// Lower operations with invalid operand or result types (currently used
// only for 128-bit integer types).
void
Expand All @@ -6263,8 +6283,25 @@ SystemZTargetLowering::LowerOperationWrapper(SDNode *N,
MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
SDValue Res = DAG.getMemIntrinsicNode(SystemZISD::ATOMIC_LOAD_128,
DL, Tys, Ops, MVT::i128, MMO);
Results.push_back(lowerGR128ToI128(DAG, Res));
Results.push_back(Res.getValue(1));

EVT VT = N->getValueType(0);

if (VT == MVT::i128 || isTypeLegal(MVT::i128)) {
SDValue Lowered = lowerGR128ToI128(DAG, Res);
Results.push_back(DAG.getBitcast(VT, Lowered));
Results.push_back(Res.getValue(1));
} else {
// For the f128 case, after type legalization, we cannot produce a bitcast
// with an illegal type (i.e. i128), so manually lower it.
//
// FIXME: Really v2i64 should be legal, and should be used in place of
// unttyped. Then we could emit the bitcast which will potentially fold
// into the use.
SDValue Cast = expandBitCastI128ToF128(DAG, Res, Res.getValue(1), DL);
Results.push_back(Cast);
Results.push_back(Res.getValue(1));
}

break;
}
case ISD::ATOMIC_STORE: {
Expand Down
33 changes: 17 additions & 16 deletions llvm/test/CodeGen/SystemZ/atomic-load-08.ll
Original file line number Diff line number Diff line change
@@ -1,20 +1,27 @@
; Test long double atomic loads. These are emitted by the Clang FE as i128
; loads with a bitcast, and this test case gets converted into that form as
; well by the AtomicExpand pass.
; Test long double atomic loads.
;
; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck -check-prefixes=CHECK,BASE %s
; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck -check-prefixes=CHECK,Z13 %s

; TODO: Is it worth testing softfp with vector?
; RUN: llc < %s -mtriple=s390x-linux-gnu -mattr=+soft-float | FileCheck -check-prefixes=SOFTFP %s

; FIXME: Without vector support, v2i64 should be legal and we should
; introduce a simple bitcast, which could fold into the store use
; avoid the intermediate f registers.
define void @f1(ptr %ret, ptr %src) {
; CHECK-LABEL: f1:
; CHECK: # %bb.0:
; CHECK-NEXT: lpq %r0, 0(%r3)
; CHECK-NEXT: stg %r1, 8(%r2)
; CHECK-NEXT: stg %r0, 0(%r2)
; CHECK-NEXT: br %r14
; Z13-NEXT: lpq %r0, 0(%r3)
; Z13-NEXT: stg %r1, 8(%r2)
; Z13-NEXT: stg %r0, 0(%r2)
; Z13-NEXT: br %r14

; BASE: lpq %r0, 0(%r3)
; BASE-NEXT: ldgr %f0, %r0
; BASE-NEXT: ldgr %f2, %r1
; BASE-NEXT: std %f0, 0(%r2)
; BASE-NEXT: std %f2, 8(%r2)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would be nice to short-circuit the LDGR/STD to just an STG here. We actually do this already for spills in foldMemoryOperandImpl, but not for regular stores. But this can be done separately.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This also should go away with the legal v2i64 fix I posted

; BASE-NEXT: br %r14

; SOFTFP-LABEL: f1:
; SOFTFP: # %bb.0:
Expand All @@ -30,23 +37,17 @@ define void @f1(ptr %ret, ptr %src) {
define void @f1_fpuse(ptr %ret, ptr %src) {
; CHECK-LABEL: f1_fpuse:
; CHECK: # %bb.0:
; BASE-NEXT: aghi %r15, -176
; BASE-NEXT: .cfi_def_cfa_offset 336

; CHECK-NEXT: lpq %r0, 0(%r3)

; BASE-NEXT: stg %r1, 168(%r15)
; BASE-NEXT: stg %r0, 160(%r15)
; BASE-NEXT: ld %f0, 160(%r15)
; BASE-NEXT: ld %f2, 168(%r15)
; BASE-NEXT: ldgr %f0, %r0
; BASE-NEXT: ldgr %f2, %r1

; Z13-NEXT: vlvgp %v0, %r0, %r1
; Z13-NEXT: vrepg %v2, %v0, 1

; CHECK-NEXT: axbr %f0, %f0
; CHECK-NEXT: std %f0, 0(%r2)
; CHECK-NEXT: std %f2, 8(%r2)
; BASE-NEXT: aghi %r15, 176
; CHECK-NEXT: br %r14


Expand Down