Skip to content

[SystemZ] Simplify f128 atomic load/store #90977

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
275 changes: 157 additions & 118 deletions llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1551,6 +1551,8 @@ static SDValue lowerI128ToGR128(SelectionDAG &DAG, SDValue In) {
std::tie(Lo, Hi) = DAG.SplitScalar(In, DL, MVT::i64, MVT::i64);
}

// FIXME: If v2i64 were a legal type, we could use it instead of
// Untyped here. This might enable improved folding.
SDNode *Pair = DAG.getMachineNode(SystemZ::PAIR128, DL,
MVT::Untyped, Hi, Lo);
return SDValue(Pair, 0);
Expand Down Expand Up @@ -6247,14 +6249,18 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
}
}

// Manually lower a bitcast to avoid introducing illegal types after type
// legalization.
static SDValue expandBitCastI128ToF128(SelectionDAG &DAG, SDValue Src,
SDValue Chain, const SDLoc &SL) {
SDValue Hi =
DAG.getTargetExtractSubreg(SystemZ::subreg_h64, SL, MVT::i64, Src);
SDValue Lo =
DAG.getTargetExtractSubreg(SystemZ::subreg_l64, SL, MVT::i64, Src);
const SDLoc &SL) {
// If i128 is legal, just use a normal bitcast.
if (DAG.getTargetLoweringInfo().isTypeLegal(MVT::i128))
return DAG.getBitcast(MVT::f128, Src);

// Otherwise, f128 must live in FP128, so do a partwise move.
assert(DAG.getTargetLoweringInfo().getRepRegClassFor(MVT::f128) ==
&SystemZ::FP128BitRegClass);

SDValue Hi, Lo;
std::tie(Lo, Hi) = DAG.SplitScalar(Src, SL, MVT::i64, MVT::i64);

Hi = DAG.getBitcast(MVT::f64, Hi);
Lo = DAG.getBitcast(MVT::f64, Lo);
Expand All @@ -6267,24 +6273,24 @@ static SDValue expandBitCastI128ToF128(SelectionDAG &DAG, SDValue Src,
return SDValue(Pair, 0);
}

static std::pair<SDValue, SDValue>
expandBitCastF128ToI128Parts(SelectionDAG &DAG, SDValue Src, const SDLoc &SL) {
static SDValue expandBitCastF128ToI128(SelectionDAG &DAG, SDValue Src,
const SDLoc &SL) {
// If i128 is legal, just use a normal bitcast.
if (DAG.getTargetLoweringInfo().isTypeLegal(MVT::i128))
return DAG.getBitcast(MVT::i128, Src);

// Otherwise, f128 must live in FP128, so do a partwise move.
assert(DAG.getTargetLoweringInfo().getRepRegClassFor(MVT::f128) ==
&SystemZ::FP128BitRegClass);

SDValue LoFP =
DAG.getTargetExtractSubreg(SystemZ::subreg_l64, SL, MVT::f64, Src);
SDValue HiFP =
DAG.getTargetExtractSubreg(SystemZ::subreg_h64, SL, MVT::f64, Src);
SDValue Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i64, LoFP);
SDValue Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i64, HiFP);

return {Hi, Lo};
}

static SDValue expandBitCastF128ToI128(SelectionDAG &DAG, SDValue Src,
const SDLoc &SL) {

auto [Hi, Lo] = expandBitCastF128ToI128Parts(DAG, Src, SL);
SDNode *Pair = DAG.getMachineNode(SystemZ::PAIR128, SL, MVT::Untyped, Hi, Lo);
return SDValue(Pair, 0);
return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i128, Lo, Hi);
}

// Lower operations with invalid operand or result types (currently used
Expand All @@ -6302,38 +6308,20 @@ SystemZTargetLowering::LowerOperationWrapper(SDNode *N,
SDValue Res = DAG.getMemIntrinsicNode(SystemZISD::ATOMIC_LOAD_128,
DL, Tys, Ops, MVT::i128, MMO);

EVT VT = N->getValueType(0);

if (VT == MVT::i128 || isTypeLegal(MVT::i128)) {
SDValue Lowered = lowerGR128ToI128(DAG, Res);
Results.push_back(DAG.getBitcast(VT, Lowered));
Results.push_back(Res.getValue(1));
} else {
// For the f128 case, after type legalization, we cannot produce a bitcast
// with an illegal type (i.e. i128), so manually lower it.
//
// FIXME: Really v2i64 should be legal, and should be used in place of
// unttyped. Then we could emit the bitcast which will potentially fold
// into the use.
SDValue Cast = expandBitCastI128ToF128(DAG, Res, Res.getValue(1), DL);
Results.push_back(Cast);
Results.push_back(Res.getValue(1));
}

SDValue Lowered = lowerGR128ToI128(DAG, Res);
if (N->getValueType(0) == MVT::f128)
Lowered = expandBitCastI128ToF128(DAG, Lowered, DL);
Results.push_back(Lowered);
Results.push_back(Res.getValue(1));
break;
}
case ISD::ATOMIC_STORE: {
SDLoc DL(N);
SDVTList Tys = DAG.getVTList(MVT::Other);
SDValue Val = N->getOperand(1);
EVT VT = Val.getValueType();

if (VT == MVT::i128 || isTypeLegal(MVT::i128)) {
Val = DAG.getBitcast(MVT::i128, Val);
Val = lowerI128ToGR128(DAG, Val);
} else {
if (Val.getValueType() == MVT::f128)
Val = expandBitCastF128ToI128(DAG, Val, DL);
}
Val = lowerI128ToGR128(DAG, Val);

SDValue Ops[] = {N->getOperand(0), Val, N->getOperand(2)};
MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
Expand Down Expand Up @@ -6370,21 +6358,7 @@ SystemZTargetLowering::LowerOperationWrapper(SDNode *N,
if (N->getValueType(0) == MVT::i128 && Src.getValueType() == MVT::f128 &&
!useSoftFloat()) {
SDLoc DL(N);
SDValue Lo, Hi;
if (getRepRegClassFor(MVT::f128) == &SystemZ::VR128BitRegClass) {
SDValue VecBC = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Src);
Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, VecBC,
DAG.getConstant(1, DL, MVT::i32));
Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, VecBC,
DAG.getConstant(0, DL, MVT::i32));
} else {
// FIXME: Assert should be moved into expandBitCastF128ToI128Parts
assert(getRepRegClassFor(MVT::f128) == &SystemZ::FP128BitRegClass &&
"Unrecognized register class for f128.");
std::tie(Hi, Lo) = expandBitCastF128ToI128Parts(DAG, Src, DL);
}

Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128, Lo, Hi));
Results.push_back(expandBitCastF128ToI128(DAG, Src, DL));
}
break;
}
Expand Down Expand Up @@ -6829,72 +6803,118 @@ SDValue SystemZTargetLowering::combineMERGE(
return SDValue();
}

static bool isI128MovedToParts(LoadSDNode *LD, SDNode *&LoPart,
SDNode *&HiPart) {
LoPart = HiPart = nullptr;

// Scan through all users.
for (SDNode::use_iterator UI = LD->use_begin(), UIEnd = LD->use_end();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unbounded users scans are often a bad idea

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point. I've changed the logic to ensure early the loop exits early in case of more than two users.

UI != UIEnd; ++UI) {
// Skip the uses of the chain.
if (UI.getUse().getResNo() != 0)
continue;

// Verify every user is a TRUNCATE to i64 of the low or high half.
SDNode *User = *UI;
bool IsLoPart = true;
if (User->getOpcode() == ISD::SRL &&
User->getOperand(1).getOpcode() == ISD::Constant &&
User->getConstantOperandVal(1) == 64 && User->hasOneUse()) {
User = *User->use_begin();
IsLoPart = false;
}
if (User->getOpcode() != ISD::TRUNCATE || User->getValueType(0) != MVT::i64)
return false;

if (IsLoPart) {
if (LoPart)
return false;
LoPart = User;
} else {
if (HiPart)
return false;
HiPart = User;
}
}
return true;
}

static bool isF128MovedToParts(LoadSDNode *LD, SDNode *&LoPart,
SDNode *&HiPart) {
LoPart = HiPart = nullptr;

// Scan through all users.
for (SDNode::use_iterator UI = LD->use_begin(), UIEnd = LD->use_end();
UI != UIEnd; ++UI) {
// Skip the uses of the chain.
if (UI.getUse().getResNo() != 0)
continue;

// Verify every user is an EXTRACT_SUBREG of the low or high half.
SDNode *User = *UI;
if (!User->hasOneUse() || !User->isMachineOpcode() ||
User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
return false;

switch (User->getConstantOperandVal(1)) {
case SystemZ::subreg_l64:
if (LoPart)
return false;
LoPart = User;
break;
case SystemZ::subreg_h64:
if (HiPart)
return false;
HiPart = User;
break;
default:
return false;
}
}
return true;
}

SDValue SystemZTargetLowering::combineLOAD(
SDNode *N, DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
EVT LdVT = N->getValueType(0);
SDLoc DL(N);

// Replace an i128 load that is used solely to move its value into GPRs
// Replace a 128-bit load that is used solely to move its value into GPRs
// by separate loads of both halves.
if (LdVT == MVT::i128) {
LoadSDNode *LD = cast<LoadSDNode>(N);
if (!LD->isSimple() || !ISD::isNormalLoad(LD))
return SDValue();

// Scan through all users.
SmallVector<std::pair<SDNode *, int>, 2> Users;
int UsedElements = 0;
for (SDNode::use_iterator UI = LD->use_begin(), UIEnd = LD->use_end();
UI != UIEnd; ++UI) {
// Skip the uses of the chain.
if (UI.getUse().getResNo() != 0)
continue;

// Verify every user is a TRUNCATE to i64 of the low or high half ...
SDNode *User = *UI;
int Index = 1;
if (User->getOpcode() == ISD::SRL &&
User->getOperand(1).getOpcode() == ISD::Constant &&
User->getConstantOperandVal(1) == 64 && User->hasOneUse()) {
User = *User->use_begin();
Index = 0;
LoadSDNode *LD = cast<LoadSDNode>(N);
if (LD->isSimple() && ISD::isNormalLoad(LD)) {
SDNode *LoPart, *HiPart;
if ((LdVT == MVT::i128 && isI128MovedToParts(LD, LoPart, HiPart)) ||
(LdVT == MVT::f128 && isF128MovedToParts(LD, LoPart, HiPart))) {
// Rewrite each extraction as an independent load.
SmallVector<SDValue, 2> ArgChains;
if (HiPart) {
SDValue EltLoad = DAG.getLoad(
HiPart->getValueType(0), DL, LD->getChain(), LD->getBasePtr(),
LD->getPointerInfo(), LD->getOriginalAlign(),
LD->getMemOperand()->getFlags(), LD->getAAInfo());

DCI.CombineTo(HiPart, EltLoad, true);
ArgChains.push_back(EltLoad.getValue(1));
}
if (LoPart) {
SDValue EltLoad = DAG.getLoad(
LoPart->getValueType(0), DL, LD->getChain(),
DAG.getObjectPtrOffset(DL, LD->getBasePtr(), TypeSize::getFixed(8)),
LD->getPointerInfo().getWithOffset(8), LD->getOriginalAlign(),
LD->getMemOperand()->getFlags(), LD->getAAInfo());

DCI.CombineTo(LoPart, EltLoad, true);
ArgChains.push_back(EltLoad.getValue(1));
}
if (User->getOpcode() != ISD::TRUNCATE ||
User->getValueType(0) != MVT::i64)
return SDValue();

// ... and no half is extracted twice.
if (UsedElements & (1 << Index))
return SDValue();

UsedElements |= 1 << Index;
Users.push_back(std::make_pair(User, Index));
}

// Rewrite each extraction as an independent load.
SmallVector<SDValue, 2> ArgChains;
for (auto UserAndIndex : Users) {
SDNode *User = UserAndIndex.first;
unsigned Offset = User->getValueType(0).getStoreSize() * UserAndIndex.second;
SDValue Ptr =
DAG.getMemBasePlusOffset(LD->getBasePtr(), TypeSize::getFixed(Offset), DL);
SDValue EltLoad =
DAG.getLoad(User->getValueType(0), DL, LD->getChain(), Ptr,
LD->getPointerInfo().getWithOffset(Offset),
LD->getOriginalAlign(), LD->getMemOperand()->getFlags(),
LD->getAAInfo());

DCI.CombineTo(User, EltLoad, true);
ArgChains.push_back(EltLoad.getValue(1));
// Collect all chains via TokenFactor.
SDValue Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, ArgChains);
DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain);
DCI.AddToWorklist(Chain.getNode());
return SDValue(N, 0);
}

// Collect all chains via TokenFactor.
SDValue Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
ArgChains);
DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain);
DCI.AddToWorklist(Chain.getNode());
return SDValue(N, 0);
}

if (LdVT.isVector() || LdVT.isInteger())
Expand Down Expand Up @@ -6974,7 +6994,8 @@ static bool isOnlyUsedByStores(SDValue StoredVal, SelectionDAG &DAG) {
return true;
}

static bool isMovedFromParts(SDValue Val, SDValue &LoPart, SDValue &HiPart) {
static bool isI128MovedFromParts(SDValue Val, SDValue &LoPart,
SDValue &HiPart) {
if (Val.getOpcode() != ISD::OR || !Val.getNode()->hasOneUse())
return false;

Expand All @@ -7001,6 +7022,23 @@ static bool isMovedFromParts(SDValue Val, SDValue &LoPart, SDValue &HiPart) {
return true;
}

static bool isF128MovedFromParts(SDValue Val, SDValue &LoPart,
SDValue &HiPart) {
if (!Val.getNode()->hasOneUse() || !Val.isMachineOpcode() ||
Val.getMachineOpcode() != TargetOpcode::REG_SEQUENCE)
return false;

if (Val->getNumOperands() != 5 ||
Val->getOperand(0)->getAsZExtVal() != SystemZ::FP128BitRegClassID ||
Val->getOperand(2)->getAsZExtVal() != SystemZ::subreg_l64 ||
Val->getOperand(4)->getAsZExtVal() != SystemZ::subreg_h64)
return false;

LoPart = Val->getOperand(1);
HiPart = Val->getOperand(3);
return true;
}

SDValue SystemZTargetLowering::combineSTORE(
SDNode *N, DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
Expand Down Expand Up @@ -7070,10 +7108,11 @@ SDValue SystemZTargetLowering::combineSTORE(
Ops, MemVT, SN->getMemOperand());
}

// Transform a store of an i128 moved from GPRs into two separate stores.
if (MemVT == MVT::i128 && SN->isSimple() && ISD::isNormalStore(SN)) {
// Transform a store of a 128-bit value moved from parts into two stores.
if (SN->isSimple() && ISD::isNormalStore(SN)) {
SDValue LoPart, HiPart;
if (isMovedFromParts(Op1, LoPart, HiPart)) {
if ((MemVT == MVT::i128 && isI128MovedFromParts(Op1, LoPart, HiPart)) ||
(MemVT == MVT::f128 && isF128MovedFromParts(Op1, LoPart, HiPart))) {
SDLoc DL(SN);
SDValue Chain0 =
DAG.getStore(SN->getChain(), DL, HiPart, SN->getBasePtr(),
Expand Down
21 changes: 5 additions & 16 deletions llvm/test/CodeGen/SystemZ/atomic-load-08.ll
Original file line number Diff line number Diff line change
@@ -1,27 +1,16 @@
; Test long double atomic loads.
; Test long double atomic loads - via i128.
;
; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck -check-prefixes=CHECK,BASE %s
; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck -check-prefixes=CHECK,Z13 %s
; TODO: Is it worth testing softfp with vector?
; RUN: llc < %s -mtriple=s390x-linux-gnu -mattr=+soft-float | FileCheck -check-prefixes=SOFTFP %s

; FIXME: Without vector support, v2i64 should be legal and we should
; introduce a simple bitcast, which could fold into the store use
; avoid the intermediate f registers.
define void @f1(ptr %ret, ptr %src) {
; CHECK-LABEL: f1:
; CHECK: # %bb.0:
; Z13-NEXT: lpq %r0, 0(%r3)
; Z13-NEXT: stg %r1, 8(%r2)
; Z13-NEXT: stg %r0, 0(%r2)
; Z13-NEXT: br %r14

; BASE: lpq %r0, 0(%r3)
; BASE-NEXT: ldgr %f0, %r0
; BASE-NEXT: ldgr %f2, %r1
; BASE-NEXT: std %f0, 0(%r2)
; BASE-NEXT: std %f2, 8(%r2)
; BASE-NEXT: br %r14
; CHECK-NEXT: lpq %r0, 0(%r3)
; CHECK-NEXT: stg %r1, 8(%r2)
; CHECK-NEXT: stg %r0, 0(%r2)
; CHECK-NEXT: br %r14

; SOFTFP-LABEL: f1:
; SOFTFP: # %bb.0:
Expand Down
Loading