Skip to content

Commit a73e591

Browse files
authored
[PowerPC] custom lower v1024i1 load/store (#126969)
Support moving PPC dense math register values to and from storage with LLVM IR load/store.
1 parent 4a477ee commit a73e591

File tree

3 files changed

+182
-2
lines changed

3 files changed

+182
-2
lines changed

llvm/lib/Target/PowerPC/PPCISelLowering.cpp

Lines changed: 133 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1355,10 +1355,14 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
13551355
setOperationAction(ISD::STORE, MVT::v256i1, Custom);
13561356
}
13571357
if (Subtarget.hasMMA()) {
1358-
if (Subtarget.isISAFuture())
1358+
if (Subtarget.isISAFuture()) {
13591359
addRegisterClass(MVT::v512i1, &PPC::WACCRCRegClass);
1360-
else
1360+
addRegisterClass(MVT::v1024i1, &PPC::DMRRCRegClass);
1361+
setOperationAction(ISD::LOAD, MVT::v1024i1, Custom);
1362+
setOperationAction(ISD::STORE, MVT::v1024i1, Custom);
1363+
} else {
13611364
addRegisterClass(MVT::v512i1, &PPC::UACCRCRegClass);
1365+
}
13621366
setOperationAction(ISD::LOAD, MVT::v512i1, Custom);
13631367
setOperationAction(ISD::STORE, MVT::v512i1, Custom);
13641368
setOperationAction(ISD::BUILD_VECTOR, MVT::v512i1, Custom);
@@ -11758,6 +11762,64 @@ SDValue PPCTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
1175811762
return Op;
1175911763
}
1176011764

11765+
SDValue PPCTargetLowering::LowerDMFVectorLoad(SDValue Op,
11766+
SelectionDAG &DAG) const {
11767+
SDLoc dl(Op);
11768+
LoadSDNode *LN = cast<LoadSDNode>(Op.getNode());
11769+
SDValue LoadChain = LN->getChain();
11770+
SDValue BasePtr = LN->getBasePtr();
11771+
EVT VT = Op.getValueType();
11772+
11773+
// Type v1024i1 is used for Dense Math dmr registers.
11774+
assert(VT == MVT::v1024i1 && "Unsupported type.");
11775+
assert((Subtarget.hasMMA() && Subtarget.isISAFuture()) &&
11776+
"Dense Math support required.");
11777+
assert(Subtarget.pairedVectorMemops() && "Vector pair support required.");
11778+
11779+
SmallVector<SDValue, 4> Loads;
11780+
SmallVector<SDValue, 4> LoadChains;
11781+
SDValue IntrinID = DAG.getConstant(Intrinsic::ppc_vsx_lxvp, dl, MVT::i32);
11782+
SDValue LoadOps[] = {LoadChain, IntrinID, BasePtr};
11783+
MachineMemOperand *MMO = LN->getMemOperand();
11784+
unsigned NumVecs = VT.getSizeInBits() / 256;
11785+
for (unsigned Idx = 0; Idx < NumVecs; ++Idx) {
11786+
MachineMemOperand *NewMMO =
11787+
DAG.getMachineFunction().getMachineMemOperand(MMO, Idx * 32, 32);
11788+
if (Idx > 0) {
11789+
BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
11790+
DAG.getConstant(32, dl, BasePtr.getValueType()));
11791+
LoadOps[2] = BasePtr;
11792+
}
11793+
SDValue Ld = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl,
11794+
DAG.getVTList(MVT::v256i1, MVT::Other),
11795+
LoadOps, MVT::v256i1, NewMMO);
11796+
LoadChains.push_back(Ld.getValue(1));
11797+
Loads.push_back(Ld);
11798+
}
11799+
11800+
if (Subtarget.isLittleEndian()) {
11801+
std::reverse(Loads.begin(), Loads.end());
11802+
std::reverse(LoadChains.begin(), LoadChains.end());
11803+
}
11804+
11805+
SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains);
11806+
SDValue Lo(DAG.getMachineNode(PPC::DMXXINSTFDMR512, dl, MVT::v512i1, Loads[0],
11807+
Loads[1]),
11808+
0);
11809+
SDValue LoSub = DAG.getTargetConstant(PPC::sub_wacc_lo, dl, MVT::i32);
11810+
SDValue Hi(DAG.getMachineNode(PPC::DMXXINSTFDMR512_HI, dl, MVT::v512i1,
11811+
Loads[2], Loads[3]),
11812+
0);
11813+
SDValue HiSub = DAG.getTargetConstant(PPC::sub_wacc_hi, dl, MVT::i32);
11814+
SDValue RC = DAG.getTargetConstant(PPC::DMRRCRegClassID, dl, MVT::i32);
11815+
const SDValue Ops[] = {RC, Lo, LoSub, Hi, HiSub};
11816+
SDValue Value =
11817+
SDValue(DAG.getMachineNode(PPC::REG_SEQUENCE, dl, MVT::v1024i1, Ops), 0);
11818+
11819+
SDValue RetOps[] = {Value, TF};
11820+
return DAG.getMergeValues(RetOps, dl);
11821+
}
11822+
1176111823
SDValue PPCTargetLowering::LowerVectorLoad(SDValue Op,
1176211824
SelectionDAG &DAG) const {
1176311825
SDLoc dl(Op);
@@ -11766,6 +11828,9 @@ SDValue PPCTargetLowering::LowerVectorLoad(SDValue Op,
1176611828
SDValue BasePtr = LN->getBasePtr();
1176711829
EVT VT = Op.getValueType();
1176811830

11831+
if (VT == MVT::v1024i1)
11832+
return LowerDMFVectorLoad(Op, DAG);
11833+
1176911834
if (VT != MVT::v256i1 && VT != MVT::v512i1)
1177011835
return Op;
1177111836

@@ -11803,6 +11868,69 @@ SDValue PPCTargetLowering::LowerVectorLoad(SDValue Op,
1180311868
return DAG.getMergeValues(RetOps, dl);
1180411869
}
1180511870

11871+
SDValue PPCTargetLowering::LowerDMFVectorStore(SDValue Op,
11872+
SelectionDAG &DAG) const {
11873+
11874+
SDLoc dl(Op);
11875+
StoreSDNode *SN = cast<StoreSDNode>(Op.getNode());
11876+
SDValue StoreChain = SN->getChain();
11877+
SDValue BasePtr = SN->getBasePtr();
11878+
SmallVector<SDValue, 4> Values;
11879+
SmallVector<SDValue, 4> Stores;
11880+
EVT VT = SN->getValue().getValueType();
11881+
11882+
// Type v1024i1 is used for Dense Math dmr registers.
11883+
assert(VT == MVT::v1024i1 && "Unsupported type.");
11884+
assert((Subtarget.hasMMA() && Subtarget.isISAFuture()) &&
11885+
"Dense Math support required.");
11886+
assert(Subtarget.pairedVectorMemops() && "Vector pair support required.");
11887+
11888+
SDValue Lo(
11889+
DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, MVT::v512i1,
11890+
Op.getOperand(1),
11891+
DAG.getTargetConstant(PPC::sub_wacc_lo, dl, MVT::i32)),
11892+
0);
11893+
SDValue Hi(
11894+
DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, MVT::v512i1,
11895+
Op.getOperand(1),
11896+
DAG.getTargetConstant(PPC::sub_wacc_hi, dl, MVT::i32)),
11897+
0);
11898+
EVT ReturnTypes[] = {MVT::v256i1, MVT::v256i1};
11899+
MachineSDNode *ExtNode =
11900+
DAG.getMachineNode(PPC::DMXXEXTFDMR512, dl, ReturnTypes, Lo);
11901+
Values.push_back(SDValue(ExtNode, 0));
11902+
Values.push_back(SDValue(ExtNode, 1));
11903+
ExtNode = DAG.getMachineNode(PPC::DMXXEXTFDMR512_HI, dl, ReturnTypes, Hi);
11904+
Values.push_back(SDValue(ExtNode, 0));
11905+
Values.push_back(SDValue(ExtNode, 1));
11906+
11907+
if (Subtarget.isLittleEndian())
11908+
std::reverse(Values.begin(), Values.end());
11909+
11910+
SDVTList Tys = DAG.getVTList(MVT::Other);
11911+
SmallVector<SDValue, 4> Ops{
11912+
StoreChain, DAG.getConstant(Intrinsic::ppc_vsx_stxvp, dl, MVT::i32),
11913+
Values[0], BasePtr};
11914+
MachineMemOperand *MMO = SN->getMemOperand();
11915+
unsigned NumVecs = VT.getSizeInBits() / 256;
11916+
for (unsigned Idx = 0; Idx < NumVecs; ++Idx) {
11917+
MachineMemOperand *NewMMO =
11918+
DAG.getMachineFunction().getMachineMemOperand(MMO, Idx * 32, 32);
11919+
if (Idx > 0) {
11920+
BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
11921+
DAG.getConstant(32, dl, BasePtr.getValueType()));
11922+
Ops[3] = BasePtr;
11923+
}
11924+
Ops[2] = Values[Idx];
11925+
SDValue St = DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, dl, Tys, Ops,
11926+
MVT::v256i1, NewMMO);
11927+
Stores.push_back(St);
11928+
}
11929+
11930+
SDValue TF = DAG.getTokenFactor(dl, Stores);
11931+
return TF;
11932+
}
11933+
1180611934
SDValue PPCTargetLowering::LowerVectorStore(SDValue Op,
1180711935
SelectionDAG &DAG) const {
1180811936
SDLoc dl(Op);
@@ -11813,6 +11941,9 @@ SDValue PPCTargetLowering::LowerVectorStore(SDValue Op,
1181311941
SDValue Value2 = SN->getValue();
1181411942
EVT StoreVT = Value.getValueType();
1181511943

11944+
if (StoreVT == MVT::v1024i1)
11945+
return LowerDMFVectorStore(Op, DAG);
11946+
1181611947
if (StoreVT != MVT::v256i1 && StoreVT != MVT::v512i1)
1181711948
return Op;
1181811949

llvm/lib/Target/PowerPC/PPCISelLowering.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1344,6 +1344,8 @@ namespace llvm {
13441344

13451345
SDValue LowerVectorLoad(SDValue Op, SelectionDAG &DAG) const;
13461346
SDValue LowerVectorStore(SDValue Op, SelectionDAG &DAG) const;
1347+
SDValue LowerDMFVectorLoad(SDValue Op, SelectionDAG &DAG) const;
1348+
SDValue LowerDMFVectorStore(SDValue Op, SelectionDAG &DAG) const;
13471349

13481350
SDValue LowerCallResult(SDValue Chain, SDValue InGlue,
13491351
CallingConv::ID CallConv, bool isVarArg,

llvm/test/CodeGen/PowerPC/v1024ls.ll

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
3+
; RUN: -mcpu=future -ppc-asm-full-reg-names \
4+
; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s
5+
; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
6+
; RUN: -mcpu=future -ppc-asm-full-reg-names \
7+
; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE
8+
9+
define void @v1024ls(ptr nocapture readonly %vqp, ptr nocapture %resp) {
10+
; CHECK-LABEL: v1024ls:
11+
; CHECK: # %bb.0: # %entry
12+
; CHECK-NEXT: lxvp vsp34, 0(r3)
13+
; CHECK-NEXT: lxvp vsp36, 32(r3)
14+
; CHECK-NEXT: dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1
15+
; CHECK-NEXT: lxvp vsp34, 64(r3)
16+
; CHECK-NEXT: lxvp vsp36, 96(r3)
17+
; CHECK-NEXT: dmxxinstfdmr512 wacc0, vsp36, vsp34, 0
18+
; CHECK-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0
19+
; CHECK-NEXT: stxvp vsp34, 96(r4)
20+
; CHECK-NEXT: stxvp vsp36, 64(r4)
21+
; CHECK-NEXT: dmxxextfdmr512 wacc_hi0, vsp34, vsp36, 1
22+
; CHECK-NEXT: stxvp vsp34, 32(r4)
23+
; CHECK-NEXT: stxvp vsp36, 0(r4)
24+
; CHECK-NEXT: blr
25+
;
26+
; CHECK-BE-LABEL: v1024ls:
27+
; CHECK-BE: # %bb.0: # %entry
28+
; CHECK-BE-NEXT: lxvp vsp34, 96(r3)
29+
; CHECK-BE-NEXT: lxvp vsp36, 64(r3)
30+
; CHECK-BE-NEXT: dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1
31+
; CHECK-BE-NEXT: lxvp vsp34, 32(r3)
32+
; CHECK-BE-NEXT: lxvp vsp36, 0(r3)
33+
; CHECK-BE-NEXT: dmxxinstfdmr512 wacc0, vsp36, vsp34, 0
34+
; CHECK-BE-NEXT: dmxxextfdmr512 wacc_hi0, vsp34, vsp36, 1
35+
; CHECK-BE-NEXT: stxvp vsp36, 96(r4)
36+
; CHECK-BE-NEXT: stxvp vsp34, 64(r4)
37+
; CHECK-BE-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0
38+
; CHECK-BE-NEXT: stxvp vsp36, 32(r4)
39+
; CHECK-BE-NEXT: stxvp vsp34, 0(r4)
40+
; CHECK-BE-NEXT: blr
41+
entry:
42+
%0 = load <1024 x i1>, ptr %vqp, align 64
43+
store <1024 x i1> %0, ptr %resp, align 64
44+
ret void
45+
}
46+
47+
declare <1024 x i1> @llvm.ppc.mma.dmsetdmrz()

0 commit comments

Comments
 (0)