[PowerPC] custom lower v1024i1 load/store #126969

RolandF77 · 2025-02-12T20:46:09Z

Support moving PPC dense math register values to and from storage with LLVM IR load/store.

llvmbot · 2025-02-12T20:46:44Z

@llvm/pr-subscribers-backend-powerpc

Author: None (RolandF77)

Changes

Support moving PPC dense math register values to and from storage with LLVM IR load/store.

Full diff: https://github.com/llvm/llvm-project/pull/126969.diff

4 Files Affected:

(modified) llvm/lib/Target/PowerPC/PPCISelLowering.cpp (+75-11)
(modified) llvm/test/CodeGen/PowerPC/mmaplus-acc-spill.ll (+4-4)
(modified) llvm/test/CodeGen/PowerPC/mmaplus-intrinsics.ll (+53-53)
(added) llvm/test/CodeGen/PowerPC/v1024ls.ll (+65)

diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index bdc1ac7c7da58..300fa716297bd 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -1363,6 +1363,11 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
     setOperationAction(ISD::STORE, MVT::v512i1, Custom);
     setOperationAction(ISD::BUILD_VECTOR, MVT::v512i1, Custom);
   }
+  if (Subtarget.isISAFuture()) {
+    setOperationAction(ISD::LOAD, MVT::v1024i1, Custom);
+    setOperationAction(ISD::STORE, MVT::v1024i1, Custom);
+    addRegisterClass(MVT::v1024i1, &PPC::DMRRCRegClass);
+  }
 
   if (Subtarget.has64BitSupport())
     setOperationAction(ISD::PREFETCH, MVT::Other, Legal);
@@ -11766,9 +11771,13 @@ SDValue PPCTargetLowering::LowerVectorLoad(SDValue Op,
   SDValue BasePtr = LN->getBasePtr();
   EVT VT = Op.getValueType();
 
-  if (VT != MVT::v256i1 && VT != MVT::v512i1)
+  if (VT != MVT::v256i1 && VT != MVT::v512i1 && VT != MVT::v1024i1)
     return Op;
 
+  // Used for dense math registers.
+  assert((VT != MVT::v1024i1 || Subtarget.isISAFuture()) &&
+         "Type unsupported for this processor");
+
   // Type v256i1 is used for pairs and v512i1 is used for accumulators.
   // Here we create 2 or 4 v16i8 loads to load the pair or accumulator value in
   // 2 or 4 vsx registers.
@@ -11796,9 +11805,36 @@ SDValue PPCTargetLowering::LowerVectorLoad(SDValue Op,
     std::reverse(LoadChains.begin(), LoadChains.end());
   }
   SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains);
-  SDValue Value =
-      DAG.getNode(VT == MVT::v512i1 ? PPCISD::ACC_BUILD : PPCISD::PAIR_BUILD,
-                  dl, VT, Loads);
+  SDValue Value;
+  if (VT == MVT::v1024i1) {
+    SmallVector<SDValue, 4> Pairs;
+    SDValue Vsx0Idx = DAG.getTargetConstant(PPC::sub_vsx0, dl, MVT::i32);
+    SDValue Vsx1Idx = DAG.getTargetConstant(PPC::sub_vsx1, dl, MVT::i32);
+    SDValue VSRpRC = DAG.getTargetConstant(PPC::VSRpRCRegClassID, dl, MVT::i32);
+    NumVecs >>= 1;
+    for (unsigned Idx = 0; Idx < NumVecs; ++Idx) {
+      const SDValue Ops[] = {VSRpRC, Loads[Idx * 2], Vsx0Idx,
+                             Loads[Idx * 2 + 1], Vsx1Idx};
+      Pairs.push_back(SDValue(
+          DAG.getMachineNode(PPC::REG_SEQUENCE, dl, MVT::v256i1, Ops), 0));
+    }
+    SDValue Lo(DAG.getMachineNode(PPC::DMXXINSTFDMR512, dl, MVT::v512i1,
+                                  Pairs[0], Pairs[1]),
+               0);
+    SDValue LoSub = DAG.getTargetConstant(PPC::sub_wacc_lo, dl, MVT::i32);
+    SDValue Hi(DAG.getMachineNode(PPC::DMXXINSTFDMR512_HI, dl, MVT::v512i1,
+                                  Pairs[2], Pairs[3]),
+               0);
+    SDValue HiSub = DAG.getTargetConstant(PPC::sub_wacc_hi, dl, MVT::i32);
+    SDValue RC = DAG.getTargetConstant(PPC::DMRRCRegClassID, dl, MVT::i32);
+    const SDValue Ops[] = {RC, Lo, LoSub, Hi, HiSub};
+    Value = SDValue(
+        DAG.getMachineNode(PPC::REG_SEQUENCE, dl, MVT::v1024i1, Ops), 0);
+  } else {
+    Value =
+        DAG.getNode(VT == MVT::v512i1 ? PPCISD::ACC_BUILD : PPCISD::PAIR_BUILD,
+                    dl, VT, Loads);
+  }
   SDValue RetOps[] = {Value, TF};
   return DAG.getMergeValues(RetOps, dl);
 }
@@ -11810,12 +11846,17 @@ SDValue PPCTargetLowering::LowerVectorStore(SDValue Op,
   SDValue StoreChain = SN->getChain();
   SDValue BasePtr = SN->getBasePtr();
   SDValue Value = SN->getValue();
-  SDValue Value2 = SN->getValue();
   EVT StoreVT = Value.getValueType();
+  SmallVector<SDValue, 4> ValueVec;
 
-  if (StoreVT != MVT::v256i1 && StoreVT != MVT::v512i1)
+  if (StoreVT != MVT::v256i1 && StoreVT != MVT::v512i1 &&
+      StoreVT != MVT::v1024i1)
     return Op;
 
+  // Used for dense math registers.
+  assert((StoreVT != MVT::v1024i1 || Subtarget.isISAFuture()) &&
+         "Type unsupported for this processor");
+
   // Type v256i1 is used for pairs and v512i1 is used for accumulators.
   // Here we create 2 or 4 v16i8 stores to store the pair or accumulator
   // underlying registers individually.
@@ -11832,20 +11873,43 @@ SDValue PPCTargetLowering::LowerVectorStore(SDValue Op,
       MachineSDNode *ExtNode = DAG.getMachineNode(
           PPC::DMXXEXTFDMR512, dl, ReturnTypes, Op.getOperand(1));
 
-      Value = SDValue(ExtNode, 0);
-      Value2 = SDValue(ExtNode, 1);
+      ValueVec.push_back(SDValue(ExtNode, 0));
+      ValueVec.push_back(SDValue(ExtNode, 1));
     } else
       Value = DAG.getNode(PPCISD::XXMFACC, dl, MVT::v512i1, Value);
     NumVecs = 4;
+
+  } else if (StoreVT == MVT::v1024i1) {
+    SDValue Lo(DAG.getMachineNode(
+                   TargetOpcode::EXTRACT_SUBREG, dl, MVT::v512i1,
+                   Op.getOperand(1),
+                   DAG.getTargetConstant(PPC::sub_wacc_lo, dl, MVT::i32)),
+               0);
+    SDValue Hi(DAG.getMachineNode(
+                   TargetOpcode::EXTRACT_SUBREG, dl, MVT::v512i1,
+                   Op.getOperand(1),
+                   DAG.getTargetConstant(PPC::sub_wacc_hi, dl, MVT::i32)),
+               0);
+    EVT ReturnTypes[] = {MVT::v256i1, MVT::v256i1};
+    MachineSDNode *ExtNode =
+        DAG.getMachineNode(PPC::DMXXEXTFDMR512, dl, ReturnTypes, Lo);
+    ValueVec.push_back(SDValue(ExtNode, 0));
+    ValueVec.push_back(SDValue(ExtNode, 1));
+    ExtNode = DAG.getMachineNode(PPC::DMXXEXTFDMR512_HI, dl, ReturnTypes, Hi);
+    ValueVec.push_back(SDValue(ExtNode, 0));
+    ValueVec.push_back(SDValue(ExtNode, 1));
+    NumVecs = 8;
   }
   for (unsigned Idx = 0; Idx < NumVecs; ++Idx) {
     unsigned VecNum = Subtarget.isLittleEndian() ? NumVecs - 1 - Idx : Idx;
     SDValue Elt;
     if (Subtarget.isISAFuture()) {
       VecNum = Subtarget.isLittleEndian() ? 1 - (Idx % 2) : (Idx % 2);
-      Elt = DAG.getNode(PPCISD::EXTRACT_VSX_REG, dl, MVT::v16i8,
-                        Idx > 1 ? Value2 : Value,
-                        DAG.getConstant(VecNum, dl, getPointerTy(DAG.getDataLayout())));
+      unsigned Pairx =
+          Subtarget.isLittleEndian() ? (NumVecs - Idx - 1) / 2 : Idx / 2;
+      Elt = DAG.getNode(
+          PPCISD::EXTRACT_VSX_REG, dl, MVT::v16i8, ValueVec[Pairx],
+          DAG.getConstant(VecNum, dl, getPointerTy(DAG.getDataLayout())));
     } else
       Elt = DAG.getNode(PPCISD::EXTRACT_VSX_REG, dl, MVT::v16i8, Value,
                         DAG.getConstant(VecNum, dl, getPointerTy(DAG.getDataLayout())));
diff --git a/llvm/test/CodeGen/PowerPC/mmaplus-acc-spill.ll b/llvm/test/CodeGen/PowerPC/mmaplus-acc-spill.ll
index 5ca8c7b02cab4..c8ead89f96d66 100644
--- a/llvm/test/CodeGen/PowerPC/mmaplus-acc-spill.ll
+++ b/llvm/test/CodeGen/PowerPC/mmaplus-acc-spill.ll
@@ -46,10 +46,10 @@ define void @intrinsics1(<16 x i8> %vc1, <16 x i8> %vc2, <16 x i8> %vc3, <16 x i
 ; CHECK-NEXT:    dmxxinstfdmr512 wacc0, vsp34, vsp36, 0
 ; CHECK-NEXT:    xvf16ger2pp wacc0, v28, v30
 ; CHECK-NEXT:    dmxxextfdmr512 wacc0, vsp34, vsp36, 0
-; CHECK-NEXT:    stxv v4, 48(r30)
-; CHECK-NEXT:    stxv v5, 32(r30)
-; CHECK-NEXT:    stxv v2, 16(r30)
-; CHECK-NEXT:    stxv v3, 0(r30)
+; CHECK-NEXT:    stxv v2, 48(r30)
+; CHECK-NEXT:    stxv v3, 32(r30)
+; CHECK-NEXT:    stxv v4, 16(r30)
+; CHECK-NEXT:    stxv v5, 0(r30)
 ; CHECK-NEXT:    lxv v31, 144(r1) # 16-byte Folded Reload
 ; CHECK-NEXT:    lxv v30, 128(r1) # 16-byte Folded Reload
 ; CHECK-NEXT:    lxv v29, 112(r1) # 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/PowerPC/mmaplus-intrinsics.ll b/llvm/test/CodeGen/PowerPC/mmaplus-intrinsics.ll
index 158ec7a3427c8..b3e4392b8d0e3 100644
--- a/llvm/test/CodeGen/PowerPC/mmaplus-intrinsics.ll
+++ b/llvm/test/CodeGen/PowerPC/mmaplus-intrinsics.ll
@@ -31,10 +31,10 @@ define void @ass_acc(ptr %ptr, <16 x i8> %vc) {
 ; CHECK-NEXT:    vmr v3, v2
 ; CHECK-NEXT:    dmxxinstfdmr512 wacc0, vsp34, vsp34, 0
 ; CHECK-NEXT:    dmxxextfdmr512 wacc0, vsp34, vsp36, 0
-; CHECK-NEXT:    stxv v4, 48(r3)
-; CHECK-NEXT:    stxv v5, 32(r3)
-; CHECK-NEXT:    stxv v2, 16(r3)
-; CHECK-NEXT:    stxv v3, 0(r3)
+; CHECK-NEXT:    stxv v2, 48(r3)
+; CHECK-NEXT:    stxv v3, 32(r3)
+; CHECK-NEXT:    stxv v4, 16(r3)
+; CHECK-NEXT:    stxv v5, 0(r3)
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-BE-LABEL: ass_acc:
@@ -55,7 +55,7 @@ define void @ass_acc(ptr %ptr, <16 x i8> %vc) {
 ; CHECK-O0-NEXT:    vmr v3, v4
 ; CHECK-O0-NEXT:    vmr v2, v4
 ; CHECK-O0-NEXT:    dmxxinstfdmr512 wacc0, vsp34, vsp34, 0
-; CHECK-O0-NEXT:    dmxxextfdmr512 wacc0, vsp34, vsp36, 0
+; CHECK-O0-NEXT:    dmxxextfdmr512 wacc0, vsp36, vsp34, 0
 ; CHECK-O0-NEXT:    xxlor vs0, v4, v4
 ; CHECK-O0-NEXT:    stxv vs0, 48(r3)
 ; CHECK-O0-NEXT:    xxlor vs0, v5, v5
@@ -121,10 +121,10 @@ define void @ld_st_xxmtacc(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-NEXT:    lxv v4, 48(r3)
 ; CHECK-NEXT:    dmxxinstfdmr512 wacc0, vsp36, vsp34, 0
 ; CHECK-NEXT:    dmxxextfdmr512 wacc0, vsp34, vsp36, 0
-; CHECK-NEXT:    stxv v4, 48(r7)
-; CHECK-NEXT:    stxv v5, 32(r7)
-; CHECK-NEXT:    stxv v2, 16(r7)
-; CHECK-NEXT:    stxv v3, 0(r7)
+; CHECK-NEXT:    stxv v2, 48(r7)
+; CHECK-NEXT:    stxv v3, 32(r7)
+; CHECK-NEXT:    stxv v4, 16(r7)
+; CHECK-NEXT:    stxv v5, 0(r7)
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-BE-LABEL: ld_st_xxmtacc:
@@ -154,7 +154,7 @@ define void @ld_st_xxmtacc(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-O0-NEXT:    lxv vs0, 48(r3)
 ; CHECK-O0-NEXT:    xxlor v2, vs0, vs0
 ; CHECK-O0-NEXT:    dmxxinstfdmr512 wacc0, vsp34, vsp36, 0
-; CHECK-O0-NEXT:    dmxxextfdmr512 wacc0, vsp34, vsp36, 0
+; CHECK-O0-NEXT:    dmxxextfdmr512 wacc0, vsp36, vsp34, 0
 ; CHECK-O0-NEXT:    xxlor vs0, v4, v4
 ; CHECK-O0-NEXT:    stxv vs0, 48(r7)
 ; CHECK-O0-NEXT:    xxlor vs0, v5, v5
@@ -236,10 +236,10 @@ define void @ld_op_st_xxmtacc(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-NEXT:    dmxxinstfdmr512 wacc0, vsp32, vsp36, 0
 ; CHECK-NEXT:    xvi4ger8pp wacc0, v2, v2
 ; CHECK-NEXT:    dmxxextfdmr512 wacc0, vsp34, vsp36, 0
-; CHECK-NEXT:    stxv v4, 48(r7)
-; CHECK-NEXT:    stxv v5, 32(r7)
-; CHECK-NEXT:    stxv v2, 16(r7)
-; CHECK-NEXT:    stxv v3, 0(r7)
+; CHECK-NEXT:    stxv v2, 48(r7)
+; CHECK-NEXT:    stxv v3, 32(r7)
+; CHECK-NEXT:    stxv v4, 16(r7)
+; CHECK-NEXT:    stxv v5, 0(r7)
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-BE-LABEL: ld_op_st_xxmtacc:
@@ -271,7 +271,7 @@ define void @ld_op_st_xxmtacc(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-O0-NEXT:    xxlor v4, vs0, vs0
 ; CHECK-O0-NEXT:    dmxxinstfdmr512 wacc0, vsp36, vsp32, 0
 ; CHECK-O0-NEXT:    xvi4ger8pp wacc0, v2, v2
-; CHECK-O0-NEXT:    dmxxextfdmr512 wacc0, vsp34, vsp36, 0
+; CHECK-O0-NEXT:    dmxxextfdmr512 wacc0, vsp36, vsp34, 0
 ; CHECK-O0-NEXT:    xxlor vs0, v4, v4
 ; CHECK-O0-NEXT:    stxv vs0, 48(r7)
 ; CHECK-O0-NEXT:    xxlor vs0, v5, v5
@@ -356,14 +356,14 @@ define void @ld_st_xxmfacc(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-NEXT:    lxv v4, 48(r3)
 ; CHECK-NEXT:    dmxxinstfdmr512 wacc0, vsp36, vsp34, 0
 ; CHECK-NEXT:    dmxxextfdmr512 wacc0, vsp34, vsp36, 0
-; CHECK-NEXT:    stxv v4, 48(r3)
-; CHECK-NEXT:    stxv v5, 32(r3)
-; CHECK-NEXT:    stxv v2, 16(r3)
-; CHECK-NEXT:    stxv v3, 0(r3)
-; CHECK-NEXT:    stxv v4, 48(r7)
-; CHECK-NEXT:    stxv v5, 32(r7)
-; CHECK-NEXT:    stxv v2, 16(r7)
-; CHECK-NEXT:    stxv v3, 0(r7)
+; CHECK-NEXT:    stxv v2, 48(r3)
+; CHECK-NEXT:    stxv v3, 32(r3)
+; CHECK-NEXT:    stxv v4, 16(r3)
+; CHECK-NEXT:    stxv v5, 0(r3)
+; CHECK-NEXT:    stxv v2, 48(r7)
+; CHECK-NEXT:    stxv v3, 32(r7)
+; CHECK-NEXT:    stxv v4, 16(r7)
+; CHECK-NEXT:    stxv v5, 0(r7)
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-BE-LABEL: ld_st_xxmfacc:
@@ -397,7 +397,7 @@ define void @ld_st_xxmfacc(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-O0-NEXT:    lxv vs0, 48(r3)
 ; CHECK-O0-NEXT:    xxlor v2, vs0, vs0
 ; CHECK-O0-NEXT:    dmxxinstfdmr512 wacc0, vsp34, vsp36, 0
-; CHECK-O0-NEXT:    dmxxextfdmr512 wacc0, vsp34, vsp36, 0
+; CHECK-O0-NEXT:    dmxxextfdmr512 wacc0, vsp36, vsp34, 0
 ; CHECK-O0-NEXT:    xxlor vs3, v4, v4
 ; CHECK-O0-NEXT:    stxv vs3, 48(r3)
 ; CHECK-O0-NEXT:    xxlor vs2, v5, v5
@@ -496,10 +496,10 @@ define void @ld_op_st_xxmfacc(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-NEXT:    dmxxinstfdmr512 wacc0, vsp32, vsp36, 0
 ; CHECK-NEXT:    xvi4ger8pp wacc0, v2, v2
 ; CHECK-NEXT:    dmxxextfdmr512 wacc0, vsp34, vsp36, 0
-; CHECK-NEXT:    stxv v4, 48(r7)
-; CHECK-NEXT:    stxv v5, 32(r7)
-; CHECK-NEXT:    stxv v2, 16(r7)
-; CHECK-NEXT:    stxv v3, 0(r7)
+; CHECK-NEXT:    stxv v2, 48(r7)
+; CHECK-NEXT:    stxv v3, 32(r7)
+; CHECK-NEXT:    stxv v4, 16(r7)
+; CHECK-NEXT:    stxv v5, 0(r7)
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-BE-LABEL: ld_op_st_xxmfacc:
@@ -531,7 +531,7 @@ define void @ld_op_st_xxmfacc(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-O0-NEXT:    xxlor v4, vs0, vs0
 ; CHECK-O0-NEXT:    dmxxinstfdmr512 wacc0, vsp36, vsp32, 0
 ; CHECK-O0-NEXT:    xvi4ger8pp wacc0, v2, v2
-; CHECK-O0-NEXT:    dmxxextfdmr512 wacc0, vsp34, vsp36, 0
+; CHECK-O0-NEXT:    dmxxextfdmr512 wacc0, vsp36, vsp34, 0
 ; CHECK-O0-NEXT:    xxlor vs0, v4, v4
 ; CHECK-O0-NEXT:    stxv vs0, 48(r7)
 ; CHECK-O0-NEXT:    xxlor vs0, v5, v5
@@ -621,10 +621,10 @@ define void @cmplx_xxmacc(ptr %ptr1, ptr %ptr2, <16 x i8> %vc1, <16 x i8> %vc2)
 ; CHECK-NEXT:    xvf64gerpp wacc0, vsp34, v5
 ; CHECK-NEXT:    xvf64gerpp wacc0, vsp36, v4
 ; CHECK-NEXT:    dmxxextfdmr512 wacc0, vsp34, vsp36, 0
-; CHECK-NEXT:    stxv v4, 48(r3)
-; CHECK-NEXT:    stxv v5, 32(r3)
-; CHECK-NEXT:    stxv v2, 16(r3)
-; CHECK-NEXT:    stxv v3, 0(r3)
+; CHECK-NEXT:    stxv v2, 48(r3)
+; CHECK-NEXT:    stxv v3, 32(r3)
+; CHECK-NEXT:    stxv v4, 16(r3)
+; CHECK-NEXT:    stxv v5, 0(r3)
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-BE-LABEL: cmplx_xxmacc:
@@ -673,7 +673,7 @@ define void @cmplx_xxmacc(ptr %ptr1, ptr %ptr2, <16 x i8> %vc1, <16 x i8> %vc2)
 ; CHECK-O0-NEXT:    xvf64gerpp wacc0, vsp32, vs0
 ; CHECK-O0-NEXT:    xxlor vs0, v4, v4
 ; CHECK-O0-NEXT:    xvf64gerpp wacc0, vsp34, vs0
-; CHECK-O0-NEXT:    dmxxextfdmr512 wacc0, vsp34, vsp36, 0
+; CHECK-O0-NEXT:    dmxxextfdmr512 wacc0, vsp36, vsp34, 0
 ; CHECK-O0-NEXT:    xxlor vs0, v4, v4
 ; CHECK-O0-NEXT:    stxv vs0, 48(r3)
 ; CHECK-O0-NEXT:    xxlor vs0, v5, v5
@@ -783,10 +783,10 @@ define void @int_xxsetaccz(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxsetaccz wacc0
 ; CHECK-NEXT:    dmxxextfdmr512 wacc0, vsp34, vsp36, 0
-; CHECK-NEXT:    stxv v4, 48(r3)
-; CHECK-NEXT:    stxv v5, 32(r3)
-; CHECK-NEXT:    stxv v2, 16(r3)
-; CHECK-NEXT:    stxv v3, 0(r3)
+; CHECK-NEXT:    stxv v2, 48(r3)
+; CHECK-NEXT:    stxv v3, 32(r3)
+; CHECK-NEXT:    stxv v4, 16(r3)
+; CHECK-NEXT:    stxv v5, 0(r3)
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-BE-LABEL: int_xxsetaccz:
@@ -802,7 +802,7 @@ define void @int_xxsetaccz(ptr %ptr) {
 ; CHECK-O0-LABEL: int_xxsetaccz:
 ; CHECK-O0:       # %bb.0: # %entry
 ; CHECK-O0-NEXT:    xxsetaccz wacc0
-; CHECK-O0-NEXT:    dmxxextfdmr512 wacc0, vsp34, vsp36, 0
+; CHECK-O0-NEXT:    dmxxextfdmr512 wacc0, vsp36, vsp34, 0
 ; CHECK-O0-NEXT:    xxlor vs0, v4, v4
 ; CHECK-O0-NEXT:    stxv vs0, 48(r3)
 ; CHECK-O0-NEXT:    xxlor vs0, v5, v5
@@ -946,14 +946,14 @@ define void @testcse(ptr %res, <16 x i8> %vc) {
 ; CHECK-NEXT:    xxsetaccz wacc0
 ; CHECK-NEXT:    xvf32gerpp wacc0, v2, v2
 ; CHECK-NEXT:    dmxxextfdmr512 wacc0, vsp34, vsp36, 0
-; CHECK-NEXT:    stxv v4, 48(r3)
-; CHECK-NEXT:    stxv v5, 32(r3)
-; CHECK-NEXT:    stxv v2, 16(r3)
-; CHECK-NEXT:    stxv v3, 0(r3)
-; CHECK-NEXT:    stxv v4, 112(r3)
-; CHECK-NEXT:    stxv v5, 96(r3)
-; CHECK-NEXT:    stxv v2, 80(r3)
-; CHECK-NEXT:    stxv v3, 64(r3)
+; CHECK-NEXT:    stxv v2, 48(r3)
+; CHECK-NEXT:    stxv v3, 32(r3)
+; CHECK-NEXT:    stxv v4, 16(r3)
+; CHECK-NEXT:    stxv v5, 0(r3)
+; CHECK-NEXT:    stxv v2, 112(r3)
+; CHECK-NEXT:    stxv v3, 96(r3)
+; CHECK-NEXT:    stxv v4, 80(r3)
+; CHECK-NEXT:    stxv v5, 64(r3)
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-BE-LABEL: testcse:
@@ -975,7 +975,7 @@ define void @testcse(ptr %res, <16 x i8> %vc) {
 ; CHECK-O0:       # %bb.0: # %entry
 ; CHECK-O0-NEXT:    xxsetaccz wacc0
 ; CHECK-O0-NEXT:    xvf32gerpp wacc0, v2, v2
-; CHECK-O0-NEXT:    dmxxextfdmr512 wacc0, vsp34, vsp36, 0
+; CHECK-O0-NEXT:    dmxxextfdmr512 wacc0, vsp36, vsp34, 0
 ; CHECK-O0-NEXT:    xxlor vs3, v4, v4
 ; CHECK-O0-NEXT:    stxv vs3, 48(r3)
 ; CHECK-O0-NEXT:    xxlor vs2, v5, v5
@@ -1065,10 +1065,10 @@ define void @test_ldst_1(ptr nocapture readonly %vqp, ptr %vpp, <16 x i8> %vc, p
 ; CHECK-NEXT:    plxvp vsp36, 8(r4), 0
 ; CHECK-NEXT:    pmxvf64gernn wacc0, vsp36, v2, 0, 0
 ; CHECK-NEXT:    dmxxextfdmr512 wacc0, vsp34, vsp36, 0
-; CHECK-NEXT:    stxv v4, 48(r7)
-; CHECK-NEXT:    stxv v5, 32(r7)
-; CHECK-NEXT:    stxv v2, 16(r7)
-; CHECK-NEXT:    stxv v3, 0(r7)
+; CHECK-NEXT:    stxv v2, 48(r7)
+; CHECK-NEXT:    stxv v3, 32(r7)
+; CHECK-NEXT:    stxv v4, 16(r7)
+; CHECK-NEXT:    stxv v5, 0(r7)
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-BE-LABEL: test_ldst_1:
@@ -1104,7 +1104,7 @@ define void @test_ldst_1(ptr nocapture readonly %vqp, ptr %vpp, <16 x i8> %vc, p
 ; CHECK-O0-NEXT:    plxvp vsp34, 8(r4), 0
 ; CHECK-O0-NEXT:    xxlor vs0, v4, v4
 ; CHECK-O0-NEXT:    pmxvf64gernn wacc0, vsp34, vs0, 0, 0
-; CHECK-O0-NEXT:    dmxxextfdmr512 wacc0, vsp34, vsp36, 0
+; CHECK-O0-NEXT:    dmxxextfdmr512 wacc0, vsp36, vsp34, 0
 ; CHECK-O0-NEXT:    xxlor vs0, v4, v4
 ; CHECK-O0-NEXT:    stxv vs0, 48(r7)
 ; CHECK-O0-NEXT:    xxlor vs0, v5, v5
diff --git a/llvm/test/CodeGen/PowerPC/v1024ls.ll b/llvm/test/CodeGen/PowerPC/v1024ls.ll
new file mode 100644
index 0000000000000..97668009cb0d7
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/v1024ls.ll
@@ -0,0 +1,65 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:   -disable-auto-paired-vec-st=false \
+; RUN:   -mcpu=future -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:   -disable-auto-paired-vec-st=false \
+; RUN:   -mcpu=future -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE
+
+define void @v1024ls(ptr nocapture readonly %vqp, ptr nocapture %resp)  {
+; CHECK-LABEL: v1024ls:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lxv v3, 0(r3)
+; CHECK-NEXT:    lxv v5, 32(r3)
+; CHECK-NEXT:    lxv v2, 16(r3)
+; CHECK-NEXT:    lxv v4, 48(r3)
+; CHECK-NEXT:    dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1
+; CHECK-NEXT:    lxv v3, 64(r3)
+; CHECK-NEXT:    lxv v5, 96(r3)
+; CHECK-NEXT:    lxv v2, 80(r3)
+; CHECK-NEXT:    lxv v4, 112(r3)
+; CHECK-NEXT:    dmxxinstfdmr512 wacc0, vsp36, vsp34, 0
+; CHECK-NEXT:    dmxxextfdmr512 wacc0, vsp34, vsp36, 0
+; CHECK-NEXT:    stxv v2, 112(r4)
+; CHECK-NEXT:    stxv v3, 96(r4)
+; CHECK-NEXT:    stxv v4, 80(r4)
+; CHECK-NEXT:    stxv v5, 64(r4)
+; CHECK-NEXT:    dmxxextfdmr512 wacc_hi0, vsp34, vsp36, 1
+; CHECK-NEXT:    stxv v2, 48(r4)
+; CHECK-NEXT:    stxv v3, 32(r4)
+; CHECK-NEXT:    stxv v4, 16(r4)
+; CHECK-NEXT:    stxv v5, 0(r4)
+; CHECK-NEXT:    blr
+;
+; CHECK-BE-LABEL: v1024ls:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv v3, 112(r3)
+; CHECK-BE-NEXT:    lxv v5, 80(r3)
+; CHECK-BE-NEXT:    lxv v2, 96(r3)
+; CHECK-BE-NEXT:    lxv v4, 64(r3)
+; CHECK-BE-NEXT:    dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1
+; CHECK-BE-NEXT:    lxv v3, 48(r3)
+; CHECK-BE-NEXT:    lxv v5, 16(r3)
+; CHECK-BE-NEXT:    lxv v2, 32(r3)
+; CHECK-BE-NEXT:    lxv v4, 0(r3)
+; CHECK-BE-NEXT:    dmxxinstfdmr512 wacc0, vsp36, vsp34, 0
+; CHECK-BE-NEXT:    dmxxextfdmr512 wacc_hi0, vsp34, vsp36, 1
+; CHECK-BE-NEXT:    stxv v5, 112(r4)
+; CHECK-BE-NEXT:    stxv v4, 96(r4)
+; CHECK-BE-NEXT:    stxv v3, 80(r4)
+; CHECK-BE-NEXT:    stxv v2, 64(r4)
+; CHECK-BE-NEXT:    dmxxextfdmr512 wacc0, vsp34, vsp36, 0
+; CHECK-BE-NEXT:    stxv v5, 48(r4)
+; CHECK-BE-NEXT:    stxv v4, 32(r4)
+; CHECK-BE-NEXT:    stxv v3, 16(r4)
+; CHECK-BE-NEXT:    stxv v2, 0(r4)
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = load <1024 x i1>, ptr %vqp, align 64
+  store <1024 x i1> %0, ptr %resp, align 64
+  ret void
+}
+
+declare <1024 x i1> @llvm.ppc.mma.dmsetdmrz()

lei137 · 2025-02-14T15:43:20Z

llvm/lib/Target/PowerPC/PPCISelLowering.cpp

+  if (Subtarget.isISAFuture()) {
+    setOperationAction(ISD::LOAD, MVT::v1024i1, Custom);
+    setOperationAction(ISD::STORE, MVT::v1024i1, Custom);
+    addRegisterClass(MVT::v1024i1, &PPC::DMRRCRegClass);
+  }


Should this be within the if (Subtarget.hasMMA()) block on line 1357? I am basing this on the fact that we need hadMMA() for this support.

According to PPC.cpp future CPU should include all of the features of Power11, and Power11 includes all the same features as Power10 where mma feature is set to true for Power10, so I think isISAFuture implies hasMMA.

All the checks of isISAFuture should I think be interpreted as placeholders. At some point a concrete target processor needs to be added and at that point all the checks for future should be updated to check specific features rather than just future - "future + 1" would also have those features but a check for future would fail.
Also for that reason I think the single condition is more correct, since we should be asking about one feature and not a cpu name in addition.

Yes Power11 implies hasMMA. I was more thinking of issues where users manually turn on/off features on a specific CPU. This patch uses lxvp|stvp, so if user explicitly turn off mma or paired-vector-memops we shouldn't be generating these code.

I see that you have the guards in the custom lowering functions though.

Yes, the checks are inconsistent. I could make them consistent. The final switching code is TBD because the target features are TBD.

For paired support not being there, I think hitting the assert is better. It would be a case of manually forcing it off since ISA level supports it.

lei137 · 2025-02-14T15:58:49Z

llvm/test/CodeGen/PowerPC/mmaplus-intrinsics.ll

@@ -1104,7 +1104,7 @@ define void @test_ldst_1(ptr nocapture readonly %vqp, ptr %vpp, <16 x i8> %vc, p
 ; CHECK-O0-NEXT:    plxvp vsp34, 8(r4), 0
 ; CHECK-O0-NEXT:    xxlor vs0, v4, v4
 ; CHECK-O0-NEXT:    pmxvf64gernn wacc0, vsp34, vs0, 0, 0
-; CHECK-O0-NEXT:    dmxxextfdmr512 wacc0, vsp34, vsp36, 0
+; CHECK-O0-NEXT:    dmxxextfdmr512 wacc0, vsp36, vsp34, 0


I'm a bit concerned that even though the extracted values are placed into different vsp registers, there are no changes in where they are stored. In the changes above, there are equivalent updates to the store instructions when the extracted values are placed into different register pairs.

The changes to the 512 bit path have been removed.

lei137 · 2025-02-14T21:15:32Z

llvm/lib/Target/PowerPC/PPCISelLowering.cpp

+  // Used for dense math registers.
+  assert((VT != MVT::v1024i1 || Subtarget.isISAFuture()) &&
+         "Type unsupported for this processor");
+
  // Type v256i1 is used for pairs and v512i1 is used for accumulators.
  // Here we create 2 or 4 v16i8 loads to load the pair or accumulator value in
  // 2 or 4 vsx registers.


maybe this doc need to be updated since it's generating loads also for v1024i1 types now?

Code removed.

lei137 · 2025-02-14T21:43:28Z

llvm/lib/Target/PowerPC/PPCISelLowering.cpp

+    SDValue RC = DAG.getTargetConstant(PPC::DMRRCRegClassID, dl, MVT::i32);
+    const SDValue Ops[] = {RC, Lo, LoSub, Hi, HiSub};
+    Value = SDValue(
+        DAG.getMachineNode(PPC::REG_SEQUENCE, dl, MVT::v1024i1, Ops), 0);


Any specific reason why we want to generate the sequence here instead of in the td file similar to PPCISD::ACC_BUILD and PPCISD::PAIR_BUILD?

I was wondering if the sequence is too complicated to be written in the .td file.

It was suggested to us that ACC_BUILD and PAIR_BUILD should be removed for one. Also the opcodes only have a single use so they add complexity without providing much value. And yes the sequence is complicated, and 512 bit dense math is already done in lowering for precedent.

maryammo · 2025-02-20T16:17:52Z

llvm/lib/Target/PowerPC/PPCISelLowering.cpp

+    SDValue Lo(DAG.getMachineNode(PPC::DMXXINSTFDMR512, dl, MVT::v512i1,
+                                  Pairs[0], Pairs[1]),
+               0);
+    SDValue LoSub = DAG.getTargetConstant(PPC::sub_wacc_lo, dl, MVT::i32);


nit : LoSub/HiSub are target constant similar to Vsx0Idx/Vsx1Idx, does that make more sense to rename them to LoIdx / HiIdx just to be consistent?

The vsx stuff was removed when the 1024 bit code was split off so there is no conflict in naming now. Sub to me seems the more accurate name.

maryammo · 2025-02-20T16:20:18Z

llvm/lib/Target/PowerPC/PPCISelLowering.cpp

+  if (Subtarget.isISAFuture()) {
+    setOperationAction(ISD::LOAD, MVT::v1024i1, Custom);
+    setOperationAction(ISD::STORE, MVT::v1024i1, Custom);
+    addRegisterClass(MVT::v1024i1, &PPC::DMRRCRegClass);
+  }


According to PPC.cpp future CPU should include all of the features of Power11, and Power11 includes all the same features as Power10 where mma feature is set to true for Power10, so I think isISAFuture implies hasMMA.

maryammo · 2025-02-20T16:30:33Z

llvm/lib/Target/PowerPC/PPCISelLowering.cpp

+    SDValue RC = DAG.getTargetConstant(PPC::DMRRCRegClassID, dl, MVT::i32);
+    const SDValue Ops[] = {RC, Lo, LoSub, Hi, HiSub};
+    Value = SDValue(
+        DAG.getMachineNode(PPC::REG_SEQUENCE, dl, MVT::v1024i1, Ops), 0);


I was wondering if the sequence is too complicated to be written in the .td file.

maryammo · 2025-02-20T19:40:31Z

llvm/test/CodeGen/PowerPC/mmaplus-intrinsics.ll

@@ -55,7 +55,7 @@ define void @ass_acc(ptr %ptr, <16 x i8> %vc) {
 ; CHECK-O0-NEXT:    vmr v3, v4
 ; CHECK-O0-NEXT:    vmr v2, v4
 ; CHECK-O0-NEXT:    dmxxinstfdmr512 wacc0, vsp34, vsp34, 0
-; CHECK-O0-NEXT:    dmxxextfdmr512 wacc0, vsp34, vsp36, 0
+; CHECK-O0-NEXT:    dmxxextfdmr512 wacc0, vsp36, vsp34, 0


I am not sure how the extracted values are placed into vector pairs in different orders while the subsequent use of those vr's are the same.

The 512 bit changes are removed.

maryammo · 2025-02-20T21:32:00Z

llvm/lib/Target/PowerPC/PPCISelLowering.cpp

-                        Idx > 1 ? Value2 : Value,
-                        DAG.getConstant(VecNum, dl, getPointerTy(DAG.getDataLayout())));
+      unsigned Pairx =
+          Subtarget.isLittleEndian() ? (NumVecs - Idx - 1) / 2 : Idx / 2;


It seems before this PR, in extracting Elt for v512i1, choosing between Value and Value2 did not depend on the endianness, it was

Elt = DAG.getNode(PPCISD::EXTRACT_VSX_REG, dl, MVT::v16i8, Idx > 1 ? Value2 : Value, DAG.getConstant(VecNum, dl, getPointerTy(DAG.getDataLayout())));

however, now it does.

Code removed.

maryammo

LGTM.

lei137

LGTM thx

Support moving PPC dense math register values to and from storage with LLVM IR load/store.

RolandF77 self-assigned this Feb 12, 2025

llvmbot added the backend:PowerPC label Feb 12, 2025

RolandF77 requested review from maryammo, mandlebug and lei137 February 12, 2025 20:46

custom lower v1024i1 load/store

b3b8416

lei137 reviewed Feb 14, 2025

View reviewed changes

maryammo reviewed Feb 20, 2025

View reviewed changes

separate 1024 code

f4ca19e

maryammo approved these changes Feb 22, 2025

View reviewed changes

update check

f927aba

lei137 approved these changes Feb 27, 2025

View reviewed changes

RolandF77 merged commit a73e591 into llvm:main Feb 28, 2025
11 checks passed

cheezeburglar pushed a commit to cheezeburglar/llvm-project that referenced this pull request Feb 28, 2025

[PowerPC] custom lower v1024i1 load/store (llvm#126969)

6c4f9a6

Support moving PPC dense math register values to and from storage with LLVM IR load/store.

[PowerPC] custom lower v1024i1 load/store #126969

[PowerPC] custom lower v1024i1 load/store #126969

Uh oh!

Conversation

RolandF77 commented Feb 12, 2025

Uh oh!

llvmbot commented Feb 12, 2025

Uh oh!

lei137 Feb 14, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

lei137 Feb 14, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

maryammo left a comment

Choose a reason for hiding this comment

Uh oh!

lei137 left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

lei137 Feb 14, 2025 •

edited

Loading

lei137 Feb 14, 2025 •

edited

Loading