llvm · ro-i · Apr 9, 2025 · Apr 10, 2025 · arsenm · Apr 10, 2025
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -80,6 +80,10 @@ enum class SchedGroupMask {
   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
 };
 
+static bool handleAsVMEMInstr(const MachineInstr &MI, const SIInstrInfo *TII) {
+  return TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI));
+}
+
 class SchedGroup;
 
 // InstructionRule class is used to enact a filter which determines whether or
@@ -1891,7 +1895,7 @@ class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
         }
       }
 
-      assert(Cache->size());
+      assert(!MFMAsFound || Cache->size());
       auto *DAG = SyncPipe[0].DAG;
       for (auto &Elt : *Cache) {
         if (DAG->IsReachable(Elt, const_cast<SUnit *>(SU)))
@@ -1994,7 +1998,7 @@ class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
       }
 
       if (NumBits < 128) {
-        assert(TII->isVMEM(*MI) && MI->mayLoad());
+        assert(handleAsVMEMInstr(*MI, TII) && MI->mayLoad());
         if (NumBits + TRI.getRegSizeInBits(*TRI.getRegClassForOperandReg(
                           MRI, MI->getOperand(0))) <=
             128)
@@ -2079,6 +2083,9 @@ class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
 static unsigned DSWCount = 0;
 static unsigned DSWWithPermCount = 0;
 static unsigned DSWWithSharedVMEMCount = 0;
+static void resetDSWCounters() {
+  DSWCount = DSWWithPermCount = DSWWithSharedVMEMCount = 0;
+}
 class MFMAExpInterleaveOpt final : public IGLPStrategy { 
 private: 
   // The count of TRANS SUs involved in the interleaved pipeline 
   static unsigned TransPipeCount; 
   // The count of MFMA SUs involved in the interleaved pipeline 
   static unsigned MFMAPipeCount; 
   // The count of Add SUs involved in the interleaved pipeline 
   static unsigned AddPipeCount; 
   // The number of transitive MFMA successors for each TRANS SU 
   static unsigned MFMAEnablement; 
   // The number of transitive TRANS predecessors for each MFMA SU 
   static unsigned ExpRequirement; 
   // The count of independent "chains" of MFMA instructions in the pipeline 
   static unsigned MFMAChains; 
   // The length of each independent "chain" of MFMA instructions 
   static unsigned MFMAChainLength; 
   // Whether or not the pipeline has V_CVT instructions 
   static bool HasCvt; 
   // Whether or not there are instructions between the TRANS instruction and 
   // V_CVT 
   static bool HasChainBetweenCvt; 
   // The first occuring DS_READ which feeds an MFMA chain 
   static std::optional<unsigned> FirstPipeDSR; 
   // The MFMAPipe SUs with no MFMA predecessors 
 class MFMAExpInterleaveOpt final : public IGLPStrategy { 
 private: 
   // The count of TRANS SUs involved in the interleaved pipeline 
   static unsigned TransPipeCount; 
   // The count of MFMA SUs involved in the interleaved pipeline 
   static unsigned MFMAPipeCount; 
   // The count of Add SUs involved in the interleaved pipeline 
   static unsigned AddPipeCount; 
   // The number of transitive MFMA successors for each TRANS SU 
   static unsigned MFMAEnablement; 
   // The number of transitive TRANS predecessors for each MFMA SU 
   static unsigned ExpRequirement; 
   // The count of independent "chains" of MFMA instructions in the pipeline 
   static unsigned MFMAChains; 
   // The length of each independent "chain" of MFMA instructions 
   static unsigned MFMAChainLength; 
   // Whether or not the pipeline has V_CVT instructions 
   static bool HasCvt; 
   // Whether or not there are instructions between the TRANS instruction and 
   // V_CVT 
   static bool HasChainBetweenCvt; 
   // The first occuring DS_READ which feeds an MFMA chain 
   static std::optional<unsigned> FirstPipeDSR; 
   // The MFMAPipe SUs with no MFMA predecessors 
 
 bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
     DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
@@ -2138,7 +2145,7 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
 
         for (auto &Succ : Pred.getSUnit()->Succs) {
           auto *MI = Succ.getSUnit()->getInstr();
-          if (!TII->isVMEM(*MI) || !MI->mayLoad())
+          if (!handleAsVMEMInstr(*MI, TII) || !MI->mayLoad())
             continue;
 
           if (MissedAny || !VMEMLookup.size()) {
@@ -2200,7 +2207,7 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
   SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
   // Interleave MFMA with DS_READ prefetch
-  for (unsigned I = 0; I < DSRCount - 4; ++I) {
+  for (unsigned I = 4; I < DSRCount; ++I) {
     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
         SchedGroupMask::DS_READ, 1, PipelineSyncID, DAG, TII);
     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
@@ -2213,7 +2220,7 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
   // Phase 2a: Loop carried dependency with V_PERM
   // Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they
   // depend on. Interleave MFMA to keep XDL unit busy throughout.
-  for (unsigned I = 0; I < DSWWithPermCount - DSWWithSharedVMEMCount; ++I) {
+  for (unsigned I = DSWWithSharedVMEMCount; I < DSWWithPermCount; ++I) {
     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
         SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII);
     SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true));
@@ -2250,7 +2257,7 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
   // Phase 2b: Loop carried dependency without V_PERM
   // Schedule DS_WRITE as closely as possible to the VMEM_READ they depend on.
   // Interleave MFMA to keep XDL unit busy throughout.
-  for (unsigned I = 0; I < DSWCount - DSWWithPermCount; I++) {
+  for (unsigned I = DSWWithPermCount; I < DSWCount; I++) {
     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
         SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
@@ -2426,17 +2433,15 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const {
     Result = true;
 
   else if (((SGMask & SchedGroupMask::VMEM) != SchedGroupMask::NONE) &&
-           (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))
+           handleAsVMEMInstr(MI, TII))
     Result = true;
 
   else if (((SGMask & SchedGroupMask::VMEM_READ) != SchedGroupMask::NONE) &&
-           MI.mayLoad() &&
-           (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))
+           MI.mayLoad() && handleAsVMEMInstr(MI, TII))
     Result = true;
 
   else if (((SGMask & SchedGroupMask::VMEM_WRITE) != SchedGroupMask::NONE) &&
-           MI.mayStore() &&
-           (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))
+           MI.mayStore() && handleAsVMEMInstr(MI, TII))
     Result = true;
 
   else if (((SGMask & SchedGroupMask::DS) != SchedGroupMask::NONE) &&
@@ -2703,5 +2708,7 @@ bool IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) {
 /// for a given region.
 std::unique_ptr<ScheduleDAGMutation>
 llvm::createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase) {
+  if (Phase == AMDGPU::SchedulingPhase::Initial)
+    resetDSWCounters();
   return std::make_unique<IGroupLPDAGMutation>(Phase);
 }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
@@ -321,6 +321,102 @@ entry:
   ret void
 }
 
+; Check fixes for:
+; - detection of VMEM_READS which are FLAT loads.
+; - unsigned int underflows in MFMASmallGemmSingleWaveOpt::applyIGLPStrategy.
+; - resetting global static DSWCounters for new runs.
+; (reduced fuzzer-generated test case)
+define amdgpu_kernel void @test_iglp_opt_flat_load(ptr %ptr1, ptr %ptr2, ptr addrspace(3) %ptr3, ptr addrspace(3) %ptr4) {
+; GCN-LABEL: test_iglp_opt_flat_load:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GCN-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x34
+; GCN-NEXT:    s_mov_b32 s8, 0
+; GCN-NEXT:    ; iglp_opt mask(0x00000001)
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-NEXT:    v_mov_b32_e32 v6, s2
+; GCN-NEXT:    v_mov_b32_e32 v7, s3
+; GCN-NEXT:    flat_load_dwordx4 v[0:3], v[4:5]
+; GCN-NEXT:    flat_load_ubyte v8, v[6:7]
+; GCN-NEXT:    ; kill: killed $vgpr4 killed $vgpr5
+; GCN-NEXT:    v_mov_b32_e32 v4, 0
+; GCN-NEXT:    ; kill: killed $vgpr6 killed $vgpr7
+; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cmp_nle_f16_e32 vcc, 0, v3
+; GCN-NEXT:    v_cmp_nge_f16_sdwa s[10:11], v3, v4 src0_sel:WORD_1 src1_sel:DWORD
+; GCN-NEXT:    v_cmp_nge_f16_sdwa s[14:15], v2, v4 src0_sel:WORD_1 src1_sel:DWORD
+; GCN-NEXT:    v_cmp_nge_f16_sdwa s[18:19], v0, v4 src0_sel:WORD_1 src1_sel:DWORD
+; GCN-NEXT:    v_and_b32_e32 v5, 1, v8
+; GCN-NEXT:    v_cmp_nle_f16_e64 s[0:1], 0, v2
+; GCN-NEXT:    v_cmp_nle_f16_e64 s[2:3], 0, v1
+; GCN-NEXT:    v_cmp_nge_f16_sdwa s[16:17], v1, v4 src0_sel:WORD_1 src1_sel:DWORD
+; GCN-NEXT:    v_cmp_nle_f16_e64 s[4:5], 0, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[10:11]
+; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[14:15]
+; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[18:19]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[6:7], 1, v5
+; GCN-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[0:1]
+; GCN-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[2:3]
+; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[16:17]
+; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
+; GCN-NEXT:    v_lshlrev_b16_e32 v0, 2, v0
+; GCN-NEXT:    v_lshlrev_b16_e32 v1, 3, v1
+; GCN-NEXT:    v_lshlrev_b16_e32 v2, 1, v2
+; GCN-NEXT:    v_lshlrev_b16_e32 v6, 1, v6
+; GCN-NEXT:    v_lshlrev_b16_e32 v4, 2, v4
+; GCN-NEXT:    v_lshlrev_b16_e32 v5, 3, v5
+; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
+; GCN-NEXT:    v_or_b32_e32 v1, v3, v2
+; GCN-NEXT:    v_or_b32_e32 v3, v7, v6
+; GCN-NEXT:    v_or_b32_e32 v2, v5, v4
+; GCN-NEXT:    v_and_b32_e32 v1, 3, v1
+; GCN-NEXT:    v_and_b32_e32 v3, 3, v3
+; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
+; GCN-NEXT:    v_or_b32_e32 v1, v3, v2
+; GCN-NEXT:    v_lshlrev_b16_e32 v0, 4, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 15, v1
+; GCN-NEXT:    s_xor_b64 s[0:1], s[6:7], -1
+; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, s12
+; GCN-NEXT:    ds_write_b8 v1, v0
+; GCN-NEXT:    s_and_saveexec_b64 s[2:3], s[0:1]
+; GCN-NEXT:    s_cbranch_execz .LBB4_2
+; GCN-NEXT:  ; %bb.1: ; %F
+; GCN-NEXT:    s_mov_b32 s9, s8
+; GCN-NEXT:    s_mov_b32 s10, s8
+; GCN-NEXT:    s_mov_b32 s11, s8
+; GCN-NEXT:    v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
+; GCN-NEXT:    v_mov_b32_e32 v4, s13
+; GCN-NEXT:    v_pk_mov_b32 v[2:3], s[10:11], s[10:11] op_sel:[0,1]
+; GCN-NEXT:    ds_write_b128 v4, v[0:3] offset:112
+; GCN-NEXT:    ds_write_b128 v4, v[0:3] offset:96
+; GCN-NEXT:    ds_write_b128 v4, v[0:3] offset:80
+; GCN-NEXT:    ds_write_b128 v4, v[0:3] offset:64
+; GCN-NEXT:    ds_write_b128 v4, v[0:3] offset:48
+; GCN-NEXT:    ds_write_b128 v4, v[0:3] offset:32
+; GCN-NEXT:    ds_write_b128 v4, v[0:3] offset:16
+; GCN-NEXT:    ds_write_b128 v4, v[0:3]
+; GCN-NEXT:  .LBB4_2: ; %common.ret
+; GCN-NEXT:    s_endpgm
+entry:
+  %LGV2 = load <8 x half>, ptr %ptr1, align 16
+  %LGV = load i1, ptr %ptr2, align 1
+  call void @llvm.amdgcn.iglp.opt(i32 1)
+  %C = fcmp ugt <8 x half> zeroinitializer, %LGV2
+  store <8 x i1> %C, ptr addrspace(3) %ptr3, align 1
+  br i1 %LGV, label %common.ret, label %F
+
+common.ret:                                       ; preds = %F, %entry
+  ret void
+
+F:                                                ; preds = %entry
+  store <32 x float> zeroinitializer, ptr addrspace(3) %ptr4, align 128
+  br label %common.ret
+}
+
 declare void @llvm.amdgcn.iglp.opt(i32) #1
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) #1