Skip to content

Commit 26b7a20

Browse files
jrbyrnesbcahoon
authored andcommitted
[AMDGPU][IGLP] SingleWaveOpt: Cache DSW Counters from PreRA (llvm#67759)
Save the DSW counters from PreRA scheduling. While this avoids recalculation in the postRA pass, that isn't the main purpose. This is required because of physical register dependencies in PostRA scheduling -- they alter the DAG s.t. our counters may become incorrect -- which alters the layout of the pipeline. By preserving the values from PreRA, we can be sure that we accurately construct the pipeline. Additionally, remove a bad assert in SharesPredWithPrevNthGroup -- it is possible that we will have an empty cache if OtherGroup has no elements which have a V_PERM pred (possible if the V_PERM SG is empty). Change-Id: I2957a575ae039b67274587f113cac68d25317c51
1 parent b5706be commit 26b7a20

File tree

6 files changed

+186
-167
lines changed

6 files changed

+186
-167
lines changed

llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp

Lines changed: 76 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -854,7 +854,8 @@ class IGLPStrategy {
854854
// Add SchedGroups to \p Pipeline to implement this Strategy.
855855
virtual void applyIGLPStrategy(
856856
DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
857-
DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) = 0;
857+
DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
858+
bool IsPostRA) = 0;
858859

859860
// Returns true if this strategy should be applied to a ScheduleDAG.
860861
virtual bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) = 0;
@@ -872,7 +873,8 @@ class MFMASmallGemmOpt final : public IGLPStrategy {
872873
public:
873874
void applyIGLPStrategy(
874875
DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
875-
DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) override;
876+
DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
877+
bool IsPostRA) override;
876878

877879
bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; }
878880

@@ -884,7 +886,8 @@ class MFMASmallGemmOpt final : public IGLPStrategy {
884886

885887
void MFMASmallGemmOpt::applyIGLPStrategy(
886888
DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
887-
DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) {
889+
DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
890+
bool IsPostRA) {
888891
// Count the number of MFMA instructions.
889892
unsigned MFMACount = 0;
890893
for (const MachineInstr &I : *DAG)
@@ -1080,9 +1083,12 @@ class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
10801083
Cache->push_back(Pred.getSUnit());
10811084
}
10821085
}
1086+
1087+
// If the other group has no PERM preds, then this group won't share any
1088+
if (!Cache->size())
1089+
return false;
10831090
}
10841091

1085-
assert(Cache->size());
10861092
auto DAG = SyncPipe[0].DAG;
10871093
// Does the previous DS_WRITE share a V_PERM predecessor with this
10881094
// VMEM_READ
@@ -1099,7 +1105,8 @@ class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
10991105
public:
11001106
void applyIGLPStrategy(
11011107
DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
1102-
DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) override;
1108+
DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
1109+
bool IsPostRA) override;
11031110

11041111
bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; }
11051112

@@ -1109,14 +1116,20 @@ class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
11091116
}
11101117
};
11111118

1119+
static unsigned DSWCount = 0;
1120+
static unsigned DSWWithPermCount = 0;
1121+
static unsigned DSWWithSharedVMEMCount = 0;
1122+
11121123
void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
11131124
DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
1114-
DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) {
1125+
DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
1126+
bool IsPostRA) {
11151127
unsigned MFMACount = 0;
1116-
unsigned DSWCount = 0;
1117-
unsigned DSWWithPermCount = 0;
1118-
unsigned DSWWithSharedVMEMCount = 0;
11191128
unsigned DSRCount = 0;
1129+
1130+
assert((IsPostRA ||
1131+
DSWCount == DSWWithPermCount == DSWWithSharedVMEMCount == 0) &&
1132+
"DSWCounters should be zero in pre-RA scheduling!");
11201133
SmallVector<SUnit *, 6> DSWithPerms;
11211134
for (auto &SU : DAG->SUnits) {
11221135
auto I = SU.getInstr();
@@ -1125,7 +1138,7 @@ void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
11251138
else if (TII->isDS(*I)) {
11261139
if (I->mayLoad())
11271140
++DSRCount;
1128-
else if (I->mayStore()) {
1141+
else if (I->mayStore() && !IsPostRA) {
11291142
++DSWCount;
11301143
for (auto Pred : SU.Preds) {
11311144
if (Pred.getSUnit()->getInstr()->getOpcode() ==
@@ -1137,57 +1150,59 @@ void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
11371150
}
11381151
}
11391152
}
1140-
DSWWithPermCount = DSWithPerms.size();
1141-
auto I = DSWithPerms.begin();
1142-
auto E = DSWithPerms.end();
1143-
1144-
// Get the count of DS_WRITES with V_PERM predecessors which
1145-
// have loop carried dependencies (WAR) on the same VMEM_READs.
1146-
// We consider partial overlap as a miss -- in other words,
1147-
// for a given DS_W, we only consider another DS_W as matching
1148-
// if there is a corresponding (in terms of the VMEM_R it uses) V_PERM pred
1149-
// for every V_PERM pred of this DS_W.
1150-
DenseMap<MachineInstr *, SUnit *> VMEMLookup;
1151-
SmallVector<SUnit *, 6> Counted;
1152-
for (; I != E; I++) {
1153-
SUnit *Cand = nullptr;
1154-
bool MissedAny = false;
1155-
for (auto &Pred : (*I)->Preds) {
1156-
if (Pred.getSUnit()->getInstr()->getOpcode() != AMDGPU::V_PERM_B32_e64)
1157-
continue;
11581153

1159-
if (Cand &&
1160-
std::find(Counted.begin(), Counted.end(), Cand) != Counted.end())
1161-
break;
1162-
1163-
for (auto &Succ : Pred.getSUnit()->Succs) {
1164-
auto MI = Succ.getSUnit()->getInstr();
1165-
if (!TII->isVMEM(*MI) || !MI->mayLoad())
1154+
if (!IsPostRA) {
1155+
DSWWithPermCount = DSWithPerms.size();
1156+
auto I = DSWithPerms.begin();
1157+
auto E = DSWithPerms.end();
1158+
1159+
// Get the count of DS_WRITES with V_PERM predecessors which
1160+
// have loop carried dependencies (WAR) on the same VMEM_READs.
1161+
// We consider partial overlap as a miss -- in other words,
1162+
// for a given DS_W, we only consider another DS_W as matching
1163+
// if there is a corresponding (in terms of the VMEM_R it uses) V_PERM pred
1164+
// for every V_PERM pred of this DS_W.
1165+
DenseMap<MachineInstr *, SUnit *> VMEMLookup;
1166+
SmallVector<SUnit *, 6> Counted;
1167+
for (; I != E; I++) {
1168+
SUnit *Cand = nullptr;
1169+
bool MissedAny = false;
1170+
for (auto &Pred : (*I)->Preds) {
1171+
if (Pred.getSUnit()->getInstr()->getOpcode() != AMDGPU::V_PERM_B32_e64)
11661172
continue;
11671173

1168-
if (MissedAny || !VMEMLookup.size()) {
1169-
MissedAny = true;
1170-
VMEMLookup[MI] = *I;
1171-
continue;
1172-
}
1174+
if (Cand && llvm::is_contained(Counted, Cand))
1175+
break;
11731176

1174-
if (!VMEMLookup.contains(MI)) {
1175-
MissedAny = true;
1176-
VMEMLookup[MI] = *I;
1177-
continue;
1178-
}
1177+
for (auto &Succ : Pred.getSUnit()->Succs) {
1178+
auto MI = Succ.getSUnit()->getInstr();
1179+
if (!TII->isVMEM(*MI) || !MI->mayLoad())
1180+
continue;
11791181

1180-
Cand = VMEMLookup[MI];
1181-
if (std::find(Counted.begin(), Counted.end(), Cand) != Counted.end()) {
1182-
MissedAny = true;
1183-
break;
1182+
if (MissedAny || !VMEMLookup.size()) {
1183+
MissedAny = true;
1184+
VMEMLookup[MI] = *I;
1185+
continue;
1186+
}
1187+
1188+
if (!VMEMLookup.contains(MI)) {
1189+
MissedAny = true;
1190+
VMEMLookup[MI] = *I;
1191+
continue;
1192+
}
1193+
1194+
Cand = VMEMLookup[MI];
1195+
if (llvm::is_contained(Counted, Cand)) {
1196+
MissedAny = true;
1197+
break;
1198+
}
11841199
}
11851200
}
1186-
}
1187-
if (!MissedAny && Cand) {
1188-
DSWWithSharedVMEMCount += 2;
1189-
Counted.push_back(Cand);
1190-
Counted.push_back(*I);
1201+
if (!MissedAny && Cand) {
1202+
DSWWithSharedVMEMCount += 2;
1203+
Counted.push_back(Cand);
1204+
Counted.push_back(*I);
1205+
}
11911206
}
11921207
}
11931208

@@ -1403,7 +1418,11 @@ class IGroupLPDAGMutation : public ScheduleDAGMutation {
14031418
// first created SchedGroup first.
14041419
bool IsBottomUp = 1;
14051420

1421+
// Whether the mutation is being applied to post RA scheduling
1422+
bool IsPostRA = false;
1423+
14061424
IGroupLPDAGMutation() = default;
1425+
IGroupLPDAGMutation(bool IsPostRA) : IsPostRA(IsPostRA) {}
14071426
};
14081427

14091428
unsigned SchedGroup::NumSchedGroups = 0;
@@ -1691,16 +1710,16 @@ void IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) {
16911710
auto S = createIGLPStrategy(StrategyID, DAG, TII);
16921711
if (S->shouldApplyStrategy(DAG)) {
16931712
IsBottomUp = S->IsBottomUp;
1694-
S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups);
1713+
S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups, IsPostRA);
16951714
}
16961715
}
16971716

16981717
} // namespace
16991718

17001719
namespace llvm {
17011720

1702-
std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation() {
1703-
return std::make_unique<IGroupLPDAGMutation>();
1721+
std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation(bool IsPostRA) {
1722+
return std::make_unique<IGroupLPDAGMutation>(IsPostRA);
17041723
}
17051724

17061725
} // end namespace llvm

llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
namespace llvm {
1616

17-
std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation();
17+
std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation(bool IsPostRA);
1818

1919
} // namespace llvm
2020

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -440,7 +440,7 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
440440
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
441441
if (ST.shouldClusterStores())
442442
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
443-
DAG->addMutation(createIGroupLPDAGMutation());
443+
DAG->addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/false));
444444
DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
445445
DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
446446
return DAG;
@@ -450,7 +450,7 @@ static ScheduleDAGInstrs *
450450
createGCNMaxILPMachineScheduler(MachineSchedContext *C) {
451451
ScheduleDAGMILive *DAG =
452452
new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxILPSchedStrategy>(C));
453-
DAG->addMutation(createIGroupLPDAGMutation());
453+
DAG->addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/false));
454454
return DAG;
455455
}
456456

@@ -914,7 +914,7 @@ class GCNPassConfig final : public AMDGPUPassConfig {
914914
if (ST.shouldClusterStores())
915915
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
916916
DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII));
917-
DAG->addMutation(createIGroupLPDAGMutation());
917+
DAG->addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/true));
918918
if (isPassEnabled(EnableVOPD, CodeGenOpt::Less))
919919
DAG->addMutation(createVOPDPairingMutation());
920920
return DAG;

llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -691,7 +691,7 @@ bool UnclusteredHighRPStage::initGCNSchedStage() {
691691
return false;
692692

693693
SavedMutations.swap(DAG.Mutations);
694-
DAG.addMutation(createIGroupLPDAGMutation());
694+
DAG.addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/false));
695695

696696
InitialOccupancy = DAG.MinOccupancy;
697697
// Aggressivly try to reduce register pressure in the unclustered high RP
@@ -826,7 +826,7 @@ bool GCNSchedStage::initGCNRegion() {
826826
StageID != GCNSchedStageID::UnclusteredHighRPReschedule) {
827827
SavedMutations.clear();
828828
SavedMutations.swap(DAG.Mutations);
829-
DAG.addMutation(createIGroupLPDAGMutation());
829+
DAG.addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/false));
830830
}
831831
return true;
832832
}
@@ -1539,7 +1539,7 @@ void GCNPostScheduleDAGMILive::schedule() {
15391539
if (HasIGLPInstrs) {
15401540
SavedMutations.clear();
15411541
SavedMutations.swap(Mutations);
1542-
addMutation(createIGroupLPDAGMutation());
1542+
addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/true));
15431543
}
15441544

15451545
ScheduleDAGMI::schedule();

0 commit comments

Comments
 (0)