@@ -265,6 +265,82 @@ struct SUnitWithMemInfo {
265
265
bool getUnderlyingObjects ();
266
266
};
267
267
268
+ // / Add loop-carried chain dependencies. This class handles the same type of
269
+ // / dependencies added by `ScheduleDAGInstrs::buildSchedGraph`, but takes into
270
+ // / account dependencies across iterations.
271
+ class LoopCarriedOrderDepsTracker {
272
+ // Type of instruction that is relevant to order-dependencies
273
+ enum class InstrTag {
274
+ Barrier = 0 , // /< A barrier event instruction.
275
+ LoadOrStore = 1 , // /< An instruction that may load or store memory, but is
276
+ // /< not a barrier event.
277
+ FPExceptions = 2 , // /< An instruction that does not match above, but may
278
+ // /< raise floatin-point exceptions.
279
+ };
280
+
281
+ struct TaggedSUnit : PointerIntPair<SUnit *, 2 > {
282
+ TaggedSUnit (SUnit *SU, InstrTag Tag)
283
+ : PointerIntPair<SUnit *, 2 >(SU, unsigned (Tag)) {}
284
+
285
+ InstrTag getTag () const { return InstrTag (getInt ()); }
286
+ };
287
+
288
+ // / Holds loads and stores with memory related information.
289
+ struct LoadStoreChunk {
290
+ SmallVector<SUnitWithMemInfo, 4 > Loads;
291
+ SmallVector<SUnitWithMemInfo, 4 > Stores;
292
+
293
+ void append (SUnit *SU);
294
+ };
295
+
296
+ SwingSchedulerDAG *DAG;
297
+ BatchAAResults *BAA;
298
+ std::vector<SUnit> &SUnits;
299
+
300
+ // / The size of SUnits, for convenience.
301
+ const unsigned N;
302
+
303
+ // / Loop-carried Edges.
304
+ std::vector<BitVector> LoopCarried;
305
+
306
+ // / Instructions related to chain dependencies. They are one of the
307
+ // / following:
308
+ // /
309
+ // / 1. Barrier event.
310
+ // / 2. Load, but neither a barrier event, invariant load, nor may load trap
311
+ // / value.
312
+ // / 3. Store, but not a barrier event.
313
+ // / 4. None of them, but may raise floating-point exceptions.
314
+ // /
315
+ // / This is used when analyzing loop-carried dependencies that access global
316
+ // / barrier instructions.
317
+ std::vector<TaggedSUnit> TaggedSUnits;
318
+
319
+ const TargetInstrInfo *TII = nullptr ;
320
+ const TargetRegisterInfo *TRI = nullptr ;
321
+
322
+ public:
323
+ LoopCarriedOrderDepsTracker (SwingSchedulerDAG *SSD, BatchAAResults *BAA,
324
+ const TargetInstrInfo *TII,
325
+ const TargetRegisterInfo *TRI);
326
+
327
+ // / The main function to compute loop-carried order-dependencies.
328
+ void computeDependencies ();
329
+
330
+ const BitVector &getLoopCarried (unsigned Idx) const {
331
+ return LoopCarried[Idx];
332
+ }
333
+
334
+ private:
335
+ // / Tags to \p SU if the instruction may affect the order-dependencies.
336
+ std::optional<InstrTag> getInstrTag (SUnit *SU) const ;
337
+
338
+ void addLoopCarriedDepenenciesForChunks (const LoadStoreChunk &From,
339
+ const LoadStoreChunk &To);
340
+
341
+ void computeDependenciesAux ();
342
+ };
343
+
268
344
} // end anonymous namespace
269
345
270
346
// / The "main" function for implementing Swing Modulo Scheduling.
@@ -592,13 +668,19 @@ void SwingSchedulerDAG::setMAX_II() {
592
668
// / scheduling part of the Swing Modulo Scheduling algorithm.
593
669
void SwingSchedulerDAG::schedule () {
594
670
buildSchedGraph (AA);
595
- addLoopCarriedDependences ();
671
+ const LoopCarriedEdges LCE = addLoopCarriedDependences ();
596
672
updatePhiDependences ();
597
673
Topo.InitDAGTopologicalSorting ();
598
674
changeDependences ();
599
675
postProcessDAG ();
600
676
DDG = std::make_unique<SwingSchedulerDDG>(SUnits, &EntrySU, &ExitSU);
601
- LLVM_DEBUG (dump ());
677
+ LLVM_DEBUG ({
678
+ dump ();
679
+ dbgs () << " ===== Loop Carried Edges Begin =====\n " ;
680
+ for (SUnit &SU : SUnits)
681
+ LCE.dump (&SU, TRI, &MRI);
682
+ dbgs () << " ===== Loop Carried Edges End =====\n " ;
683
+ });
602
684
603
685
NodeSetType NodeSets;
604
686
findCircuits (NodeSets);
@@ -831,15 +913,6 @@ static bool isSuccOrder(SUnit *SUa, SUnit *SUb) {
831
913
return false ;
832
914
}
833
915
834
- // / Return true if the instruction causes a chain between memory
835
- // / references before and after it.
836
- static bool isDependenceBarrier (MachineInstr &MI) {
837
- return MI.isCall () || MI.mayRaiseFPException () ||
838
- MI.hasUnmodeledSideEffects () ||
839
- (MI.hasOrderedMemoryRef () &&
840
- (!MI.mayLoad () || !MI.isDereferenceableInvariantLoad ()));
841
- }
842
-
843
916
SUnitWithMemInfo::SUnitWithMemInfo (SUnit *SU) : SU(SU) {
844
917
if (!getUnderlyingObjects ())
845
918
return ;
@@ -940,28 +1013,111 @@ static bool hasLoopCarriedMemDep(const SUnitWithMemInfo &Src,
940
1013
return false ;
941
1014
}
942
1015
1016
+ void LoopCarriedOrderDepsTracker::LoadStoreChunk::append (SUnit *SU) {
1017
+ const MachineInstr *MI = SU->getInstr ();
1018
+ if (!MI->mayLoadOrStore ())
1019
+ return ;
1020
+ (MI->mayStore () ? Stores : Loads).emplace_back (SU);
1021
+ }
1022
+
1023
+ LoopCarriedOrderDepsTracker::LoopCarriedOrderDepsTracker (
1024
+ SwingSchedulerDAG *SSD, BatchAAResults *BAA, const TargetInstrInfo *TII,
1025
+ const TargetRegisterInfo *TRI)
1026
+ : DAG(SSD), BAA(BAA), SUnits(DAG->SUnits), N(SUnits.size()),
1027
+ LoopCarried(N, BitVector(N)), TII(TII), TRI(TRI) {}
1028
+
1029
+ void LoopCarriedOrderDepsTracker::computeDependencies () {
1030
+ // Traverse all instructions and extract only what we are targetting.
1031
+ for (auto &SU : SUnits) {
1032
+ auto Tagged = getInstrTag (&SU);
1033
+
1034
+ // This instruction has no loop-carried order-dependencies.
1035
+ if (!Tagged)
1036
+ continue ;
1037
+ TaggedSUnits.emplace_back (&SU, *Tagged);
1038
+ }
1039
+
1040
+ computeDependenciesAux ();
1041
+ }
1042
+
1043
+ std::optional<LoopCarriedOrderDepsTracker::InstrTag>
1044
+ LoopCarriedOrderDepsTracker::getInstrTag (SUnit *SU) const {
1045
+ MachineInstr *MI = SU->getInstr ();
1046
+ if (TII->isGlobalMemoryObject (MI))
1047
+ return InstrTag::Barrier;
1048
+
1049
+ if (MI->mayStore () ||
1050
+ (MI->mayLoad () && !MI->isDereferenceableInvariantLoad ()))
1051
+ return InstrTag::LoadOrStore;
1052
+
1053
+ if (MI->mayRaiseFPException ())
1054
+ return InstrTag::FPExceptions;
1055
+
1056
+ return std::nullopt;
1057
+ }
1058
+
1059
+ void LoopCarriedOrderDepsTracker::addLoopCarriedDepenenciesForChunks (
1060
+ const LoadStoreChunk &From, const LoadStoreChunk &To) {
1061
+ // Add dependencies for load-to-store (WAR) from top to bottom.
1062
+ for (const SUnitWithMemInfo &Src : From.Loads )
1063
+ for (const SUnitWithMemInfo &Dst : To.Stores )
1064
+ if (Src.SU ->NodeNum < Dst.SU ->NodeNum &&
1065
+ hasLoopCarriedMemDep (Src, Dst, *BAA, TII, TRI))
1066
+ LoopCarried[Src.SU ->NodeNum ].set (Dst.SU ->NodeNum );
1067
+
1068
+ // TODO: The following dependencies are missed.
1069
+ //
1070
+ // - Dependencies for load-to-store from bottom to top.
1071
+ // - Dependencies for store-to-load (RAW).
1072
+ // - Dependencies for store-to-store (WAW).
1073
+ }
1074
+
1075
+ void LoopCarriedOrderDepsTracker::computeDependenciesAux () {
1076
+ SmallVector<LoadStoreChunk, 2 > Chunks (1 );
1077
+ for (const auto &TSU : TaggedSUnits) {
1078
+ InstrTag Tag = TSU.getTag ();
1079
+ SUnit *SU = TSU.getPointer ();
1080
+ switch (Tag) {
1081
+ case InstrTag::Barrier:
1082
+ Chunks.emplace_back ();
1083
+ break ;
1084
+ case InstrTag::LoadOrStore:
1085
+ Chunks.back ().append (SU);
1086
+ break ;
1087
+ case InstrTag::FPExceptions:
1088
+ // TODO: Handle this properly.
1089
+ break ;
1090
+ }
1091
+ }
1092
+
1093
+ // Add dependencies between memory operations. If there are one or more
1094
+ // barrier events between two memory instructions, we don't add a
1095
+ // loop-carried dependence for them.
1096
+ for (const LoadStoreChunk &Chunk : Chunks)
1097
+ addLoopCarriedDepenenciesForChunks (Chunk, Chunk);
1098
+
1099
+ // TODO: If there are multiple barrier instructions, dependencies from the
1100
+ // last barrier instruction (or load/store below it) to the first barrier
1101
+ // instruction (or load/store above it).
1102
+ }
1103
+
943
1104
// / Add a chain edge between a load and store if the store can be an
944
1105
// / alias of the load on a subsequent iteration, i.e., a loop carried
945
1106
// / dependence. This code is very similar to the code in ScheduleDAGInstrs
946
1107
// / but that code doesn't create loop carried dependences.
947
- void SwingSchedulerDAG::addLoopCarriedDependences () {
948
- SmallVector<SUnitWithMemInfo, 4 > PendingLoads;
949
- for (auto &SU : SUnits) {
950
- MachineInstr &MI = *SU.getInstr ();
951
- if (isDependenceBarrier (MI))
952
- PendingLoads.clear ();
953
- else if (MI.mayLoad ()) {
954
- PendingLoads.emplace_back (&SU);
955
- } else if (MI.mayStore ()) {
956
- SUnitWithMemInfo Store (&SU);
957
- for (const SUnitWithMemInfo &Load : PendingLoads)
958
- if (hasLoopCarriedMemDep (Load, Store, BAA, TII, TRI)) {
959
- SDep Dep (Load.SU , SDep::Barrier);
960
- Dep.setLatency (1 );
961
- SU.addPred (Dep);
962
- }
963
- }
964
- }
1108
+ // / TODO: Also compute output-dependencies.
1109
+ LoopCarriedEdges SwingSchedulerDAG::addLoopCarriedDependences () {
1110
+ LoopCarriedEdges LCE;
1111
+
1112
+ // Add loop-carried order-dependencies
1113
+ LoopCarriedOrderDepsTracker LCODTracker (this , &BAA, TII, TRI);
1114
+ LCODTracker.computeDependencies ();
1115
+ for (unsigned I = 0 ; I != SUnits.size (); I++)
1116
+ for (const int Succ : LCODTracker.getLoopCarried (I).set_bits ())
1117
+ LCE.OrderDeps [&SUnits[I]].insert (&SUnits[Succ]);
1118
+
1119
+ LCE.modifySUnits (SUnits);
1120
+ return LCE;
965
1121
}
966
1122
967
1123
// / Update the phi dependences to the DAG because ScheduleDAGInstrs no longer
@@ -4001,3 +4157,37 @@ const SwingSchedulerDDG::EdgesType &
4001
4157
SwingSchedulerDDG::getOutEdges (const SUnit *SU) const {
4002
4158
return getEdges (SU).Succs ;
4003
4159
}
4160
+
4161
+ void LoopCarriedEdges::modifySUnits (std::vector<SUnit> &SUnits) {
4162
+ // Currently this function simply adds all dependencies represented by this
4163
+ // object. After we properly handle missed dependencies, the logic here will
4164
+ // be more complex, as currently missed edges should not be added to the DAG.
4165
+ for (SUnit &SU : SUnits) {
4166
+ SUnit *Src = &SU;
4167
+ if (auto *OrderDep = getOrderDepOrNull (Src)) {
4168
+ SDep Dep (Src, SDep::Barrier);
4169
+ Dep.setLatency (1 );
4170
+ for (SUnit *Dst : *OrderDep)
4171
+ Dst->addPred (Dep);
4172
+ }
4173
+ }
4174
+ }
4175
+
4176
+ void LoopCarriedEdges::dump (SUnit *SU, const TargetRegisterInfo *TRI,
4177
+ const MachineRegisterInfo *MRI) const {
4178
+ const auto *Order = getOrderDepOrNull (SU);
4179
+
4180
+ if (!Order)
4181
+ return ;
4182
+
4183
+ const auto DumpSU = [](const SUnit *SU) {
4184
+ std::ostringstream OSS;
4185
+ OSS << " SU(" << SU->NodeNum << " )" ;
4186
+ return OSS.str ();
4187
+ };
4188
+
4189
+ dbgs () << " Loop carried edges from " << DumpSU (SU) << " \n "
4190
+ << " Order\n " ;
4191
+ for (SUnit *Dst : *Order)
4192
+ dbgs () << " " << DumpSU (Dst) << " \n " ;
4193
+ }
0 commit comments