Skip to content

Commit e3fdc3a

Browse files
[RISCV] Allow hoisting VXRM writes out of loops speculatively (#110044)
Change the intersect for the anticipated algorithm to ignore unknown when anticipating. This effectively allows VXRM writes speculatively because it could do a VXRM write even when there's branches where VXRM is unneeded. The importance of this change is because VXRM writes causes pipeline flushes in some micro-architectures and so it makes sense to allow more aggressive hoisting even if it causes some degradation for the slow path. An example is this code: ``` typedef unsigned char uint8_t; __attribute__ ((noipa)) void foo (uint8_t *dst, int i_dst_stride, uint8_t *src1, int i_src1_stride, uint8_t *src2, int i_src2_stride, int i_width, int i_height ) { for( int y = 0; y < i_height; y++ ) { for( int x = 0; x < i_width; x++ ) dst[x] = ( src1[x] + src2[x] + 1 ) >> 1; dst += i_dst_stride; src1 += i_src1_stride; src2 += i_src2_stride; } } ``` With this patch, the code above generates a hoisting VXRM writes out of the outer loop.
1 parent febbf91 commit e3fdc3a

File tree

4 files changed

+712
-6
lines changed

4 files changed

+712
-6
lines changed

llvm/lib/Target/RISCV/RISCVFeatures.td

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1453,6 +1453,9 @@ def TuneSiFive7 : SubtargetFeature<"sifive7", "RISCVProcFamily", "SiFive7",
14531453
def TuneVentanaVeyron : SubtargetFeature<"ventana-veyron", "RISCVProcFamily", "VentanaVeyron",
14541454
"Ventana Veyron-Series processors">;
14551455

1456+
def TuneVXRMPipelineFlush : SubtargetFeature<"vxrm-pipeline-flush", "HasVXRMPipelineFlush",
1457+
"true", "VXRM writes causes pipeline flush">;
1458+
14561459
// Assume that lock-free native-width atomics are available, even if the target
14571460
// and operating system combination would not usually provide them. The user
14581461
// is responsible for providing any necessary __sync implementations. Code

llvm/lib/Target/RISCV/RISCVInsertWriteVXRM.cpp

Lines changed: 38 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,35 @@ class VXRMInfo {
109109
return VXRMInfo::getUnknown();
110110
}
111111

112+
// Calculate the VXRMInfo visible to a block assuming this and Other
113+
// are both predecessors. To allow speculatively running WriteVXRM
114+
// we will ignore Unknowns if one of this and Other have valid
115+
// WriteVXRM. Rationale: WriteVXRM causes a pipeline flush in some
116+
// uarchs and moving it outside loops is very important for some
117+
// workloads.
118+
VXRMInfo intersectAnticipated(const VXRMInfo &Other) const {
119+
// If the new value isn't valid, ignore it.
120+
if (!Other.isValid())
121+
return *this;
122+
123+
// If this value isn't valid, this must be the first predecessor, use it.
124+
if (!isValid())
125+
return Other;
126+
127+
// If either is unknown, the result is the other one.
128+
if (isUnknown())
129+
return Other;
130+
if (Other.isUnknown())
131+
return *this;
132+
133+
// If we have an exact match, return this.
134+
if (*this == Other)
135+
return *this;
136+
137+
// Otherwise the result is unknown.
138+
return VXRMInfo::getUnknown();
139+
}
140+
112141
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
113142
/// Support for debugging, callable in GDB: V->dump()
114143
LLVM_DUMP_METHOD void dump() const {
@@ -187,7 +216,7 @@ class RISCVInsertWriteVXRM : public MachineFunctionPass {
187216
private:
188217
bool computeVXRMChanges(const MachineBasicBlock &MBB);
189218
void computeAvailable(const MachineBasicBlock &MBB);
190-
void computeAnticipated(const MachineBasicBlock &MBB);
219+
void computeAnticipated(const MachineFunction &MF, const MachineBasicBlock &MBB);
191220
void emitWriteVXRM(MachineBasicBlock &MBB);
192221
};
193222

@@ -279,8 +308,9 @@ void RISCVInsertWriteVXRM::computeAvailable(const MachineBasicBlock &MBB) {
279308
}
280309
}
281310

282-
void RISCVInsertWriteVXRM::computeAnticipated(const MachineBasicBlock &MBB) {
311+
void RISCVInsertWriteVXRM::computeAnticipated(const MachineFunction &MF, const MachineBasicBlock &MBB) {
283312
BlockData &BBInfo = BlockInfo[MBB.getNumber()];
313+
const RISCVSubtarget &ST = MF.getSubtarget<RISCVSubtarget>();
284314

285315
BBInfo.InQueue = false;
286316

@@ -289,7 +319,11 @@ void RISCVInsertWriteVXRM::computeAnticipated(const MachineBasicBlock &MBB) {
289319
Anticipated.setUnknown();
290320
} else {
291321
for (const MachineBasicBlock *S : MBB.successors())
292-
Anticipated =
322+
if (ST.hasVXRMPipelineFlush())
323+
Anticipated =
324+
Anticipated.intersectAnticipated(BlockInfo[S->getNumber()].AnticipatedIn);
325+
else
326+
Anticipated =
293327
Anticipated.intersect(BlockInfo[S->getNumber()].AnticipatedIn);
294328
}
295329

@@ -453,7 +487,7 @@ bool RISCVInsertWriteVXRM::runOnMachineFunction(MachineFunction &MF) {
453487
while (!WorkList.empty()) {
454488
const MachineBasicBlock &MBB = *WorkList.front();
455489
WorkList.pop();
456-
computeAnticipated(MBB);
490+
computeAnticipated(MF, MBB);
457491
}
458492

459493
// Phase 4 - Emit VXRM writes at the earliest place possible.

llvm/lib/Target/RISCV/RISCVProcessors.td

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -277,7 +277,8 @@ def SIFIVE_P470 : RISCVProcessorModel<"sifive-p470", SiFiveP400Model,
277277
FeatureUnalignedScalarMem,
278278
FeatureUnalignedVectorMem]),
279279
!listconcat(SiFiveP400TuneFeatures,
280-
[TuneNoSinkSplatOperands])>;
280+
[TuneNoSinkSplatOperands,
281+
TuneVXRMPipelineFlush])>;
281282

282283

283284
def SIFIVE_P670 : RISCVProcessorModel<"sifive-p670", SiFiveP600Model,
@@ -298,6 +299,7 @@ def SIFIVE_P670 : RISCVProcessorModel<"sifive-p670", SiFiveP600Model,
298299
TuneLUIADDIFusion,
299300
TuneAUIPCADDIFusion,
300301
TuneNoSinkSplatOperands,
302+
TuneVXRMPipelineFlush,
301303
FeaturePostRAScheduler]>;
302304

303305
def SYNTACORE_SCR1_BASE : RISCVProcessorModel<"syntacore-scr1-base",
@@ -510,7 +512,8 @@ def SPACEMIT_X60 : RISCVProcessorModel<"spacemit-x60",
510512
[TuneDLenFactor2,
511513
TuneOptimizedNF2SegmentLoadStore,
512514
TuneOptimizedNF3SegmentLoadStore,
513-
TuneOptimizedNF4SegmentLoadStore]> {
515+
TuneOptimizedNF4SegmentLoadStore,
516+
TuneVXRMPipelineFlush]> {
514517
let MVendorID = 0x710;
515518
let MArchID = 0x8000000058000001;
516519
let MImpID = 0x1000000049772200;

0 commit comments

Comments
 (0)