Skip to content

Commit ca9d2e9

Browse files
author
Sumanth Gundapaneni
authored
[Hexagon] Add Loop Alignment pass. (#83379)
Inspect a basic block and if its single basic block loop with a small number of instructions, set the Loop Alignment to 32 bytes. This will avoid the cache line break in the first packet of loop which will cause a stall per each execution of loop.
1 parent 8466ab9 commit ca9d2e9

File tree

9 files changed

+680
-2
lines changed

9 files changed

+680
-2
lines changed

clang/test/CodeGen/builtins-hexagon.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
// REQUIRES: hexagon-registered-target
2-
// RUN: %clang_cc1 -triple hexagon-unknown-elf -target-cpu hexagonv65 -target-feature +hvxv65 -emit-llvm %s -o - | FileCheck %s
2+
// RUN: %clang_cc1 -triple hexagon-unknown-elf -target-cpu hexagonv65 -target-feature +hvxv65 -target-feature +hvx-length128b -emit-llvm %s -o - | FileCheck %s
33

44
void test() {
55
int v64 __attribute__((__vector_size__(64)));

llvm/lib/Target/Hexagon/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ add_llvm_target(HexagonCodeGen
4343
HexagonISelDAGToDAGHVX.cpp
4444
HexagonISelLowering.cpp
4545
HexagonISelLoweringHVX.cpp
46+
HexagonLoopAlign.cpp
4647
HexagonLoopIdiomRecognition.cpp
4748
HexagonMachineFunctionInfo.cpp
4849
HexagonMachineScheduler.cpp
Lines changed: 216 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,216 @@
1+
//===----- HexagonLoopAlign.cpp - Generate loop alignment directives -----===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
// Inspect a basic block and if its single basic block loop with a small
9+
// number of instructions, set the prefLoopAlignment to 32 bytes (5).
10+
//===----------------------------------------------------------------------===//
11+
12+
#define DEBUG_TYPE "hexagon-loop-align"
13+
14+
#include "HexagonTargetMachine.h"
15+
#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
16+
#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
17+
#include "llvm/CodeGen/SchedulerRegistry.h"
18+
#include "llvm/Support/Debug.h"
19+
20+
using namespace llvm;
21+
22+
static cl::opt<bool>
23+
DisableLoopAlign("disable-hexagon-loop-align", cl::Hidden,
24+
cl::desc("Disable Hexagon loop alignment pass"));
25+
26+
static cl::opt<uint32_t> HVXLoopAlignLimitUB(
27+
"hexagon-hvx-loop-align-limit-ub", cl::Hidden, cl::init(16),
28+
cl::desc("Set hexagon hvx loop upper bound align limit"));
29+
30+
static cl::opt<uint32_t> TinyLoopAlignLimitUB(
31+
"hexagon-tiny-loop-align-limit-ub", cl::Hidden, cl::init(16),
32+
cl::desc("Set hexagon tiny-core loop upper bound align limit"));
33+
34+
static cl::opt<uint32_t>
35+
LoopAlignLimitUB("hexagon-loop-align-limit-ub", cl::Hidden, cl::init(8),
36+
cl::desc("Set hexagon loop upper bound align limit"));
37+
38+
static cl::opt<uint32_t>
39+
LoopAlignLimitLB("hexagon-loop-align-limit-lb", cl::Hidden, cl::init(4),
40+
cl::desc("Set hexagon loop lower bound align limit"));
41+
42+
static cl::opt<uint32_t>
43+
LoopBndlAlignLimit("hexagon-loop-bundle-align-limit", cl::Hidden,
44+
cl::init(4),
45+
cl::desc("Set hexagon loop align bundle limit"));
46+
47+
static cl::opt<uint32_t> TinyLoopBndlAlignLimit(
48+
"hexagon-tiny-loop-bundle-align-limit", cl::Hidden, cl::init(8),
49+
cl::desc("Set hexagon tiny-core loop align bundle limit"));
50+
51+
static cl::opt<uint32_t>
52+
LoopEdgeThreshold("hexagon-loop-edge-threshold", cl::Hidden, cl::init(7500),
53+
cl::desc("Set hexagon loop align edge theshold"));
54+
55+
namespace llvm {
56+
FunctionPass *createHexagonLoopAlign();
57+
void initializeHexagonLoopAlignPass(PassRegistry &);
58+
} // namespace llvm
59+
60+
namespace {
61+
62+
class HexagonLoopAlign : public MachineFunctionPass {
63+
const HexagonSubtarget *HST = nullptr;
64+
const TargetMachine *HTM = nullptr;
65+
const HexagonInstrInfo *HII = nullptr;
66+
67+
public:
68+
static char ID;
69+
HexagonLoopAlign() : MachineFunctionPass(ID) {
70+
initializeHexagonLoopAlignPass(*PassRegistry::getPassRegistry());
71+
}
72+
bool shouldBalignLoop(MachineBasicBlock &BB, bool AboveThres);
73+
bool isSingleLoop(MachineBasicBlock &MBB);
74+
bool attemptToBalignSmallLoop(MachineFunction &MF, MachineBasicBlock &MBB);
75+
76+
void getAnalysisUsage(AnalysisUsage &AU) const override {
77+
AU.addRequired<MachineBranchProbabilityInfo>();
78+
AU.addRequired<MachineBlockFrequencyInfo>();
79+
MachineFunctionPass::getAnalysisUsage(AU);
80+
}
81+
82+
StringRef getPassName() const override { return "Hexagon LoopAlign pass"; }
83+
bool runOnMachineFunction(MachineFunction &MF) override;
84+
};
85+
86+
char HexagonLoopAlign::ID = 0;
87+
88+
bool HexagonLoopAlign::shouldBalignLoop(MachineBasicBlock &BB,
89+
bool AboveThres) {
90+
bool isVec = false;
91+
unsigned InstCnt = 0;
92+
unsigned BndlCnt = 0;
93+
94+
for (MachineBasicBlock::instr_iterator II = BB.instr_begin(),
95+
IE = BB.instr_end();
96+
II != IE; ++II) {
97+
98+
// End if the instruction is endloop.
99+
if (HII->isEndLoopN(II->getOpcode()))
100+
break;
101+
// Count the number of bundles.
102+
if (II->isBundle()) {
103+
BndlCnt++;
104+
continue;
105+
}
106+
// Skip over debug instructions.
107+
if (II->isDebugInstr())
108+
continue;
109+
// Check if there are any HVX instructions in loop.
110+
isVec |= HII->isHVXVec(*II);
111+
// Count the number of instructions.
112+
InstCnt++;
113+
}
114+
115+
LLVM_DEBUG({
116+
dbgs() << "Bundle Count : " << BndlCnt << "\n";
117+
dbgs() << "Instruction Count : " << InstCnt << "\n";
118+
});
119+
120+
unsigned LimitUB = 0;
121+
unsigned LimitBndl = LoopBndlAlignLimit;
122+
// The conditions in the order of priority.
123+
if (HST->isTinyCore()) {
124+
LimitUB = TinyLoopAlignLimitUB;
125+
LimitBndl = TinyLoopBndlAlignLimit;
126+
} else if (isVec)
127+
LimitUB = HVXLoopAlignLimitUB;
128+
else if (AboveThres)
129+
LimitUB = LoopAlignLimitUB;
130+
131+
// if the upper bound is not set to a value, implies we didn't meet
132+
// the criteria.
133+
if (LimitUB == 0)
134+
return false;
135+
136+
return InstCnt >= LoopAlignLimitLB && InstCnt <= LimitUB &&
137+
BndlCnt <= LimitBndl;
138+
}
139+
140+
bool HexagonLoopAlign::isSingleLoop(MachineBasicBlock &MBB) {
141+
int Succs = MBB.succ_size();
142+
return (MBB.isSuccessor(&MBB) && (Succs == 2));
143+
}
144+
145+
bool HexagonLoopAlign::attemptToBalignSmallLoop(MachineFunction &MF,
146+
MachineBasicBlock &MBB) {
147+
if (!isSingleLoop(MBB))
148+
return false;
149+
150+
const MachineBranchProbabilityInfo *MBPI =
151+
&getAnalysis<MachineBranchProbabilityInfo>();
152+
const MachineBlockFrequencyInfo *MBFI =
153+
&getAnalysis<MachineBlockFrequencyInfo>();
154+
155+
// Compute frequency of back edge,
156+
BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB);
157+
BranchProbability BrProb = MBPI->getEdgeProbability(&MBB, &MBB);
158+
BlockFrequency EdgeFreq = BlockFreq * BrProb;
159+
LLVM_DEBUG({
160+
dbgs() << "Loop Align Pass:\n";
161+
dbgs() << "\tedge with freq(" << EdgeFreq.getFrequency() << ")\n";
162+
});
163+
164+
bool AboveThres = EdgeFreq.getFrequency() > LoopEdgeThreshold;
165+
if (shouldBalignLoop(MBB, AboveThres)) {
166+
// We found a loop, change its alignment to be 32 (5).
167+
MBB.setAlignment(llvm::Align(1 << 5));
168+
return true;
169+
}
170+
return false;
171+
}
172+
173+
// Inspect each basic block, and if its a single BB loop, see if it
174+
// meets the criteria for increasing alignment to 32.
175+
176+
bool HexagonLoopAlign::runOnMachineFunction(MachineFunction &MF) {
177+
178+
HST = &MF.getSubtarget<HexagonSubtarget>();
179+
HII = HST->getInstrInfo();
180+
HTM = &MF.getTarget();
181+
182+
if (skipFunction(MF.getFunction()))
183+
return false;
184+
if (DisableLoopAlign)
185+
return false;
186+
187+
// This optimization is performed at
188+
// i) -O2 and above, and when the loop has a HVX instruction.
189+
// ii) -O3
190+
if (HST->useHVXOps()) {
191+
if (HTM->getOptLevel() < CodeGenOptLevel::Default)
192+
return false;
193+
} else {
194+
if (HTM->getOptLevel() < CodeGenOptLevel::Aggressive)
195+
return false;
196+
}
197+
198+
bool Changed = false;
199+
for (MachineFunction::iterator MBBi = MF.begin(), MBBe = MF.end();
200+
MBBi != MBBe; ++MBBi) {
201+
MachineBasicBlock &MBB = *MBBi;
202+
Changed |= attemptToBalignSmallLoop(MF, MBB);
203+
}
204+
return Changed;
205+
}
206+
207+
} // namespace
208+
209+
INITIALIZE_PASS(HexagonLoopAlign, "hexagon-loop-align",
210+
"Hexagon LoopAlign pass", false, false)
211+
212+
//===----------------------------------------------------------------------===//
213+
// Public Constructor Functions
214+
//===----------------------------------------------------------------------===//
215+
216+
FunctionPass *llvm::createHexagonLoopAlign() { return new HexagonLoopAlign(); }

llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,7 @@ namespace llvm {
164164
void initializeHexagonGenMuxPass(PassRegistry&);
165165
void initializeHexagonHardwareLoopsPass(PassRegistry&);
166166
void initializeHexagonLoopIdiomRecognizeLegacyPassPass(PassRegistry &);
167+
void initializeHexagonLoopAlignPass(PassRegistry &);
167168
void initializeHexagonNewValueJumpPass(PassRegistry&);
168169
void initializeHexagonOptAddrModePass(PassRegistry&);
169170
void initializeHexagonPacketizerPass(PassRegistry&);
@@ -194,6 +195,7 @@ namespace llvm {
194195
FunctionPass *createHexagonHardwareLoops();
195196
FunctionPass *createHexagonISelDag(HexagonTargetMachine &TM,
196197
CodeGenOptLevel OptLevel);
198+
FunctionPass *createHexagonLoopAlign();
197199
FunctionPass *createHexagonLoopRescheduling();
198200
FunctionPass *createHexagonNewValueJump();
199201
FunctionPass *createHexagonOptAddrMode();
@@ -256,8 +258,10 @@ HexagonTargetMachine::HexagonTargetMachine(const Target &T, const Triple &TT,
256258
TT, CPU, FS, Options, getEffectiveRelocModel(RM),
257259
getEffectiveCodeModel(CM, CodeModel::Small),
258260
(HexagonNoOpt ? CodeGenOptLevel::None : OL)),
259-
TLOF(std::make_unique<HexagonTargetObjectFile>()) {
261+
TLOF(std::make_unique<HexagonTargetObjectFile>()),
262+
Subtarget(Triple(TT), CPU, FS, *this) {
260263
initializeHexagonExpandCondsetsPass(*PassRegistry::getPassRegistry());
264+
initializeHexagonLoopAlignPass(*PassRegistry::getPassRegistry());
261265
initializeHexagonTfrCleanupPass(*PassRegistry::getPassRegistry());
262266
initAsmInfo();
263267
}
@@ -476,6 +480,9 @@ void HexagonPassConfig::addPreEmitPass() {
476480
// Packetization is mandatory: it handles gather/scatter at all opt levels.
477481
addPass(createHexagonPacketizer(NoOpt));
478482

483+
if (!NoOpt)
484+
addPass(createHexagonLoopAlign());
485+
479486
if (EnableVectorPrint)
480487
addPass(createHexagonVectorPrint());
481488

llvm/lib/Target/Hexagon/HexagonTargetMachine.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ namespace llvm {
2323

2424
class HexagonTargetMachine : public LLVMTargetMachine {
2525
std::unique_ptr<TargetLoweringObjectFile> TLOF;
26+
HexagonSubtarget Subtarget;
2627
mutable StringMap<std::unique_ptr<HexagonSubtarget>> SubtargetMap;
2728

2829
public:
Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
; RUN: llc -march=hexagon -O3 < %s | FileCheck %s -check-prefix=BALIGN
2+
; BALIGN: .p2align{{.*}}5
3+
4+
; The test for checking the alignment of 'for.body4.for.body4_crit_edge' basic block
5+
6+
define dso_local void @foo(i32 %nCol, i32 %nRow, ptr nocapture %resMat) local_unnamed_addr {
7+
entry:
8+
%shl = shl i32 %nRow, 2
9+
%cmp36 = icmp sgt i32 %nRow, 0
10+
%0 = add i32 %nCol, -1
11+
%.inv = icmp slt i32 %0, 1
12+
%1 = select i1 %.inv, i32 1, i32 %nCol
13+
br label %Outerloop
14+
15+
Outerloop: ; preds = %for.end7, %entry
16+
%r12.0 = phi i32 [ 0, %entry ], [ %inc8, %for.end7 ]
17+
%r7_6.0 = phi i64 [ undef, %entry ], [ %r7_6.1.lcssa, %for.end7 ]
18+
%r0i.0 = phi i32 [ undef, %entry ], [ %r0i.1.lcssa, %for.end7 ]
19+
%r5.0 = phi ptr [ %resMat, %entry ], [ %r5.1.lcssa, %for.end7 ]
20+
%r8.0 = phi i32 [ %shl, %entry ], [ %r8.1.lcssa, %for.end7 ]
21+
br i1 %cmp36, label %for.body.lr.ph, label %for.end7
22+
23+
for.body.lr.ph: ; preds = %Outerloop
24+
%cmp332 = icmp eq i32 %r12.0, 0
25+
%exitcond.peel = icmp eq i32 %r12.0, 1
26+
br label %for.body
27+
28+
for.body: ; preds = %for.end, %for.body.lr.ph
29+
%r8.141 = phi i32 [ %r8.0, %for.body.lr.ph ], [ %add, %for.end ]
30+
%r5.140 = phi ptr [ %r5.0, %for.body.lr.ph ], [ %add.ptr, %for.end ]
31+
%i.039 = phi i32 [ 0, %for.body.lr.ph ], [ %inc6, %for.end ]
32+
%r0i.138 = phi i32 [ %r0i.0, %for.body.lr.ph ], [ %4, %for.end ]
33+
%r7_6.137 = phi i64 [ %r7_6.0, %for.body.lr.ph ], [ %r7_6.2.lcssa, %for.end ]
34+
%add = add nsw i32 %r8.141, %shl
35+
br i1 %cmp332, label %for.end, label %for.body4.peel
36+
37+
for.body4.peel: ; preds = %for.body
38+
%r1i.0.in.peel = inttoptr i32 %r8.141 to ptr
39+
%r1i.0.peel = load i32, ptr %r1i.0.in.peel, align 4
40+
%2 = tail call i64 @llvm.hexagon.M2.dpmpyss.nac.s0(i64 %r7_6.137, i32 %r1i.0.peel, i32 %r0i.138)
41+
br i1 %exitcond.peel, label %for.end, label %for.body4.preheader.peel.newph
42+
43+
for.body4.preheader.peel.newph: ; preds = %for.body4.peel
44+
%r1i.0.in = inttoptr i32 %add to ptr
45+
%r1i.0 = load i32, ptr %r1i.0.in, align 4
46+
br label %for.body4
47+
48+
for.body4: ; preds = %for.body4.for.body4_crit_edge, %for.body4.preheader.peel.newph
49+
%inc.phi = phi i32 [ %inc.0, %for.body4.for.body4_crit_edge ], [ 2, %for.body4.preheader.peel.newph ]
50+
%r7_6.233 = phi i64 [ %3, %for.body4.for.body4_crit_edge ], [ %2, %for.body4.preheader.peel.newph ]
51+
%3 = tail call i64 @llvm.hexagon.M2.dpmpyss.nac.s0(i64 %r7_6.233, i32 %r1i.0, i32 %r0i.138)
52+
%exitcond = icmp eq i32 %inc.phi, %r12.0
53+
br i1 %exitcond, label %for.end.loopexit, label %for.body4.for.body4_crit_edge
54+
55+
for.body4.for.body4_crit_edge: ; preds = %for.body4
56+
%inc.0 = add nuw nsw i32 %inc.phi, 1
57+
br label %for.body4
58+
59+
for.end.loopexit: ; preds = %for.body4
60+
br label %for.end
61+
62+
for.end: ; preds = %for.end.loopexit, %for.body4.peel, %for.body
63+
%r7_6.2.lcssa = phi i64 [ %r7_6.137, %for.body ], [ %2, %for.body4.peel ], [ %3, %for.end.loopexit ]
64+
%4 = tail call i32 @llvm.hexagon.S2.clbp(i64 %r7_6.2.lcssa)
65+
store i32 %4, ptr %r5.140, align 4
66+
%add.ptr = getelementptr inbounds i8, ptr %r5.140, i32 undef
67+
%inc6 = add nuw nsw i32 %i.039, 1
68+
%exitcond47 = icmp eq i32 %inc6, %nRow
69+
br i1 %exitcond47, label %for.end7.loopexit, label %for.body
70+
71+
for.end7.loopexit: ; preds = %for.end
72+
br label %for.end7
73+
74+
for.end7: ; preds = %for.end7.loopexit, %Outerloop
75+
%r7_6.1.lcssa = phi i64 [ %r7_6.0, %Outerloop ], [ %r7_6.2.lcssa, %for.end7.loopexit ]
76+
%r0i.1.lcssa = phi i32 [ %r0i.0, %Outerloop ], [ %4, %for.end7.loopexit ]
77+
%r5.1.lcssa = phi ptr [ %r5.0, %Outerloop ], [ %add.ptr, %for.end7.loopexit ]
78+
%r8.1.lcssa = phi i32 [ %r8.0, %Outerloop ], [ %add, %for.end7.loopexit ]
79+
%inc8 = add nuw i32 %r12.0, 1
80+
%exitcond48 = icmp eq i32 %inc8, %1
81+
br i1 %exitcond48, label %if.end, label %Outerloop
82+
83+
if.end: ; preds = %for.end7
84+
ret void
85+
}
86+
87+
; Function Attrs: nounwind readnone
88+
declare i64 @llvm.hexagon.M2.dpmpyss.nac.s0(i64, i32, i32)
89+
90+
; Function Attrs: nounwind readnone
91+
declare i32 @llvm.hexagon.S2.clbp(i64)

0 commit comments

Comments
 (0)