Skip to content

Commit cd46354

Browse files
[WebAssembly] Enable a limited amount of stackification for debug code (#136510)
This change is a step towards fixing one long-standing problem with LLVM's debug WASM codegen: excessive use of locals. One local for each temporary value in IR (roughly speaking). This has a lot of problems: 1) It makes it easy to hit engine limitations of 50K locals with certain code patterns and large functions. 2) It makes for larger binaries that are slower to load and slower to compile to native code. 3) It makes certain compilation strategies (spill all WASM locals to stack, for example) for debug code excessively expensive and makes debug WASM code either run very slow, or be less debuggable. 4) It slows down LLVM itself. This change addresses these partially by running a limited version of the stackification pass for unoptimized code, one that gets rid of the most 'obviously' unnecessary locals. Care needs to be taken to not impact LLVM's ability to produce high quality debug variable locations with this pass. To that end: 1) We only allow stackification when it doesn't require moving any instructions. 2) We disable stackification of any locals that are used in DEBUG_VALUEs, or as a frame base. I have verified on a moderately large example that the baseline and the diff produce the same kinds (local/global/stack) of locations, and the only differences are due to the shifting of instruction offsets, with many local.[get|set]s not being present anymore. Even with this quite conservative approach, the results are pretty good: 1) 30% reduction in raw code size, up to 10x reduction in the number of locals for select large methods (~1000 => ~100). 2) ~10% reduction in instructions retired for an "llc -O0" run on a moderately sized input.
1 parent de569ad commit cd46354

File tree

9 files changed

+221
-143
lines changed

9 files changed

+221
-143
lines changed

llvm/lib/Target/WebAssembly/WebAssembly.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ FunctionPass *createWebAssemblyReplacePhysRegs();
4444
FunctionPass *createWebAssemblyNullifyDebugValueLists();
4545
FunctionPass *createWebAssemblyOptimizeLiveIntervals();
4646
FunctionPass *createWebAssemblyMemIntrinsicResults();
47-
FunctionPass *createWebAssemblyRegStackify();
47+
FunctionPass *createWebAssemblyRegStackify(CodeGenOptLevel OptLevel);
4848
FunctionPass *createWebAssemblyRegColoring();
4949
FunctionPass *createWebAssemblyFixBrTableDefaults();
5050
FunctionPass *createWebAssemblyFixIrreducibleControlFlow();

llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp

Lines changed: 89 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -41,14 +41,18 @@ using namespace llvm;
4141

4242
namespace {
4343
class WebAssemblyRegStackify final : public MachineFunctionPass {
44+
bool Optimize;
45+
4446
StringRef getPassName() const override {
4547
return "WebAssembly Register Stackify";
4648
}
4749

4850
void getAnalysisUsage(AnalysisUsage &AU) const override {
4951
AU.setPreservesCFG();
50-
AU.addRequired<MachineDominatorTreeWrapperPass>();
51-
AU.addRequired<LiveIntervalsWrapperPass>();
52+
if (Optimize) {
53+
AU.addRequired<LiveIntervalsWrapperPass>();
54+
AU.addRequired<MachineDominatorTreeWrapperPass>();
55+
}
5256
AU.addPreserved<MachineBlockFrequencyInfoWrapperPass>();
5357
AU.addPreserved<SlotIndexesWrapperPass>();
5458
AU.addPreserved<LiveIntervalsWrapperPass>();
@@ -61,7 +65,9 @@ class WebAssemblyRegStackify final : public MachineFunctionPass {
6165

6266
public:
6367
static char ID; // Pass identification, replacement for typeid
64-
WebAssemblyRegStackify() : MachineFunctionPass(ID) {}
68+
WebAssemblyRegStackify(CodeGenOptLevel OptLevel)
69+
: MachineFunctionPass(ID), Optimize(OptLevel != CodeGenOptLevel::None) {}
70+
WebAssemblyRegStackify() : WebAssemblyRegStackify(CodeGenOptLevel::Default) {}
6571
};
6672
} // end anonymous namespace
6773

@@ -70,8 +76,8 @@ INITIALIZE_PASS(WebAssemblyRegStackify, DEBUG_TYPE,
7076
"Reorder instructions to use the WebAssembly value stack",
7177
false, false)
7278

73-
FunctionPass *llvm::createWebAssemblyRegStackify() {
74-
return new WebAssemblyRegStackify();
79+
FunctionPass *llvm::createWebAssemblyRegStackify(CodeGenOptLevel OptLevel) {
80+
return new WebAssemblyRegStackify(OptLevel);
7581
}
7682

7783
// Decorate the given instruction with implicit operands that enforce the
@@ -96,8 +102,7 @@ static void imposeStackOrdering(MachineInstr *MI) {
96102
static void convertImplicitDefToConstZero(MachineInstr *MI,
97103
MachineRegisterInfo &MRI,
98104
const TargetInstrInfo *TII,
99-
MachineFunction &MF,
100-
LiveIntervals &LIS) {
105+
MachineFunction &MF) {
101106
assert(MI->getOpcode() == TargetOpcode::IMPLICIT_DEF);
102107

103108
const auto *RegClass = MRI.getRegClass(MI->getOperand(0).getReg());
@@ -262,36 +267,53 @@ static bool shouldRematerialize(const MachineInstr &Def,
262267
// LiveIntervals to handle complex cases.
263268
static MachineInstr *getVRegDef(unsigned Reg, const MachineInstr *Insert,
264269
const MachineRegisterInfo &MRI,
265-
const LiveIntervals &LIS) {
270+
const LiveIntervals *LIS) {
266271
// Most registers are in SSA form here so we try a quick MRI query first.
267272
if (MachineInstr *Def = MRI.getUniqueVRegDef(Reg))
268273
return Def;
269274

270275
// MRI doesn't know what the Def is. Try asking LIS.
271-
if (const VNInfo *ValNo = LIS.getInterval(Reg).getVNInfoBefore(
272-
LIS.getInstructionIndex(*Insert)))
273-
return LIS.getInstructionFromIndex(ValNo->def);
276+
if (LIS != nullptr) {
277+
SlotIndex InstIndex = LIS->getInstructionIndex(*Insert);
278+
if (const VNInfo *ValNo = LIS->getInterval(Reg).getVNInfoBefore(InstIndex))
279+
return LIS->getInstructionFromIndex(ValNo->def);
280+
}
274281

275282
return nullptr;
276283
}
277284

278285
// Test whether Reg, as defined at Def, has exactly one use. This is a
279286
// generalization of MachineRegisterInfo::hasOneNonDBGUse that uses
280-
// LiveIntervals to handle complex cases.
281-
static bool hasOneNonDBGUse(unsigned Reg, MachineInstr *Def,
282-
MachineRegisterInfo &MRI, MachineDominatorTree &MDT,
283-
LiveIntervals &LIS) {
287+
// LiveIntervals to handle complex cases in optimized code.
288+
static bool hasSingleUse(unsigned Reg, MachineRegisterInfo &MRI,
289+
WebAssemblyFunctionInfo &MFI, bool Optimize,
290+
MachineInstr *Def, LiveIntervals *LIS) {
291+
if (!Optimize) {
292+
// Using "hasOneUse" instead of "hasOneNonDBGUse" here because we don't
293+
// want to stackify DBG_VALUE operands - WASM stack locations are less
294+
// useful and less widely supported than WASM local locations.
295+
if (!MRI.hasOneUse(Reg))
296+
return false;
297+
// The frame base always has an implicit DBG use as DW_AT_frame_base.
298+
if (MFI.isFrameBaseVirtual() && MFI.getFrameBaseVreg() == Reg)
299+
return false;
300+
return true;
301+
}
302+
284303
// Most registers are in SSA form here so we try a quick MRI query first.
285304
if (MRI.hasOneNonDBGUse(Reg))
286305
return true;
287306

307+
if (LIS == nullptr)
308+
return false;
309+
288310
bool HasOne = false;
289-
const LiveInterval &LI = LIS.getInterval(Reg);
311+
const LiveInterval &LI = LIS->getInterval(Reg);
290312
const VNInfo *DefVNI =
291-
LI.getVNInfoAt(LIS.getInstructionIndex(*Def).getRegSlot());
313+
LI.getVNInfoAt(LIS->getInstructionIndex(*Def).getRegSlot());
292314
assert(DefVNI);
293315
for (auto &I : MRI.use_nodbg_operands(Reg)) {
294-
const auto &Result = LI.Query(LIS.getInstructionIndex(*I.getParent()));
316+
const auto &Result = LI.Query(LIS->getInstructionIndex(*I.getParent()));
295317
if (Result.valueIn() == DefVNI) {
296318
if (!Result.isKill())
297319
return false;
@@ -311,7 +333,7 @@ static bool hasOneNonDBGUse(unsigned Reg, MachineInstr *Def,
311333
static bool isSafeToMove(const MachineOperand *Def, const MachineOperand *Use,
312334
const MachineInstr *Insert,
313335
const WebAssemblyFunctionInfo &MFI,
314-
const MachineRegisterInfo &MRI) {
336+
const MachineRegisterInfo &MRI, bool Optimize) {
315337
const MachineInstr *DefI = Def->getParent();
316338
const MachineInstr *UseI = Use->getParent();
317339
assert(DefI->getParent() == Insert->getParent());
@@ -357,6 +379,12 @@ static bool isSafeToMove(const MachineOperand *Def, const MachineOperand *Use,
357379
if (NextI == Insert)
358380
return true;
359381

382+
// When not optimizing, we only handle the trivial case above
383+
// to guarantee no impact to debugging and to avoid spending
384+
// compile time.
385+
if (!Optimize)
386+
return false;
387+
360388
// 'catch' and 'catch_all' should be the first instruction of a BB and cannot
361389
// move.
362390
if (WebAssembly::isCatch(DefI->getOpcode()))
@@ -520,14 +548,15 @@ static void shrinkToUses(LiveInterval &LI, LiveIntervals &LIS) {
520548
/// dependencies; move the def down and nest it with the current instruction.
521549
static MachineInstr *moveForSingleUse(unsigned Reg, MachineOperand &Op,
522550
MachineInstr *Def, MachineBasicBlock &MBB,
523-
MachineInstr *Insert, LiveIntervals &LIS,
551+
MachineInstr *Insert, LiveIntervals *LIS,
524552
WebAssemblyFunctionInfo &MFI,
525553
MachineRegisterInfo &MRI) {
526554
LLVM_DEBUG(dbgs() << "Move for single use: "; Def->dump());
527555

528556
WebAssemblyDebugValueManager DefDIs(Def);
529557
DefDIs.sink(Insert);
530-
LIS.handleMove(*Def);
558+
if (LIS != nullptr)
559+
LIS->handleMove(*Def);
531560

532561
if (MRI.hasOneDef(Reg) && MRI.hasOneNonDBGUse(Reg)) {
533562
// No one else is using this register for anything so we can just stackify
@@ -540,17 +569,18 @@ static MachineInstr *moveForSingleUse(unsigned Reg, MachineOperand &Op,
540569
Op.setReg(NewReg);
541570
DefDIs.updateReg(NewReg);
542571

543-
// Tell LiveIntervals about the new register.
544-
LIS.createAndComputeVirtRegInterval(NewReg);
572+
if (LIS != nullptr) {
573+
// Tell LiveIntervals about the new register.
574+
LIS->createAndComputeVirtRegInterval(NewReg);
545575

546-
// Tell LiveIntervals about the changes to the old register.
547-
LiveInterval &LI = LIS.getInterval(Reg);
548-
LI.removeSegment(LIS.getInstructionIndex(*Def).getRegSlot(),
549-
LIS.getInstructionIndex(*Op.getParent()).getRegSlot(),
550-
/*RemoveDeadValNo=*/true);
576+
// Tell LiveIntervals about the changes to the old register.
577+
LiveInterval &LI = LIS->getInterval(Reg);
578+
LI.removeSegment(LIS->getInstructionIndex(*Def).getRegSlot(),
579+
LIS->getInstructionIndex(*Op.getParent()).getRegSlot(),
580+
/*RemoveDeadValNo=*/true);
581+
}
551582

552583
MFI.stackifyVReg(MRI, NewReg);
553-
554584
LLVM_DEBUG(dbgs() << " - Replaced register: "; Def->dump());
555585
}
556586

@@ -567,11 +597,12 @@ static MachineInstr *getPrevNonDebugInst(MachineInstr *MI) {
567597

568598
/// A trivially cloneable instruction; clone it and nest the new copy with the
569599
/// current instruction.
570-
static MachineInstr *rematerializeCheapDef(
571-
unsigned Reg, MachineOperand &Op, MachineInstr &Def, MachineBasicBlock &MBB,
572-
MachineBasicBlock::instr_iterator Insert, LiveIntervals &LIS,
573-
WebAssemblyFunctionInfo &MFI, MachineRegisterInfo &MRI,
574-
const WebAssemblyInstrInfo *TII, const WebAssemblyRegisterInfo *TRI) {
600+
static MachineInstr *
601+
rematerializeCheapDef(unsigned Reg, MachineOperand &Op, MachineInstr &Def,
602+
MachineBasicBlock::instr_iterator Insert,
603+
LiveIntervals &LIS, WebAssemblyFunctionInfo &MFI,
604+
MachineRegisterInfo &MRI,
605+
const WebAssemblyInstrInfo *TII) {
575606
LLVM_DEBUG(dbgs() << "Rematerializing cheap def: "; Def.dump());
576607
LLVM_DEBUG(dbgs() << " - for use in "; Op.getParent()->dump());
577608

@@ -811,9 +842,12 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
811842
MachineRegisterInfo &MRI = MF.getRegInfo();
812843
WebAssemblyFunctionInfo &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
813844
const auto *TII = MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
814-
const auto *TRI = MF.getSubtarget<WebAssemblySubtarget>().getRegisterInfo();
815-
auto &MDT = getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
816-
auto &LIS = getAnalysis<LiveIntervalsWrapperPass>().getLIS();
845+
MachineDominatorTree *MDT = nullptr;
846+
LiveIntervals *LIS = nullptr;
847+
if (Optimize) {
848+
MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
849+
LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
850+
}
817851

818852
// Walk the instructions from the bottom up. Currently we don't look past
819853
// block boundaries, and the blocks aren't ordered so the block visitation
@@ -876,23 +910,28 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
876910
// supports intra-block moves) and it's MachineSink's job to catch all
877911
// the sinking opportunities anyway.
878912
bool SameBlock = DefI->getParent() == &MBB;
879-
bool CanMove = SameBlock && isSafeToMove(Def, &Use, Insert, MFI, MRI) &&
913+
bool CanMove = SameBlock &&
914+
isSafeToMove(Def, &Use, Insert, MFI, MRI, Optimize) &&
880915
!TreeWalker.isOnStack(Reg);
881-
if (CanMove && hasOneNonDBGUse(Reg, DefI, MRI, MDT, LIS)) {
916+
if (CanMove && hasSingleUse(Reg, MRI, MFI, Optimize, DefI, LIS)) {
882917
Insert = moveForSingleUse(Reg, Use, DefI, MBB, Insert, LIS, MFI, MRI);
883918

884919
// If we are removing the frame base reg completely, remove the debug
885920
// info as well.
886921
// TODO: Encode this properly as a stackified value.
887-
if (MFI.isFrameBaseVirtual() && MFI.getFrameBaseVreg() == Reg)
922+
if (MFI.isFrameBaseVirtual() && MFI.getFrameBaseVreg() == Reg) {
923+
assert(
924+
Optimize &&
925+
"Stackifying away frame base in unoptimized code not expected");
888926
MFI.clearFrameBaseVreg();
889-
} else if (shouldRematerialize(*DefI, TII)) {
890-
Insert =
891-
rematerializeCheapDef(Reg, Use, *DefI, MBB, Insert->getIterator(),
892-
LIS, MFI, MRI, TII, TRI);
893-
} else if (CanMove && oneUseDominatesOtherUses(Reg, Use, MBB, MRI, MDT,
894-
LIS, MFI)) {
895-
Insert = moveAndTeeForMultiUse(Reg, Use, DefI, MBB, Insert, LIS, MFI,
927+
}
928+
} else if (Optimize && shouldRematerialize(*DefI, TII)) {
929+
Insert = rematerializeCheapDef(Reg, Use, *DefI, Insert->getIterator(),
930+
*LIS, MFI, MRI, TII);
931+
} else if (Optimize && CanMove &&
932+
oneUseDominatesOtherUses(Reg, Use, MBB, MRI, *MDT, *LIS,
933+
MFI)) {
934+
Insert = moveAndTeeForMultiUse(Reg, Use, DefI, MBB, Insert, *LIS, MFI,
896935
MRI, TII);
897936
} else {
898937
// We failed to stackify the operand. If the problem was ordering
@@ -915,7 +954,8 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
915954
Register DefReg = SubsequentDef->getReg();
916955
Register UseReg = SubsequentUse->getReg();
917956
// TODO: This single-use restriction could be relaxed by using tees
918-
if (DefReg != UseReg || !MRI.hasOneNonDBGUse(DefReg))
957+
if (DefReg != UseReg ||
958+
!hasSingleUse(DefReg, MRI, MFI, Optimize, nullptr, nullptr))
919959
break;
920960
MFI.stackifyVReg(MRI, DefReg);
921961
++SubsequentDef;
@@ -926,7 +966,7 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
926966
// to a constant 0 so that the def is explicit, and the push/pop
927967
// correspondence is maintained.
928968
if (Insert->getOpcode() == TargetOpcode::IMPLICIT_DEF)
929-
convertImplicitDefToConstZero(Insert, MRI, TII, MF, LIS);
969+
convertImplicitDefToConstZero(Insert, MRI, TII, MF);
930970

931971
// We stackified an operand. Add the defining instruction's operands to
932972
// the worklist stack now to continue to build an ever deeper tree.

llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -602,14 +602,16 @@ void WebAssemblyPassConfig::addPreEmitPass() {
602602

603603
// Prepare memory intrinsic calls for register stackifying.
604604
addPass(createWebAssemblyMemIntrinsicResults());
605+
}
605606

606-
// Mark registers as representing wasm's value stack. This is a key
607-
// code-compression technique in WebAssembly. We run this pass (and
608-
// MemIntrinsicResults above) very late, so that it sees as much code as
609-
// possible, including code emitted by PEI and expanded by late tail
610-
// duplication.
611-
addPass(createWebAssemblyRegStackify());
607+
// Mark registers as representing wasm's value stack. This is a key
608+
// code-compression technique in WebAssembly. We run this pass (and
609+
// MemIntrinsicResults above) very late, so that it sees as much code as
610+
// possible, including code emitted by PEI and expanded by late tail
611+
// duplication.
612+
addPass(createWebAssemblyRegStackify(getOptLevel()));
612613

614+
if (getOptLevel() != CodeGenOptLevel::None) {
613615
// Run the register coloring pass to reduce the total number of registers.
614616
// This runs after stackification so that it doesn't consider registers
615617
// that become stackified.

llvm/test/CodeGen/WebAssembly/PR40172.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,9 @@ target triple = "wasm32-unknown-unknown"
1010

1111
; CHECK: i32.sub $[[BASE:[0-9]+]]=,
1212
; CHECK: local.copy $[[ARG:[0-9]+]]=, $0{{$}}
13-
; CHECK: i32.const $[[A0:[0-9]+]]=, 1{{$}}
14-
; CHECK: i32.and $[[A1:[0-9]+]]=, $[[ARG]], $[[A0]]{{$}}
15-
; CHECK: i32.store8 8($[[BASE]]), $[[A1]]{{$}}
13+
; CHECK: i32.const $push[[A0:[0-9]+]]=, 1{{$}}
14+
; CHECK: i32.and $push[[A1:[0-9]+]]=, $[[ARG]], $pop[[A0]]{{$}}
15+
; CHECK: i32.store8 8($[[BASE]]), $pop[[A1]]{{$}}
1616

1717
define void @test(i8 %byte) {
1818
%t = alloca { i8, i8 }, align 8

llvm/test/CodeGen/WebAssembly/PR41841.ll

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,9 @@ declare void @foo(i128)
66

77
; CHECK-LABEL: test_zext:
88
; CHECK-NEXT: .functype test_zext (i32) -> (){{$}}
9-
; CHECK-NEXT: i64.extend_i32_u $[[TMP3:[0-9]+]]=, $0{{$}}
10-
; CHECK-NEXT: i64.const $[[TMP4:[0-9]+]]=, 1{{$}}
11-
; CHECK-NEXT: i64.and $[[TMP1:[0-9]+]]=, $[[TMP3]], $[[TMP4]]{{$}}
9+
; CHECK-NEXT: i64.extend_i32_u $push[[TMP3:[0-9]+]]=, $0{{$}}
10+
; CHECK-NEXT: i64.const $push[[TMP4:[0-9]+]]=, 1{{$}}
11+
; CHECK-NEXT: i64.and $[[TMP1:[0-9]+]]=, $pop[[TMP3]], $pop[[TMP4]]{{$}}
1212
; CHECK-NEXT: i64.const $[[TMP2:[0-9]+]]=, 0{{$}}
1313
; CHECK-NEXT: call foo, $[[TMP1]], $[[TMP2]]{{$}}
1414
; CHECK-NEXT: return{{$}}
@@ -23,11 +23,11 @@ next: ; preds = %start
2323

2424
; CHECK-LABEL: test_sext:
2525
; CHECK-NEXT:.functype test_sext (i32) -> (){{$}}
26-
; CHECK-NEXT: i64.extend_i32_u $[[TMP3:[0-9]+]]=, $0{{$}}
27-
; CHECK-NEXT: i64.const $[[TMP4:[0-9]+]]=, 1{{$}}
28-
; CHECK-NEXT: i64.and $[[TMP5:[0-9]+]]=, $[[TMP3]], $[[TMP4]]{{$}}
29-
; CHECK-NEXT: i64.const $[[TMP6:[0-9]+]]=, 0{{$}}
30-
; CHECK-NEXT: i64.sub $[[TMP1:[0-9]+]]=, $[[TMP6]], $[[TMP5]]{{$}}
26+
; CHECK-NEXT: i64.extend_i32_u $push[[TMP3:[0-9]+]]=, $0{{$}}
27+
; CHECK-NEXT: i64.const $push[[TMP4:[0-9]+]]=, 1{{$}}
28+
; CHECK-NEXT: i64.and $[[TMP5:[0-9]+]]=, $pop[[TMP3]], $pop[[TMP4]]{{$}}
29+
; CHECK-NEXT: i64.const $push[[TMP6:[0-9]+]]=, 0{{$}}
30+
; CHECK-NEXT: i64.sub $[[TMP1:[0-9]+]]=, $pop[[TMP6]], $[[TMP5]]{{$}}
3131
; CHECK-NEXT: local.copy $[[TMP2:[0-9]+]]=, $[[TMP1]]{{$}}
3232
; CHECK-NEXT: call foo, $[[TMP1]], $[[TMP2]]{{$}}
3333
; CHECK-NEXT: return{{$}}

0 commit comments

Comments
 (0)