-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[RISCV] Add stack clash vector support #119458
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-risc-v Author: Raphael Moreira Zinsly (rzinsly) ChangesUse the probe loop structure to allocate vector code in the stack as well. We add the pseudo instruction RISCV::PROBED_STACKALLOC_RVV to differentiate from the normal loop. Patch is 27.93 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/119458.diff 5 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index 1028149bf513f4..9b149a466658bb 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -499,6 +499,54 @@ getPushOrLibCallsSavedInfo(const MachineFunction &MF,
return PushOrLibCallsCSI;
}
+void RISCVFrameLowering::allocateAndProbeStackForRVV(
+ MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, const DebugLoc &DL, int64_t Amount,
+ MachineInstr::MIFlag Flag, bool EmitCFI) const {
+ assert(Amount != 0 && "Did not need to adjust stack pointer for RVV.");
+
+ // Emit a variable-length allocation probing loop.
+
+ // Get VLEN in TargetReg
+ const RISCVInstrInfo *TII = STI.getInstrInfo();
+ Register TargetReg = RISCV::X6;
+ uint32_t NumOfVReg = Amount / 8;
+ BuildMI(MBB, MBBI, DL, TII->get(RISCV::PseudoReadVLENB), TargetReg)
+ .setMIFlag(Flag);
+ TII->mulImm(MF, MBB, MBBI, DL, TargetReg, NumOfVReg, Flag);
+
+ if (EmitCFI) {
+ // Set the CFA register to TargetReg.
+ unsigned Reg = STI.getRegisterInfo()->getDwarfRegNum(TargetReg, true);
+ unsigned CFIIndex =
+ MF.addFrameInst(MCCFIInstruction::cfiDefCfa(nullptr, Reg, -Amount));
+ BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex)
+ .setMIFlags(MachineInstr::FrameSetup);
+ }
+
+ // It will be expanded to a probe loop in `inlineStackProbe`.
+ BuildMI(MBB, MBBI, DL, TII->get(RISCV::PROBED_STACKALLOC_RVV))
+ .addReg(SPReg)
+ .addReg(TargetReg);
+
+ if (EmitCFI) {
+ // Set the CFA register back to SP.
+ unsigned Reg = STI.getRegisterInfo()->getDwarfRegNum(SPReg, true);
+ unsigned CFIIndex =
+ MF.addFrameInst(MCCFIInstruction::createDefCfaRegister(nullptr, Reg));
+ BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex)
+ .setMIFlags(MachineInstr::FrameSetup);
+ }
+
+ // SUB SP, SP, T1
+ BuildMI(MBB, MBBI, DL, TII->get(RISCV::SUB), SPReg)
+ .addReg(SPReg)
+ .addReg(TargetReg)
+ .setMIFlag(Flag);
+}
+
static void appendScalableVectorExpression(const TargetRegisterInfo &TRI,
SmallVectorImpl<char> &Expr,
int FixedOffset, int ScalableOffset,
@@ -857,10 +905,10 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
.setMIFlag(MachineInstr::FrameSetup);
}
+ uint64_t SecondSPAdjustAmount = 0;
// Emit the second SP adjustment after saving callee saved registers.
if (FirstSPAdjustAmount) {
- uint64_t SecondSPAdjustAmount =
- getStackSizeWithRVVPadding(MF) - FirstSPAdjustAmount;
+ SecondSPAdjustAmount = getStackSizeWithRVVPadding(MF) - FirstSPAdjustAmount;
assert(SecondSPAdjustAmount > 0 &&
"SecondSPAdjustAmount should be greater than zero");
@@ -870,11 +918,15 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
}
if (RVVStackSize) {
- // We must keep the stack pointer aligned through any intermediate
- // updates.
- RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg,
- StackOffset::getScalable(-RVVStackSize),
- MachineInstr::FrameSetup, getStackAlign());
+ if (NeedProbe)
+ allocateAndProbeStackForRVV(MF, MBB, MBBI, DL, RVVStackSize,
+ MachineInstr::FrameSetup, !hasFP(MF));
+ else
+ // We must keep the stack pointer aligned through any intermediate
+ // updates.
+ RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg,
+ StackOffset::getScalable(-RVVStackSize),
+ MachineInstr::FrameSetup, getStackAlign());
if (!hasFP(MF)) {
// Emit .cfi_def_cfa_expression "sp + StackSize + RVVStackSize * vlenb".
@@ -914,6 +966,20 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
.addImm(ShiftAmount)
.setMIFlag(MachineInstr::FrameSetup);
}
+ if (NeedProbe) {
+ // Do a probe if the align + size allocated just passed the probe size
+ // and was not yet probed.
+ uint64_t AllocatedStack = SecondSPAdjustAmount + RVVStackSize;
+ if (AllocatedStack < ProbeSize &&
+ AllocatedStack + MaxAlignment.value() >= ProbeSize) {
+ bool IsRV64 = STI.is64Bit();
+ BuildMI(MBB, MBBI, DL, TII->get(IsRV64 ? RISCV::SD : RISCV::SW))
+ .addReg(RISCV::X0)
+ .addReg(SPReg)
+ .addImm(0)
+ .setMIFlags(MachineInstr::FrameSetup);
+ }
+ }
// FP will be used to restore the frame in the epilogue, so we need
// another base register BP to record SP after re-alignment. SP will
// track the current stack after allocating variable sized objects.
@@ -2016,9 +2082,11 @@ TargetStackID::Value RISCVFrameLowering::getStackIDForScalableVectors() const {
}
// Synthesize the probe loop.
-static void emitStackProbeInline(MachineFunction &MF, MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI,
- DebugLoc DL) {
+MachineBasicBlock *RISCVFrameLowering::emitStackProbeInline(
+ MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, DebugLoc DL, Register TargetReg,
+ bool IsRVV) const {
+ assert(TargetReg != RISCV::X2 && "New top of stack cannot already be in SP");
auto &Subtarget = MF.getSubtarget<RISCVSubtarget>();
const RISCVInstrInfo *TII = Subtarget.getInstrInfo();
@@ -2034,7 +2102,6 @@ static void emitStackProbeInline(MachineFunction &MF, MachineBasicBlock &MBB,
MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock());
MF.insert(MBBInsertPoint, ExitMBB);
MachineInstr::MIFlag Flags = MachineInstr::FrameSetup;
- Register TargetReg = RISCV::X6;
Register ScratchReg = RISCV::X7;
// ScratchReg = ProbeSize
@@ -2055,12 +2122,29 @@ static void emitStackProbeInline(MachineFunction &MF, MachineBasicBlock &MBB,
.addImm(0)
.setMIFlags(Flags);
- // BNE SP, TargetReg, LoopTest
- BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(RISCV::BNE))
- .addReg(SPReg)
- .addReg(TargetReg)
- .addMBB(LoopTestMBB)
- .setMIFlags(Flags);
+ if (IsRVV) {
+ // SUB TargetReg, TargetReg, ProbeSize
+ BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(RISCV::SUB),
+ TargetReg)
+ .addReg(TargetReg)
+ .addReg(ScratchReg)
+ .setMIFlags(Flags);
+
+ // BGE TargetReg, ProbeSize, LoopTest
+ BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(RISCV::BGE))
+ .addReg(TargetReg)
+ .addReg(ScratchReg)
+ .addMBB(LoopTestMBB)
+ .setMIFlags(Flags);
+
+ } else {
+ // BNE SP, TargetReg, LoopTest
+ BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(RISCV::BNE))
+ .addReg(SPReg)
+ .addReg(TargetReg)
+ .addMBB(LoopTestMBB)
+ .setMIFlags(Flags);
+ }
ExitMBB->splice(ExitMBB->end(), &MBB, std::next(MBBI), MBB.end());
@@ -2069,16 +2153,33 @@ static void emitStackProbeInline(MachineFunction &MF, MachineBasicBlock &MBB,
MBB.addSuccessor(LoopTestMBB);
// Update liveins.
fullyRecomputeLiveIns({ExitMBB, LoopTestMBB});
+
+ return ExitMBB;
}
void RISCVFrameLowering::inlineStackProbe(MachineFunction &MF,
MachineBasicBlock &MBB) const {
- auto Where = llvm::find_if(MBB, [](MachineInstr &MI) {
- return MI.getOpcode() == RISCV::PROBED_STACKALLOC;
- });
- if (Where != MBB.end()) {
- DebugLoc DL = MBB.findDebugLoc(Where);
- emitStackProbeInline(MF, MBB, Where, DL);
- Where->eraseFromParent();
+ // Get the instructions that need to be replaced. We emit at most two of
+ // these. Remember them in order to avoid complications coming from the need
+ // to traverse the block while potentially creating more blocks.
+ SmallVector<MachineInstr *, 4> ToReplace;
+ for (MachineInstr &MI : MBB) {
+ int Opc = MI.getOpcode();
+ if (Opc == RISCV::PROBED_STACKALLOC ||
+ Opc == RISCV::PROBED_STACKALLOC_RVV) {
+ ToReplace.push_back(&MI);
+ }
+ }
+
+ for (MachineInstr *MI : ToReplace) {
+ if (MI->getOpcode() == RISCV::PROBED_STACKALLOC ||
+ MI->getOpcode() == RISCV::PROBED_STACKALLOC_RVV) {
+ MachineBasicBlock::iterator MBBI = MI->getIterator();
+ DebugLoc DL = MBB.findDebugLoc(MBBI);
+ Register TargetReg = MI->getOperand(1).getReg();
+ emitStackProbeInline(MF, MBB, MBBI, DL, TargetReg,
+ (MI->getOpcode() == RISCV::PROBED_STACKALLOC_RVV));
+ MBBI->eraseFromParent();
+ }
}
}
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.h b/llvm/lib/Target/RISCV/RISCVFrameLowering.h
index 190c063d9d3b5d..1a2c6e0302623d 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.h
@@ -83,6 +83,12 @@ class RISCVFrameLowering : public TargetFrameLowering {
uint64_t RealStackSize, bool EmitCFI, bool NeedProbe,
uint64_t ProbeSize) const;
+ MachineBasicBlock *emitStackProbeInline(MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ DebugLoc DL, Register TargetReg,
+ bool IsRVV) const;
+
protected:
const RISCVSubtarget &STI;
@@ -107,6 +113,11 @@ class RISCVFrameLowering : public TargetFrameLowering {
// Replace a StackProbe stub (if any) with the actual probe code inline
void inlineStackProbe(MachineFunction &MF,
MachineBasicBlock &PrologueMBB) const override;
+ void allocateAndProbeStackForRVV(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, int64_t Amount,
+ MachineInstr::MIFlag Flag,
+ bool EmitCFI) const;
};
} // namespace llvm
#endif
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index 14b571cebe1fec..d77e416a970b2f 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -1382,6 +1382,10 @@ def PROBED_STACKALLOC : Pseudo<(outs GPR:$sp),
(ins GPR:$scratch),
[]>,
Sched<[]>;
+def PROBED_STACKALLOC_RVV : Pseudo<(outs GPR:$sp),
+ (ins GPR:$scratch),
+ []>,
+ Sched<[]>;
}
/// HI and ADD_LO address nodes.
diff --git a/llvm/test/CodeGen/RISCV/rvv/access-fixed-objects-by-rvv.ll b/llvm/test/CodeGen/RISCV/rvv/access-fixed-objects-by-rvv.ll
index c6a3649c9ba8fe..0772181f7488df 100644
--- a/llvm/test/CodeGen/RISCV/rvv/access-fixed-objects-by-rvv.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/access-fixed-objects-by-rvv.ll
@@ -64,3 +64,49 @@ define <vscale x 1 x i64> @access_fixed_and_vector_objects(ptr %val) {
ret <vscale x 1 x i64> %a
}
+
+define <vscale x 1 x i64> @probe_fixed_and_vector_objects(ptr %val) "probe-stack"="inline-asm" {
+; RV64IV-LABEL: probe_fixed_and_vector_objects:
+; RV64IV: # %bb.0:
+; RV64IV-NEXT: addi sp, sp, -528
+; RV64IV-NEXT: .cfi_def_cfa_offset 528
+; RV64IV-NEXT: csrr t1, vlenb
+; RV64IV-NEXT: .cfi_def_cfa t1, -8
+; RV64IV-NEXT: lui t2, 1
+; RV64IV-NEXT: .LBB2_1: # =>This Inner Loop Header: Depth=1
+; RV64IV-NEXT: sub sp, sp, t2
+; RV64IV-NEXT: sd zero, 0(sp)
+; RV64IV-NEXT: sub t1, t1, t2
+; RV64IV-NEXT: bge t1, t2, .LBB2_1
+; RV64IV-NEXT: # %bb.2:
+; RV64IV-NEXT: .cfi_def_cfa_register sp
+; RV64IV-NEXT: sub sp, sp, t1
+; RV64IV-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x90, 0x04, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 528 + 1 * vlenb
+; RV64IV-NEXT: addi a0, sp, 8
+; RV64IV-NEXT: vl1re64.v v8, (a0)
+; RV64IV-NEXT: addi a0, sp, 528
+; RV64IV-NEXT: vl1re64.v v9, (a0)
+; RV64IV-NEXT: ld a0, 520(sp)
+; RV64IV-NEXT: vsetvli zero, a0, e64, m1, ta, ma
+; RV64IV-NEXT: vadd.vv v8, v8, v9
+; RV64IV-NEXT: csrr a0, vlenb
+; RV64IV-NEXT: add sp, sp, a0
+; RV64IV-NEXT: .cfi_def_cfa sp, 528
+; RV64IV-NEXT: addi sp, sp, 528
+; RV64IV-NEXT: .cfi_def_cfa_offset 0
+; RV64IV-NEXT: ret
+ %local = alloca i64
+ %vector = alloca <vscale x 1 x i64>
+ %array = alloca [64 x i64]
+ %v1 = load <vscale x 1 x i64>, ptr %array
+ %v2 = load <vscale x 1 x i64>, ptr %vector
+ %len = load i64, ptr %local
+
+ %a = call <vscale x 1 x i64> @llvm.riscv.vadd.nxv1i64.nxv1i64(
+ <vscale x 1 x i64> undef,
+ <vscale x 1 x i64> %v1,
+ <vscale x 1 x i64> %v2,
+ i64 %len)
+
+ ret <vscale x 1 x i64> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/stack-probing-rvv.ll b/llvm/test/CodeGen/RISCV/rvv/stack-probing-rvv.ll
new file mode 100644
index 00000000000000..d7f9ae73eaea54
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/stack-probing-rvv.ll
@@ -0,0 +1,400 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+m,+v -O2 < %s \
+; RUN: | FileCheck %s -check-prefix=RV64IV
+; RUN: llc -mtriple=riscv32 -mattr=+m,+v -O2 < %s \
+; RUN: | FileCheck %s -check-prefix=RV32IV
+
+; Tests adapted from AArch64.
+
+; Test prolog sequences for stack probing when vector is involved.
+
+; The space for vector objects needs probing in the general case, because
+; the stack adjustment may happen to be too big (i.e. greater than the
+; probe size).
+
+define void @f_vector(ptr %out) #0 {
+; RV64IV-LABEL: f_vector:
+; RV64IV: # %bb.0: # %entry
+; RV64IV-NEXT: csrr t1, vlenb
+; RV64IV-NEXT: slli t1, t1, 1
+; RV64IV-NEXT: .cfi_def_cfa t1, -16
+; RV64IV-NEXT: lui t2, 1
+; RV64IV-NEXT: .LBB0_1: # %entry
+; RV64IV-NEXT: # =>This Inner Loop Header: Depth=1
+; RV64IV-NEXT: sub sp, sp, t2
+; RV64IV-NEXT: sd zero, 0(sp)
+; RV64IV-NEXT: sub t1, t1, t2
+; RV64IV-NEXT: bge t1, t2, .LBB0_1
+; RV64IV-NEXT: # %bb.2: # %entry
+; RV64IV-NEXT: .cfi_def_cfa_register sp
+; RV64IV-NEXT: sub sp, sp, t1
+; RV64IV-NEXT: .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 2 * vlenb
+; RV64IV-NEXT: csrr a0, vlenb
+; RV64IV-NEXT: slli a0, a0, 1
+; RV64IV-NEXT: add sp, sp, a0
+; RV64IV-NEXT: .cfi_def_cfa sp, 0
+; RV64IV-NEXT: ret
+;
+; RV32IV-LABEL: f_vector:
+; RV32IV: # %bb.0: # %entry
+; RV32IV-NEXT: csrr t1, vlenb
+; RV32IV-NEXT: slli t1, t1, 1
+; RV32IV-NEXT: .cfi_def_cfa t1, -16
+; RV32IV-NEXT: lui t2, 1
+; RV32IV-NEXT: .LBB0_1: # %entry
+; RV32IV-NEXT: # =>This Inner Loop Header: Depth=1
+; RV32IV-NEXT: sub sp, sp, t2
+; RV32IV-NEXT: sw zero, 0(sp)
+; RV32IV-NEXT: sub t1, t1, t2
+; RV32IV-NEXT: bge t1, t2, .LBB0_1
+; RV32IV-NEXT: # %bb.2: # %entry
+; RV32IV-NEXT: .cfi_def_cfa_register sp
+; RV32IV-NEXT: sub sp, sp, t1
+; RV32IV-NEXT: .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 2 * vlenb
+; RV32IV-NEXT: csrr a0, vlenb
+; RV32IV-NEXT: slli a0, a0, 1
+; RV32IV-NEXT: add sp, sp, a0
+; RV32IV-NEXT: .cfi_def_cfa sp, 0
+; RV32IV-NEXT: ret
+entry:
+ %vec = alloca <vscale x 4 x float>, align 16
+ ret void
+}
+
+; As above, but with 4 vectors of stack space.
+define void @f4_vector(ptr %out) #0 {
+; RV64IV-LABEL: f4_vector:
+; RV64IV: # %bb.0: # %entry
+; RV64IV-NEXT: csrr t1, vlenb
+; RV64IV-NEXT: slli t1, t1, 3
+; RV64IV-NEXT: .cfi_def_cfa t1, -64
+; RV64IV-NEXT: lui t2, 1
+; RV64IV-NEXT: .LBB1_1: # %entry
+; RV64IV-NEXT: # =>This Inner Loop Header: Depth=1
+; RV64IV-NEXT: sub sp, sp, t2
+; RV64IV-NEXT: sd zero, 0(sp)
+; RV64IV-NEXT: sub t1, t1, t2
+; RV64IV-NEXT: bge t1, t2, .LBB1_1
+; RV64IV-NEXT: # %bb.2: # %entry
+; RV64IV-NEXT: .cfi_def_cfa_register sp
+; RV64IV-NEXT: sub sp, sp, t1
+; RV64IV-NEXT: .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 8 * vlenb
+; RV64IV-NEXT: csrr a0, vlenb
+; RV64IV-NEXT: slli a0, a0, 3
+; RV64IV-NEXT: add sp, sp, a0
+; RV64IV-NEXT: .cfi_def_cfa sp, 0
+; RV64IV-NEXT: ret
+;
+; RV32IV-LABEL: f4_vector:
+; RV32IV: # %bb.0: # %entry
+; RV32IV-NEXT: csrr t1, vlenb
+; RV32IV-NEXT: slli t1, t1, 3
+; RV32IV-NEXT: .cfi_def_cfa t1, -64
+; RV32IV-NEXT: lui t2, 1
+; RV32IV-NEXT: .LBB1_1: # %entry
+; RV32IV-NEXT: # =>This Inner Loop Header: Depth=1
+; RV32IV-NEXT: sub sp, sp, t2
+; RV32IV-NEXT: sw zero, 0(sp)
+; RV32IV-NEXT: sub t1, t1, t2
+; RV32IV-NEXT: bge t1, t2, .LBB1_1
+; RV32IV-NEXT: # %bb.2: # %entry
+; RV32IV-NEXT: .cfi_def_cfa_register sp
+; RV32IV-NEXT: sub sp, sp, t1
+; RV32IV-NEXT: .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 8 * vlenb
+; RV32IV-NEXT: csrr a0, vlenb
+; RV32IV-NEXT: slli a0, a0, 3
+; RV32IV-NEXT: add sp, sp, a0
+; RV32IV-NEXT: .cfi_def_cfa sp, 0
+; RV32IV-NEXT: ret
+entry:
+ %vec1 = alloca <vscale x 4 x float>, align 16
+ %vec2 = alloca <vscale x 4 x float>, align 16
+ %vec3 = alloca <vscale x 4 x float>, align 16
+ %vec4 = alloca <vscale x 4 x float>, align 16
+ ret void
+}
+
+; As above, but with 16 vectors of stack space.
+; The stack adjustment is less than or equal to 16 x 256 = 4096, so
+; we can allocate the locals at once.
+define void @f16_vector(ptr %out) #0 {
+; RV64IV-LABEL: f16_vector:
+; RV64IV: # %bb.0: # %entry
+; RV64IV-NEXT: csrr t1, vlenb
+; RV64IV-NEXT: slli t1, t1, 5
+; RV64IV-NEXT: .cfi_def_cfa t1, -256
+; RV64IV-NEXT: lui t2, 1
+; RV64IV-NEXT: .LBB2_1: # %entry
+; RV64IV-NEXT: # =>This Inner Loop Header: Depth=1
+; RV64IV-NEXT: sub sp, sp, t2
+; RV64IV-NEXT: sd zero, 0(sp)
+; RV64IV-NEXT: sub t1, t1, t2
+; RV64IV-NEXT: bge t1, t2, .LBB2_1
+; RV64IV-NEXT: # %bb.2: # %entry
+; RV64IV-NEXT: .cfi_def_cfa_register sp
+; RV64IV-NEXT: sub sp, sp, t1
+; RV64IV-NEXT: .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 * vlenb
+; RV64IV-NEXT: csrr a0, vlenb
+; RV64IV-NEXT: slli a0, a0, 5
+; RV64IV-NEXT: add sp, sp, a0
+; RV64IV-NEXT: .cfi_def_cfa sp, 0
+; RV64IV-NEXT: ret
+;
+; RV32IV-LABEL: f16_vector:
+; RV32IV: # %bb.0: # %entry
+; RV32IV-NEXT: csrr t1, vlenb
+; RV32IV-NEXT: slli t1, t1, 5
+; RV32IV-NEXT: .cfi_def_cfa t1, -256
+; RV32IV-NEXT: lui t2, 1
+; RV32IV-NEXT: .LBB2_1: # %entry
+; RV32IV-NEXT: # =>This Inner Loop Header: Depth=1
+; RV32IV-NEXT: sub sp, sp, t2
+; RV32IV-NEXT: sw zero, 0(sp)
+; RV32IV-NEXT: sub t1, t1, t2
+; RV32IV-NEXT: bge t1, t2, .LBB2_1
+; RV32IV-NEXT: # %bb.2: # %entry
+; RV32IV-NEXT: .cfi_def_cfa_register sp
+; RV32IV-NEXT: sub sp, sp, t1
+; RV32IV-NEXT: .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 * vlenb
+; RV32IV-NEXT: csrr a0, vlenb
+; RV32IV-NEXT: slli a0, a0, 5
+; RV32IV-NEXT: add sp, sp, a0
+; RV32IV-NEXT: .cfi_def_cfa sp, 0
+; RV32IV-NEXT: ret
+entry:
+ %vec1 = alloca <vscale x 4 x float>, align 16
+ %vec2 = alloca <vscale x 4 x float>, align 16
+ %vec3 = alloca <vscale x 4 x float>, align 16
+ %vec4 = alloca <vscale x 4 x float>, align 16
+ %vec5 = alloca <vscale x 4 x float>, align 16
+ %vec6 = alloca <vscale x 4 x float>, align 16
+ %vec7 = alloca <vscale x 4 x float>, align 16
+ %vec8 = alloca <vscale x 4 x float>, align 16
+ %vec9 = alloca <vscale x 4 x float>, align 16
+ %vec10 = alloca <vscale x 4 x float>, align 16
+ %vec11 = alloca <vscale x 4 x float>, align 16
+ %vec12 = alloca <vscale x 4 x float>, align 16
+ %vec13 = alloca <vscale x 4 x float>, align 16
+ %vec14 = alloca <vscale x 4 x float>, align 16
+ %vec15 = alloca <vscale x 4 x float>, align 16
+ %vec16 = alloca <vscale x 4 x float>, align 16
+ ret void
+}
+
+; As above, but with 17 vectors of stack space.
+define void @f17_vector(ptr %out) #0 {
+; RV64IV-LABEL: f17_vector:
+; RV64IV: # %bb.0: # %entry
+; RV64IV-NEXT: csrr t1, vlenb
+; RV64IV-NEXT: li a0, 34
+; RV6...
[truncated]
|
ping |
if (NeedProbe) { | ||
// Do a probe if the align + size allocated just passed the probe size | ||
// and was not yet probed. | ||
uint64_t AllocatedStack = SecondSPAdjustAmount + RVVStackSize; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
RVVStackSize is relative to vscale. Does this addition make sense? It undercounts the allocated size if VLEN is greater than 64.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We can't know how much of the stack rvv allocated, but just by adding RVVStackSize we would likely pass the probe size.
In the end I was trying to just capture when we had a vector allocation as it would already probed the stack after the second adjustment.
Your question shows that the code is misleading, so I changed it to test for RVVStackSize instead.
b9de4a4
to
6baead9
Compare
✅ With the latest revision this PR passed the undef deprecator. |
Use the probe loop structure to allocate vector code in the stack as well. We add the pseudo instruction RISCV::PROBED_STACKALLOC_RVV to differentiate from the normal loop.
6baead9
to
68e37b2
Compare
Please don't force push unless you need to. |
static void emitStackProbeInline(MachineFunction &MF, MachineBasicBlock &MBB, | ||
MachineBasicBlock::iterator MBBI, | ||
DebugLoc DL) { | ||
MachineBasicBlock *RISCVFrameLowering::emitStackProbeInline( |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why did the return type change? The caller doesn't use the returned MachineBasicBlock*
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I mixed up the patches again, this will only be needed for the dynamic allocation probing.
Removed.
@topperc ping |
if (NeedProbe) | ||
allocateAndProbeStackForRVV(MF, MBB, MBBI, DL, RVVStackSize, | ||
MachineInstr::FrameSetup, !hasFP(MF)); | ||
else |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Add curly braces around this since the comment makes it multiple lines for the reader. See https://llvm.org/docs/CodingStandards.html#don-t-use-braces-on-simple-single-statement-bodies-of-if-else-loop-statements
You'll also need to add curly braces around if body for consistency.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
@topperc Thank you! Can you merge this for me? |
Use the probe loop structure to allocate vector code in the stack as well. We add the pseudo instruction RISCV::PROBED_STACKALLOC_RVV to differentiate from the normal loop.
Use the probe loop structure to allocate vector code in the stack as well. We add the pseudo instruction RISCV::PROBED_STACKALLOC_RVV to differentiate from the normal loop.