llvm · topperc · Jan 10, 2025 · Dec 10, 2024 · Dec 18, 2024 · Jan 6, 2025
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -499,6 +499,54 @@ getPushOrLibCallsSavedInfo(const MachineFunction &MF,
   return PushOrLibCallsCSI;
 }
 
+void RISCVFrameLowering::allocateAndProbeStackForRVV(
+    MachineFunction &MF, MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator MBBI, const DebugLoc &DL, int64_t Amount,
+    MachineInstr::MIFlag Flag, bool EmitCFI) const {
+  assert(Amount != 0 && "Did not need to adjust stack pointer for RVV.");
+
+  // Emit a variable-length allocation probing loop.
+
+  // Get VLEN in TargetReg
+  const RISCVInstrInfo *TII = STI.getInstrInfo();
+  Register TargetReg = RISCV::X6;
+  uint32_t NumOfVReg = Amount / (RISCV::RVVBitsPerBlock / 8);
+  BuildMI(MBB, MBBI, DL, TII->get(RISCV::PseudoReadVLENB), TargetReg)
+      .setMIFlag(Flag);
+  TII->mulImm(MF, MBB, MBBI, DL, TargetReg, NumOfVReg, Flag);
+
+  if (EmitCFI) {
+    // Set the CFA register to TargetReg.
+    unsigned Reg = STI.getRegisterInfo()->getDwarfRegNum(TargetReg, true);
+    unsigned CFIIndex =
+        MF.addFrameInst(MCCFIInstruction::cfiDefCfa(nullptr, Reg, -Amount));
+    BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex)
+        .setMIFlags(MachineInstr::FrameSetup);
+  }
+
+  // It will be expanded to a probe loop in `inlineStackProbe`.
+  BuildMI(MBB, MBBI, DL, TII->get(RISCV::PROBED_STACKALLOC_RVV))
+      .addReg(SPReg)
+      .addReg(TargetReg);
+
+  if (EmitCFI) {
+    // Set the CFA register back to SP.
+    unsigned Reg = STI.getRegisterInfo()->getDwarfRegNum(SPReg, true);
+    unsigned CFIIndex =
+        MF.addFrameInst(MCCFIInstruction::createDefCfaRegister(nullptr, Reg));
+    BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex)
+        .setMIFlags(MachineInstr::FrameSetup);
+  }
+
+  // SUB SP, SP, T1
+  BuildMI(MBB, MBBI, DL, TII->get(RISCV::SUB), SPReg)
+      .addReg(SPReg)
+      .addReg(TargetReg)
+      .setMIFlag(Flag);
+}
+
 static void appendScalableVectorExpression(const TargetRegisterInfo &TRI,
                                            SmallVectorImpl<char> &Expr,
                                            int FixedOffset, int ScalableOffset,
@@ -857,10 +905,10 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
         .setMIFlag(MachineInstr::FrameSetup);
   }
 
+  uint64_t SecondSPAdjustAmount = 0;
   // Emit the second SP adjustment after saving callee saved registers.
   if (FirstSPAdjustAmount) {
-    uint64_t SecondSPAdjustAmount =
-        getStackSizeWithRVVPadding(MF) - FirstSPAdjustAmount;
+    SecondSPAdjustAmount = getStackSizeWithRVVPadding(MF) - FirstSPAdjustAmount;
     assert(SecondSPAdjustAmount > 0 &&
            "SecondSPAdjustAmount should be greater than zero");
 
@@ -870,11 +918,16 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
   }
 
   if (RVVStackSize) {
-    // We must keep the stack pointer aligned through any intermediate
-    // updates.
-    RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg,
-                  StackOffset::getScalable(-RVVStackSize),
-                  MachineInstr::FrameSetup, getStackAlign());
+    if (NeedProbe) {
+      allocateAndProbeStackForRVV(MF, MBB, MBBI, DL, RVVStackSize,
+                                  MachineInstr::FrameSetup, !hasFP(MF));
+    } else {
+      // We must keep the stack pointer aligned through any intermediate
+      // updates.
+      RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg,
+                    StackOffset::getScalable(-RVVStackSize),
+                    MachineInstr::FrameSetup, getStackAlign());
+    }
 
     if (!hasFP(MF)) {
       // Emit .cfi_def_cfa_expression "sp + StackSize + RVVStackSize * vlenb".
@@ -914,6 +967,19 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
             .addImm(ShiftAmount)
             .setMIFlag(MachineInstr::FrameSetup);
       }
+      if (NeedProbe && RVVStackSize == 0) {
+        // Do a probe if the align + size allocated just passed the probe size
+        // and was not yet probed.
+        if (SecondSPAdjustAmount < ProbeSize &&
+            SecondSPAdjustAmount + MaxAlignment.value() >= ProbeSize) {
+          bool IsRV64 = STI.is64Bit();
+          BuildMI(MBB, MBBI, DL, TII->get(IsRV64 ? RISCV::SD : RISCV::SW))
+              .addReg(RISCV::X0)
+              .addReg(SPReg)
+              .addImm(0)
+              .setMIFlags(MachineInstr::FrameSetup);
+        }
+      }
       // FP will be used to restore the frame in the epilogue, so we need
       // another base register BP to record SP after re-alignment. SP will
       // track the current stack after allocating variable sized objects.
@@ -2017,8 +2083,9 @@ TargetStackID::Value RISCVFrameLowering::getStackIDForScalableVectors() const {
 
 // Synthesize the probe loop.
 static void emitStackProbeInline(MachineFunction &MF, MachineBasicBlock &MBB,
-                                 MachineBasicBlock::iterator MBBI,
-                                 DebugLoc DL) {
+                                 MachineBasicBlock::iterator MBBI, DebugLoc DL,
+                                 Register TargetReg, bool IsRVV) {
+  assert(TargetReg != RISCV::X2 && "New top of stack cannot already be in SP");
 
   auto &Subtarget = MF.getSubtarget<RISCVSubtarget>();
   const RISCVInstrInfo *TII = Subtarget.getInstrInfo();
@@ -2034,7 +2101,6 @@ static void emitStackProbeInline(MachineFunction &MF, MachineBasicBlock &MBB,
   MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock());
   MF.insert(MBBInsertPoint, ExitMBB);
   MachineInstr::MIFlag Flags = MachineInstr::FrameSetup;
-  Register TargetReg = RISCV::X6;
   Register ScratchReg = RISCV::X7;
 
   // ScratchReg = ProbeSize
@@ -2055,12 +2121,29 @@ static void emitStackProbeInline(MachineFunction &MF, MachineBasicBlock &MBB,
       .addImm(0)
       .setMIFlags(Flags);
 
-  //   BNE SP, TargetReg, LoopTest
-  BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(RISCV::BNE))
-      .addReg(SPReg)
-      .addReg(TargetReg)
-      .addMBB(LoopTestMBB)
-      .setMIFlags(Flags);
+  if (IsRVV) {
+    //  SUB TargetReg, TargetReg, ProbeSize
+    BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(RISCV::SUB),
+            TargetReg)
+        .addReg(TargetReg)
+        .addReg(ScratchReg)
+        .setMIFlags(Flags);
+
+    //  BGE TargetReg, ProbeSize, LoopTest
+    BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(RISCV::BGE))
+        .addReg(TargetReg)
+        .addReg(ScratchReg)
+        .addMBB(LoopTestMBB)
+        .setMIFlags(Flags);
+
+  } else {
+    //  BNE SP, TargetReg, LoopTest
+    BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(RISCV::BNE))
+        .addReg(SPReg)
+        .addReg(TargetReg)
+        .addMBB(LoopTestMBB)
+        .setMIFlags(Flags);
+  }
 
   ExitMBB->splice(ExitMBB->end(), &MBB, std::next(MBBI), MBB.end());
 
@@ -2073,12 +2156,27 @@ static void emitStackProbeInline(MachineFunction &MF, MachineBasicBlock &MBB,
 
 void RISCVFrameLowering::inlineStackProbe(MachineFunction &MF,
                                           MachineBasicBlock &MBB) const {
-  auto Where = llvm::find_if(MBB, [](MachineInstr &MI) {
-    return MI.getOpcode() == RISCV::PROBED_STACKALLOC;
-  });
-  if (Where != MBB.end()) {
-    DebugLoc DL = MBB.findDebugLoc(Where);
-    emitStackProbeInline(MF, MBB, Where, DL);
-    Where->eraseFromParent();
+  // Get the instructions that need to be replaced. We emit at most two of
+  // these. Remember them in order to avoid complications coming from the need
+  // to traverse the block while potentially creating more blocks.
+  SmallVector<MachineInstr *, 4> ToReplace;
+  for (MachineInstr &MI : MBB) {
+    unsigned Opc = MI.getOpcode();
+    if (Opc == RISCV::PROBED_STACKALLOC ||
+        Opc == RISCV::PROBED_STACKALLOC_RVV) {
+      ToReplace.push_back(&MI);
+    }
+  }
+
+  for (MachineInstr *MI : ToReplace) {
+    if (MI->getOpcode() == RISCV::PROBED_STACKALLOC ||
+        MI->getOpcode() == RISCV::PROBED_STACKALLOC_RVV) {
+      MachineBasicBlock::iterator MBBI = MI->getIterator();
+      DebugLoc DL = MBB.findDebugLoc(MBBI);
+      Register TargetReg = MI->getOperand(1).getReg();
+      emitStackProbeInline(MF, MBB, MBBI, DL, TargetReg,
+                           (MI->getOpcode() == RISCV::PROBED_STACKALLOC_RVV));
+      MBBI->eraseFromParent();
+    }
   }
 }
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.h b/llvm/lib/Target/RISCV/RISCVFrameLowering.h
@@ -107,6 +107,11 @@ class RISCVFrameLowering : public TargetFrameLowering {
   // Replace a StackProbe stub (if any) with the actual probe code inline
   void inlineStackProbe(MachineFunction &MF,
                         MachineBasicBlock &PrologueMBB) const override;
+  void allocateAndProbeStackForRVV(MachineFunction &MF, MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MBBI,
+                                   const DebugLoc &DL, int64_t Amount,
+                                   MachineInstr::MIFlag Flag,
+                                   bool EmitCFI) const;
 };
 } // namespace llvm
 #endif
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -1382,6 +1382,10 @@ def PROBED_STACKALLOC : Pseudo<(outs GPR:$sp),
                                (ins GPR:$scratch),
                                []>,
                                Sched<[]>;
+def PROBED_STACKALLOC_RVV : Pseudo<(outs GPR:$sp),
+                               (ins GPR:$scratch),
+                               []>,
+                               Sched<[]>;
 }
 
 /// HI and ADD_LO address nodes.

diff --git a/llvm/test/CodeGen/RISCV/rvv/access-fixed-objects-by-rvv.ll b/llvm/test/CodeGen/RISCV/rvv/access-fixed-objects-by-rvv.ll
@@ -64,3 +64,49 @@ define <vscale x 1 x i64> @access_fixed_and_vector_objects(ptr %val) {
 
   ret <vscale x 1 x i64> %a
 }
+
+define <vscale x 1 x i64> @probe_fixed_and_vector_objects(ptr %val, <vscale x 1 x i64> %dummy) "probe-stack"="inline-asm" {
+; RV64IV-LABEL: probe_fixed_and_vector_objects:
+; RV64IV:       # %bb.0:
+; RV64IV-NEXT:    addi sp, sp, -528
+; RV64IV-NEXT:    .cfi_def_cfa_offset 528
+; RV64IV-NEXT:    csrr t1, vlenb
+; RV64IV-NEXT:    .cfi_def_cfa t1, -8
+; RV64IV-NEXT:    lui t2, 1
+; RV64IV-NEXT:  .LBB2_1: # =>This Inner Loop Header: Depth=1
+; RV64IV-NEXT:    sub sp, sp, t2
+; RV64IV-NEXT:    sd zero, 0(sp)
+; RV64IV-NEXT:    sub t1, t1, t2
+; RV64IV-NEXT:    bge t1, t2, .LBB2_1
+; RV64IV-NEXT:  # %bb.2:
+; RV64IV-NEXT:    .cfi_def_cfa_register sp
+; RV64IV-NEXT:    sub sp, sp, t1
+; RV64IV-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x90, 0x04, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 528 + 1 * vlenb
+; RV64IV-NEXT:    addi a0, sp, 8
+; RV64IV-NEXT:    vl1re64.v v9, (a0)
+; RV64IV-NEXT:    addi a0, sp, 528
+; RV64IV-NEXT:    vl1re64.v v10, (a0)
+; RV64IV-NEXT:    ld a0, 520(sp)
+; RV64IV-NEXT:    vsetvli zero, a0, e64, m1, tu, ma
+; RV64IV-NEXT:    vadd.vv v8, v9, v10
+; RV64IV-NEXT:    csrr a0, vlenb
+; RV64IV-NEXT:    add sp, sp, a0
+; RV64IV-NEXT:    .cfi_def_cfa sp, 528
+; RV64IV-NEXT:    addi sp, sp, 528
+; RV64IV-NEXT:    .cfi_def_cfa_offset 0
+; RV64IV-NEXT:    ret
+  %local = alloca i64
+  %vector = alloca <vscale x 1 x i64>
+  %array = alloca [64 x i64]
+  %v1 = load <vscale x 1 x i64>, ptr %array
+  %v2 = load <vscale x 1 x i64>, ptr %vector
+  %len = load i64, ptr %local
+
+  %a = call <vscale x 1 x i64> @llvm.riscv.vadd.nxv1i64.nxv1i64(
+    <vscale x 1 x i64> %dummy,
+    <vscale x 1 x i64> %v1,
+    <vscale x 1 x i64> %v2,
+    i64 %len)
+
+  ret <vscale x 1 x i64> %a
+}