[RISCV] Support fastcc passing scalable vectors indirectly with no free GPRs. #107623

topperc · 2024-09-06T17:46:04Z

We can still pass indirectly by putting the pointer on the stack. This is what we do in the normal calling convention.

…rectly and we are out of GPRs. We can still pass indirectly by putting the pointer on the stack.

llvmbot · 2024-09-06T17:46:37Z

@llvm/pr-subscribers-backend-risc-v

Author: Craig Topper (topperc)

Changes

We can still pass indirectly by putting the pointer on the stack. This is what we do in the normal calling convention.

Full diff: https://github.com/llvm/llvm-project/pull/107623.diff

2 Files Affected:

(modified) llvm/lib/Target/RISCV/RISCVCallingConv.cpp (+27-17)
(modified) llvm/test/CodeGen/RISCV/rvv/calling-conv-fastcc.ll (+116)

diff --git a/llvm/lib/Target/RISCV/RISCVCallingConv.cpp b/llvm/lib/Target/RISCV/RISCVCallingConv.cpp
index c6a66e69401a6f..76fed279482468 100644
--- a/llvm/lib/Target/RISCV/RISCVCallingConv.cpp
+++ b/llvm/lib/Target/RISCV/RISCVCallingConv.cpp
@@ -603,25 +603,35 @@ bool llvm::CC_RISCV_FastCC(unsigned ValNo, MVT ValVT, MVT LocVT,
       if (ValVT.isFixedLengthVector())
         LocVT = TLI.getContainerForFixedLengthVector(LocVT);
       State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
-    } else {
-      // Try and pass the address via a "fast" GPR.
-      if (MCRegister GPRReg = State.AllocateReg(getFastCCArgGPRs(ABI))) {
-        LocInfo = CCValAssign::Indirect;
-        LocVT = Subtarget.getXLenVT();
-        State.addLoc(CCValAssign::getReg(ValNo, ValVT, GPRReg, LocVT, LocInfo));
-      } else if (ValVT.isFixedLengthVector()) {
-        auto StackAlign =
-            MaybeAlign(ValVT.getScalarSizeInBits() / 8).valueOrOne();
-        unsigned StackOffset =
-            State.AllocateStack(ValVT.getStoreSize(), StackAlign);
-        State.addLoc(
-            CCValAssign::getMem(ValNo, ValVT, StackOffset, LocVT, LocInfo));
-      } else {
-        // Can't pass scalable vectors on the stack.
-        return true;
-      }
+      return false;
     }
 
+    // Try and pass the address via a "fast" GPR.
+    if (MCRegister GPRReg = State.AllocateReg(getFastCCArgGPRs(ABI))) {
+      LocInfo = CCValAssign::Indirect;
+      LocVT = Subtarget.getXLenVT();
+      State.addLoc(CCValAssign::getReg(ValNo, ValVT, GPRReg, LocVT, LocInfo));
+      return false;
+    }
+
+    // Pass scalable vectors indirectly by storing the pointer on the stack.
+    if (ValVT.isScalableVector()) {
+      LocInfo = CCValAssign::Indirect;
+      LocVT = Subtarget.getXLenVT();
+      unsigned XLen = Subtarget.getXLen();
+      unsigned StackOffset = State.AllocateStack(XLen / 8, Align(XLen / 8));
+      State.addLoc(
+          CCValAssign::getMem(ValNo, ValVT, StackOffset, LocVT, LocInfo));
+      return false;
+    }
+
+    // Pass fixed-length vectors on the stack.
+    auto StackAlign =
+        MaybeAlign(ValVT.getScalarSizeInBits() / 8).valueOrOne();
+    unsigned StackOffset =
+        State.AllocateStack(ValVT.getStoreSize(), StackAlign);
+    State.addLoc(
+        CCValAssign::getMem(ValNo, ValVT, StackOffset, LocVT, LocInfo));
     return false;
   }
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/calling-conv-fastcc.ll b/llvm/test/CodeGen/RISCV/rvv/calling-conv-fastcc.ll
index fb84a2528778a3..427ce9d097135b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/calling-conv-fastcc.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/calling-conv-fastcc.ll
@@ -633,3 +633,119 @@ define fastcc <vscale x 32 x i32> @pass_vector_arg_indirect_stack(<vscale x 32 x
   %s = call fastcc <vscale x 32 x i32> @vector_arg_indirect_stack(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, <vscale x 32 x i32> zeroinitializer, <vscale x 32 x i32> zeroinitializer, <vscale x 32 x i32> zeroinitializer, i32 8)
   ret <vscale x 32 x i32> %s
 }
+
+; Test case where we are out of registers for the vector and all GPRs are used.
+define fastcc <vscale x 16 x i32> @vector_arg_indirect_stack_no_gpr(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, <vscale x 16 x i32> %x, <vscale x 16 x i32> %y, <vscale x 16 x i32> %z) {
+; RV32-LABEL: vector_arg_indirect_stack_no_gpr:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lw a0, 0(sp)
+; RV32-NEXT:    vl8re32.v v16, (a0)
+; RV32-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; RV32-NEXT:    vadd.vv v8, v8, v16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vector_arg_indirect_stack_no_gpr:
+; RV64:       # %bb.0:
+; RV64-NEXT:    ld a0, 0(sp)
+; RV64-NEXT:    vl8re32.v v16, (a0)
+; RV64-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; RV64-NEXT:    vadd.vv v8, v8, v16
+; RV64-NEXT:    ret
+  %s = add <vscale x 16 x i32> %x, %z
+  ret <vscale x 16 x i32> %s
+}
+
+; Calling the function above. Ensure we pass the arguments correctly.
+define fastcc <vscale x 16 x i32> @pass_vector_arg_indirect_stack_no_gpr(<vscale x 16 x i32> %x, <vscale x 16 x i32> %y, <vscale x 16 x i32> %z) {
+; RV32-LABEL: pass_vector_arg_indirect_stack_no_gpr:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -80
+; RV32-NEXT:    .cfi_def_cfa_offset 80
+; RV32-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s0, 72(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s1, 68(sp) # 4-byte Folded Spill
+; RV32-NEXT:    .cfi_offset ra, -4
+; RV32-NEXT:    .cfi_offset s0, -8
+; RV32-NEXT:    .cfi_offset s1, -12
+; RV32-NEXT:    addi s0, sp, 80
+; RV32-NEXT:    .cfi_def_cfa s0, 0
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    sub sp, sp, a0
+; RV32-NEXT:    andi sp, sp, -64
+; RV32-NEXT:    mv s1, sp
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.i v8, 0
+; RV32-NEXT:    addi a0, s1, 64
+; RV32-NEXT:    vs8r.v v8, (a0)
+; RV32-NEXT:    li a1, 1
+; RV32-NEXT:    li a2, 2
+; RV32-NEXT:    li a3, 3
+; RV32-NEXT:    li a4, 4
+; RV32-NEXT:    li a5, 5
+; RV32-NEXT:    li a6, 6
+; RV32-NEXT:    li a7, 7
+; RV32-NEXT:    li t3, 8
+; RV32-NEXT:    li t4, 9
+; RV32-NEXT:    li t5, 10
+; RV32-NEXT:    li t6, 11
+; RV32-NEXT:    sw a0, 0(sp)
+; RV32-NEXT:    li a0, 0
+; RV32-NEXT:    vmv.v.i v16, 0
+; RV32-NEXT:    call vector_arg_indirect_stack_no_gpr
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    addi sp, s0, -80
+; RV32-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s1, 68(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 80
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: pass_vector_arg_indirect_stack_no_gpr:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -96
+; RV64-NEXT:    .cfi_def_cfa_offset 96
+; RV64-NEXT:    sd ra, 88(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s0, 80(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s1, 72(sp) # 8-byte Folded Spill
+; RV64-NEXT:    .cfi_offset ra, -8
+; RV64-NEXT:    .cfi_offset s0, -16
+; RV64-NEXT:    .cfi_offset s1, -24
+; RV64-NEXT:    addi s0, sp, 96
+; RV64-NEXT:    .cfi_def_cfa s0, 0
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    sub sp, sp, a0
+; RV64-NEXT:    andi sp, sp, -64
+; RV64-NEXT:    mv s1, sp
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; RV64-NEXT:    vmv.v.i v8, 0
+; RV64-NEXT:    addi a0, s1, 64
+; RV64-NEXT:    vs8r.v v8, (a0)
+; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    li a2, 2
+; RV64-NEXT:    li a3, 3
+; RV64-NEXT:    li a4, 4
+; RV64-NEXT:    li a5, 5
+; RV64-NEXT:    li a6, 6
+; RV64-NEXT:    li a7, 7
+; RV64-NEXT:    li t3, 8
+; RV64-NEXT:    li t4, 9
+; RV64-NEXT:    li t5, 10
+; RV64-NEXT:    li t6, 11
+; RV64-NEXT:    sd a0, 0(sp)
+; RV64-NEXT:    li a0, 0
+; RV64-NEXT:    vmv.v.i v16, 0
+; RV64-NEXT:    call vector_arg_indirect_stack_no_gpr
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    addi sp, s0, -96
+; RV64-NEXT:    ld ra, 88(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s0, 80(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s1, 72(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 96
+; RV64-NEXT:    ret
+  %s = call fastcc <vscale x 16 x i32> @vector_arg_indirect_stack_no_gpr(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, <vscale x 16 x i32> zeroinitializer, <vscale x 16 x i32> zeroinitializer, <vscale x 16 x i32> zeroinitializer)
+  ret <vscale x 16 x i32> %s
+}

github-actions · 2024-09-06T17:49:56Z

✅ With the latest revision this PR passed the C/C++ code formatter.

michaelmaitland

LGTM

…stcc

[RISCV] Support fastcc for scalable vectors when we need to pass indi…

e0eb494

…rectly and we are out of GPRs. We can still pass indirectly by putting the pointer on the stack.

topperc requested review from jrtc27, preames, lukel97, dtcxzyw and wangpc-pp September 6, 2024 17:46

llvmbot added the backend:RISC-V label Sep 6, 2024

fixup! clang-format

45f78a0

michaelmaitland approved these changes Sep 6, 2024

View reviewed changes

Merge remote-tracking branch 'origin/main' into pr/vector-indirect-fa…

7948d0c

…stcc

topperc merged commit d5ef3f8 into llvm:main Sep 7, 2024
8 checks passed

topperc deleted the pr/vector-indirect-fastcc branch September 7, 2024 00:10

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[RISCV] Support fastcc passing scalable vectors indirectly with no free GPRs. #107623

[RISCV] Support fastcc passing scalable vectors indirectly with no free GPRs. #107623

Uh oh!

topperc commented Sep 6, 2024

Uh oh!

llvmbot commented Sep 6, 2024

Uh oh!

github-actions bot commented Sep 6, 2024 •

edited

Loading

Uh oh!

michaelmaitland left a comment

Uh oh!

Uh oh!

Uh oh!

[RISCV] Support fastcc passing scalable vectors indirectly with no free GPRs. #107623

[RISCV] Support fastcc passing scalable vectors indirectly with no free GPRs. #107623

Uh oh!

Conversation

topperc commented Sep 6, 2024

Uh oh!

llvmbot commented Sep 6, 2024

Uh oh!

github-actions bot commented Sep 6, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

michaelmaitland left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

github-actions bot commented Sep 6, 2024 •

edited

Loading