Skip to content

[NVPTX] support dynamic allocas with PTX alloca instruction #84585

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Mar 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,10 +60,12 @@ void NVPTXFrameLowering::emitPrologue(MachineFunction &MF,
NRI->getFrameRegister(MF))
.addReg(NRI->getFrameLocalRegister(MF));
}
BuildMI(MBB, MBBI, dl,
MF.getSubtarget().getInstrInfo()->get(MovDepotOpcode),
NRI->getFrameLocalRegister(MF))
.addImm(MF.getFunctionNumber());
if (!MR.use_empty(NRI->getFrameLocalRegister(MF))) {
BuildMI(MBB, MBBI, dl,
MF.getSubtarget().getInstrInfo()->get(MovDepotOpcode),
NRI->getFrameLocalRegister(MF))
.addImm(MF.getFunctionNumber());
}
}
}

Expand Down
44 changes: 34 additions & 10 deletions llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -645,8 +645,6 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
setOperationAction(ISD::ConstantFP, MVT::bf16, Legal);

// Lowering of DYNAMIC_STACKALLOC is unsupported.
// Custom lower to produce an error.
setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);

Expand Down Expand Up @@ -937,6 +935,7 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
MAKE_CASE(NVPTXISD::BFE)
MAKE_CASE(NVPTXISD::BFI)
MAKE_CASE(NVPTXISD::PRMT)
MAKE_CASE(NVPTXISD::DYNAMIC_STACKALLOC)
MAKE_CASE(NVPTXISD::SETP_F16X2)
MAKE_CASE(NVPTXISD::SETP_BF16X2)
MAKE_CASE(NVPTXISD::Dummy)
Expand Down Expand Up @@ -2211,14 +2210,39 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,

SDValue NVPTXTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
SelectionDAG &DAG) const {
const Function &Fn = DAG.getMachineFunction().getFunction();

DiagnosticInfoUnsupported NoDynamicAlloca(
Fn, "dynamic alloca unsupported by NVPTX backend",
SDLoc(Op).getDebugLoc());
DAG.getContext()->diagnose(NoDynamicAlloca);
auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
return DAG.getMergeValues(Ops, SDLoc());

if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) {
const Function &Fn = DAG.getMachineFunction().getFunction();

DiagnosticInfoUnsupported NoDynamicAlloca(
Fn,
"Support for dynamic alloca introduced in PTX ISA version 7.3 and "
"requires target sm_52.",
SDLoc(Op).getDebugLoc());
DAG.getContext()->diagnose(NoDynamicAlloca);
auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()),
Op.getOperand(0)};
return DAG.getMergeValues(Ops, SDLoc());
}

SDValue Chain = Op.getOperand(0);
SDValue Size = Op.getOperand(1);
uint64_t Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
SDLoc DL(Op.getNode());

// The size for ptx alloca instruction is 64-bit for m64 and 32-bit for m32.
if (nvTM->is64Bit())
Size = DAG.getZExtOrTrunc(Size, DL, MVT::i64);
else
Size = DAG.getZExtOrTrunc(Size, DL, MVT::i32);

SDValue AllocOps[] = {Chain, Size,
DAG.getTargetConstant(Align, DL, MVT::i32)};
SDValue Alloca = DAG.getNode(NVPTXISD::DYNAMIC_STACKALLOC, DL,
nvTM->is64Bit() ? MVT::i64 : MVT::i32, AllocOps);

SDValue MergeOps[] = {Alloca, Chain};
return DAG.getMergeValues(MergeOps, DL);
}

// By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/NVPTX/NVPTXISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ enum NodeType : unsigned {
BFE,
BFI,
PRMT,
DYNAMIC_STACKALLOC,
Dummy,

LoadV2 = ISD::FIRST_TARGET_MEMORY_OPCODE,
Expand Down
22 changes: 22 additions & 0 deletions llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -3805,6 +3805,28 @@ def CALL_PROTOTYPE :
NVPTXInst<(outs), (ins ProtoIdent:$ident),
"$ident", [(CallPrototype (i32 texternalsym:$ident))]>;

def SDTDynAllocaOp :
SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, SDTCisInt<1>, SDTCisInt<2>]>;

def dyn_alloca :
SDNode<"NVPTXISD::DYNAMIC_STACKALLOC", SDTDynAllocaOp,
[SDNPHasChain, SDNPSideEffect]>;

def DYNAMIC_STACKALLOC32 :
NVPTXInst<(outs Int32Regs:$ptr),
(ins Int32Regs:$size, i32imm:$align),
"alloca.u32 \t$ptr, $size, $align;\n\t"
"cvta.local.u32 \t$ptr, $ptr;",
[(set (i32 Int32Regs:$ptr), (dyn_alloca Int32Regs:$size, (i32 timm:$align)))]>,
Requires<[hasPTX<73>, hasSM<52>]>;

def DYNAMIC_STACKALLOC64 :
NVPTXInst<(outs Int64Regs:$ptr),
(ins Int64Regs:$size, i32imm:$align),
"alloca.u64 \t$ptr, $size, $align;\n\t"
"cvta.local.u64 \t$ptr, $ptr;",
[(set Int64Regs:$ptr, (dyn_alloca Int64Regs:$size, (i32 timm:$align)))]>,
Requires<[hasPTX<73>, hasSM<52>]>;

include "NVPTXIntrinsics.td"

Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/Generic/ForceStackAlign.ll
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
; Stack realignment not supported.
; XFAIL: target=sparc{{.*}}

; NVPTX cannot select dynamic_stackalloc
; NVPTX can only select dynamic_stackalloc on sm_52+ and with ptx73+
; XFAIL: target=nvptx{{.*}}

define i32 @f(ptr %p) nounwind {
Expand Down
48 changes: 41 additions & 7 deletions llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll
Original file line number Diff line number Diff line change
@@ -1,10 +1,44 @@
; RUN: not llc -march=nvptx < %s 2>&1 | FileCheck %s
; RUN: not llc -march=nvptx64 < %s 2>&1 | FileCheck %s
; RUN: not llc < %s -march=nvptx -mattr=+ptx72 -mcpu=sm_52 2>&1 | FileCheck %s --check-prefixes=CHECK-FAILS
; RUN: not llc < %s -march=nvptx -mattr=+ptx73 -mcpu=sm_50 2>&1 | FileCheck %s --check-prefixes=CHECK-FAILS

; CHECK: in function test_dynamic_stackalloc{{.*}}: dynamic alloca unsupported by NVPTX backend
; RUN: llc < %s -march=nvptx -mattr=+ptx73 -mcpu=sm_52 | FileCheck %s --check-prefixes=CHECK,CHECK-32
; RUN: llc < %s -march=nvptx64 -mattr=+ptx73 -mcpu=sm_52 | FileCheck %s --check-prefixes=CHECK,CHECK-64
; RUN: %if ptxas %{ llc < %s -march=nvptx -mattr=+ptx73 -mcpu=sm_52 | %ptxas-verify %}
; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mattr=+ptx73 -mcpu=sm_52 | %ptxas-verify %}

define void @test_dynamic_stackalloc(i64 %n) {
%alloca = alloca i32, i64 %n
store volatile i32 0, ptr %alloca
ret void
; CHECK-FAILS: in function test_dynamic_stackalloc{{.*}}: Support for dynamic alloca introduced in PTX ISA version 7.3 and requires target sm_52.

; CHECK-LABEL: .visible .func (.param .b32 func_retval0) test_dynamic_stackalloc(
; CHECK-NOT: __local_depot

; CHECK-32: ld.param.u32 %r[[SIZE:[0-9]]], [test_dynamic_stackalloc_param_0];
; CHECK-32-NEXT: mad.lo.s32 %r[[SIZE2:[0-9]]], %r[[SIZE]], 1, 7;
; CHECK-32-NEXT: and.b32 %r[[SIZE3:[0-9]]], %r[[SIZE2]], -8;
; CHECK-32-NEXT: alloca.u32 %r[[ALLOCA:[0-9]]], %r[[SIZE3]], 16;
; CHECK-32-NEXT: cvta.local.u32 %r[[ALLOCA]], %r[[ALLOCA]];
; CHECK-32-NEXT: { // callseq 0, 0
; CHECK-32-NEXT: .reg .b32 temp_param_reg;
; CHECK-32-NEXT: .param .b32 param0;
; CHECK-32-NEXT: st.param.b32 [param0+0], %r[[ALLOCA]];

; CHECK-64: ld.param.u64 %rd[[SIZE:[0-9]]], [test_dynamic_stackalloc_param_0];
; CHECK-64-NEXT: add.s64 %rd[[SIZE2:[0-9]]], %rd[[SIZE]], 7;
; CHECK-64-NEXT: and.b64 %rd[[SIZE3:[0-9]]], %rd[[SIZE2]], -8;
; CHECK-64-NEXT: alloca.u64 %rd[[ALLOCA:[0-9]]], %rd[[SIZE3]], 16;
; CHECK-64-NEXT: cvta.local.u64 %rd[[ALLOCA]], %rd[[ALLOCA]];
; CHECK-64-NEXT: { // callseq 0, 0
; CHECK-64-NEXT: .reg .b32 temp_param_reg;
; CHECK-64-NEXT: .param .b64 param0;
; CHECK-64-NEXT: st.param.b64 [param0+0], %rd[[ALLOCA]];

; CHECK-NEXT: .param .b32 retval0;
; CHECK-NEXT: call.uni (retval0),
; CHECK-NEXT: bar,

define i32 @test_dynamic_stackalloc(i64 %n) {
%alloca = alloca i8, i64 %n, align 16
%call = call i32 @bar(ptr %alloca)
ret i32 %call
}

declare i32 @bar(ptr)