Skip to content

Commit 831592d

Browse files
authored
[NVPTX] Fixup under-aligned dynamic alloca lowering (#139628)
The alignment on a ISD::DYNAMIC_STACKALLOC node may be 0 to indicate that the default stack alignment should be used. Prior to this change, we passed this alignment through unchanged leading to an error in ptxas. Now, we use the stack-alignment in this case. Also did a little cleanup while I'm here.
1 parent 131c8f8 commit 831592d

File tree

4 files changed

+122
-59
lines changed

4 files changed

+122
-59
lines changed

llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2056,18 +2056,28 @@ SDValue NVPTXTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
20562056
return DAG.getMergeValues(Ops, SDLoc());
20572057
}
20582058

2059+
SDLoc DL(Op.getNode());
20592060
SDValue Chain = Op.getOperand(0);
20602061
SDValue Size = Op.getOperand(1);
2061-
uint64_t Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
2062-
SDLoc DL(Op.getNode());
2062+
uint64_t Align = Op.getConstantOperandVal(2);
2063+
2064+
// The alignment on a ISD::DYNAMIC_STACKALLOC node may be 0 to indicate that
2065+
// the default stack alignment should be used.
2066+
if (Align == 0)
2067+
Align = DAG.getSubtarget().getFrameLowering()->getStackAlign().value();
20632068

20642069
// The size for ptx alloca instruction is 64-bit for m64 and 32-bit for m32.
2065-
MVT ValueSizeTy = nvTM->is64Bit() ? MVT::i64 : MVT::i32;
2070+
const MVT LocalVT = getPointerTy(DAG.getDataLayout(), ADDRESS_SPACE_LOCAL);
2071+
2072+
SDValue Alloc =
2073+
DAG.getNode(NVPTXISD::DYNAMIC_STACKALLOC, DL, {LocalVT, MVT::Other},
2074+
{Chain, DAG.getZExtOrTrunc(Size, DL, LocalVT),
2075+
DAG.getTargetConstant(Align, DL, MVT::i32)});
2076+
2077+
SDValue ASC = DAG.getAddrSpaceCast(
2078+
DL, Op.getValueType(), Alloc, ADDRESS_SPACE_LOCAL, ADDRESS_SPACE_GENERIC);
20662079

2067-
SDValue AllocOps[] = {Chain, DAG.getZExtOrTrunc(Size, DL, ValueSizeTy),
2068-
DAG.getTargetConstant(Align, DL, MVT::i32)};
2069-
EVT RetTypes[] = {ValueSizeTy, MVT::Other};
2070-
return DAG.getNode(NVPTXISD::DYNAMIC_STACKALLOC, DL, RetTypes, AllocOps);
2080+
return DAG.getMergeValues({ASC, SDValue(Alloc.getNode(), 1)}, DL);
20712081
}
20722082

20732083
SDValue NVPTXTargetLowering::LowerSTACKRESTORE(SDValue Op,

llvm/lib/Target/NVPTX/NVPTXInstrInfo.td

Lines changed: 9 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -3102,28 +3102,20 @@ def CALL_PROTOTYPE :
31023102
"$ident", [(CallPrototype (i32 texternalsym:$ident))]>;
31033103

31043104
def SDTDynAllocaOp :
3105-
SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, SDTCisInt<1>, SDTCisInt<2>]>;
3105+
SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, SDTCisInt<1>, SDTCisVT<2, i32>]>;
31063106

31073107
def dyn_alloca :
31083108
SDNode<"NVPTXISD::DYNAMIC_STACKALLOC", SDTDynAllocaOp,
31093109
[SDNPHasChain, SDNPSideEffect]>;
31103110

3111-
def DYNAMIC_STACKALLOC32 :
3112-
NVPTXInst<(outs Int32Regs:$ptr),
3113-
(ins Int32Regs:$size, i32imm:$align),
3114-
"alloca.u32 \t$ptr, $size, $align;\n\t"
3115-
"cvta.local.u32 \t$ptr, $ptr;",
3116-
[(set i32:$ptr, (dyn_alloca i32:$size, (i32 timm:$align)))]>,
3117-
Requires<[hasPTX<73>, hasSM<52>]>;
3118-
3119-
def DYNAMIC_STACKALLOC64 :
3120-
NVPTXInst<(outs Int64Regs:$ptr),
3121-
(ins Int64Regs:$size, i32imm:$align),
3122-
"alloca.u64 \t$ptr, $size, $align;\n\t"
3123-
"cvta.local.u64 \t$ptr, $ptr;",
3124-
[(set i64:$ptr, (dyn_alloca i64:$size, (i32 timm:$align)))]>,
3125-
Requires<[hasPTX<73>, hasSM<52>]>;
3126-
3111+
foreach t = [I32RT, I64RT] in {
3112+
def DYNAMIC_STACKALLOC # t.Size :
3113+
NVPTXInst<(outs t.RC:$ptr),
3114+
(ins t.RC:$size, i32imm:$align),
3115+
"alloca.u" # t.Size # " \t$ptr, $size, $align;",
3116+
[(set t.Ty:$ptr, (dyn_alloca t.Ty:$size, timm:$align))]>,
3117+
Requires<[hasPTX<73>, hasSM<52>]>;
3118+
}
31273119

31283120
//
31293121
// BRX

llvm/test/CodeGen/NVPTX/dynamic-stackalloc-regression.ll

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,20 +6,20 @@ target triple = "nvptx64-nvidia-cuda"
66
define void @foo(i64 %a, ptr %p0, ptr %p1) {
77
; CHECK-LABEL: foo(
88
; CHECK: {
9-
; CHECK-NEXT: .reg .b64 %rd<8>;
9+
; CHECK-NEXT: .reg .b64 %rd<10>;
1010
; CHECK-EMPTY:
1111
; CHECK-NEXT: // %bb.0:
1212
; CHECK-NEXT: ld.param.b64 %rd1, [foo_param_0];
1313
; CHECK-NEXT: add.s64 %rd2, %rd1, 7;
1414
; CHECK-NEXT: and.b64 %rd3, %rd2, -8;
1515
; CHECK-NEXT: alloca.u64 %rd4, %rd3, 16;
16-
; CHECK-NEXT: cvta.local.u64 %rd4, %rd4;
17-
; CHECK-NEXT: ld.param.b64 %rd5, [foo_param_1];
18-
; CHECK-NEXT: alloca.u64 %rd6, %rd3, 16;
19-
; CHECK-NEXT: cvta.local.u64 %rd6, %rd6;
20-
; CHECK-NEXT: ld.param.b64 %rd7, [foo_param_2];
21-
; CHECK-NEXT: st.b64 [%rd5], %rd4;
22-
; CHECK-NEXT: st.b64 [%rd7], %rd6;
16+
; CHECK-NEXT: cvta.local.u64 %rd5, %rd4;
17+
; CHECK-NEXT: ld.param.b64 %rd6, [foo_param_1];
18+
; CHECK-NEXT: alloca.u64 %rd7, %rd3, 16;
19+
; CHECK-NEXT: cvta.local.u64 %rd8, %rd7;
20+
; CHECK-NEXT: ld.param.b64 %rd9, [foo_param_2];
21+
; CHECK-NEXT: st.b64 [%rd6], %rd5;
22+
; CHECK-NEXT: st.b64 [%rd9], %rd8;
2323
; CHECK-NEXT: ret;
2424
%b = alloca i8, i64 %a, align 16
2525
%c = alloca i8, i64 %a, align 16
Lines changed: 88 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,42 +1,103 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
12
; RUN: not llc < %s -mtriple=nvptx -mattr=+ptx72 -mcpu=sm_52 2>&1 | FileCheck %s --check-prefixes=CHECK-FAILS
23
; RUN: not llc < %s -mtriple=nvptx -mattr=+ptx73 -mcpu=sm_50 2>&1 | FileCheck %s --check-prefixes=CHECK-FAILS
34

4-
; RUN: llc < %s -mtriple=nvptx -mattr=+ptx73 -mcpu=sm_52 | FileCheck %s --check-prefixes=CHECK,CHECK-32
5-
; RUN: llc < %s -mtriple=nvptx64 -mattr=+ptx73 -mcpu=sm_52 | FileCheck %s --check-prefixes=CHECK,CHECK-64
5+
; RUN: llc < %s -mtriple=nvptx -mattr=+ptx73 -mcpu=sm_52 | FileCheck %s --check-prefixes=CHECK-32
6+
; RUN: llc < %s -mtriple=nvptx64 -mattr=+ptx73 -mcpu=sm_52 | FileCheck %s --check-prefixes=CHECK-64
67
; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mattr=+ptx73 -mcpu=sm_52 | %ptxas-verify %}
78
; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mattr=+ptx73 -mcpu=sm_52 | %ptxas-verify %}
89

910
; CHECK-FAILS: in function test_dynamic_stackalloc{{.*}}: Support for dynamic alloca introduced in PTX ISA version 7.3 and requires target sm_52.
1011

11-
; CHECK-LABEL: .visible .func (.param .b32 func_retval0) test_dynamic_stackalloc(
12-
; CHECK-NOT: __local_depot
13-
14-
; CHECK-32: ld.param.b32 %r[[SIZE:[0-9]]], [test_dynamic_stackalloc_param_0];
15-
; CHECK-32-NEXT: add.s32 %r[[SIZE2:[0-9]]], %r[[SIZE]], 7;
16-
; CHECK-32-NEXT: and.b32 %r[[SIZE3:[0-9]]], %r[[SIZE2]], -8;
17-
; CHECK-32-NEXT: alloca.u32 %r[[ALLOCA:[0-9]]], %r[[SIZE3]], 16;
18-
; CHECK-32-NEXT: cvta.local.u32 %r[[ALLOCA]], %r[[ALLOCA]];
19-
; CHECK-32-NEXT: { // callseq 0, 0
20-
; CHECK-32-NEXT: .param .b32 param0;
21-
; CHECK-32-NEXT: st.param.b32 [param0], %r[[ALLOCA]];
22-
23-
; CHECK-64: ld.param.b64 %rd[[SIZE:[0-9]]], [test_dynamic_stackalloc_param_0];
24-
; CHECK-64-NEXT: add.s64 %rd[[SIZE2:[0-9]]], %rd[[SIZE]], 7;
25-
; CHECK-64-NEXT: and.b64 %rd[[SIZE3:[0-9]]], %rd[[SIZE2]], -8;
26-
; CHECK-64-NEXT: alloca.u64 %rd[[ALLOCA:[0-9]]], %rd[[SIZE3]], 16;
27-
; CHECK-64-NEXT: cvta.local.u64 %rd[[ALLOCA]], %rd[[ALLOCA]];
28-
; CHECK-64-NEXT: { // callseq 0, 0
29-
; CHECK-64-NEXT: .param .b64 param0;
30-
; CHECK-64-NEXT: st.param.b64 [param0], %rd[[ALLOCA]];
31-
32-
; CHECK-NEXT: .param .b32 retval0;
33-
; CHECK-NEXT: call.uni (retval0),
34-
; CHECK-NEXT: bar,
35-
3612
define i32 @test_dynamic_stackalloc(i64 %n) {
13+
; CHECK-32-LABEL: test_dynamic_stackalloc(
14+
; CHECK-32: {
15+
; CHECK-32-NEXT: .reg .b32 %r<8>;
16+
; CHECK-32-EMPTY:
17+
; CHECK-32-NEXT: // %bb.0:
18+
; CHECK-32-NEXT: ld.param.b32 %r1, [test_dynamic_stackalloc_param_0];
19+
; CHECK-32-NEXT: add.s32 %r2, %r1, 7;
20+
; CHECK-32-NEXT: and.b32 %r3, %r2, -8;
21+
; CHECK-32-NEXT: alloca.u32 %r4, %r3, 16;
22+
; CHECK-32-NEXT: cvta.local.u32 %r5, %r4;
23+
; CHECK-32-NEXT: { // callseq 0, 0
24+
; CHECK-32-NEXT: .param .b32 param0;
25+
; CHECK-32-NEXT: st.param.b32 [param0], %r5;
26+
; CHECK-32-NEXT: .param .b32 retval0;
27+
; CHECK-32-NEXT: call.uni (retval0),
28+
; CHECK-32-NEXT: bar,
29+
; CHECK-32-NEXT: (
30+
; CHECK-32-NEXT: param0
31+
; CHECK-32-NEXT: );
32+
; CHECK-32-NEXT: ld.param.b32 %r6, [retval0];
33+
; CHECK-32-NEXT: } // callseq 0
34+
; CHECK-32-NEXT: st.param.b32 [func_retval0], %r6;
35+
; CHECK-32-NEXT: ret;
36+
;
37+
; CHECK-64-LABEL: test_dynamic_stackalloc(
38+
; CHECK-64: {
39+
; CHECK-64-NEXT: .reg .b32 %r<3>;
40+
; CHECK-64-NEXT: .reg .b64 %rd<6>;
41+
; CHECK-64-EMPTY:
42+
; CHECK-64-NEXT: // %bb.0:
43+
; CHECK-64-NEXT: ld.param.b64 %rd1, [test_dynamic_stackalloc_param_0];
44+
; CHECK-64-NEXT: add.s64 %rd2, %rd1, 7;
45+
; CHECK-64-NEXT: and.b64 %rd3, %rd2, -8;
46+
; CHECK-64-NEXT: alloca.u64 %rd4, %rd3, 16;
47+
; CHECK-64-NEXT: cvta.local.u64 %rd5, %rd4;
48+
; CHECK-64-NEXT: { // callseq 0, 0
49+
; CHECK-64-NEXT: .param .b64 param0;
50+
; CHECK-64-NEXT: st.param.b64 [param0], %rd5;
51+
; CHECK-64-NEXT: .param .b32 retval0;
52+
; CHECK-64-NEXT: call.uni (retval0),
53+
; CHECK-64-NEXT: bar,
54+
; CHECK-64-NEXT: (
55+
; CHECK-64-NEXT: param0
56+
; CHECK-64-NEXT: );
57+
; CHECK-64-NEXT: ld.param.b32 %r1, [retval0];
58+
; CHECK-64-NEXT: } // callseq 0
59+
; CHECK-64-NEXT: st.param.b32 [func_retval0], %r1;
60+
; CHECK-64-NEXT: ret;
3761
%alloca = alloca i8, i64 %n, align 16
3862
%call = call i32 @bar(ptr %alloca)
3963
ret i32 %call
4064
}
4165

66+
define float @test_dynamic_stackalloc_unaligned(i64 %0) {
67+
; CHECK-32-LABEL: test_dynamic_stackalloc_unaligned(
68+
; CHECK-32: {
69+
; CHECK-32-NEXT: .reg .b32 %r<6>;
70+
; CHECK-32-NEXT: .reg .b32 %f<2>;
71+
; CHECK-32-EMPTY:
72+
; CHECK-32-NEXT: // %bb.0:
73+
; CHECK-32-NEXT: ld.param.b32 %r1, [test_dynamic_stackalloc_unaligned_param_0];
74+
; CHECK-32-NEXT: shl.b32 %r2, %r1, 2;
75+
; CHECK-32-NEXT: add.s32 %r3, %r2, 7;
76+
; CHECK-32-NEXT: and.b32 %r4, %r3, -8;
77+
; CHECK-32-NEXT: alloca.u32 %r5, %r4, 8;
78+
; CHECK-32-NEXT: ld.local.b32 %f1, [%r5];
79+
; CHECK-32-NEXT: st.param.b32 [func_retval0], %f1;
80+
; CHECK-32-NEXT: ret;
81+
;
82+
; CHECK-64-LABEL: test_dynamic_stackalloc_unaligned(
83+
; CHECK-64: {
84+
; CHECK-64-NEXT: .reg .b32 %f<2>;
85+
; CHECK-64-NEXT: .reg .b64 %rd<6>;
86+
; CHECK-64-EMPTY:
87+
; CHECK-64-NEXT: // %bb.0:
88+
; CHECK-64-NEXT: ld.param.b64 %rd1, [test_dynamic_stackalloc_unaligned_param_0];
89+
; CHECK-64-NEXT: shl.b64 %rd2, %rd1, 2;
90+
; CHECK-64-NEXT: add.s64 %rd3, %rd2, 7;
91+
; CHECK-64-NEXT: and.b64 %rd4, %rd3, -8;
92+
; CHECK-64-NEXT: alloca.u64 %rd5, %rd4, 8;
93+
; CHECK-64-NEXT: ld.local.b32 %f1, [%rd5];
94+
; CHECK-64-NEXT: st.param.b32 [func_retval0], %f1;
95+
; CHECK-64-NEXT: ret;
96+
%4 = alloca float, i64 %0, align 4
97+
%5 = getelementptr float, ptr %4, i64 0
98+
%6 = load float, ptr %5, align 4
99+
ret float %6
100+
}
101+
42102
declare i32 @bar(ptr)
103+

0 commit comments

Comments
 (0)