2
2
; RUN: opt < %s -S -nvptx-lower-args --mtriple nvptx64-nvidia-cuda -mcpu=sm_70 -mattr=+ptx77 | FileCheck %s --check-prefixes OPT
3
3
; RUN: llc < %s --mtriple nvptx64-nvidia-cuda -mcpu=sm_70 -mattr=+ptx77 | FileCheck %s --check-prefixes PTX
4
4
5
+ %struct.uint4 = type { i32 , i32 , i32 , i32 }
6
+
7
+ @gi = dso_local addrspace (1 ) externally_initialized global %struct.uint4 { i32 50462976 , i32 117835012 , i32 185207048 , i32 252579084 }, align 16
8
+
9
+ ; Function Attrs: mustprogress nofree noinline norecurse nosync nounwind willreturn memory(read, inaccessiblemem: none)
10
+ ; Regular functions mus still make a copy. `cvta.param` does not always work there.
11
+ define dso_local noundef i32 @non_kernel_function (ptr nocapture noundef readonly byval (%struct.uint4 ) align 16 %a , i1 noundef zeroext %b , i32 noundef %c ) local_unnamed_addr #0 {
12
+ ; OPT-LABEL: define dso_local noundef i32 @non_kernel_function(
13
+ ; OPT-SAME: ptr nocapture noundef readonly byval([[STRUCT_UINT4:%.*]]) align 16 [[A:%.*]], i1 noundef zeroext [[B:%.*]], i32 noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
14
+ ; OPT-NEXT: [[ENTRY:.*:]]
15
+ ; OPT-NEXT: [[A1:%.*]] = alloca [[STRUCT_UINT4]], align 16
16
+ ; OPT-NEXT: [[A2:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(101)
17
+ ; OPT-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 16 [[A1]], ptr addrspace(101) align 16 [[A2]], i64 16, i1 false)
18
+ ; OPT-NEXT: [[A_:%.*]] = select i1 [[B]], ptr [[A1]], ptr addrspacecast (ptr addrspace(1) @gi to ptr)
19
+ ; OPT-NEXT: [[IDX_EXT:%.*]] = sext i32 [[C]] to i64
20
+ ; OPT-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[A_]], i64 [[IDX_EXT]]
21
+ ; OPT-NEXT: [[TMP0:%.*]] = load i32, ptr [[ADD_PTR]], align 1
22
+ ; OPT-NEXT: ret i32 [[TMP0]]
23
+ ;
24
+ ; PTX-LABEL: non_kernel_function(
25
+ ; PTX: {
26
+ ; PTX-NEXT: .local .align 16 .b8 __local_depot0[16];
27
+ ; PTX-NEXT: .reg .b64 %SP;
28
+ ; PTX-NEXT: .reg .b64 %SPL;
29
+ ; PTX-NEXT: .reg .pred %p<2>;
30
+ ; PTX-NEXT: .reg .b16 %rs<3>;
31
+ ; PTX-NEXT: .reg .b32 %r<11>;
32
+ ; PTX-NEXT: .reg .b64 %rd<10>;
33
+ ; PTX-EMPTY:
34
+ ; PTX-NEXT: // %bb.0: // %entry
35
+ ; PTX-NEXT: mov.u64 %SPL, __local_depot0;
36
+ ; PTX-NEXT: cvta.local.u64 %SP, %SPL;
37
+ ; PTX-NEXT: ld.param.u8 %rs1, [non_kernel_function_param_1];
38
+ ; PTX-NEXT: and.b16 %rs2, %rs1, 1;
39
+ ; PTX-NEXT: setp.eq.b16 %p1, %rs2, 1;
40
+ ; PTX-NEXT: ld.param.s32 %rd1, [non_kernel_function_param_2];
41
+ ; PTX-NEXT: add.u64 %rd2, %SP, 0;
42
+ ; PTX-NEXT: or.b64 %rd3, %rd2, 8;
43
+ ; PTX-NEXT: ld.param.u64 %rd4, [non_kernel_function_param_0+8];
44
+ ; PTX-NEXT: st.u64 [%rd3], %rd4;
45
+ ; PTX-NEXT: ld.param.u64 %rd5, [non_kernel_function_param_0];
46
+ ; PTX-NEXT: st.u64 [%SP+0], %rd5;
47
+ ; PTX-NEXT: mov.u64 %rd6, gi;
48
+ ; PTX-NEXT: cvta.global.u64 %rd7, %rd6;
49
+ ; PTX-NEXT: selp.b64 %rd8, %rd2, %rd7, %p1;
50
+ ; PTX-NEXT: add.s64 %rd9, %rd8, %rd1;
51
+ ; PTX-NEXT: ld.u8 %r1, [%rd9];
52
+ ; PTX-NEXT: ld.u8 %r2, [%rd9+1];
53
+ ; PTX-NEXT: shl.b32 %r3, %r2, 8;
54
+ ; PTX-NEXT: or.b32 %r4, %r3, %r1;
55
+ ; PTX-NEXT: ld.u8 %r5, [%rd9+2];
56
+ ; PTX-NEXT: shl.b32 %r6, %r5, 16;
57
+ ; PTX-NEXT: ld.u8 %r7, [%rd9+3];
58
+ ; PTX-NEXT: shl.b32 %r8, %r7, 24;
59
+ ; PTX-NEXT: or.b32 %r9, %r8, %r6;
60
+ ; PTX-NEXT: or.b32 %r10, %r9, %r4;
61
+ ; PTX-NEXT: st.param.b32 [func_retval0+0], %r10;
62
+ ; PTX-NEXT: ret;
63
+ entry:
64
+ %a. = select i1 %b , ptr %a , ptr addrspacecast (ptr addrspace (1 ) @gi to ptr ), !dbg !17
65
+ %idx.ext = sext i32 %c to i64 , !dbg !18
66
+ %add.ptr = getelementptr inbounds i8 , ptr %a. , i64 %idx.ext , !dbg !18
67
+ %0 = load i32 , ptr %add.ptr , align 1 , !dbg !19
68
+ ret i32 %0 , !dbg !23
69
+ }
70
+
5
71
define void @grid_const_int (ptr byval (i32 ) align 4 %input1 , i32 %input2 , ptr %out , i32 %n ) {
6
72
; PTX-LABEL: grid_const_int(
7
73
; PTX: {
@@ -17,7 +83,7 @@ define void @grid_const_int(ptr byval(i32) align 4 %input1, i32 %input2, ptr %ou
17
83
; PTX-NEXT: st.global.u32 [%rd2], %r3;
18
84
; PTX-NEXT: ret;
19
85
; OPT-LABEL: define void @grid_const_int(
20
- ; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], i32 [[INPUT2:%.*]], ptr [[OUT:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+ ]] {
86
+ ; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], i32 [[INPUT2:%.*]], ptr [[OUT:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
21
87
; OPT-NEXT: [[OUT2:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
22
88
; OPT-NEXT: [[OUT3:%.*]] = addrspacecast ptr addrspace(1) [[OUT2]] to ptr
23
89
; OPT-NEXT: [[INPUT11:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
@@ -106,14 +172,14 @@ define void @grid_const_escape(ptr byval(%struct.s) align 4 %input) {
106
172
define void @multiple_grid_const_escape (ptr byval (%struct.s ) align 4 %input , i32 %a , ptr byval (i32 ) align 4 %b ) {
107
173
; PTX-LABEL: multiple_grid_const_escape(
108
174
; PTX: {
109
- ; PTX-NEXT: .local .align 4 .b8 __local_depot3 [4];
175
+ ; PTX-NEXT: .local .align 4 .b8 __local_depot4 [4];
110
176
; PTX-NEXT: .reg .b64 %SP;
111
177
; PTX-NEXT: .reg .b64 %SPL;
112
178
; PTX-NEXT: .reg .b32 %r<4>;
113
179
; PTX-NEXT: .reg .b64 %rd<10>;
114
180
; PTX-EMPTY:
115
181
; PTX-NEXT: // %bb.0:
116
- ; PTX-NEXT: mov.u64 %SPL, __local_depot3 ;
182
+ ; PTX-NEXT: mov.u64 %SPL, __local_depot4 ;
117
183
; PTX-NEXT: cvta.local.u64 %SP, %SPL;
118
184
; PTX-NEXT: mov.b64 %rd2, multiple_grid_const_escape_param_0;
119
185
; PTX-NEXT: mov.b64 %rd3, multiple_grid_const_escape_param_2;
@@ -342,10 +408,10 @@ define void @grid_const_phi(ptr byval(%struct.s) align 4 %input1, ptr %inout) {
342
408
; PTX-NEXT: cvta.param.u64 %rd8, %rd7;
343
409
; PTX-NEXT: ld.global.u32 %r1, [%rd1];
344
410
; PTX-NEXT: setp.lt.s32 %p1, %r1, 0;
345
- ; PTX-NEXT: @%p1 bra $L__BB8_2 ;
411
+ ; PTX-NEXT: @%p1 bra $L__BB9_2 ;
346
412
; PTX-NEXT: // %bb.1: // %second
347
413
; PTX-NEXT: add.s64 %rd8, %rd8, 4;
348
- ; PTX-NEXT: $L__BB8_2 : // %merge
414
+ ; PTX-NEXT: $L__BB9_2 : // %merge
349
415
; PTX-NEXT: ld.u32 %r2, [%rd8];
350
416
; PTX-NEXT: st.global.u32 [%rd1], %r2;
351
417
; PTX-NEXT: ret;
@@ -402,13 +468,13 @@ define void @grid_const_phi_ngc(ptr byval(%struct.s) align 4 %input1, ptr byval(
402
468
; PTX-NEXT: cvta.param.u64 %rd11, %rd10;
403
469
; PTX-NEXT: ld.global.u32 %r1, [%rd1];
404
470
; PTX-NEXT: setp.lt.s32 %p1, %r1, 0;
405
- ; PTX-NEXT: @%p1 bra $L__BB9_2 ;
471
+ ; PTX-NEXT: @%p1 bra $L__BB10_2 ;
406
472
; PTX-NEXT: // %bb.1: // %second
407
473
; PTX-NEXT: mov.b64 %rd8, grid_const_phi_ngc_param_1;
408
474
; PTX-NEXT: mov.u64 %rd9, %rd8;
409
475
; PTX-NEXT: cvta.param.u64 %rd2, %rd9;
410
476
; PTX-NEXT: add.s64 %rd11, %rd2, 4;
411
- ; PTX-NEXT: $L__BB9_2 : // %merge
477
+ ; PTX-NEXT: $L__BB10_2 : // %merge
412
478
; PTX-NEXT: ld.u32 %r2, [%rd11];
413
479
; PTX-NEXT: st.global.u32 [%rd1], %r2;
414
480
; PTX-NEXT: ret;
@@ -567,3 +633,5 @@ declare dso_local ptr @escape3(ptr, ptr, ptr) local_unnamed_addr
567
633
568
634
!22 = !{ptr @grid_const_ptrtoint , !"kernel" , i32 1 , !"grid_constant" , !23 }
569
635
!23 = !{i32 1 }
636
+
637
+
0 commit comments