4
4
target datalayout = "A5"
5
5
6
6
@all_lds = internal unnamed_addr addrspace (3 ) global [16384 x i32 ] undef , align 4
7
+ @some_lds = internal unnamed_addr addrspace (3 ) global [32 x i32 ] undef , align 4
8
+
9
+ @initializer_user_some = addrspace (1 ) global i32 ptrtoint ([32 x i32 ] addrspace (3 )* @some_lds to i32 ), align 4
10
+ @initializer_user_all = addrspace (1 ) global i32 ptrtoint ([16384 x i32 ] addrspace (3 )* @all_lds to i32 ), align 4
7
11
8
12
; This function cannot promote to using LDS because of the size of the
9
13
; constant expression use in the function, which was previously not
10
14
; detected.
11
- ; IR-LABEL: @constant_expression_uses_lds (
15
+ ; IR-LABEL: @constant_expression_uses_all_lds (
12
16
; IR: alloca
13
17
14
- ; ASM-LABEL: constant_expression_uses_lds :
15
- ; ASM: .group_segment_fixed_size: 65536
16
- define amdgpu_kernel void @constant_expression_uses_lds (i32 addrspace (1 )* nocapture %out , i32 %idx ) #0 {
18
+ ; ASM-LABEL: constant_expression_uses_all_lds :
19
+ ; ASM: .amdhsa_group_segment_fixed_size 65536
20
+ define amdgpu_kernel void @constant_expression_uses_all_lds (i32 addrspace (1 )* nocapture %out , i32 %idx ) #0 {
17
21
entry:
18
22
%stack = alloca [4 x i32 ], align 4 , addrspace (5 )
19
23
%gep0 = getelementptr inbounds [4 x i32 ], [4 x i32 ] addrspace (5 )* %stack , i32 0 , i32 0
@@ -32,4 +36,130 @@ entry:
32
36
ret void
33
37
}
34
38
35
- attributes #0 = { "amdgpu-waves-per-eu" ="1,5" }
39
+ ; Has a constant expression use through a single level of constant
40
+ ; expression, but not enough LDS to block promotion
41
+
42
+ ; IR-LABEL: @constant_expression_uses_some_lds(
43
+ ; IR-NOT: alloca
44
+
45
+ ; ASM-LABEL: {{^}}constant_expression_uses_some_lds:
46
+ ; ASM: .amdhsa_group_segment_fixed_size 4224{{$}}
47
+ define amdgpu_kernel void @constant_expression_uses_some_lds (i32 addrspace (1 )* nocapture %out , i32 %idx ) #0 {
48
+ entry:
49
+ %stack = alloca [4 x i32 ], align 4 , addrspace (5 )
50
+ %gep0 = getelementptr inbounds [4 x i32 ], [4 x i32 ] addrspace (5 )* %stack , i32 0 , i32 0
51
+ %gep1 = getelementptr inbounds [4 x i32 ], [4 x i32 ] addrspace (5 )* %stack , i32 0 , i32 1
52
+ %gep2 = getelementptr inbounds [4 x i32 ], [4 x i32 ] addrspace (5 )* %stack , i32 0 , i32 2
53
+ %gep3 = getelementptr inbounds [4 x i32 ], [4 x i32 ] addrspace (5 )* %stack , i32 0 , i32 3
54
+ store i32 9 , i32 addrspace (5 )* %gep0
55
+ store i32 10 , i32 addrspace (5 )* %gep1
56
+ store i32 99 , i32 addrspace (5 )* %gep2
57
+ store i32 43 , i32 addrspace (5 )* %gep3
58
+ %arrayidx = getelementptr inbounds [4 x i32 ], [4 x i32 ] addrspace (5 )* %stack , i32 0 , i32 %idx
59
+ %load = load i32 , i32 addrspace (5 )* %arrayidx , align 4
60
+ store i32 %load , i32 addrspace (1 )* %out
61
+ store volatile i32 ptrtoint ([32 x i32 ] addrspace (3 )* @some_lds to i32 ), i32 addrspace (1 )* undef
62
+ ret void
63
+ }
64
+
65
+ declare void @callee (i8* )
66
+
67
+ ; IR-LABEL: @constant_expression_uses_all_lds_multi_level(
68
+ ; IR: alloca
69
+
70
+ ; ASM-LABEL: {{^}}constant_expression_uses_all_lds_multi_level:
71
+ ; ASM: .amdhsa_group_segment_fixed_size 65536{{$}}
72
+ define amdgpu_kernel void @constant_expression_uses_all_lds_multi_level (i32 addrspace (1 )* nocapture %out , i32 %idx ) #0 {
73
+ entry:
74
+ %stack = alloca [4 x i32 ], align 4 , addrspace (5 )
75
+ %gep0 = getelementptr inbounds [4 x i32 ], [4 x i32 ] addrspace (5 )* %stack , i32 0 , i32 0
76
+ %gep1 = getelementptr inbounds [4 x i32 ], [4 x i32 ] addrspace (5 )* %stack , i32 0 , i32 1
77
+ %gep2 = getelementptr inbounds [4 x i32 ], [4 x i32 ] addrspace (5 )* %stack , i32 0 , i32 2
78
+ %gep3 = getelementptr inbounds [4 x i32 ], [4 x i32 ] addrspace (5 )* %stack , i32 0 , i32 3
79
+ store i32 9 , i32 addrspace (5 )* %gep0
80
+ store i32 10 , i32 addrspace (5 )* %gep1
81
+ store i32 99 , i32 addrspace (5 )* %gep2
82
+ store i32 43 , i32 addrspace (5 )* %gep3
83
+ %arrayidx = getelementptr inbounds [4 x i32 ], [4 x i32 ] addrspace (5 )* %stack , i32 0 , i32 %idx
84
+ %load = load i32 , i32 addrspace (5 )* %arrayidx , align 4
85
+ store i32 %load , i32 addrspace (1 )* %out
86
+ call void @callee (i8* addrspacecast (i8 addrspace (3 )* bitcast (i32 addrspace (3 )* getelementptr inbounds ([16384 x i32 ], [16384 x i32 ] addrspace (3 )* @all_lds , i32 0 , i32 8 ) to i8 addrspace (3 )*) to i8* ))
87
+ ret void
88
+ }
89
+
90
+ ; IR-LABEL: @constant_expression_uses_some_lds_multi_level(
91
+ ; IR-NOT: alloca
92
+ ; IR: llvm.amdgcn.workitem.id
93
+
94
+ ; ASM-LABEL: {{^}}constant_expression_uses_some_lds_multi_level:
95
+ ; ASM: .amdhsa_group_segment_fixed_size 4224{{$}}
96
+ define amdgpu_kernel void @constant_expression_uses_some_lds_multi_level (i32 addrspace (1 )* nocapture %out , i32 %idx ) #0 {
97
+ entry:
98
+ %stack = alloca [4 x i32 ], align 4 , addrspace (5 )
99
+ %gep0 = getelementptr inbounds [4 x i32 ], [4 x i32 ] addrspace (5 )* %stack , i32 0 , i32 0
100
+ %gep1 = getelementptr inbounds [4 x i32 ], [4 x i32 ] addrspace (5 )* %stack , i32 0 , i32 1
101
+ %gep2 = getelementptr inbounds [4 x i32 ], [4 x i32 ] addrspace (5 )* %stack , i32 0 , i32 2
102
+ %gep3 = getelementptr inbounds [4 x i32 ], [4 x i32 ] addrspace (5 )* %stack , i32 0 , i32 3
103
+ store i32 9 , i32 addrspace (5 )* %gep0
104
+ store i32 10 , i32 addrspace (5 )* %gep1
105
+ store i32 99 , i32 addrspace (5 )* %gep2
106
+ store i32 43 , i32 addrspace (5 )* %gep3
107
+ %arrayidx = getelementptr inbounds [4 x i32 ], [4 x i32 ] addrspace (5 )* %stack , i32 0 , i32 %idx
108
+ %load = load i32 , i32 addrspace (5 )* %arrayidx , align 4
109
+ store i32 %load , i32 addrspace (1 )* %out
110
+ call void @callee (i8* addrspacecast (i8 addrspace (3 )* bitcast (i32 addrspace (3 )* getelementptr inbounds ([32 x i32 ], [32 x i32 ] addrspace (3 )* @some_lds , i32 0 , i32 8 ) to i8 addrspace (3 )*) to i8* ))
111
+ ret void
112
+ }
113
+
114
+ ; IR-LABEL: @constant_expression_uses_some_lds_global_initializer(
115
+ ; IR-NOT: alloca
116
+ ; IR: llvm.amdgcn.workitem.id
117
+
118
+ ; ASM-LABEL: {{^}}constant_expression_uses_some_lds_global_initializer:
119
+ ; ASM: .amdhsa_group_segment_fixed_size 4096{{$}}
120
+ define amdgpu_kernel void @constant_expression_uses_some_lds_global_initializer (i32 addrspace (1 )* nocapture %out , i32 %idx ) #0 {
121
+ entry:
122
+ %stack = alloca [4 x i32 ], align 4 , addrspace (5 )
123
+ %gep0 = getelementptr inbounds [4 x i32 ], [4 x i32 ] addrspace (5 )* %stack , i32 0 , i32 0
124
+ %gep1 = getelementptr inbounds [4 x i32 ], [4 x i32 ] addrspace (5 )* %stack , i32 0 , i32 1
125
+ %gep2 = getelementptr inbounds [4 x i32 ], [4 x i32 ] addrspace (5 )* %stack , i32 0 , i32 2
126
+ %gep3 = getelementptr inbounds [4 x i32 ], [4 x i32 ] addrspace (5 )* %stack , i32 0 , i32 3
127
+ store i32 9 , i32 addrspace (5 )* %gep0
128
+ store i32 10 , i32 addrspace (5 )* %gep1
129
+ store i32 99 , i32 addrspace (5 )* %gep2
130
+ store i32 43 , i32 addrspace (5 )* %gep3
131
+ %arrayidx = getelementptr inbounds [4 x i32 ], [4 x i32 ] addrspace (5 )* %stack , i32 0 , i32 %idx
132
+ %load = load i32 , i32 addrspace (5 )* %arrayidx , align 4
133
+ store i32 %load , i32 addrspace (1 )* %out
134
+
135
+ store volatile i32 ptrtoint (i32 addrspace (1 )* @initializer_user_some to i32 ), i32 addrspace (1 )* undef
136
+ ret void
137
+ }
138
+
139
+ ; We can't actually handle LDS initializers in global initializers,
140
+ ; but this should count as usage.
141
+
142
+ ; IR-LABEL: @constant_expression_uses_all_lds_global_initializer(
143
+ ; IR: alloca
144
+
145
+ ; ASM-LABEL: {{^}}constant_expression_uses_all_lds_global_initializer:
146
+ ; ASM: .group_segment_fixed_size: 65536
147
+ define amdgpu_kernel void @constant_expression_uses_all_lds_global_initializer (i32 addrspace (1 )* nocapture %out , i32 %idx ) #0 {
148
+ entry:
149
+ %stack = alloca [4 x i32 ], align 4 , addrspace (5 )
150
+ %gep0 = getelementptr inbounds [4 x i32 ], [4 x i32 ] addrspace (5 )* %stack , i32 0 , i32 0
151
+ %gep1 = getelementptr inbounds [4 x i32 ], [4 x i32 ] addrspace (5 )* %stack , i32 0 , i32 1
152
+ %gep2 = getelementptr inbounds [4 x i32 ], [4 x i32 ] addrspace (5 )* %stack , i32 0 , i32 2
153
+ %gep3 = getelementptr inbounds [4 x i32 ], [4 x i32 ] addrspace (5 )* %stack , i32 0 , i32 3
154
+ store i32 9 , i32 addrspace (5 )* %gep0
155
+ store i32 10 , i32 addrspace (5 )* %gep1
156
+ store i32 99 , i32 addrspace (5 )* %gep2
157
+ store i32 43 , i32 addrspace (5 )* %gep3
158
+ %arrayidx = getelementptr inbounds [4 x i32 ], [4 x i32 ] addrspace (5 )* %stack , i32 0 , i32 %idx
159
+ %load = load i32 , i32 addrspace (5 )* %arrayidx , align 4
160
+ store i32 %load , i32 addrspace (1 )* %out
161
+ store volatile i32 ptrtoint (i32 addrspace (1 )* @initializer_user_all to i32 ), i32 addrspace (1 )* undef
162
+ ret void
163
+ }
164
+
165
+ attributes #0 = { "amdgpu-waves-per-eu" ="1,5" "amdgpu-flat-work-group-size" ="256,256" }
0 commit comments