1
- ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-xnack -verify-machineinstrs -debug-only=machine-scheduler < %s 2> %t | FileCheck --enable-var-scope --check-prefixes=CHECK,GCN %s
2
- ; RUN: FileCheck --enable-var-scope --check-prefixes=CHECK,DBG %s < %t
1
+ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2
+ ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-xnack -verify-machineinstrs -debug-only=machine-scheduler < %s 2> %t | FileCheck --enable-var-scope --check-prefix=GFX9 %s
3
+ ; RUN: FileCheck --enable-var-scope --check-prefix=DBG %s < %t
4
+ ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -debug-only=machine-scheduler < %s 2> %t | FileCheck --enable-var-scope --check-prefix=GFX10 %s
5
+ ; RUN: FileCheck --enable-var-scope --check-prefix=DBG %s < %t
3
6
; REQUIRES: asserts
4
7
5
8
; FIXME: Verifier error with xnack enabled.
6
9
7
- ; CHECK-LABEL: {{^}}cluster_load_cluster_store:
8
- define amdgpu_kernel void @cluster_load_cluster_store (i32* noalias %lb , i32* noalias %sb ) {
9
- bb:
10
+ ; DBG-LABEL: cluster_load_cluster_store:
10
11
11
12
; DBG: Num BaseOps: {{[1-9]+}}, Offset: {{[0-9]+}}, OffsetIsScalable: {{[01]}}, Width: 8
12
13
; DBG: Num BaseOps: {{[1-9]+}}, Offset: {{[0-9]+}}, OffsetIsScalable: {{[01]}}, Width: 8
20
21
; DBG: Cluster ld/st SU([[L1:[0-9]+]]) - SU([[L2:[0-9]+]])
21
22
; DBG: Cluster ld/st SU([[L2]]) - SU([[L3:[0-9]+]])
22
23
; DBG: Cluster ld/st SU([[L3]]) - SU([[L4:[0-9]+]])
23
- ; GCN: flat_load_dword [[LD1:v[0-9]+]], v[{{[0-9:]+}}]
24
- ; GCN-NEXT: flat_load_dword [[LD2:v[0-9]+]], v[{{[0-9:]+}}] offset:8
25
- ; GCN-NEXT: flat_load_dword [[LD3:v[0-9]+]], v[{{[0-9:]+}}] offset:16
26
- ; GCN-NEXT: flat_load_dword [[LD4:v[0-9]+]], v[{{[0-9:]+}}] offset:24
24
+
25
+ ; DBG-NOT: Cluster ld/st
26
+
27
+ define amdgpu_kernel void @cluster_load_cluster_store (i32* noalias %lb , i32* noalias %sb ) {
28
+ ; GFX9-LABEL: cluster_load_cluster_store:
29
+ ; GFX9: ; %bb.0: ; %bb
30
+ ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
31
+ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
32
+ ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
33
+ ; GFX9-NEXT: v_mov_b32_e32 v0, s2
34
+ ; GFX9-NEXT: v_mov_b32_e32 v1, s3
35
+ ; GFX9-NEXT: flat_load_dword v2, v[0:1]
36
+ ; GFX9-NEXT: flat_load_dword v3, v[0:1] offset:8
37
+ ; GFX9-NEXT: flat_load_dword v4, v[0:1] offset:16
38
+ ; GFX9-NEXT: flat_load_dword v5, v[0:1] offset:24
39
+ ; GFX9-NEXT: v_mov_b32_e32 v0, s0
40
+ ; GFX9-NEXT: v_mov_b32_e32 v1, s1
41
+ ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
42
+ ; GFX9-NEXT: flat_store_dword v[0:1], v2
43
+ ; GFX9-NEXT: flat_store_dword v[0:1], v3 offset:8
44
+ ; GFX9-NEXT: flat_store_dword v[0:1], v4 offset:16
45
+ ; GFX9-NEXT: flat_store_dword v[0:1], v5 offset:24
46
+ ; GFX9-NEXT: s_endpgm
47
+ ;
48
+ ; GFX10-LABEL: cluster_load_cluster_store:
49
+ ; GFX10: ; %bb.0: ; %bb
50
+ ; GFX10-NEXT: s_clause 0x1
51
+ ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
52
+ ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
53
+ ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
54
+ ; GFX10-NEXT: s_add_u32 s0, s2, 8
55
+ ; GFX10-NEXT: s_addc_u32 s1, s3, 0
56
+ ; GFX10-NEXT: s_add_u32 s6, s2, 16
57
+ ; GFX10-NEXT: v_mov_b32_e32 v3, s1
58
+ ; GFX10-NEXT: s_addc_u32 s7, s3, 0
59
+ ; GFX10-NEXT: v_mov_b32_e32 v0, s2
60
+ ; GFX10-NEXT: v_mov_b32_e32 v2, s0
61
+ ; GFX10-NEXT: s_add_u32 s0, s2, 24
62
+ ; GFX10-NEXT: s_addc_u32 s1, s3, 0
63
+ ; GFX10-NEXT: v_mov_b32_e32 v1, s3
64
+ ; GFX10-NEXT: v_mov_b32_e32 v4, s6
65
+ ; GFX10-NEXT: v_mov_b32_e32 v7, s1
66
+ ; GFX10-NEXT: v_mov_b32_e32 v5, s7
67
+ ; GFX10-NEXT: v_mov_b32_e32 v6, s0
68
+ ; GFX10-NEXT: s_add_u32 s0, s4, 8
69
+ ; GFX10-NEXT: s_clause 0x3
70
+ ; GFX10-NEXT: flat_load_dword v8, v[0:1]
71
+ ; GFX10-NEXT: flat_load_dword v9, v[2:3]
72
+ ; GFX10-NEXT: flat_load_dword v10, v[4:5]
73
+ ; GFX10-NEXT: flat_load_dword v11, v[6:7]
74
+ ; GFX10-NEXT: s_addc_u32 s1, s5, 0
75
+ ; GFX10-NEXT: v_mov_b32_e32 v0, s4
76
+ ; GFX10-NEXT: v_mov_b32_e32 v3, s1
77
+ ; GFX10-NEXT: v_mov_b32_e32 v2, s0
78
+ ; GFX10-NEXT: s_add_u32 s0, s4, 16
79
+ ; GFX10-NEXT: s_addc_u32 s1, s5, 0
80
+ ; GFX10-NEXT: s_add_u32 s2, s4, 24
81
+ ; GFX10-NEXT: s_addc_u32 s3, s5, 0
82
+ ; GFX10-NEXT: v_mov_b32_e32 v1, s5
83
+ ; GFX10-NEXT: v_mov_b32_e32 v5, s1
84
+ ; GFX10-NEXT: v_mov_b32_e32 v7, s3
85
+ ; GFX10-NEXT: v_mov_b32_e32 v4, s0
86
+ ; GFX10-NEXT: v_mov_b32_e32 v6, s2
87
+ ; GFX10-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
88
+ ; GFX10-NEXT: flat_store_dword v[0:1], v8
89
+ ; GFX10-NEXT: s_waitcnt vmcnt(2) lgkmcnt(3)
90
+ ; GFX10-NEXT: flat_store_dword v[2:3], v9
91
+ ; GFX10-NEXT: s_waitcnt vmcnt(1) lgkmcnt(3)
92
+ ; GFX10-NEXT: flat_store_dword v[4:5], v10
93
+ ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(3)
94
+ ; GFX10-NEXT: flat_store_dword v[6:7], v11
95
+ ; GFX10-NEXT: s_endpgm
96
+ bb:
27
97
%la0 = getelementptr inbounds i32 , i32* %lb , i32 0
28
98
%ld0 = load i32 , i32* %la0
29
99
%la1 = getelementptr inbounds i32 , i32* %lb , i32 2
33
103
%la3 = getelementptr inbounds i32 , i32* %lb , i32 6
34
104
%ld3 = load i32 , i32* %la3
35
105
36
- ; DBG-NOT: Cluster ld/st
37
- ; GCN: flat_store_dword v[{{[0-9:]+}}], [[LD1]]
38
- ; GCN-NEXT: flat_store_dword v[{{[0-9:]+}}], [[LD2]] offset:8
39
- ; GCN-NEXT: flat_store_dword v[{{[0-9:]+}}], [[LD3]] offset:16
40
- ; GCN-NEXT: flat_store_dword v[{{[0-9:]+}}], [[LD4]] offset:24
41
106
%sa0 = getelementptr inbounds i32 , i32* %sb , i32 0
42
107
store i32 %ld0 , i32* %sa0
43
108
%sa1 = getelementptr inbounds i32 , i32* %sb , i32 2
50
115
ret void
51
116
}
52
117
53
- ; CHECK-LABEL: {{^}}cluster_load_valu_cluster_store:
54
- define amdgpu_kernel void @cluster_load_valu_cluster_store (i32* noalias %lb , i32* noalias %sb ) {
55
- bb:
118
+ ; DBG-LABEL: cluster_load_valu_cluster_store:
119
+
56
120
; DBG: Num BaseOps: {{[1-9]+}}, Offset: {{[0-9]+}}, OffsetIsScalable: {{[01]}}, Width: 8
57
121
; DBG: Num BaseOps: {{[1-9]+}}, Offset: {{[0-9]+}}, OffsetIsScalable: {{[01]}}, Width: 8
58
122
; DBG: Num BaseOps: {{[1-9]+}}, Offset: {{[0-9]+}}, OffsetIsScalable: {{[01]}}, Width: 4
65
129
; DBG: Cluster ld/st SU([[L1:[0-9]+]]) - SU([[L2:[0-9]+]])
66
130
; DBG: Cluster ld/st SU([[L2]]) - SU([[L3:[0-9]+]])
67
131
; DBG: Cluster ld/st SU([[L3]]) - SU([[L4:[0-9]+]])
68
- ; GCN: flat_load_dword [[LD1:v[0-9]+]], v[{{[0-9:]+}}]
69
- ; GCN-NEXT: flat_load_dword [[LD2:v[0-9]+]], v[{{[0-9:]+}}] offset:8
70
- ; GCN-NEXT: flat_load_dword [[LD3:v[0-9]+]], v[{{[0-9:]+}}] offset:16
71
- ; GCN-NEXT: flat_load_dword [[LD4:v[0-9]+]], v[{{[0-9:]+}}] offset:24
132
+
133
+ ; DBG-NOT: Cluster ld/st
134
+
135
+ define amdgpu_kernel void @cluster_load_valu_cluster_store (i32* noalias %lb , i32* noalias %sb ) {
136
+ ; GFX9-LABEL: cluster_load_valu_cluster_store:
137
+ ; GFX9: ; %bb.0: ; %bb
138
+ ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
139
+ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
140
+ ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
141
+ ; GFX9-NEXT: v_mov_b32_e32 v0, s2
142
+ ; GFX9-NEXT: v_mov_b32_e32 v1, s3
143
+ ; GFX9-NEXT: flat_load_dword v2, v[0:1]
144
+ ; GFX9-NEXT: flat_load_dword v3, v[0:1] offset:8
145
+ ; GFX9-NEXT: flat_load_dword v4, v[0:1] offset:16
146
+ ; GFX9-NEXT: flat_load_dword v5, v[0:1] offset:24
147
+ ; GFX9-NEXT: v_mov_b32_e32 v0, s0
148
+ ; GFX9-NEXT: v_mov_b32_e32 v1, s1
149
+ ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
150
+ ; GFX9-NEXT: flat_store_dword v[0:1], v2
151
+ ; GFX9-NEXT: v_add_u32_e32 v2, 1, v3
152
+ ; GFX9-NEXT: flat_store_dword v[0:1], v4 offset:16
153
+ ; GFX9-NEXT: flat_store_dword v[0:1], v2 offset:8
154
+ ; GFX9-NEXT: flat_store_dword v[0:1], v5 offset:24
155
+ ; GFX9-NEXT: s_endpgm
156
+ ;
157
+ ; GFX10-LABEL: cluster_load_valu_cluster_store:
158
+ ; GFX10: ; %bb.0: ; %bb
159
+ ; GFX10-NEXT: s_clause 0x1
160
+ ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
161
+ ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
162
+ ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
163
+ ; GFX10-NEXT: s_add_u32 s0, s2, 8
164
+ ; GFX10-NEXT: s_addc_u32 s1, s3, 0
165
+ ; GFX10-NEXT: s_add_u32 s6, s2, 16
166
+ ; GFX10-NEXT: v_mov_b32_e32 v3, s1
167
+ ; GFX10-NEXT: v_mov_b32_e32 v2, s0
168
+ ; GFX10-NEXT: s_addc_u32 s7, s3, 0
169
+ ; GFX10-NEXT: s_add_u32 s0, s2, 24
170
+ ; GFX10-NEXT: v_mov_b32_e32 v0, s2
171
+ ; GFX10-NEXT: s_addc_u32 s1, s3, 0
172
+ ; GFX10-NEXT: v_mov_b32_e32 v4, s6
173
+ ; GFX10-NEXT: v_mov_b32_e32 v1, s3
174
+ ; GFX10-NEXT: flat_load_dword v6, v[2:3]
175
+ ; GFX10-NEXT: v_mov_b32_e32 v3, s1
176
+ ; GFX10-NEXT: v_mov_b32_e32 v5, s7
177
+ ; GFX10-NEXT: v_mov_b32_e32 v2, s0
178
+ ; GFX10-NEXT: s_add_u32 s0, s4, 8
179
+ ; GFX10-NEXT: s_addc_u32 s1, s5, 0
180
+ ; GFX10-NEXT: s_clause 0x2
181
+ ; GFX10-NEXT: flat_load_dword v8, v[0:1]
182
+ ; GFX10-NEXT: flat_load_dword v9, v[4:5]
183
+ ; GFX10-NEXT: flat_load_dword v10, v[2:3]
184
+ ; GFX10-NEXT: s_add_u32 s2, s4, 16
185
+ ; GFX10-NEXT: s_addc_u32 s3, s5, 0
186
+ ; GFX10-NEXT: v_mov_b32_e32 v3, s1
187
+ ; GFX10-NEXT: v_mov_b32_e32 v0, s4
188
+ ; GFX10-NEXT: v_mov_b32_e32 v2, s0
189
+ ; GFX10-NEXT: s_add_u32 s0, s4, 24
190
+ ; GFX10-NEXT: v_mov_b32_e32 v5, s3
191
+ ; GFX10-NEXT: v_mov_b32_e32 v1, s5
192
+ ; GFX10-NEXT: s_addc_u32 s1, s5, 0
193
+ ; GFX10-NEXT: v_mov_b32_e32 v4, s2
194
+ ; GFX10-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
195
+ ; GFX10-NEXT: v_add_nc_u32_e32 v11, 1, v6
196
+ ; GFX10-NEXT: v_mov_b32_e32 v7, s1
197
+ ; GFX10-NEXT: v_mov_b32_e32 v6, s0
198
+ ; GFX10-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
199
+ ; GFX10-NEXT: flat_store_dword v[0:1], v8
200
+ ; GFX10-NEXT: s_waitcnt vmcnt(1) lgkmcnt(2)
201
+ ; GFX10-NEXT: flat_store_dword v[4:5], v9
202
+ ; GFX10-NEXT: flat_store_dword v[2:3], v11
203
+ ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(3)
204
+ ; GFX10-NEXT: flat_store_dword v[6:7], v10
205
+ ; GFX10-NEXT: s_endpgm
206
+ bb:
72
207
%la0 = getelementptr inbounds i32 , i32* %lb , i32 0
73
208
%ld0 = load i32 , i32* %la0
74
209
%la1 = getelementptr inbounds i32 , i32* %lb , i32 2
78
213
%la3 = getelementptr inbounds i32 , i32* %lb , i32 6
79
214
%ld3 = load i32 , i32* %la3
80
215
81
- ; DBG-NOT: Cluster ld/st
82
- ; GCN: flat_store_dword v[{{[0-9:]+}}], [[LD1]]
83
- ; GCN: v_add_u32_e32 [[ST2:v[0-9]+]], 1, [[LD2]]
84
- ; GCN-NEXT: flat_store_dword v[{{[0-9:]+}}], [[LD3]] offset:16
85
- ; GCN-NEXT: flat_store_dword v[{{[0-9:]+}}], [[ST2]] offset:8
86
- ; GCN-NEXT: flat_store_dword v[{{[0-9:]+}}], [[LD4]] offset:24
87
216
%sa0 = getelementptr inbounds i32 , i32* %sb , i32 0
88
217
store i32 %ld0 , i32* %sa0
89
218
%sa1 = getelementptr inbounds i32 , i32* %sb , i32 2
98
227
}
99
228
100
229
; Cluster loads from the same texture with different coordinates
101
- ; CHECK -LABEL: {{^}} cluster_image_load:
230
+ ; DBG -LABEL: cluster_image_load:
102
231
; DBG: Num BaseOps: {{[1-9]+}}, Offset: {{[0-9]+}}, OffsetIsScalable: {{[01]}}, Width: 16
103
232
; DBG: Num BaseOps: {{[1-9]+}}, Offset: {{[0-9]+}}, OffsetIsScalable: {{[01]}}, Width: 16
104
233
; DBG: {{^}}Cluster ld/st [[SU1:SU\([0-9]+\)]] - [[SU2:SU\([0-9]+\)]]
105
234
; DBG: {{^}}[[SU1]]: {{.*}} IMAGE_LOAD
106
235
; DBG: {{^}}[[SU2]]: {{.*}} IMAGE_LOAD
107
- ; GCN: image_load v
108
- ; GCN-NEXT: image_load v
109
236
define amdgpu_ps void @cluster_image_load (<8 x i32 > inreg %src , <8 x i32 > inreg %dst , i32 %x , i32 %y ) {
237
+ ; GFX9-LABEL: cluster_image_load:
238
+ ; GFX9: ; %bb.0: ; %entry
239
+ ; GFX9-NEXT: v_add_u32_e32 v2, 1, v0
240
+ ; GFX9-NEXT: v_add_u32_e32 v3, 1, v1
241
+ ; GFX9-NEXT: v_add_u32_e32 v6, 2, v0
242
+ ; GFX9-NEXT: v_add_u32_e32 v7, 2, v1
243
+ ; GFX9-NEXT: image_load v[2:5], v[2:3], s[0:7] dmask:0xf unorm
244
+ ; GFX9-NEXT: image_load v[6:9], v[6:7], s[0:7] dmask:0xf unorm
245
+ ; GFX9-NEXT: s_waitcnt vmcnt(0)
246
+ ; GFX9-NEXT: v_add_f32_e32 v5, v5, v9
247
+ ; GFX9-NEXT: v_add_f32_e32 v4, v4, v8
248
+ ; GFX9-NEXT: v_add_f32_e32 v3, v3, v7
249
+ ; GFX9-NEXT: v_add_f32_e32 v2, v2, v6
250
+ ; GFX9-NEXT: image_store v[2:5], v[0:1], s[8:15] dmask:0xf unorm
251
+ ; GFX9-NEXT: s_endpgm
252
+ ;
253
+ ; GFX10-LABEL: cluster_image_load:
254
+ ; GFX10: ; %bb.0: ; %entry
255
+ ; GFX10-NEXT: v_add_nc_u32_e32 v10, 1, v0
256
+ ; GFX10-NEXT: v_add_nc_u32_e32 v11, 1, v1
257
+ ; GFX10-NEXT: v_add_nc_u32_e32 v12, 2, v0
258
+ ; GFX10-NEXT: v_add_nc_u32_e32 v13, 2, v1
259
+ ; GFX10-NEXT: s_clause 0x1
260
+ ; GFX10-NEXT: image_load v[2:5], v[10:11], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
261
+ ; GFX10-NEXT: image_load v[6:9], v[12:13], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
262
+ ; GFX10-NEXT: s_waitcnt vmcnt(0)
263
+ ; GFX10-NEXT: v_add_f32_e32 v5, v5, v9
264
+ ; GFX10-NEXT: v_add_f32_e32 v4, v4, v8
265
+ ; GFX10-NEXT: v_add_f32_e32 v3, v3, v7
266
+ ; GFX10-NEXT: v_add_f32_e32 v2, v2, v6
267
+ ; GFX10-NEXT: image_store v[2:5], v[0:1], s[8:15] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
268
+ ; GFX10-NEXT: s_endpgm
110
269
entry:
111
270
%x1 = add i32 %x , 1
112
271
%y1 = add i32 %y , 1
@@ -120,11 +279,34 @@ entry:
120
279
}
121
280
122
281
; Don't cluster loads from different textures
123
- ; CHECK -LABEL: {{^}} no_cluster_image_load:
282
+ ; DBG -LABEL: no_cluster_image_load:
124
283
; DBG: Num BaseOps: {{[1-9]+}}, Offset: {{[0-9]+}}, OffsetIsScalable: {{[01]}}, Width: 16
125
284
; DBG: Num BaseOps: {{[1-9]+}}, Offset: {{[0-9]+}}, OffsetIsScalable: {{[01]}}, Width: 16
126
285
; DBG-NOT: {{^}}Cluster ld/st
127
286
define amdgpu_ps void @no_cluster_image_load (<8 x i32 > inreg %src1 , <8 x i32 > inreg %src2 , <8 x i32 > inreg %dst , i32 %x , i32 %y ) {
287
+ ; GFX9-LABEL: no_cluster_image_load:
288
+ ; GFX9: ; %bb.0: ; %entry
289
+ ; GFX9-NEXT: image_load v[2:5], v[0:1], s[0:7] dmask:0xf unorm
290
+ ; GFX9-NEXT: image_load v[6:9], v[0:1], s[8:15] dmask:0xf unorm
291
+ ; GFX9-NEXT: s_waitcnt vmcnt(0)
292
+ ; GFX9-NEXT: v_add_f32_e32 v5, v5, v9
293
+ ; GFX9-NEXT: v_add_f32_e32 v4, v4, v8
294
+ ; GFX9-NEXT: v_add_f32_e32 v3, v3, v7
295
+ ; GFX9-NEXT: v_add_f32_e32 v2, v2, v6
296
+ ; GFX9-NEXT: image_store v[2:5], v[0:1], s[16:23] dmask:0xf unorm
297
+ ; GFX9-NEXT: s_endpgm
298
+ ;
299
+ ; GFX10-LABEL: no_cluster_image_load:
300
+ ; GFX10: ; %bb.0: ; %entry
301
+ ; GFX10-NEXT: image_load v[2:5], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
302
+ ; GFX10-NEXT: image_load v[6:9], v[0:1], s[8:15] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
303
+ ; GFX10-NEXT: s_waitcnt vmcnt(0)
304
+ ; GFX10-NEXT: v_add_f32_e32 v5, v5, v9
305
+ ; GFX10-NEXT: v_add_f32_e32 v4, v4, v8
306
+ ; GFX10-NEXT: v_add_f32_e32 v3, v3, v7
307
+ ; GFX10-NEXT: v_add_f32_e32 v2, v2, v6
308
+ ; GFX10-NEXT: image_store v[2:5], v[0:1], s[16:23] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
309
+ ; GFX10-NEXT: s_endpgm
128
310
entry:
129
311
%val1 = call <4 x float > @llvm.amdgcn.image.load.mip.2d.v4f32.i32 (i32 15 , i32 %x , i32 %y , i32 0 , <8 x i32 > %src1 , i32 0 , i32 0 )
130
312
%val2 = call <4 x float > @llvm.amdgcn.image.load.mip.2d.v4f32.i32 (i32 15 , i32 %x , i32 %y , i32 0 , <8 x i32 > %src2 , i32 0 , i32 0 )
@@ -134,15 +316,59 @@ entry:
134
316
}
135
317
136
318
; Cluster loads from the same texture and sampler with different coordinates
137
- ; CHECK -LABEL: {{^}} cluster_image_sample:
319
+ ; DBG -LABEL: cluster_image_sample:
138
320
; DBG: Num BaseOps: {{[1-9]+}}, Offset: {{[0-9]+}}, OffsetIsScalable: {{[01]}}, Width: 16
139
321
; DBG: Num BaseOps: {{[1-9]+}}, Offset: {{[0-9]+}}, OffsetIsScalable: {{[01]}}, Width: 16
140
322
; DBG: {{^}}Cluster ld/st [[SU1:SU\([0-9]+\)]] - [[SU2:SU\([0-9]+\)]]
141
323
; DBG: {{^}}[[SU1]]: {{.*}} IMAGE_SAMPLE
142
324
; DBG: {{^}}[[SU2]]: {{.*}} IMAGE_SAMPLE
143
- ; GCN: image_sample_d
144
- ; GCN-NEXT: image_sample_d
145
325
define amdgpu_ps void @cluster_image_sample (<8 x i32 > inreg %src , <4 x i32 > inreg %smp , <8 x i32 > inreg %dst , i32 %x , i32 %y ) {
326
+ ; GFX9-LABEL: cluster_image_sample:
327
+ ; GFX9: ; %bb.0: ; %entry
328
+ ; GFX9-NEXT: v_cvt_f32_i32_e32 v8, v0
329
+ ; GFX9-NEXT: v_cvt_f32_i32_e32 v9, v1
330
+ ; GFX9-NEXT: v_mov_b32_e32 v4, 0
331
+ ; GFX9-NEXT: v_mov_b32_e32 v10, 1.0
332
+ ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v8
333
+ ; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v9
334
+ ; GFX9-NEXT: v_mov_b32_e32 v5, v4
335
+ ; GFX9-NEXT: v_mov_b32_e32 v6, v4
336
+ ; GFX9-NEXT: v_mov_b32_e32 v7, v4
337
+ ; GFX9-NEXT: v_add_f32_e32 v8, 2.0, v8
338
+ ; GFX9-NEXT: v_add_f32_e32 v9, 2.0, v9
339
+ ; GFX9-NEXT: v_mov_b32_e32 v11, v10
340
+ ; GFX9-NEXT: v_mov_b32_e32 v12, v10
341
+ ; GFX9-NEXT: v_mov_b32_e32 v13, v10
342
+ ; GFX9-NEXT: image_sample_d v[2:5], v[2:9], s[0:7], s[8:11] dmask:0xf
343
+ ; GFX9-NEXT: image_sample_d v[6:9], v[8:15], s[0:7], s[8:11] dmask:0xf
344
+ ; GFX9-NEXT: s_waitcnt vmcnt(0)
345
+ ; GFX9-NEXT: v_add_f32_e32 v5, v5, v9
346
+ ; GFX9-NEXT: v_add_f32_e32 v4, v4, v8
347
+ ; GFX9-NEXT: v_add_f32_e32 v3, v3, v7
348
+ ; GFX9-NEXT: v_add_f32_e32 v2, v2, v6
349
+ ; GFX9-NEXT: image_store v[2:5], v[0:1], s[12:19] dmask:0xf unorm
350
+ ; GFX9-NEXT: s_endpgm
351
+ ;
352
+ ; GFX10-LABEL: cluster_image_sample:
353
+ ; GFX10: ; %bb.0: ; %entry
354
+ ; GFX10-NEXT: v_cvt_f32_i32_e32 v2, v0
355
+ ; GFX10-NEXT: v_cvt_f32_i32_e32 v3, v1
356
+ ; GFX10-NEXT: v_mov_b32_e32 v13, 0
357
+ ; GFX10-NEXT: v_mov_b32_e32 v10, 1.0
358
+ ; GFX10-NEXT: v_add_f32_e32 v11, 1.0, v2
359
+ ; GFX10-NEXT: v_add_f32_e32 v12, 1.0, v3
360
+ ; GFX10-NEXT: v_add_f32_e32 v14, 2.0, v2
361
+ ; GFX10-NEXT: v_add_f32_e32 v15, 2.0, v3
362
+ ; GFX10-NEXT: s_clause 0x1
363
+ ; GFX10-NEXT: image_sample_d v[2:5], [v11, v12, v13, v13, v13, v13], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
364
+ ; GFX10-NEXT: image_sample_d v[6:9], [v14, v15, v10, v10, v10, v10], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
365
+ ; GFX10-NEXT: s_waitcnt vmcnt(0)
366
+ ; GFX10-NEXT: v_add_f32_e32 v5, v5, v9
367
+ ; GFX10-NEXT: v_add_f32_e32 v4, v4, v8
368
+ ; GFX10-NEXT: v_add_f32_e32 v3, v3, v7
369
+ ; GFX10-NEXT: v_add_f32_e32 v2, v2, v6
370
+ ; GFX10-NEXT: image_store v[2:5], v[0:1], s[12:19] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
371
+ ; GFX10-NEXT: s_endpgm
146
372
entry:
147
373
%s = sitofp i32 %x to float
148
374
%t = sitofp i32 %y to float
0 commit comments