1
- ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck --check-prefix=SM20 %s
2
- ; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck --check-prefix=SM35 %s
3
- ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
4
- ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_35 | %ptxas-verify %}
1
+ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2
+ ; RUN: llc < %s --mtriple=nvptx64 -mcpu=sm_20 | FileCheck --check-prefix=SM20 %s
3
+ ; RUN: llc < %s --mtriple=nvptx64 -mcpu=sm_35 | FileCheck --check-prefix=SM35 %s
4
+ ; RUN: %if ptxas %{ llc < %s --mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
5
+ ; RUN: %if ptxas %{ llc < %s --mtriple=nvptx64 -mcpu=sm_35 | %ptxas-verify %}
5
6
6
7
7
8
declare i32 @llvm.nvvm.rotate.b32 (i32 , i32 )
@@ -11,50 +12,338 @@ declare i64 @llvm.nvvm.rotate.right.b64(i64, i32)
11
12
; SM20: rotate32
12
13
; SM35: rotate32
13
14
define i32 @rotate32 (i32 %a , i32 %b ) {
14
- ; SM20: shl.b32
15
- ; SM20: sub.s32
16
- ; SM20: shr.b32
17
- ; SM20: add.u32
18
- ; SM35: shf.l.wrap.b32
15
+ ; SM20-LABEL: rotate32(
16
+ ; SM20: {
17
+ ; SM20-NEXT: .reg .b32 %r<4>;
18
+ ; SM20-EMPTY:
19
+ ; SM20-NEXT: // %bb.0:
20
+ ; SM20-NEXT: ld.param.u32 %r1, [rotate32_param_0];
21
+ ; SM20-NEXT: ld.param.u32 %r2, [rotate32_param_1];
22
+ ; SM20-NEXT: {
23
+ ; SM20-NEXT: .reg .b32 %lhs;
24
+ ; SM20-NEXT: .reg .b32 %rhs;
25
+ ; SM20-NEXT: .reg .b32 %amt2;
26
+ ; SM20-NEXT: shl.b32 %lhs, %r1, %r2;
27
+ ; SM20-NEXT: sub.s32 %amt2, 32, %r2;
28
+ ; SM20-NEXT: shr.b32 %rhs, %r1, %amt2;
29
+ ; SM20-NEXT: add.u32 %r3, %lhs, %rhs;
30
+ ; SM20-NEXT: }
31
+ ; SM20-NEXT: st.param.b32 [func_retval0+0], %r3;
32
+ ; SM20-NEXT: ret;
33
+ ;
34
+ ; SM35-LABEL: rotate32(
35
+ ; SM35: {
36
+ ; SM35-NEXT: .reg .b32 %r<4>;
37
+ ; SM35-EMPTY:
38
+ ; SM35-NEXT: // %bb.0:
39
+ ; SM35-NEXT: ld.param.u32 %r1, [rotate32_param_0];
40
+ ; SM35-NEXT: ld.param.u32 %r2, [rotate32_param_1];
41
+ ; SM35-NEXT: shf.l.wrap.b32 %r3, %r1, %r1, %r2;
42
+ ; SM35-NEXT: st.param.b32 [func_retval0+0], %r3;
43
+ ; SM35-NEXT: ret;
19
44
%val = tail call i32 @llvm.nvvm.rotate.b32 (i32 %a , i32 %b )
20
45
ret i32 %val
21
46
}
22
47
23
48
; SM20: rotate64
24
49
; SM35: rotate64
25
50
define i64 @rotate64 (i64 %a , i32 %b ) {
26
- ; SM20: shl.b64
27
- ; SM20: sub.u32
28
- ; SM20: shr.b64
29
- ; SM20: add.u64
30
- ; SM35: shf.l.wrap.b32
31
- ; SM35: shf.l.wrap.b32
51
+ ; SM20-LABEL: rotate64(
52
+ ; SM20: {
53
+ ; SM20-NEXT: .reg .b32 %r<2>;
54
+ ; SM20-NEXT: .reg .b64 %rd<3>;
55
+ ; SM20-EMPTY:
56
+ ; SM20-NEXT: // %bb.0:
57
+ ; SM20-NEXT: ld.param.u64 %rd1, [rotate64_param_0];
58
+ ; SM20-NEXT: ld.param.u32 %r1, [rotate64_param_1];
59
+ ; SM20-NEXT: {
60
+ ; SM20-NEXT: .reg .b64 %lhs;
61
+ ; SM20-NEXT: .reg .b64 %rhs;
62
+ ; SM20-NEXT: .reg .u32 %amt2;
63
+ ; SM20-NEXT: and.b32 %amt2, %r1, 63;
64
+ ; SM20-NEXT: shl.b64 %lhs, %rd1, %amt2;
65
+ ; SM20-NEXT: sub.u32 %amt2, 64, %amt2;
66
+ ; SM20-NEXT: shr.b64 %rhs, %rd1, %amt2;
67
+ ; SM20-NEXT: add.u64 %rd2, %lhs, %rhs;
68
+ ; SM20-NEXT: }
69
+ ; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2;
70
+ ; SM20-NEXT: ret;
71
+ ;
72
+ ; SM35-LABEL: rotate64(
73
+ ; SM35: {
74
+ ; SM35-NEXT: .reg .b32 %r<6>;
75
+ ; SM35-NEXT: .reg .b64 %rd<3>;
76
+ ; SM35-EMPTY:
77
+ ; SM35-NEXT: // %bb.0:
78
+ ; SM35-NEXT: ld.param.u64 %rd1, [rotate64_param_0];
79
+ ; SM35-NEXT: {
80
+ ; SM35-NEXT: .reg .b32 %dummy;
81
+ ; SM35-NEXT: mov.b64 {%dummy,%r1}, %rd1;
82
+ ; SM35-NEXT: }
83
+ ; SM35-NEXT: {
84
+ ; SM35-NEXT: .reg .b32 %dummy;
85
+ ; SM35-NEXT: mov.b64 {%r2,%dummy}, %rd1;
86
+ ; SM35-NEXT: }
87
+ ; SM35-NEXT: ld.param.u32 %r3, [rotate64_param_1];
88
+ ; SM35-NEXT: shf.l.wrap.b32 %r4, %r2, %r1, %r3;
89
+ ; SM35-NEXT: shf.l.wrap.b32 %r5, %r1, %r2, %r3;
90
+ ; SM35-NEXT: mov.b64 %rd2, {%r5, %r4};
91
+ ; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2;
92
+ ; SM35-NEXT: ret;
32
93
%val = tail call i64 @llvm.nvvm.rotate.b64 (i64 %a , i32 %b )
33
94
ret i64 %val
34
95
}
35
96
36
97
; SM20: rotateright64
37
98
; SM35: rotateright64
38
99
define i64 @rotateright64 (i64 %a , i32 %b ) {
39
- ; SM20: shr.b64
40
- ; SM20: sub.u32
41
- ; SM20: shl.b64
42
- ; SM20: add.u64
43
- ; SM35: shf.r.wrap.b32
44
- ; SM35: shf.r.wrap.b32
100
+ ; SM20-LABEL: rotateright64(
101
+ ; SM20: {
102
+ ; SM20-NEXT: .reg .b32 %r<2>;
103
+ ; SM20-NEXT: .reg .b64 %rd<3>;
104
+ ; SM20-EMPTY:
105
+ ; SM20-NEXT: // %bb.0:
106
+ ; SM20-NEXT: ld.param.u64 %rd1, [rotateright64_param_0];
107
+ ; SM20-NEXT: ld.param.u32 %r1, [rotateright64_param_1];
108
+ ; SM20-NEXT: {
109
+ ; SM20-NEXT: .reg .b64 %lhs;
110
+ ; SM20-NEXT: .reg .b64 %rhs;
111
+ ; SM20-NEXT: .reg .u32 %amt2;
112
+ ; SM20-NEXT: and.b32 %amt2, %r1, 63;
113
+ ; SM20-NEXT: shr.b64 %lhs, %rd1, %amt2;
114
+ ; SM20-NEXT: sub.u32 %amt2, 64, %amt2;
115
+ ; SM20-NEXT: shl.b64 %rhs, %rd1, %amt2;
116
+ ; SM20-NEXT: add.u64 %rd2, %lhs, %rhs;
117
+ ; SM20-NEXT: }
118
+ ; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2;
119
+ ; SM20-NEXT: ret;
120
+ ;
121
+ ; SM35-LABEL: rotateright64(
122
+ ; SM35: {
123
+ ; SM35-NEXT: .reg .b32 %r<6>;
124
+ ; SM35-NEXT: .reg .b64 %rd<3>;
125
+ ; SM35-EMPTY:
126
+ ; SM35-NEXT: // %bb.0:
127
+ ; SM35-NEXT: ld.param.u64 %rd1, [rotateright64_param_0];
128
+ ; SM35-NEXT: {
129
+ ; SM35-NEXT: .reg .b32 %dummy;
130
+ ; SM35-NEXT: mov.b64 {%r1,%dummy}, %rd1;
131
+ ; SM35-NEXT: }
132
+ ; SM35-NEXT: {
133
+ ; SM35-NEXT: .reg .b32 %dummy;
134
+ ; SM35-NEXT: mov.b64 {%dummy,%r2}, %rd1;
135
+ ; SM35-NEXT: }
136
+ ; SM35-NEXT: ld.param.u32 %r3, [rotateright64_param_1];
137
+ ; SM35-NEXT: shf.r.wrap.b32 %r4, %r2, %r1, %r3;
138
+ ; SM35-NEXT: shf.r.wrap.b32 %r5, %r1, %r2, %r3;
139
+ ; SM35-NEXT: mov.b64 %rd2, {%r5, %r4};
140
+ ; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2;
141
+ ; SM35-NEXT: ret;
45
142
%val = tail call i64 @llvm.nvvm.rotate.right.b64 (i64 %a , i32 %b )
46
143
ret i64 %val
47
144
}
48
145
49
146
; SM20: rotl0
50
147
; SM35: rotl0
51
148
define i32 @rotl0 (i32 %x ) {
52
- ; SM20: shl.b32
53
- ; SM20: shr.b32
54
- ; SM20: add.u32
55
- ; SM35: shf.l.wrap.b32
149
+ ; SM20-LABEL: rotl0(
150
+ ; SM20: {
151
+ ; SM20-NEXT: .reg .b32 %r<3>;
152
+ ; SM20-EMPTY:
153
+ ; SM20-NEXT: // %bb.0:
154
+ ; SM20-NEXT: ld.param.u32 %r1, [rotl0_param_0];
155
+ ; SM20-NEXT: {
156
+ ; SM20-NEXT: .reg .b32 %lhs;
157
+ ; SM20-NEXT: .reg .b32 %rhs;
158
+ ; SM20-NEXT: shl.b32 %lhs, %r1, 8;
159
+ ; SM20-NEXT: shr.b32 %rhs, %r1, 24;
160
+ ; SM20-NEXT: add.u32 %r2, %lhs, %rhs;
161
+ ; SM20-NEXT: }
162
+ ; SM20-NEXT: st.param.b32 [func_retval0+0], %r2;
163
+ ; SM20-NEXT: ret;
164
+ ;
165
+ ; SM35-LABEL: rotl0(
166
+ ; SM35: {
167
+ ; SM35-NEXT: .reg .b32 %r<3>;
168
+ ; SM35-EMPTY:
169
+ ; SM35-NEXT: // %bb.0:
170
+ ; SM35-NEXT: ld.param.u32 %r1, [rotl0_param_0];
171
+ ; SM35-NEXT: shf.l.wrap.b32 %r2, %r1, %r1, 8;
172
+ ; SM35-NEXT: st.param.b32 [func_retval0+0], %r2;
173
+ ; SM35-NEXT: ret;
56
174
%t0 = shl i32 %x , 8
57
175
%t1 = lshr i32 %x , 24
58
176
%t2 = or i32 %t0 , %t1
59
177
ret i32 %t2
60
178
}
179
+
180
+ declare i64 @llvm.fshl.i64 (i64 , i64 , i64 )
181
+ declare i64 @llvm.fshr.i64 (i64 , i64 , i64 )
182
+
183
+ ; SM35: rotl64
184
+ define i64 @rotl64 (i64 %a , i64 %n ) {
185
+ ; SM20-LABEL: rotl64(
186
+ ; SM20: {
187
+ ; SM20-NEXT: .reg .b32 %r<2>;
188
+ ; SM20-NEXT: .reg .b64 %rd<3>;
189
+ ; SM20-EMPTY:
190
+ ; SM20-NEXT: // %bb.0:
191
+ ; SM20-NEXT: ld.param.u64 %rd1, [rotl64_param_0];
192
+ ; SM20-NEXT: ld.param.u32 %r1, [rotl64_param_1];
193
+ ; SM20-NEXT: {
194
+ ; SM20-NEXT: .reg .b64 %lhs;
195
+ ; SM20-NEXT: .reg .b64 %rhs;
196
+ ; SM20-NEXT: .reg .u32 %amt2;
197
+ ; SM20-NEXT: and.b32 %amt2, %r1, 63;
198
+ ; SM20-NEXT: shl.b64 %lhs, %rd1, %amt2;
199
+ ; SM20-NEXT: sub.u32 %amt2, 64, %amt2;
200
+ ; SM20-NEXT: shr.b64 %rhs, %rd1, %amt2;
201
+ ; SM20-NEXT: add.u64 %rd2, %lhs, %rhs;
202
+ ; SM20-NEXT: }
203
+ ; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2;
204
+ ; SM20-NEXT: ret;
205
+ ;
206
+ ; SM35-LABEL: rotl64(
207
+ ; SM35: {
208
+ ; SM35-NEXT: .reg .b32 %r<2>;
209
+ ; SM35-NEXT: .reg .b64 %rd<3>;
210
+ ; SM35-EMPTY:
211
+ ; SM35-NEXT: // %bb.0:
212
+ ; SM35-NEXT: ld.param.u64 %rd1, [rotl64_param_0];
213
+ ; SM35-NEXT: ld.param.u32 %r1, [rotl64_param_1];
214
+ ; SM35-NEXT: {
215
+ ; SM35-NEXT: .reg .b64 %lhs;
216
+ ; SM35-NEXT: .reg .b64 %rhs;
217
+ ; SM35-NEXT: .reg .u32 %amt2;
218
+ ; SM35-NEXT: and.b32 %amt2, %r1, 63;
219
+ ; SM35-NEXT: shl.b64 %lhs, %rd1, %amt2;
220
+ ; SM35-NEXT: sub.u32 %amt2, 64, %amt2;
221
+ ; SM35-NEXT: shr.b64 %rhs, %rd1, %amt2;
222
+ ; SM35-NEXT: add.u64 %rd2, %lhs, %rhs;
223
+ ; SM35-NEXT: }
224
+ ; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2;
225
+ ; SM35-NEXT: ret;
226
+ %val = tail call i64 @llvm.fshl.i64 (i64 %a , i64 %a , i64 %n )
227
+ ret i64 %val
228
+ }
229
+
230
+ ; SM35: rotl64_imm
231
+ define i64 @rotl64_imm (i64 %a ) {
232
+ ; SM20-LABEL: rotl64_imm(
233
+ ; SM20: {
234
+ ; SM20-NEXT: .reg .b64 %rd<3>;
235
+ ; SM20-EMPTY:
236
+ ; SM20-NEXT: // %bb.0:
237
+ ; SM20-NEXT: ld.param.u64 %rd1, [rotl64_imm_param_0];
238
+ ; SM20-NEXT: {
239
+ ; SM20-NEXT: .reg .b64 %lhs;
240
+ ; SM20-NEXT: .reg .b64 %rhs;
241
+ ; SM20-NEXT: shl.b64 %lhs, %rd1, 2;
242
+ ; SM20-NEXT: shr.b64 %rhs, %rd1, 62;
243
+ ; SM20-NEXT: add.u64 %rd2, %lhs, %rhs;
244
+ ; SM20-NEXT: }
245
+ ; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2;
246
+ ; SM20-NEXT: ret;
247
+ ;
248
+ ; SM35-LABEL: rotl64_imm(
249
+ ; SM35: {
250
+ ; SM35-NEXT: .reg .b64 %rd<3>;
251
+ ; SM35-EMPTY:
252
+ ; SM35-NEXT: // %bb.0:
253
+ ; SM35-NEXT: ld.param.u64 %rd1, [rotl64_imm_param_0];
254
+ ; SM35-NEXT: {
255
+ ; SM35-NEXT: .reg .b64 %lhs;
256
+ ; SM35-NEXT: .reg .b64 %rhs;
257
+ ; SM35-NEXT: shl.b64 %lhs, %rd1, 2;
258
+ ; SM35-NEXT: shr.b64 %rhs, %rd1, 62;
259
+ ; SM35-NEXT: add.u64 %rd2, %lhs, %rhs;
260
+ ; SM35-NEXT: }
261
+ ; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2;
262
+ ; SM35-NEXT: ret;
263
+ %val = tail call i64 @llvm.fshl.i64 (i64 %a , i64 %a , i64 66 )
264
+ ret i64 %val
265
+ }
266
+
267
+ ; SM35: rotr64
268
+ define i64 @rotr64 (i64 %a , i64 %n ) {
269
+ ; SM20-LABEL: rotr64(
270
+ ; SM20: {
271
+ ; SM20-NEXT: .reg .b32 %r<2>;
272
+ ; SM20-NEXT: .reg .b64 %rd<3>;
273
+ ; SM20-EMPTY:
274
+ ; SM20-NEXT: // %bb.0:
275
+ ; SM20-NEXT: ld.param.u64 %rd1, [rotr64_param_0];
276
+ ; SM20-NEXT: ld.param.u32 %r1, [rotr64_param_1];
277
+ ; SM20-NEXT: {
278
+ ; SM20-NEXT: .reg .b64 %lhs;
279
+ ; SM20-NEXT: .reg .b64 %rhs;
280
+ ; SM20-NEXT: .reg .u32 %amt2;
281
+ ; SM20-NEXT: and.b32 %amt2, %r1, 63;
282
+ ; SM20-NEXT: shr.b64 %lhs, %rd1, %amt2;
283
+ ; SM20-NEXT: sub.u32 %amt2, 64, %amt2;
284
+ ; SM20-NEXT: shl.b64 %rhs, %rd1, %amt2;
285
+ ; SM20-NEXT: add.u64 %rd2, %lhs, %rhs;
286
+ ; SM20-NEXT: }
287
+ ; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2;
288
+ ; SM20-NEXT: ret;
289
+ ;
290
+ ; SM35-LABEL: rotr64(
291
+ ; SM35: {
292
+ ; SM35-NEXT: .reg .b32 %r<2>;
293
+ ; SM35-NEXT: .reg .b64 %rd<3>;
294
+ ; SM35-EMPTY:
295
+ ; SM35-NEXT: // %bb.0:
296
+ ; SM35-NEXT: ld.param.u64 %rd1, [rotr64_param_0];
297
+ ; SM35-NEXT: ld.param.u32 %r1, [rotr64_param_1];
298
+ ; SM35-NEXT: {
299
+ ; SM35-NEXT: .reg .b64 %lhs;
300
+ ; SM35-NEXT: .reg .b64 %rhs;
301
+ ; SM35-NEXT: .reg .u32 %amt2;
302
+ ; SM35-NEXT: and.b32 %amt2, %r1, 63;
303
+ ; SM35-NEXT: shr.b64 %lhs, %rd1, %amt2;
304
+ ; SM35-NEXT: sub.u32 %amt2, 64, %amt2;
305
+ ; SM35-NEXT: shl.b64 %rhs, %rd1, %amt2;
306
+ ; SM35-NEXT: add.u64 %rd2, %lhs, %rhs;
307
+ ; SM35-NEXT: }
308
+ ; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2;
309
+ ; SM35-NEXT: ret;
310
+ %val = tail call i64 @llvm.fshr.i64 (i64 %a , i64 %a , i64 %n )
311
+ ret i64 %val
312
+ }
313
+
314
+ ; SM35: rotr64_imm
315
+ define i64 @rotr64_imm (i64 %a ) {
316
+ ; SM20-LABEL: rotr64_imm(
317
+ ; SM20: {
318
+ ; SM20-NEXT: .reg .b64 %rd<3>;
319
+ ; SM20-EMPTY:
320
+ ; SM20-NEXT: // %bb.0:
321
+ ; SM20-NEXT: ld.param.u64 %rd1, [rotr64_imm_param_0];
322
+ ; SM20-NEXT: {
323
+ ; SM20-NEXT: .reg .b64 %lhs;
324
+ ; SM20-NEXT: .reg .b64 %rhs;
325
+ ; SM20-NEXT: shl.b64 %lhs, %rd1, 62;
326
+ ; SM20-NEXT: shr.b64 %rhs, %rd1, 2;
327
+ ; SM20-NEXT: add.u64 %rd2, %lhs, %rhs;
328
+ ; SM20-NEXT: }
329
+ ; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2;
330
+ ; SM20-NEXT: ret;
331
+ ;
332
+ ; SM35-LABEL: rotr64_imm(
333
+ ; SM35: {
334
+ ; SM35-NEXT: .reg .b64 %rd<3>;
335
+ ; SM35-EMPTY:
336
+ ; SM35-NEXT: // %bb.0:
337
+ ; SM35-NEXT: ld.param.u64 %rd1, [rotr64_imm_param_0];
338
+ ; SM35-NEXT: {
339
+ ; SM35-NEXT: .reg .b64 %lhs;
340
+ ; SM35-NEXT: .reg .b64 %rhs;
341
+ ; SM35-NEXT: shl.b64 %lhs, %rd1, 62;
342
+ ; SM35-NEXT: shr.b64 %rhs, %rd1, 2;
343
+ ; SM35-NEXT: add.u64 %rd2, %lhs, %rhs;
344
+ ; SM35-NEXT: }
345
+ ; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2;
346
+ ; SM35-NEXT: ret;
347
+ %val = tail call i64 @llvm.fshr.i64 (i64 %a , i64 %a , i64 66 )
348
+ ret i64 %val
349
+ }
0 commit comments