Skip to content

Commit 7396ab1

Browse files
authored
[NVPTX] Fix 64 bits rotations with large shift values (#89399)
ROTL and ROTR can take a shift amount larger than the element size, in which case the effective shift amount should be the shift amount modulo the element size. This patch adds the modulo step when the shift amount isn't known at compile time. Without it the existing implementation would end up shifting beyond the type size and give incorrect results.
1 parent cf2f32c commit 7396ab1

File tree

2 files changed

+320
-29
lines changed

2 files changed

+320
-29
lines changed

llvm/lib/Target/NVPTX/NVPTXInstrInfo.td

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1752,8 +1752,9 @@ def ROTL64reg_sw :
17521752
".reg .b64 %lhs;\n\t"
17531753
".reg .b64 %rhs;\n\t"
17541754
".reg .u32 %amt2;\n\t"
1755-
"shl.b64 \t%lhs, $src, $amt;\n\t"
1756-
"sub.u32 \t%amt2, 64, $amt;\n\t"
1755+
"and.b32 \t%amt2, $amt, 63;\n\t"
1756+
"shl.b64 \t%lhs, $src, %amt2;\n\t"
1757+
"sub.u32 \t%amt2, 64, %amt2;\n\t"
17571758
"shr.b64 \t%rhs, $src, %amt2;\n\t"
17581759
"add.u64 \t$dst, %lhs, %rhs;\n\t"
17591760
"}}",
@@ -1765,8 +1766,9 @@ def ROTR64reg_sw :
17651766
".reg .b64 %lhs;\n\t"
17661767
".reg .b64 %rhs;\n\t"
17671768
".reg .u32 %amt2;\n\t"
1768-
"shr.b64 \t%lhs, $src, $amt;\n\t"
1769-
"sub.u32 \t%amt2, 64, $amt;\n\t"
1769+
"and.b32 \t%amt2, $amt, 63;\n\t"
1770+
"shr.b64 \t%lhs, $src, %amt2;\n\t"
1771+
"sub.u32 \t%amt2, 64, %amt2;\n\t"
17701772
"shl.b64 \t%rhs, $src, %amt2;\n\t"
17711773
"add.u64 \t$dst, %lhs, %rhs;\n\t"
17721774
"}}",

llvm/test/CodeGen/NVPTX/rotate.ll

Lines changed: 314 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
1-
; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck --check-prefix=SM20 %s
2-
; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck --check-prefix=SM35 %s
3-
; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
4-
; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_35 | %ptxas-verify %}
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2+
; RUN: llc < %s --mtriple=nvptx64 -mcpu=sm_20 | FileCheck --check-prefix=SM20 %s
3+
; RUN: llc < %s --mtriple=nvptx64 -mcpu=sm_35 | FileCheck --check-prefix=SM35 %s
4+
; RUN: %if ptxas %{ llc < %s --mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
5+
; RUN: %if ptxas %{ llc < %s --mtriple=nvptx64 -mcpu=sm_35 | %ptxas-verify %}
56

67

78
declare i32 @llvm.nvvm.rotate.b32(i32, i32)
@@ -11,50 +12,338 @@ declare i64 @llvm.nvvm.rotate.right.b64(i64, i32)
1112
; SM20: rotate32
1213
; SM35: rotate32
1314
define i32 @rotate32(i32 %a, i32 %b) {
14-
; SM20: shl.b32
15-
; SM20: sub.s32
16-
; SM20: shr.b32
17-
; SM20: add.u32
18-
; SM35: shf.l.wrap.b32
15+
; SM20-LABEL: rotate32(
16+
; SM20: {
17+
; SM20-NEXT: .reg .b32 %r<4>;
18+
; SM20-EMPTY:
19+
; SM20-NEXT: // %bb.0:
20+
; SM20-NEXT: ld.param.u32 %r1, [rotate32_param_0];
21+
; SM20-NEXT: ld.param.u32 %r2, [rotate32_param_1];
22+
; SM20-NEXT: {
23+
; SM20-NEXT: .reg .b32 %lhs;
24+
; SM20-NEXT: .reg .b32 %rhs;
25+
; SM20-NEXT: .reg .b32 %amt2;
26+
; SM20-NEXT: shl.b32 %lhs, %r1, %r2;
27+
; SM20-NEXT: sub.s32 %amt2, 32, %r2;
28+
; SM20-NEXT: shr.b32 %rhs, %r1, %amt2;
29+
; SM20-NEXT: add.u32 %r3, %lhs, %rhs;
30+
; SM20-NEXT: }
31+
; SM20-NEXT: st.param.b32 [func_retval0+0], %r3;
32+
; SM20-NEXT: ret;
33+
;
34+
; SM35-LABEL: rotate32(
35+
; SM35: {
36+
; SM35-NEXT: .reg .b32 %r<4>;
37+
; SM35-EMPTY:
38+
; SM35-NEXT: // %bb.0:
39+
; SM35-NEXT: ld.param.u32 %r1, [rotate32_param_0];
40+
; SM35-NEXT: ld.param.u32 %r2, [rotate32_param_1];
41+
; SM35-NEXT: shf.l.wrap.b32 %r3, %r1, %r1, %r2;
42+
; SM35-NEXT: st.param.b32 [func_retval0+0], %r3;
43+
; SM35-NEXT: ret;
1944
%val = tail call i32 @llvm.nvvm.rotate.b32(i32 %a, i32 %b)
2045
ret i32 %val
2146
}
2247

2348
; SM20: rotate64
2449
; SM35: rotate64
2550
define i64 @rotate64(i64 %a, i32 %b) {
26-
; SM20: shl.b64
27-
; SM20: sub.u32
28-
; SM20: shr.b64
29-
; SM20: add.u64
30-
; SM35: shf.l.wrap.b32
31-
; SM35: shf.l.wrap.b32
51+
; SM20-LABEL: rotate64(
52+
; SM20: {
53+
; SM20-NEXT: .reg .b32 %r<2>;
54+
; SM20-NEXT: .reg .b64 %rd<3>;
55+
; SM20-EMPTY:
56+
; SM20-NEXT: // %bb.0:
57+
; SM20-NEXT: ld.param.u64 %rd1, [rotate64_param_0];
58+
; SM20-NEXT: ld.param.u32 %r1, [rotate64_param_1];
59+
; SM20-NEXT: {
60+
; SM20-NEXT: .reg .b64 %lhs;
61+
; SM20-NEXT: .reg .b64 %rhs;
62+
; SM20-NEXT: .reg .u32 %amt2;
63+
; SM20-NEXT: and.b32 %amt2, %r1, 63;
64+
; SM20-NEXT: shl.b64 %lhs, %rd1, %amt2;
65+
; SM20-NEXT: sub.u32 %amt2, 64, %amt2;
66+
; SM20-NEXT: shr.b64 %rhs, %rd1, %amt2;
67+
; SM20-NEXT: add.u64 %rd2, %lhs, %rhs;
68+
; SM20-NEXT: }
69+
; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2;
70+
; SM20-NEXT: ret;
71+
;
72+
; SM35-LABEL: rotate64(
73+
; SM35: {
74+
; SM35-NEXT: .reg .b32 %r<6>;
75+
; SM35-NEXT: .reg .b64 %rd<3>;
76+
; SM35-EMPTY:
77+
; SM35-NEXT: // %bb.0:
78+
; SM35-NEXT: ld.param.u64 %rd1, [rotate64_param_0];
79+
; SM35-NEXT: {
80+
; SM35-NEXT: .reg .b32 %dummy;
81+
; SM35-NEXT: mov.b64 {%dummy,%r1}, %rd1;
82+
; SM35-NEXT: }
83+
; SM35-NEXT: {
84+
; SM35-NEXT: .reg .b32 %dummy;
85+
; SM35-NEXT: mov.b64 {%r2,%dummy}, %rd1;
86+
; SM35-NEXT: }
87+
; SM35-NEXT: ld.param.u32 %r3, [rotate64_param_1];
88+
; SM35-NEXT: shf.l.wrap.b32 %r4, %r2, %r1, %r3;
89+
; SM35-NEXT: shf.l.wrap.b32 %r5, %r1, %r2, %r3;
90+
; SM35-NEXT: mov.b64 %rd2, {%r5, %r4};
91+
; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2;
92+
; SM35-NEXT: ret;
3293
%val = tail call i64 @llvm.nvvm.rotate.b64(i64 %a, i32 %b)
3394
ret i64 %val
3495
}
3596

3697
; SM20: rotateright64
3798
; SM35: rotateright64
3899
define i64 @rotateright64(i64 %a, i32 %b) {
39-
; SM20: shr.b64
40-
; SM20: sub.u32
41-
; SM20: shl.b64
42-
; SM20: add.u64
43-
; SM35: shf.r.wrap.b32
44-
; SM35: shf.r.wrap.b32
100+
; SM20-LABEL: rotateright64(
101+
; SM20: {
102+
; SM20-NEXT: .reg .b32 %r<2>;
103+
; SM20-NEXT: .reg .b64 %rd<3>;
104+
; SM20-EMPTY:
105+
; SM20-NEXT: // %bb.0:
106+
; SM20-NEXT: ld.param.u64 %rd1, [rotateright64_param_0];
107+
; SM20-NEXT: ld.param.u32 %r1, [rotateright64_param_1];
108+
; SM20-NEXT: {
109+
; SM20-NEXT: .reg .b64 %lhs;
110+
; SM20-NEXT: .reg .b64 %rhs;
111+
; SM20-NEXT: .reg .u32 %amt2;
112+
; SM20-NEXT: and.b32 %amt2, %r1, 63;
113+
; SM20-NEXT: shr.b64 %lhs, %rd1, %amt2;
114+
; SM20-NEXT: sub.u32 %amt2, 64, %amt2;
115+
; SM20-NEXT: shl.b64 %rhs, %rd1, %amt2;
116+
; SM20-NEXT: add.u64 %rd2, %lhs, %rhs;
117+
; SM20-NEXT: }
118+
; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2;
119+
; SM20-NEXT: ret;
120+
;
121+
; SM35-LABEL: rotateright64(
122+
; SM35: {
123+
; SM35-NEXT: .reg .b32 %r<6>;
124+
; SM35-NEXT: .reg .b64 %rd<3>;
125+
; SM35-EMPTY:
126+
; SM35-NEXT: // %bb.0:
127+
; SM35-NEXT: ld.param.u64 %rd1, [rotateright64_param_0];
128+
; SM35-NEXT: {
129+
; SM35-NEXT: .reg .b32 %dummy;
130+
; SM35-NEXT: mov.b64 {%r1,%dummy}, %rd1;
131+
; SM35-NEXT: }
132+
; SM35-NEXT: {
133+
; SM35-NEXT: .reg .b32 %dummy;
134+
; SM35-NEXT: mov.b64 {%dummy,%r2}, %rd1;
135+
; SM35-NEXT: }
136+
; SM35-NEXT: ld.param.u32 %r3, [rotateright64_param_1];
137+
; SM35-NEXT: shf.r.wrap.b32 %r4, %r2, %r1, %r3;
138+
; SM35-NEXT: shf.r.wrap.b32 %r5, %r1, %r2, %r3;
139+
; SM35-NEXT: mov.b64 %rd2, {%r5, %r4};
140+
; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2;
141+
; SM35-NEXT: ret;
45142
%val = tail call i64 @llvm.nvvm.rotate.right.b64(i64 %a, i32 %b)
46143
ret i64 %val
47144
}
48145

49146
; SM20: rotl0
50147
; SM35: rotl0
51148
define i32 @rotl0(i32 %x) {
52-
; SM20: shl.b32
53-
; SM20: shr.b32
54-
; SM20: add.u32
55-
; SM35: shf.l.wrap.b32
149+
; SM20-LABEL: rotl0(
150+
; SM20: {
151+
; SM20-NEXT: .reg .b32 %r<3>;
152+
; SM20-EMPTY:
153+
; SM20-NEXT: // %bb.0:
154+
; SM20-NEXT: ld.param.u32 %r1, [rotl0_param_0];
155+
; SM20-NEXT: {
156+
; SM20-NEXT: .reg .b32 %lhs;
157+
; SM20-NEXT: .reg .b32 %rhs;
158+
; SM20-NEXT: shl.b32 %lhs, %r1, 8;
159+
; SM20-NEXT: shr.b32 %rhs, %r1, 24;
160+
; SM20-NEXT: add.u32 %r2, %lhs, %rhs;
161+
; SM20-NEXT: }
162+
; SM20-NEXT: st.param.b32 [func_retval0+0], %r2;
163+
; SM20-NEXT: ret;
164+
;
165+
; SM35-LABEL: rotl0(
166+
; SM35: {
167+
; SM35-NEXT: .reg .b32 %r<3>;
168+
; SM35-EMPTY:
169+
; SM35-NEXT: // %bb.0:
170+
; SM35-NEXT: ld.param.u32 %r1, [rotl0_param_0];
171+
; SM35-NEXT: shf.l.wrap.b32 %r2, %r1, %r1, 8;
172+
; SM35-NEXT: st.param.b32 [func_retval0+0], %r2;
173+
; SM35-NEXT: ret;
56174
%t0 = shl i32 %x, 8
57175
%t1 = lshr i32 %x, 24
58176
%t2 = or i32 %t0, %t1
59177
ret i32 %t2
60178
}
179+
180+
declare i64 @llvm.fshl.i64(i64, i64, i64)
181+
declare i64 @llvm.fshr.i64(i64, i64, i64)
182+
183+
; SM35: rotl64
184+
define i64 @rotl64(i64 %a, i64 %n) {
185+
; SM20-LABEL: rotl64(
186+
; SM20: {
187+
; SM20-NEXT: .reg .b32 %r<2>;
188+
; SM20-NEXT: .reg .b64 %rd<3>;
189+
; SM20-EMPTY:
190+
; SM20-NEXT: // %bb.0:
191+
; SM20-NEXT: ld.param.u64 %rd1, [rotl64_param_0];
192+
; SM20-NEXT: ld.param.u32 %r1, [rotl64_param_1];
193+
; SM20-NEXT: {
194+
; SM20-NEXT: .reg .b64 %lhs;
195+
; SM20-NEXT: .reg .b64 %rhs;
196+
; SM20-NEXT: .reg .u32 %amt2;
197+
; SM20-NEXT: and.b32 %amt2, %r1, 63;
198+
; SM20-NEXT: shl.b64 %lhs, %rd1, %amt2;
199+
; SM20-NEXT: sub.u32 %amt2, 64, %amt2;
200+
; SM20-NEXT: shr.b64 %rhs, %rd1, %amt2;
201+
; SM20-NEXT: add.u64 %rd2, %lhs, %rhs;
202+
; SM20-NEXT: }
203+
; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2;
204+
; SM20-NEXT: ret;
205+
;
206+
; SM35-LABEL: rotl64(
207+
; SM35: {
208+
; SM35-NEXT: .reg .b32 %r<2>;
209+
; SM35-NEXT: .reg .b64 %rd<3>;
210+
; SM35-EMPTY:
211+
; SM35-NEXT: // %bb.0:
212+
; SM35-NEXT: ld.param.u64 %rd1, [rotl64_param_0];
213+
; SM35-NEXT: ld.param.u32 %r1, [rotl64_param_1];
214+
; SM35-NEXT: {
215+
; SM35-NEXT: .reg .b64 %lhs;
216+
; SM35-NEXT: .reg .b64 %rhs;
217+
; SM35-NEXT: .reg .u32 %amt2;
218+
; SM35-NEXT: and.b32 %amt2, %r1, 63;
219+
; SM35-NEXT: shl.b64 %lhs, %rd1, %amt2;
220+
; SM35-NEXT: sub.u32 %amt2, 64, %amt2;
221+
; SM35-NEXT: shr.b64 %rhs, %rd1, %amt2;
222+
; SM35-NEXT: add.u64 %rd2, %lhs, %rhs;
223+
; SM35-NEXT: }
224+
; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2;
225+
; SM35-NEXT: ret;
226+
%val = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 %n)
227+
ret i64 %val
228+
}
229+
230+
; SM35: rotl64_imm
231+
define i64 @rotl64_imm(i64 %a) {
232+
; SM20-LABEL: rotl64_imm(
233+
; SM20: {
234+
; SM20-NEXT: .reg .b64 %rd<3>;
235+
; SM20-EMPTY:
236+
; SM20-NEXT: // %bb.0:
237+
; SM20-NEXT: ld.param.u64 %rd1, [rotl64_imm_param_0];
238+
; SM20-NEXT: {
239+
; SM20-NEXT: .reg .b64 %lhs;
240+
; SM20-NEXT: .reg .b64 %rhs;
241+
; SM20-NEXT: shl.b64 %lhs, %rd1, 2;
242+
; SM20-NEXT: shr.b64 %rhs, %rd1, 62;
243+
; SM20-NEXT: add.u64 %rd2, %lhs, %rhs;
244+
; SM20-NEXT: }
245+
; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2;
246+
; SM20-NEXT: ret;
247+
;
248+
; SM35-LABEL: rotl64_imm(
249+
; SM35: {
250+
; SM35-NEXT: .reg .b64 %rd<3>;
251+
; SM35-EMPTY:
252+
; SM35-NEXT: // %bb.0:
253+
; SM35-NEXT: ld.param.u64 %rd1, [rotl64_imm_param_0];
254+
; SM35-NEXT: {
255+
; SM35-NEXT: .reg .b64 %lhs;
256+
; SM35-NEXT: .reg .b64 %rhs;
257+
; SM35-NEXT: shl.b64 %lhs, %rd1, 2;
258+
; SM35-NEXT: shr.b64 %rhs, %rd1, 62;
259+
; SM35-NEXT: add.u64 %rd2, %lhs, %rhs;
260+
; SM35-NEXT: }
261+
; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2;
262+
; SM35-NEXT: ret;
263+
%val = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 66)
264+
ret i64 %val
265+
}
266+
267+
; SM35: rotr64
268+
define i64 @rotr64(i64 %a, i64 %n) {
269+
; SM20-LABEL: rotr64(
270+
; SM20: {
271+
; SM20-NEXT: .reg .b32 %r<2>;
272+
; SM20-NEXT: .reg .b64 %rd<3>;
273+
; SM20-EMPTY:
274+
; SM20-NEXT: // %bb.0:
275+
; SM20-NEXT: ld.param.u64 %rd1, [rotr64_param_0];
276+
; SM20-NEXT: ld.param.u32 %r1, [rotr64_param_1];
277+
; SM20-NEXT: {
278+
; SM20-NEXT: .reg .b64 %lhs;
279+
; SM20-NEXT: .reg .b64 %rhs;
280+
; SM20-NEXT: .reg .u32 %amt2;
281+
; SM20-NEXT: and.b32 %amt2, %r1, 63;
282+
; SM20-NEXT: shr.b64 %lhs, %rd1, %amt2;
283+
; SM20-NEXT: sub.u32 %amt2, 64, %amt2;
284+
; SM20-NEXT: shl.b64 %rhs, %rd1, %amt2;
285+
; SM20-NEXT: add.u64 %rd2, %lhs, %rhs;
286+
; SM20-NEXT: }
287+
; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2;
288+
; SM20-NEXT: ret;
289+
;
290+
; SM35-LABEL: rotr64(
291+
; SM35: {
292+
; SM35-NEXT: .reg .b32 %r<2>;
293+
; SM35-NEXT: .reg .b64 %rd<3>;
294+
; SM35-EMPTY:
295+
; SM35-NEXT: // %bb.0:
296+
; SM35-NEXT: ld.param.u64 %rd1, [rotr64_param_0];
297+
; SM35-NEXT: ld.param.u32 %r1, [rotr64_param_1];
298+
; SM35-NEXT: {
299+
; SM35-NEXT: .reg .b64 %lhs;
300+
; SM35-NEXT: .reg .b64 %rhs;
301+
; SM35-NEXT: .reg .u32 %amt2;
302+
; SM35-NEXT: and.b32 %amt2, %r1, 63;
303+
; SM35-NEXT: shr.b64 %lhs, %rd1, %amt2;
304+
; SM35-NEXT: sub.u32 %amt2, 64, %amt2;
305+
; SM35-NEXT: shl.b64 %rhs, %rd1, %amt2;
306+
; SM35-NEXT: add.u64 %rd2, %lhs, %rhs;
307+
; SM35-NEXT: }
308+
; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2;
309+
; SM35-NEXT: ret;
310+
%val = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 %n)
311+
ret i64 %val
312+
}
313+
314+
; SM35: rotr64_imm
315+
define i64 @rotr64_imm(i64 %a) {
316+
; SM20-LABEL: rotr64_imm(
317+
; SM20: {
318+
; SM20-NEXT: .reg .b64 %rd<3>;
319+
; SM20-EMPTY:
320+
; SM20-NEXT: // %bb.0:
321+
; SM20-NEXT: ld.param.u64 %rd1, [rotr64_imm_param_0];
322+
; SM20-NEXT: {
323+
; SM20-NEXT: .reg .b64 %lhs;
324+
; SM20-NEXT: .reg .b64 %rhs;
325+
; SM20-NEXT: shl.b64 %lhs, %rd1, 62;
326+
; SM20-NEXT: shr.b64 %rhs, %rd1, 2;
327+
; SM20-NEXT: add.u64 %rd2, %lhs, %rhs;
328+
; SM20-NEXT: }
329+
; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2;
330+
; SM20-NEXT: ret;
331+
;
332+
; SM35-LABEL: rotr64_imm(
333+
; SM35: {
334+
; SM35-NEXT: .reg .b64 %rd<3>;
335+
; SM35-EMPTY:
336+
; SM35-NEXT: // %bb.0:
337+
; SM35-NEXT: ld.param.u64 %rd1, [rotr64_imm_param_0];
338+
; SM35-NEXT: {
339+
; SM35-NEXT: .reg .b64 %lhs;
340+
; SM35-NEXT: .reg .b64 %rhs;
341+
; SM35-NEXT: shl.b64 %lhs, %rd1, 62;
342+
; SM35-NEXT: shr.b64 %rhs, %rd1, 2;
343+
; SM35-NEXT: add.u64 %rd2, %lhs, %rhs;
344+
; SM35-NEXT: }
345+
; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2;
346+
; SM35-NEXT: ret;
347+
%val = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 66)
348+
ret i64 %val
349+
}

0 commit comments

Comments
 (0)