Skip to content

Commit 138f196

Browse files
committed
[NVPTX] Fix 64 bits rotations with large shift values
ROTL and ROTR can take a shift amount larger than the element size, in which case the effective shift amount should be the shift amount modulo the element size. This patch adds the modulo step when the shift amount isn't known at compile time. Without it the existing implementation would end up shifting beyond the type size and give incorrect results.
1 parent 6f02120 commit 138f196

File tree

2 files changed

+49
-4
lines changed

2 files changed

+49
-4
lines changed

llvm/lib/Target/NVPTX/NVPTXInstrInfo.td

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1752,8 +1752,9 @@ def ROTL64reg_sw :
17521752
".reg .b64 %lhs;\n\t"
17531753
".reg .b64 %rhs;\n\t"
17541754
".reg .u32 %amt2;\n\t"
1755-
"shl.b64 \t%lhs, $src, $amt;\n\t"
1756-
"sub.u32 \t%amt2, 64, $amt;\n\t"
1755+
"and.b32 \t%amt2, $amt, 63;\n\t"
1756+
"shl.b64 \t%lhs, $src, %amt2;\n\t"
1757+
"sub.u32 \t%amt2, 64, %amt2;\n\t"
17571758
"shr.b64 \t%rhs, $src, %amt2;\n\t"
17581759
"add.u64 \t$dst, %lhs, %rhs;\n\t"
17591760
"}}",
@@ -1765,8 +1766,9 @@ def ROTR64reg_sw :
17651766
".reg .b64 %lhs;\n\t"
17661767
".reg .b64 %rhs;\n\t"
17671768
".reg .u32 %amt2;\n\t"
1768-
"shr.b64 \t%lhs, $src, $amt;\n\t"
1769-
"sub.u32 \t%amt2, 64, $amt;\n\t"
1769+
"and.b32 \t%amt2, $amt, 63;\n\t"
1770+
"shr.b64 \t%lhs, $src, %amt2;\n\t"
1771+
"sub.u32 \t%amt2, 64, %amt2;\n\t"
17701772
"shl.b64 \t%rhs, $src, %amt2;\n\t"
17711773
"add.u64 \t$dst, %lhs, %rhs;\n\t"
17721774
"}}",

llvm/test/CodeGen/NVPTX/rotate.ll

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,3 +58,46 @@ define i32 @rotl0(i32 %x) {
5858
%t2 = or i32 %t0, %t1
5959
ret i32 %t2
6060
}
61+
62+
declare i64 @llvm.fshl.i64(i64, i64, i64)
63+
declare i64 @llvm.fshr.i64(i64, i64, i64)
64+
65+
; SM35: rotl64
66+
define i64 @rotl64(i64 %a, i64 %n) {
67+
; SM35: and.b32 {{.*}}, 63;
68+
; SM35: shl.b64
69+
; SM35: sub.u32
70+
; SM35: shr.b64
71+
; SM35: add.u64
72+
%val = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 %n)
73+
ret i64 %val
74+
}
75+
76+
; SM35: rotl64_imm
77+
define i64 @rotl64_imm(i64 %a) {
78+
; SM35: shl.b64 {{.*}}, 2;
79+
; SM35: shr.b64 {{.*}}, 62;
80+
; SM35: add.u64
81+
%val = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 66)
82+
ret i64 %val
83+
}
84+
85+
; SM35: rotr64
86+
define i64 @rotr64(i64 %a, i64 %n) {
87+
; SM35: and.b32 {{.*}}, 63;
88+
; SM35: shr.b64
89+
; SM35: sub.u32
90+
; SM35: shl.b64
91+
; SM35: add.u64
92+
%val = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 %n)
93+
ret i64 %val
94+
}
95+
96+
; SM35: rotr64_imm
97+
define i64 @rotr64_imm(i64 %a) {
98+
; SM35: shl.b64 {{.*}}, 62;
99+
; SM35: shr.b64 {{.*}}, 2;
100+
; SM35: add.u64
101+
%val = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 66)
102+
ret i64 %val
103+
}

0 commit comments

Comments
 (0)