[NVPTX] Fix 64 bits rotations with large shift values

npmiller · npmiller · commit 138f1963635e · 2024-04-29T16:55:27.000+01:00
ROTL and ROTR can take a shift amount larger than the element size, in
which case the effective shift amount should be the shift amount modulo
the element size.

This patch adds the modulo step when the shift amount isn't known at
compile time. Without it the existing implementation would end up
shifting beyond the type size and give incorrect results.
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1752,8 +1752,9 @@ def ROTL64reg_sw :
             ".reg .b64 %lhs;\n\t"
             ".reg .b64 %rhs;\n\t"
             ".reg .u32 %amt2;\n\t"
-            "shl.b64 \t%lhs, $src, $amt;\n\t"
-            "sub.u32 \t%amt2, 64, $amt;\n\t"
+            "and.b32 \t%amt2, $amt, 63;\n\t"
+            "shl.b64 \t%lhs, $src, %amt2;\n\t"
+            "sub.u32 \t%amt2, 64, %amt2;\n\t"
             "shr.b64 \t%rhs, $src, %amt2;\n\t"
             "add.u64 \t$dst, %lhs, %rhs;\n\t"
             "}}",
@@ -1765,8 +1766,9 @@ def ROTR64reg_sw :
             ".reg .b64 %lhs;\n\t"
             ".reg .b64 %rhs;\n\t"
             ".reg .u32 %amt2;\n\t"
-            "shr.b64 \t%lhs, $src, $amt;\n\t"
-            "sub.u32 \t%amt2, 64, $amt;\n\t"
+            "and.b32 \t%amt2, $amt, 63;\n\t"
+            "shr.b64 \t%lhs, $src, %amt2;\n\t"
+            "sub.u32 \t%amt2, 64, %amt2;\n\t"
             "shl.b64 \t%rhs, $src, %amt2;\n\t"
             "add.u64 \t$dst, %lhs, %rhs;\n\t"
             "}}",
diff --git a/llvm/test/CodeGen/NVPTX/rotate.ll b/llvm/test/CodeGen/NVPTX/rotate.ll
@@ -58,3 +58,46 @@ define i32 @rotl0(i32 %x) {
   %t2 = or i32 %t0, %t1
   ret i32 %t2
 }
+
+declare i64 @llvm.fshl.i64(i64, i64, i64)
+declare i64 @llvm.fshr.i64(i64, i64, i64)
+
+; SM35: rotl64
+define i64 @rotl64(i64 %a, i64 %n) {
+; SM35: and.b32 {{.*}}, 63;
+; SM35: shl.b64
+; SM35: sub.u32
+; SM35: shr.b64
+; SM35: add.u64
+  %val = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 %n)
+  ret i64 %val
+}
+
+; SM35: rotl64_imm
+define i64 @rotl64_imm(i64 %a) {
+; SM35: shl.b64 {{.*}}, 2;
+; SM35: shr.b64 {{.*}}, 62;
+; SM35: add.u64
+  %val = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 66)
+  ret i64 %val
+}
+
+; SM35: rotr64
+define i64 @rotr64(i64 %a, i64 %n) {
+; SM35: and.b32 {{.*}}, 63;
+; SM35: shr.b64
+; SM35: sub.u32
+; SM35: shl.b64
+; SM35: add.u64
+  %val = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 %n)
+  ret i64 %val
+}
+
+; SM35: rotr64_imm
+define i64 @rotr64_imm(i64 %a) {
+; SM35: shl.b64 {{.*}}, 62;
+; SM35: shr.b64 {{.*}}, 2;
+; SM35: add.u64
+  %val = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 66)
+  ret i64 %val
+}