[NVPTX] deprecate nvvm.rotate.* intrinsics, cleanup funnel-shift handling #107655

AlexMaclean · 2024-09-06T23:03:38Z

This change deprecates the following intrinsics which can be trivially converted to llvm funnel-shift intrinsics:

@llvm.nvvm.rotate.b32
@llvm.nvvm.rotate.right.b64
@llvm.nvvm.rotate.b64

llvmbot · 2024-09-06T23:04:07Z

@llvm/pr-subscribers-backend-nvptx

@llvm/pr-subscribers-llvm-ir

Author: Alex MacLean (AlexMaclean)

Changes

This change deprecates the following intrinsics which can be trivially converted to llvm funnel-shift intrinsics:

@llvm.nvvm.rotate.b32
@llvm.nvvm.rotate.right.b64
@llvm.nvvm.rotate.b64

Patch is 39.52 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/107655.diff

6 Files Affected:

(modified) llvm/include/llvm/IR/IntrinsicsNVVM.td (-16)
(modified) llvm/lib/IR/AutoUpgrade.cpp (+103-75)
(modified) llvm/lib/Target/NVPTX/NVPTXInstrInfo.td (+218-161)
(modified) llvm/lib/Target/NVPTX/NVPTXIntrinsics.td (+2-127)
(modified) llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll (+16)
(modified) llvm/test/CodeGen/NVPTX/rotate.ll (+36-60)

diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 39685c920d948d..1b94c56ca9b828 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -4489,22 +4489,6 @@ def int_nvvm_sust_p_3d_v4i32_trap
               "llvm.nvvm.sust.p.3d.v4i32.trap">,
     ClangBuiltin<"__nvvm_sust_p_3d_v4i32_trap">;
 
-
-def int_nvvm_rotate_b32
-  : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
-              [IntrNoMem, IntrSpeculatable], "llvm.nvvm.rotate.b32">,
-              ClangBuiltin<"__nvvm_rotate_b32">;
-
-def int_nvvm_rotate_b64
-  : DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty],
-             [IntrNoMem, IntrSpeculatable], "llvm.nvvm.rotate.b64">,
-             ClangBuiltin<"__nvvm_rotate_b64">;
-
-def int_nvvm_rotate_right_b64
-  : DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty],
-              [IntrNoMem, IntrSpeculatable], "llvm.nvvm.rotate.right.b64">,
-              ClangBuiltin<"__nvvm_rotate_right_b64">;
-
 def int_nvvm_swap_lo_hi_b64
   : DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty],
               [IntrNoMem, IntrSpeculatable], "llvm.nvvm.swap.lo.hi.b64">,
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 69dae5e32dbbe8..2b8e93dff46684 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -1268,6 +1268,9 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
       else if (Name.consume_front("atomic.load.add."))
         // nvvm.atomic.load.add.{f32.p,f64.p}
         Expand = Name.starts_with("f32.p") || Name.starts_with("f64.p");
+      else if (Name.consume_front("rotate."))
+        // nvvm.rotate.{b32,b64,right.b64}
+        Expand = Name == "b32" || Name == "b64" || Name == "right.b64";
       else
         Expand = false;
 
@@ -2254,6 +2257,104 @@ void llvm::UpgradeInlineAsmString(std::string *AsmStr) {
   }
 }
 
+static Value *upgradeNVVMIntrinsicCall(StringRef Name, CallBase *CI,
+                                       Function *F, IRBuilder<> &Builder) {
+  Value *Rep = nullptr;
+
+  if (Name == "abs.i" || Name == "abs.ll") {
+    Value *Arg = CI->getArgOperand(0);
+    Value *Neg = Builder.CreateNeg(Arg, "neg");
+    Value *Cmp = Builder.CreateICmpSGE(
+        Arg, llvm::Constant::getNullValue(Arg->getType()), "abs.cond");
+    Rep = Builder.CreateSelect(Cmp, Arg, Neg, "abs");
+  } else if (Name.starts_with("atomic.load.add.f32.p") ||
+             Name.starts_with("atomic.load.add.f64.p")) {
+    Value *Ptr = CI->getArgOperand(0);
+    Value *Val = CI->getArgOperand(1);
+    Rep = Builder.CreateAtomicRMW(AtomicRMWInst::FAdd, Ptr, Val, MaybeAlign(),
+                                  AtomicOrdering::SequentiallyConsistent);
+  } else if (Name.consume_front("max.") &&
+             (Name == "s" || Name == "i" || Name == "ll" || Name == "us" ||
+              Name == "ui" || Name == "ull")) {
+    Value *Arg0 = CI->getArgOperand(0);
+    Value *Arg1 = CI->getArgOperand(1);
+    Value *Cmp = Name.starts_with("u")
+                     ? Builder.CreateICmpUGE(Arg0, Arg1, "max.cond")
+                     : Builder.CreateICmpSGE(Arg0, Arg1, "max.cond");
+    Rep = Builder.CreateSelect(Cmp, Arg0, Arg1, "max");
+  } else if (Name.consume_front("min.") &&
+             (Name == "s" || Name == "i" || Name == "ll" || Name == "us" ||
+              Name == "ui" || Name == "ull")) {
+    Value *Arg0 = CI->getArgOperand(0);
+    Value *Arg1 = CI->getArgOperand(1);
+    Value *Cmp = Name.starts_with("u")
+                     ? Builder.CreateICmpULE(Arg0, Arg1, "min.cond")
+                     : Builder.CreateICmpSLE(Arg0, Arg1, "min.cond");
+    Rep = Builder.CreateSelect(Cmp, Arg0, Arg1, "min");
+  } else if (Name == "clz.ll") {
+    // llvm.nvvm.clz.ll returns an i32, but llvm.ctlz.i64 returns an i64.
+    Value *Arg = CI->getArgOperand(0);
+    Value *Ctlz = Builder.CreateCall(
+        Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctlz,
+                                  {Arg->getType()}),
+        {Arg, Builder.getFalse()}, "ctlz");
+    Rep = Builder.CreateTrunc(Ctlz, Builder.getInt32Ty(), "ctlz.trunc");
+  } else if (Name == "popc.ll") {
+    // llvm.nvvm.popc.ll returns an i32, but llvm.ctpop.i64 returns an
+    // i64.
+    Value *Arg = CI->getArgOperand(0);
+    Value *Popc = Builder.CreateCall(
+        Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctpop,
+                                  {Arg->getType()}),
+        Arg, "ctpop");
+    Rep = Builder.CreateTrunc(Popc, Builder.getInt32Ty(), "ctpop.trunc");
+  } else if (Name == "h2f") {
+    Rep = Builder.CreateCall(
+        Intrinsic::getDeclaration(F->getParent(), Intrinsic::convert_from_fp16,
+                                  {Builder.getFloatTy()}),
+        CI->getArgOperand(0), "h2f");
+  } else if (Name == "rotate.b32") {
+    Value *Arg = CI->getOperand(0);
+    Value *ShiftAmt = CI->getOperand(1);
+    Rep = Builder.CreateIntrinsic(Builder.getInt32Ty(), Intrinsic::fshl,
+                                  {Arg, Arg, ShiftAmt});
+  } else if (Name == "rotate.b64") {
+    Type *Int64Ty = Builder.getInt64Ty();
+    Value *Arg = CI->getOperand(0);
+    Value *ZExtShiftAmt = Builder.CreateZExt(CI->getOperand(1), Int64Ty);
+    Rep = Builder.CreateIntrinsic(Int64Ty, Intrinsic::fshl,
+                                  {Arg, Arg, ZExtShiftAmt});
+  } else if (Name == "rotate.right.b64") {
+    Type *Int64Ty = Builder.getInt64Ty();
+    Value *Arg = CI->getOperand(0);
+    Value *ZExtShiftAmt = Builder.CreateZExt(CI->getOperand(1), Int64Ty);
+    Rep = Builder.CreateIntrinsic(Int64Ty, Intrinsic::fshr,
+                                  {Arg, Arg, ZExtShiftAmt});
+  } else {
+    Intrinsic::ID IID = shouldUpgradeNVPTXBF16Intrinsic(Name);
+    if (IID != Intrinsic::not_intrinsic &&
+        !F->getReturnType()->getScalarType()->isBFloatTy()) {
+      rename(F);
+      Function *NewFn = Intrinsic::getDeclaration(F->getParent(), IID);
+      SmallVector<Value *, 2> Args;
+      for (size_t I = 0; I < NewFn->arg_size(); ++I) {
+        Value *Arg = CI->getArgOperand(I);
+        Type *OldType = Arg->getType();
+        Type *NewType = NewFn->getArg(I)->getType();
+        Args.push_back(
+            (OldType->isIntegerTy() && NewType->getScalarType()->isBFloatTy())
+                ? Builder.CreateBitCast(Arg, NewType)
+                : Arg);
+      }
+      Rep = Builder.CreateCall(NewFn, Args);
+      if (F->getReturnType()->isIntegerTy())
+        Rep = Builder.CreateBitCast(Rep, F->getReturnType());
+    }
+  }
+
+  return Rep;
+}
+
 static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
                                       IRBuilder<> &Builder) {
   LLVMContext &C = F->getContext();
@@ -4204,81 +4305,8 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
 
     if (!IsX86 && Name == "stackprotectorcheck") {
       Rep = nullptr;
-    } else if (IsNVVM && (Name == "abs.i" || Name == "abs.ll")) {
-      Value *Arg = CI->getArgOperand(0);
-      Value *Neg = Builder.CreateNeg(Arg, "neg");
-      Value *Cmp = Builder.CreateICmpSGE(
-          Arg, llvm::Constant::getNullValue(Arg->getType()), "abs.cond");
-      Rep = Builder.CreateSelect(Cmp, Arg, Neg, "abs");
-    } else if (IsNVVM && (Name.starts_with("atomic.load.add.f32.p") ||
-                          Name.starts_with("atomic.load.add.f64.p"))) {
-      Value *Ptr = CI->getArgOperand(0);
-      Value *Val = CI->getArgOperand(1);
-      Rep = Builder.CreateAtomicRMW(AtomicRMWInst::FAdd, Ptr, Val, MaybeAlign(),
-                                    AtomicOrdering::SequentiallyConsistent);
-    } else if (IsNVVM && Name.consume_front("max.") &&
-               (Name == "s" || Name == "i" || Name == "ll" || Name == "us" ||
-                Name == "ui" || Name == "ull")) {
-      Value *Arg0 = CI->getArgOperand(0);
-      Value *Arg1 = CI->getArgOperand(1);
-      Value *Cmp = Name.starts_with("u")
-                       ? Builder.CreateICmpUGE(Arg0, Arg1, "max.cond")
-                       : Builder.CreateICmpSGE(Arg0, Arg1, "max.cond");
-      Rep = Builder.CreateSelect(Cmp, Arg0, Arg1, "max");
-    } else if (IsNVVM && Name.consume_front("min.") &&
-               (Name == "s" || Name == "i" || Name == "ll" || Name == "us" ||
-                Name == "ui" || Name == "ull")) {
-      Value *Arg0 = CI->getArgOperand(0);
-      Value *Arg1 = CI->getArgOperand(1);
-      Value *Cmp = Name.starts_with("u")
-                       ? Builder.CreateICmpULE(Arg0, Arg1, "min.cond")
-                       : Builder.CreateICmpSLE(Arg0, Arg1, "min.cond");
-      Rep = Builder.CreateSelect(Cmp, Arg0, Arg1, "min");
-    } else if (IsNVVM && Name == "clz.ll") {
-      // llvm.nvvm.clz.ll returns an i32, but llvm.ctlz.i64 returns an i64.
-      Value *Arg = CI->getArgOperand(0);
-      Value *Ctlz = Builder.CreateCall(
-          Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctlz,
-                                    {Arg->getType()}),
-          {Arg, Builder.getFalse()}, "ctlz");
-      Rep = Builder.CreateTrunc(Ctlz, Builder.getInt32Ty(), "ctlz.trunc");
-    } else if (IsNVVM && Name == "popc.ll") {
-      // llvm.nvvm.popc.ll returns an i32, but llvm.ctpop.i64 returns an
-      // i64.
-      Value *Arg = CI->getArgOperand(0);
-      Value *Popc = Builder.CreateCall(
-          Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctpop,
-                                    {Arg->getType()}),
-          Arg, "ctpop");
-      Rep = Builder.CreateTrunc(Popc, Builder.getInt32Ty(), "ctpop.trunc");
-    } else if (IsNVVM) {
-      if (Name == "h2f") {
-        Rep =
-            Builder.CreateCall(Intrinsic::getDeclaration(
-                                   F->getParent(), Intrinsic::convert_from_fp16,
-                                   {Builder.getFloatTy()}),
-                               CI->getArgOperand(0), "h2f");
-      } else {
-        Intrinsic::ID IID = shouldUpgradeNVPTXBF16Intrinsic(Name);
-        if (IID != Intrinsic::not_intrinsic &&
-            !F->getReturnType()->getScalarType()->isBFloatTy()) {
-          rename(F);
-          NewFn = Intrinsic::getDeclaration(F->getParent(), IID);
-          SmallVector<Value *, 2> Args;
-          for (size_t I = 0; I < NewFn->arg_size(); ++I) {
-            Value *Arg = CI->getArgOperand(I);
-            Type *OldType = Arg->getType();
-            Type *NewType = NewFn->getArg(I)->getType();
-            Args.push_back((OldType->isIntegerTy() &&
-                            NewType->getScalarType()->isBFloatTy())
-                               ? Builder.CreateBitCast(Arg, NewType)
-                               : Arg);
-          }
-          Rep = Builder.CreateCall(NewFn, Args);
-          if (F->getReturnType()->isIntegerTy())
-            Rep = Builder.CreateBitCast(Rep, F->getReturnType());
-        }
-      }
+    } else if (IsNVVM){
+      Rep = upgradeNVVMIntrinsicCall(Name, CI, F, Builder);
     } else if (IsX86) {
       Rep = upgradeX86IntrinsicCall(Name, CI, F, Builder);
     } else if (IsARM) {
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index b7e210805db904..0d7e2d4a98d88b 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1665,167 +1665,6 @@ def BREV64 :
              "brev.b64 \t$dst, $a;",
              [(set Int64Regs:$dst, (bitreverse Int64Regs:$a))]>;
 
-//
-// Rotate: Use ptx shf instruction if available.
-//
-
-// 32 bit r2 = rotl r1, n
-//    =>
-//        r2 = shf.l r1, r1, n
-def ROTL32imm_hw :
-  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt),
-            "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
-            [(set Int32Regs:$dst, (rotl (i32 Int32Regs:$src), (i32 imm:$amt)))]>,
-           Requires<[hasHWROT32]>;
-
-def ROTL32reg_hw :
-  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
-            "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
-            [(set Int32Regs:$dst, (rotl (i32 Int32Regs:$src), (i32 Int32Regs:$amt)))]>,
-           Requires<[hasHWROT32]>;
-
-// 32 bit r2 = rotr r1, n
-//    =>
-//        r2 = shf.r r1, r1, n
-def ROTR32imm_hw :
-  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt),
-            "shf.r.wrap.b32 \t$dst, $src, $src, $amt;",
-            [(set Int32Regs:$dst, (rotr (i32 Int32Regs:$src), (i32 imm:$amt)))]>,
-           Requires<[hasHWROT32]>;
-
-def ROTR32reg_hw :
-  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
-            "shf.r.wrap.b32 \t$dst, $src, $src, $amt;",
-            [(set Int32Regs:$dst, (rotr (i32 Int32Regs:$src), (i32 Int32Regs:$amt)))]>,
-           Requires<[hasHWROT32]>;
-
-// 32-bit software rotate by immediate.  $amt2 should equal 32 - $amt1.
-def ROT32imm_sw :
-  NVPTXInst<(outs Int32Regs:$dst),
-            (ins Int32Regs:$src, i32imm:$amt1, i32imm:$amt2),
-            "{{\n\t"
-            ".reg .b32 %lhs;\n\t"
-            ".reg .b32 %rhs;\n\t"
-            "shl.b32 \t%lhs, $src, $amt1;\n\t"
-            "shr.b32 \t%rhs, $src, $amt2;\n\t"
-            "add.u32 \t$dst, %lhs, %rhs;\n\t"
-            "}}",
-            []>;
-
-def SUB_FRM_32 : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(32 - N->getZExtValue(), SDLoc(N), MVT::i32);
-}]>;
-
-def : Pat<(rotl (i32 Int32Regs:$src), (i32 imm:$amt)),
-          (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>,
-      Requires<[noHWROT32]>;
-def : Pat<(rotr (i32 Int32Regs:$src), (i32 imm:$amt)),
-          (ROT32imm_sw Int32Regs:$src, (SUB_FRM_32 node:$amt), imm:$amt)>,
-      Requires<[noHWROT32]>;
-
-// 32-bit software rotate left by register.
-def ROTL32reg_sw :
-  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
-            "{{\n\t"
-            ".reg .b32 %lhs;\n\t"
-            ".reg .b32 %rhs;\n\t"
-            ".reg .b32 %amt2;\n\t"
-            "shl.b32 \t%lhs, $src, $amt;\n\t"
-            "sub.s32 \t%amt2, 32, $amt;\n\t"
-            "shr.b32 \t%rhs, $src, %amt2;\n\t"
-            "add.u32 \t$dst, %lhs, %rhs;\n\t"
-            "}}",
-            [(set Int32Regs:$dst, (rotl (i32 Int32Regs:$src), (i32 Int32Regs:$amt)))]>,
-           Requires<[noHWROT32]>;
-
-// 32-bit software rotate right by register.
-def ROTR32reg_sw :
-  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
-            "{{\n\t"
-            ".reg .b32 %lhs;\n\t"
-            ".reg .b32 %rhs;\n\t"
-            ".reg .b32 %amt2;\n\t"
-            "shr.b32 \t%lhs, $src, $amt;\n\t"
-            "sub.s32 \t%amt2, 32, $amt;\n\t"
-            "shl.b32 \t%rhs, $src, %amt2;\n\t"
-            "add.u32 \t$dst, %lhs, %rhs;\n\t"
-            "}}",
-            [(set Int32Regs:$dst, (rotr (i32 Int32Regs:$src), (i32 Int32Regs:$amt)))]>,
-           Requires<[noHWROT32]>;
-
-// 64-bit software rotate by immediate.  $amt2 should equal 64 - $amt1.
-def ROT64imm_sw :
-  NVPTXInst<(outs Int64Regs:$dst),
-            (ins Int64Regs:$src, i32imm:$amt1, i32imm:$amt2),
-            "{{\n\t"
-            ".reg .b64 %lhs;\n\t"
-            ".reg .b64 %rhs;\n\t"
-            "shl.b64 \t%lhs, $src, $amt1;\n\t"
-            "shr.b64 \t%rhs, $src, $amt2;\n\t"
-            "add.u64 \t$dst, %lhs, %rhs;\n\t"
-            "}}",
-            []>;
-
-def SUB_FRM_64 : SDNodeXForm<imm, [{
-    return CurDAG->getTargetConstant(64-N->getZExtValue(), SDLoc(N), MVT::i32);
-}]>;
-
-def : Pat<(rotl Int64Regs:$src, (i32 imm:$amt)),
-          (ROT64imm_sw Int64Regs:$src, imm:$amt, (SUB_FRM_64 node:$amt))>;
-def : Pat<(rotr Int64Regs:$src, (i32 imm:$amt)),
-          (ROT64imm_sw Int64Regs:$src, (SUB_FRM_64 node:$amt), imm:$amt)>;
-
-// 64-bit software rotate left by register.
-def ROTL64reg_sw :
-  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt),
-            "{{\n\t"
-            ".reg .b64 %lhs;\n\t"
-            ".reg .b64 %rhs;\n\t"
-            ".reg .u32 %amt2;\n\t"
-            "and.b32 \t%amt2, $amt, 63;\n\t"
-            "shl.b64 \t%lhs, $src, %amt2;\n\t"
-            "sub.u32 \t%amt2, 64, %amt2;\n\t"
-            "shr.b64 \t%rhs, $src, %amt2;\n\t"
-            "add.u64 \t$dst, %lhs, %rhs;\n\t"
-            "}}",
-            [(set Int64Regs:$dst, (rotl Int64Regs:$src, (i32 Int32Regs:$amt)))]>;
-
-def ROTR64reg_sw :
-  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt),
-            "{{\n\t"
-            ".reg .b64 %lhs;\n\t"
-            ".reg .b64 %rhs;\n\t"
-            ".reg .u32 %amt2;\n\t"
-            "and.b32 \t%amt2, $amt, 63;\n\t"
-            "shr.b64 \t%lhs, $src, %amt2;\n\t"
-            "sub.u32 \t%amt2, 64, %amt2;\n\t"
-            "shl.b64 \t%rhs, $src, %amt2;\n\t"
-            "add.u64 \t$dst, %lhs, %rhs;\n\t"
-            "}}",
-            [(set Int64Regs:$dst, (rotr Int64Regs:$src, (i32 Int32Regs:$amt)))]>;
-
-//
-// Funnnel shift in clamp mode
-//
-
-// Create SDNodes so they can be used in the DAG code, e.g.
-// NVPTXISelLowering (LowerShiftLeftParts and LowerShiftRightParts)
-def FUN_SHFL_CLAMP : SDNode<"NVPTXISD::FUN_SHFL_CLAMP", SDTIntShiftDOp, []>;
-def FUN_SHFR_CLAMP : SDNode<"NVPTXISD::FUN_SHFR_CLAMP", SDTIntShiftDOp, []>;
-
-def FUNSHFLCLAMP :
-  NVPTXInst<(outs Int32Regs:$dst),
-            (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
-            "shf.l.clamp.b32 \t$dst, $lo, $hi, $amt;",
-            [(set Int32Regs:$dst,
-              (FUN_SHFL_CLAMP (i32 Int32Regs:$lo), (i32 Int32Regs:$hi), (i32 Int32Regs:$amt)))]>;
-
-def FUNSHFRCLAMP :
-  NVPTXInst<(outs Int32Regs:$dst),
-            (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
-            "shf.r.clamp.b32 \t$dst, $lo, $hi, $amt;",
-            [(set Int32Regs:$dst,
-             (FUN_SHFR_CLAMP (i32 Int32Regs:$lo), (i32 Int32Regs:$hi), (i32 Int32Regs:$amt)))]>;
 
 //
 // BFE - bit-field extract
@@ -3651,6 +3490,224 @@ def : Pat<(v2i16 (build_vector (i16 Int16Regs:$a), (i16 Int16Regs:$b))),
 def: Pat<(v2i16 (scalar_to_vector (i16 Int16Regs:$a))),
          (CVT_u32_u16 Int16Regs:$a, CvtNONE)>;
 
+//
+// Rotate: Use ptx shf instruction if available.
+//
+
+// Create SDNodes so they can be used in the DAG code, e.g.
+// NVPTXISelLowering (LowerShiftLeftParts and LowerShiftRightParts)
+def FUN_SHFL_CLAMP : SDNode<"NVPTXISD::FUN_SHFL_CLAMP", SDTIntShiftDOp, []>;
+def FUN_SHFR_CLAMP : SDNode<"NVPTXISD::FUN_SHFR_CLAMP", SDTIntShiftDOp, []>;
+
+// Funnel shift, requires >= sm_32.  Does not trap if amt is out of range, so
+// no side effects.
+let hasSideEffects = false in {
+
+  def SHF_L_CLAMP_B32_REG :
+    NVPTXInst<(outs Int32Regs:$dst),
+              (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
+              "shf.l.clamp.b32 \t$dst, $lo, $hi, $amt;",
+              [(set Int32Regs:$dst,
+                (FUN_SHFL_CLAMP (i32 Int32Regs:$lo), (i32 Int32Regs:$hi), (i32 Int32Regs:$amt)))]>,
+    Requires<[hasHWROT32]>;
+
+  def SHF_R_CLAMP_B32_REG :
+    NVPTXInst<(outs Int32Regs:$dst),
+              (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
+              "shf.r.clamp.b32 \t$dst, $lo, $hi, $amt;",
+              [(set Int32Regs:$dst,
+              (FUN_SHFR_CLAMP (i32 Int32Regs:$lo), (i32 Int32Regs:$hi), (i32 Int32Regs:$amt)))]>,
+    Requires<[hasHWROT32]>;
+
+  def SHF_L_WRAP_B32_IMM
+    : NVPTXInst<(outs Int32Regs:$dst),
+                (ins  Int32Regs:$lo, Int32Regs:$hi, i32imm:$amt),
+                "shf.l.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>,
+      Requires<[hasHWROT32]>;
+
+  def SHF_L_WRAP_B32_REG
+    : NVPTXInst<(outs Int32Regs:$dst),
+                (ins  Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
+                "shf.l.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>,
+      Requires<[hasHWROT32]>;
+
+  def SHF_R_WRAP_B32_IMM
+    : NVPTXInst<(outs Int32Regs:$dst),
+                (ins  Int32Regs:$lo, Int32Regs:$hi, i32imm:$amt),
+                "shf.r.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>,
+      Requires<[hasHWROT32]>;
+
+  def SHF_R_WRAP_B32_REG
+    : NVPTXInst<(outs Int32Regs:$dst),
+                (ins  Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
+                "shf.r.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>,
+      Requires<[hasHWROT32]>;
+}
+
+// 32 bit r2 = rotl r1, n
+//    =>
+//        r2 = shf.l r1, r1, n
+def : Pat<(rotl (i32 Int32Regs:$src), (i32 imm:$amt)),
+          (SHF_L_WRAP_B32_IMM Int32Regs:$src, Int32Regs:$src, imm:$amt)>,
+   ...
[truncated]

github-actions · 2024-09-06T23:07:17Z

✅ With the latest revision this PR passed the C/C++ code formatter.

AlexMaclean · 2024-09-12T16:59:02Z

@Artem-B / @jlebar ping

jlebar · 2024-09-12T17:18:03Z

FYI I'm currently on vacation for another week and a half without access to a laptop.

…

On Thu, Sep 12, 2024, 6:59 PM Alex MacLean ***@***.***> wrote: @Artem-B <https://github.com/Artem-B> / @jlebar <https://github.com/jlebar> ping — Reply to this email directly, view it on GitHub <#107655 (comment)>, or unsubscribe <https://github.com/notifications/unsubscribe-auth/AABEZB6QR2LMCMF34TQJGLDZWHB6ZAVCNFSM6AAAAABNZNOCPGVHI2DSMVQWIX3LMV43OSLTON2WKQ3PNVWWK3TUHMZDGNBWHAYDMNRQGU> . You are receiving this because you were mentioned.Message ID: ***@***.***>

AlexMaclean · 2024-09-19T20:48:14Z

@Artem-B ping

Artem-B

Typo in the patch title: handeling -> handling.

LGTM otherwise.

AlexMaclean · 2024-09-21T19:28:34Z

@Artem-B, I took a deeper look at this and realize the way we were lowering 64 bit rotation isn't actually correct.

Here's an alive proof in llvm that shows the expansion is incorrect https://alive2.llvm.org/ce/z/RQVDDG

I've move to just using the SelectionDAG expansion for all rotate instructions, this will properly convert of a 32-bit funnel shift when appropriate and expand to a valid multi-instruction expression otherwise.

Could you please take another look?

Artem-B · 2024-09-23T18:26:48Z

the way we were lowering 64 bit rotation isn't actually correct.

Here's an alive proof in llvm that shows the expansion is incorrect https://alive2.llvm.org/ce/z/RQVDDG

Are you saying that your patch produces these invalid results? Or that LLVM without your patch miscompiles?

I've move to just using the SelectionDAG expansion for all rotate instructions, this will properly convert of a 32-bit funnel shift when appropriate and expand to a valid multi-instruction expression otherwise.
Could you please take another look?

I do not see any further changes on the pull requests since I've stamped it last week. Is the issue fixed the this pull request? Or do you want me to help figuring out where things still go wrong?

llvm/test/CodeGen/NVPTX/rotate_64.ll

AlexMaclean · 2024-09-23T19:03:28Z

Are you saying that your patch produces these invalid results? Or that LLVM without your patch miscompiles?

Prior to this patch LLVM will miscompile the @llvm.nvvm.rotate.right.b64 and @llvm.nvvm.rotate.b64 intrinsics. The earlier version of this patch expanded the miscompilation to also impact the rotl and rotr nodes produced from funnel-shift intrinsics. The latest version of the patch fixes all miscompilations.

I do not see any further changes on the pull requests since I've stamped it last week. Is the issue fixed the this pull request? Or do you want me to help figuring out where things still go wrong?

The further changes are in: 9a0201c. After this commit the issues are fixed.

…eling

Artem-B

Looks like i64 ops do not take advantage of funnel shifts.

Artem-B · 2024-09-23T20:07:21Z

llvm/test/CodeGen/NVPTX/rotate.ll

+; SM35-NEXT:    and.b32 %r2, %r1, 63;
+; SM35-NEXT:    shl.b64 %rd2, %rd1, %r2;
+; SM35-NEXT:    neg.s32 %r3, %r1;
+; SM35-NEXT:    and.b32 %r4, %r3, 63;
+; SM35-NEXT:    shr.u64 %rd3, %rd1, %r4;
+; SM35-NEXT:    or.b64 %rd4, %rd2, %rd3;


This looks like a regression. Previously we'd generate a couple of 32-bit funnel shifts.

It actually is not correct to lower these 64-bit rotates with 32-bit funnel shifts. This is the correctness issue I fixed.

https://alive2.llvm.org/ce/z/RQVDDG shows counter-examples for this lowering.

I think it would be safe to lower a 64-bit rotate with a constant rotation operand to two 32-bit funnel shifts but I don't know if it is worth it to handle this edge case.

Ah. I see. I did not connect IR->IR trnasformation in the alive example with the shf.l wrap we used to generate.

The original code is indeed wrong. 32-bit rotation is indeed not very useful for implementing 64-bit rotations, as it can only shift up to 32 bits. It should indeed be doable, but is probably not worth it. Given that the bug has been around forever, and nobody stumbled onto it before you, it's probalby not widely used.

If we were to custom-lower it, one way to do it is to change the order of destination operands when we split i64 into two i32 as a partial rotation by 32, and do the remaining rotation by (count-32) with the funnel shift.

Artem-B · 2024-09-23T20:10:02Z

llvm/test/CodeGen/NVPTX/rotate.ll

+; SM35-NEXT:    ld.param.u64 %rd1, [funnel_shift_left_64_param_0];
+; SM35-NEXT:    ld.param.u32 %r1, [funnel_shift_left_64_param_2];
+; SM35-NEXT:    and.b32 %r2, %r1, 63;
+; SM35-NEXT:    shl.b64 %rd2, %rd1, %r2;
+; SM35-NEXT:    ld.param.u64 %rd3, [funnel_shift_left_64_param_1];
+; SM35-NEXT:    shr.u64 %rd4, %rd3, 1;
+; SM35-NEXT:    not.b32 %r3, %r1;
+; SM35-NEXT:    and.b32 %r4, %r3, 63;
+; SM35-NEXT:    shr.u64 %rd5, %rd4, %r4;
+; SM35-NEXT:    or.b64 %rd6, %rd2, %rd5;


This also does not use funnel shifts.

Artem-B · 2024-09-23T21:39:21Z

llvm/test/CodeGen/NVPTX/rotate.ll

+; SM35-NEXT:    and.b32 %r2, %r1, 63;
+; SM35-NEXT:    shl.b64 %rd2, %rd1, %r2;
+; SM35-NEXT:    neg.s32 %r3, %r1;
+; SM35-NEXT:    and.b32 %r4, %r3, 63;
+; SM35-NEXT:    shr.u64 %rd3, %rd1, %r4;
+; SM35-NEXT:    or.b64 %rd4, %rd2, %rd3;


Ah. I see. I did not connect IR->IR trnasformation in the alive example with the shf.l wrap we used to generate.

The original code is indeed wrong. 32-bit rotation is indeed not very useful for implementing 64-bit rotations, as it can only shift up to 32 bits. It should indeed be doable, but is probably not worth it. Given that the bug has been around forever, and nobody stumbled onto it before you, it's probalby not widely used.

If we were to custom-lower it, one way to do it is to change the order of destination operands when we split i64 into two i32 as a partial rotation by 32, and do the remaining rotation by (count-32) with the funnel shift.

steelannelida · 2024-09-25T13:04:37Z

We're seeing numerical errors in trig functions in XLA after this

steelannelida · 2024-09-25T14:37:40Z

Reproducer: a (Float32 Cos function)
For this IR
module_0006.CosF32s.3.ir-with-opt.ll.txt

We generate PTX without this commit
module_0006.CosF32s.3-reverted.ptx.txt

With this commit
module_0006.CosF32s.3.ptx.txt

For input

-1.9938988e-28, 1.9938988e-28, -1e20f, 1e20f, -2.3564024f, -3.14159f,
3.14159f, -0.0f, 0.0f, -1.570796f, 1.570796f, -0.78539f, 0.78539f,
-2.19993846e+10, -1.70141183e+38, -kInf, kInf, kQNaN

It returns

Expected literal:
f32[18] {1, 1, 0.754259288, 0.754259288, -0.707253754, -1, -1, 1, 1, 3.13916473e-07, 3.13916473e-07, 0.707112551, 0.707112551, -2.01264605e-09, 0.781914651, nan, nan, nan}

Actual literal:
f32[18] {1, 1, 0.631242573, 0.631242573, -0.707253754, -1, -1, 1, 1, 3.13916473e-07, 3.13916473e-07, 0.707112551, 0.707112551, -0.710541368, 0.565063953, nan(0x7fffff), nan(0x7fffff), nan(0x7fffff)})

AlexMaclean · 2024-09-25T14:39:20Z

We're seeing numerical errors in trig functions in XLA after this

Sorry about this, and thanks for the reproducer! I'm taking a look now.

steelannelida · 2024-09-25T14:39:58Z

Is it ok if we revert upstream for the time being?

steelannelida · 2024-09-25T14:49:34Z

unfortunately we'll also have to revert 3675761 because of the conflicts in AutoUpgrade.cpp

…ift handling (#107655)" This reverts commit 9ac00b8.

…ling (llvm#107655) This change deprecates the following intrinsics which can be trivially converted to llvm funnel-shift intrinsics: - @llvm.nvvm.rotate.b32 - @llvm.nvvm.rotate.right.b64 - @llvm.nvvm.rotate.b64

…ift handling" (#110025) This change deprecates the following intrinsics which can be trivially converted to llvm funnel-shift intrinsics: - @llvm.nvvm.rotate.b32 - @llvm.nvvm.rotate.right.b64 - @llvm.nvvm.rotate.b64 This fixes a bug in the previous version (#107655) which flipped the order of the operands to the PTX funnel shift instruction. In LLVM IR the high bits are the first arg and the low bits are the second arg, while in PTX this is reversed.

…ift handling" (llvm#110025) This change deprecates the following intrinsics which can be trivially converted to llvm funnel-shift intrinsics: - @llvm.nvvm.rotate.b32 - @llvm.nvvm.rotate.right.b64 - @llvm.nvvm.rotate.b64 This fixes a bug in the previous version (llvm#107655) which flipped the order of the operands to the PTX funnel shift instruction. In LLVM IR the high bits are the first arg and the low bits are the second arg, while in PTX this is reversed.

Remove the following intrinsics which can be trivially replaced with an `addrspacecast` * llvm.nvvm.ptr.gen.to.global * llvm.nvvm.ptr.gen.to.shared * llvm.nvvm.ptr.gen.to.constant * llvm.nvvm.ptr.gen.to.local * llvm.nvvm.ptr.global.to.gen * llvm.nvvm.ptr.shared.to.gen * llvm.nvvm.ptr.constant.to.gen * llvm.nvvm.ptr.local.to.gen Also, cleanup the NVPTX lowering of `addrspacecast` making it more concise. This was reverted to avoid conflicts while reverting #107655. Re-landing unchanged.

AlexMaclean requested review from jlebar and Artem-B September 6, 2024 23:03

AlexMaclean self-assigned this Sep 6, 2024

llvmbot added backend:NVPTX llvm:ir labels Sep 6, 2024

AlexMaclean force-pushed the dev/amaclean/upstream-remove-rotate branch 2 times, most recently from c91d6ff to 88bc7ef Compare September 12, 2024 16:58

Artem-B approved these changes Sep 19, 2024

View reviewed changes

AlexMaclean changed the title ~~[NVPTX] deprecate nvvm.rotate.* intrinsics, cleanup funnel-shift handeling~~ [NVPTX] deprecate nvvm.rotate.* intrinsics, cleanup funnel-shift handling Sep 20, 2024

AlexMaclean force-pushed the dev/amaclean/upstream-remove-rotate branch from 88bc7ef to 9a0201c Compare September 21, 2024 19:22

AlexMaclean requested review from Artem-B and justinfargnoli September 21, 2024 19:28

justinfargnoli approved these changes Sep 23, 2024

View reviewed changes

Artem-B reviewed Sep 23, 2024

View reviewed changes

llvm/test/CodeGen/NVPTX/rotate_64.ll Outdated Show resolved Hide resolved

AlexMaclean added 4 commits September 23, 2024 19:11

[NVPTX] deprecate nvvm.rotate.* intrinsics, cleanup funnel-shift hand…

008d778

…eling

add release note

0e3ba7a

[NVPTX] Fixup rotate lowering correctness

b967616

address comments - update rotate_64.ll

979aa53

AlexMaclean force-pushed the dev/amaclean/upstream-remove-rotate branch from 9a0201c to 979aa53 Compare September 23, 2024 19:22

Artem-B reviewed Sep 23, 2024

View reviewed changes

Artem-B approved these changes Sep 23, 2024

View reviewed changes

AlexMaclean merged commit 9ac00b8 into llvm:main Sep 23, 2024
9 checks passed

steelannelida pushed a commit that referenced this pull request Sep 25, 2024

Revert "[NVPTX] deprecate nvvm.rotate.* intrinsics, cleanup funnel-sh…

4cb61c2

…ift handling (#107655)" This reverts commit 9ac00b8.

Artem-B mentioned this pull request Sep 25, 2024

Reland "[NVPTX] deprecate nvvm.rotate.* intrinsics, cleanup funnel-shift handling" #110025

Merged

AlexMaclean mentioned this pull request Sep 27, 2024

Reland "[NVVM] Upgrade nvvm.ptr.* intrinics to addrspace cast" #110262

Merged

[NVPTX] deprecate nvvm.rotate.* intrinsics, cleanup funnel-shift handling #107655

[NVPTX] deprecate nvvm.rotate.* intrinsics, cleanup funnel-shift handling #107655

Uh oh!

Conversation

AlexMaclean commented Sep 6, 2024

Uh oh!

llvmbot commented Sep 6, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

github-actions bot commented Sep 6, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

AlexMaclean commented Sep 12, 2024

Uh oh!

jlebar commented Sep 12, 2024 via email

Uh oh!

AlexMaclean commented Sep 19, 2024

Uh oh!

Artem-B left a comment

Choose a reason for hiding this comment

Uh oh!

AlexMaclean commented Sep 21, 2024

Uh oh!

Artem-B commented Sep 23, 2024

Uh oh!

Uh oh!

AlexMaclean commented Sep 23, 2024

Uh oh!

Artem-B left a comment

Choose a reason for hiding this comment

Uh oh!

Artem-B Sep 23, 2024

Choose a reason for hiding this comment

Uh oh!

AlexMaclean Sep 23, 2024

Choose a reason for hiding this comment

Uh oh!

Artem-B Sep 23, 2024

Choose a reason for hiding this comment

Uh oh!

Artem-B Sep 23, 2024

Choose a reason for hiding this comment

Uh oh!

AlexMaclean Sep 23, 2024

Choose a reason for hiding this comment

Uh oh!

Artem-B Sep 23, 2024

Choose a reason for hiding this comment

Uh oh!

Uh oh!

steelannelida commented Sep 25, 2024

Uh oh!

steelannelida commented Sep 25, 2024

Uh oh!

AlexMaclean commented Sep 25, 2024

Uh oh!

steelannelida commented Sep 25, 2024

Uh oh!

steelannelida commented Sep 25, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Uh oh!

llvmbot commented Sep 6, 2024 •

edited

Loading

github-actions bot commented Sep 6, 2024 •

edited

Loading

steelannelida commented Sep 25, 2024 •

edited

Loading