Skip to content

[NVPTX] Auto-Upgrade llvm.nvvm.swap.lo.hi.b64 to llvm.fshl #132098

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged

Conversation

AlexMaclean
Copy link
Member

After 3c8c291 the lowering of 64-bit funnel shifts has been improved to the point where this intrinsic is no longer needed.

@llvmbot
Copy link
Member

llvmbot commented Mar 19, 2025

@llvm/pr-subscribers-llvm-ir

Author: Alex MacLean (AlexMaclean)

Changes

After 3c8c291 the lowering of 64-bit funnel shifts has been improved to the point where this intrinsic is no longer needed.


Full diff: https://github.com/llvm/llvm-project/pull/132098.diff

4 Files Affected:

  • (modified) llvm/include/llvm/IR/IntrinsicsNVVM.td (+1-6)
  • (modified) llvm/lib/IR/AutoUpgrade.cpp (+7-1)
  • (modified) llvm/lib/Target/NVPTX/NVPTXIntrinsics.td (-5)
  • (modified) llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll (+3)
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 665db3025903e..80e10f33b770d 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -45,6 +45,7 @@
 //   * llvm.nvvm.ldg.global.i        --> load addrspace(1) !load.invariant
 //   * llvm.nvvm.ldg.global.f        --> ibid.
 //   * llvm.nvvm.ldg.global.p        --> ibid.
+//   * llvm.nvvm.swap.lo.hi.b64      --> llvm.fshl(x, x, 32)
 
 def llvm_global_ptr_ty  : LLVMQualPointerType<1>;  // (global)ptr
 def llvm_shared_ptr_ty  : LLVMQualPointerType<3>;  // (shared)ptr
@@ -4635,12 +4636,6 @@ def int_nvvm_sust_p_3d_v4i32_trap
               "llvm.nvvm.sust.p.3d.v4i32.trap">,
     ClangBuiltin<"__nvvm_sust_p_3d_v4i32_trap">;
 
-def int_nvvm_swap_lo_hi_b64
-  : DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty],
-              [IntrNoMem, IntrSpeculatable], "llvm.nvvm.swap.lo.hi.b64">,
-              ClangBuiltin<"__nvvm_swap_lo_hi_b64">;
-
-
 // Accessing special registers.
 
 class PTXReadSRegIntrinsicNB_r32
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 7a194219c5cd4..9be307bb071ed 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -1292,7 +1292,8 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
       if (Name.consume_front("abs."))
         // nvvm.abs.{i,ii}
         Expand = Name == "i" || Name == "ll";
-      else if (Name == "clz.ll" || Name == "popc.ll" || Name == "h2f")
+      else if (Name == "clz.ll" || Name == "popc.ll" || Name == "h2f" ||
+               Name == "swap.lo.hi.b64")
         Expand = true;
       else if (Name.consume_front("max.") || Name.consume_front("min."))
         // nvvm.{min,max}.{i,ii,ui,ull}
@@ -2370,6 +2371,11 @@ static Value *upgradeNVVMIntrinsicCall(StringRef Name, CallBase *CI,
     Value *ZExtShiftAmt = Builder.CreateZExt(CI->getOperand(1), Int64Ty);
     Rep = Builder.CreateIntrinsic(Int64Ty, Intrinsic::fshr,
                                   {Arg, Arg, ZExtShiftAmt});
+  } else if (Name == "swap.lo.hi.b64") {
+    Type *Int64Ty = Builder.getInt64Ty();
+    Value *Arg = CI->getOperand(0);
+    Rep = Builder.CreateIntrinsic(Int64Ty, Intrinsic::fshl,
+                                  {Arg, Arg, Builder.getInt64(32)});
   } else if ((Name.consume_front("ptr.gen.to.") &&
               (Name.starts_with("local") || Name.starts_with("shared") ||
                Name.starts_with("global") || Name.starts_with("constant"))) ||
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 90f56a421b19b..b2e05a567b4fe 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -2947,11 +2947,6 @@ def : Pat<(int_nvvm_read_ptx_sreg_envreg29), (MOV_SPECIAL ENVREG29)>;
 def : Pat<(int_nvvm_read_ptx_sreg_envreg30), (MOV_SPECIAL ENVREG30)>;
 def : Pat<(int_nvvm_read_ptx_sreg_envreg31), (MOV_SPECIAL ENVREG31)>;
 
-
-def : Pat<(int_nvvm_swap_lo_hi_b64 i64:$src),
-          (V2I32toI64 (I64toI32H $src),
-                      (I64toI32L $src))> ;
-
 //-----------------------------------
 // Texture Intrinsics
 //-----------------------------------
diff --git a/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll b/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll
index 5cc3a30277459..588e79a7428a4 100644
--- a/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll
+++ b/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll
@@ -34,6 +34,7 @@ declare double @llvm.nvvm.bitcast.ll2d(i64)
 declare i32 @llvm.nvvm.rotate.b32(i32, i32)
 declare i64 @llvm.nvvm.rotate.right.b64(i64, i32)
 declare i64 @llvm.nvvm.rotate.b64(i64, i32)
+declare i64 @llvm.nvvm.swap.lo.hi.b64(i64)
 
 declare ptr addrspace(1) @llvm.nvvm.ptr.gen.to.global.p1.p0(ptr)
 declare ptr addrspace(3) @llvm.nvvm.ptr.gen.to.shared.p3.p0(ptr)
@@ -166,10 +167,12 @@ define void @rotate(i32 %a, i64 %b) {
 ; CHECK: call i32 @llvm.fshl.i32(i32 %a, i32 %a, i32 6)
 ; CHECK: call i64 @llvm.fshr.i64(i64 %b, i64 %b, i64 7)
 ; CHECK: call i64 @llvm.fshl.i64(i64 %b, i64 %b, i64 8)
+; CHECK: call i64 @llvm.fshl.i64(i64 %b, i64 %b, i64 32)
 ;
   %r1 = call i32 @llvm.nvvm.rotate.b32(i32 %a, i32 6)
   %r2 = call i64 @llvm.nvvm.rotate.right.b64(i64 %b, i32 7)
   %r3 = call i64 @llvm.nvvm.rotate.b64(i64 %b, i32 8)
+  %r4 = call i64 @llvm.nvvm.swap.lo.hi.b64(i64 %b)
   ret void
 }
 

@llvmbot
Copy link
Member

llvmbot commented Mar 19, 2025

@llvm/pr-subscribers-backend-nvptx

Author: Alex MacLean (AlexMaclean)

Changes

After 3c8c291 the lowering of 64-bit funnel shifts has been improved to the point where this intrinsic is no longer needed.


Full diff: https://github.com/llvm/llvm-project/pull/132098.diff

4 Files Affected:

  • (modified) llvm/include/llvm/IR/IntrinsicsNVVM.td (+1-6)
  • (modified) llvm/lib/IR/AutoUpgrade.cpp (+7-1)
  • (modified) llvm/lib/Target/NVPTX/NVPTXIntrinsics.td (-5)
  • (modified) llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll (+3)
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 665db3025903e..80e10f33b770d 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -45,6 +45,7 @@
 //   * llvm.nvvm.ldg.global.i        --> load addrspace(1) !load.invariant
 //   * llvm.nvvm.ldg.global.f        --> ibid.
 //   * llvm.nvvm.ldg.global.p        --> ibid.
+//   * llvm.nvvm.swap.lo.hi.b64      --> llvm.fshl(x, x, 32)
 
 def llvm_global_ptr_ty  : LLVMQualPointerType<1>;  // (global)ptr
 def llvm_shared_ptr_ty  : LLVMQualPointerType<3>;  // (shared)ptr
@@ -4635,12 +4636,6 @@ def int_nvvm_sust_p_3d_v4i32_trap
               "llvm.nvvm.sust.p.3d.v4i32.trap">,
     ClangBuiltin<"__nvvm_sust_p_3d_v4i32_trap">;
 
-def int_nvvm_swap_lo_hi_b64
-  : DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty],
-              [IntrNoMem, IntrSpeculatable], "llvm.nvvm.swap.lo.hi.b64">,
-              ClangBuiltin<"__nvvm_swap_lo_hi_b64">;
-
-
 // Accessing special registers.
 
 class PTXReadSRegIntrinsicNB_r32
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 7a194219c5cd4..9be307bb071ed 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -1292,7 +1292,8 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
       if (Name.consume_front("abs."))
         // nvvm.abs.{i,ii}
         Expand = Name == "i" || Name == "ll";
-      else if (Name == "clz.ll" || Name == "popc.ll" || Name == "h2f")
+      else if (Name == "clz.ll" || Name == "popc.ll" || Name == "h2f" ||
+               Name == "swap.lo.hi.b64")
         Expand = true;
       else if (Name.consume_front("max.") || Name.consume_front("min."))
         // nvvm.{min,max}.{i,ii,ui,ull}
@@ -2370,6 +2371,11 @@ static Value *upgradeNVVMIntrinsicCall(StringRef Name, CallBase *CI,
     Value *ZExtShiftAmt = Builder.CreateZExt(CI->getOperand(1), Int64Ty);
     Rep = Builder.CreateIntrinsic(Int64Ty, Intrinsic::fshr,
                                   {Arg, Arg, ZExtShiftAmt});
+  } else if (Name == "swap.lo.hi.b64") {
+    Type *Int64Ty = Builder.getInt64Ty();
+    Value *Arg = CI->getOperand(0);
+    Rep = Builder.CreateIntrinsic(Int64Ty, Intrinsic::fshl,
+                                  {Arg, Arg, Builder.getInt64(32)});
   } else if ((Name.consume_front("ptr.gen.to.") &&
               (Name.starts_with("local") || Name.starts_with("shared") ||
                Name.starts_with("global") || Name.starts_with("constant"))) ||
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 90f56a421b19b..b2e05a567b4fe 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -2947,11 +2947,6 @@ def : Pat<(int_nvvm_read_ptx_sreg_envreg29), (MOV_SPECIAL ENVREG29)>;
 def : Pat<(int_nvvm_read_ptx_sreg_envreg30), (MOV_SPECIAL ENVREG30)>;
 def : Pat<(int_nvvm_read_ptx_sreg_envreg31), (MOV_SPECIAL ENVREG31)>;
 
-
-def : Pat<(int_nvvm_swap_lo_hi_b64 i64:$src),
-          (V2I32toI64 (I64toI32H $src),
-                      (I64toI32L $src))> ;
-
 //-----------------------------------
 // Texture Intrinsics
 //-----------------------------------
diff --git a/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll b/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll
index 5cc3a30277459..588e79a7428a4 100644
--- a/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll
+++ b/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll
@@ -34,6 +34,7 @@ declare double @llvm.nvvm.bitcast.ll2d(i64)
 declare i32 @llvm.nvvm.rotate.b32(i32, i32)
 declare i64 @llvm.nvvm.rotate.right.b64(i64, i32)
 declare i64 @llvm.nvvm.rotate.b64(i64, i32)
+declare i64 @llvm.nvvm.swap.lo.hi.b64(i64)
 
 declare ptr addrspace(1) @llvm.nvvm.ptr.gen.to.global.p1.p0(ptr)
 declare ptr addrspace(3) @llvm.nvvm.ptr.gen.to.shared.p3.p0(ptr)
@@ -166,10 +167,12 @@ define void @rotate(i32 %a, i64 %b) {
 ; CHECK: call i32 @llvm.fshl.i32(i32 %a, i32 %a, i32 6)
 ; CHECK: call i64 @llvm.fshr.i64(i64 %b, i64 %b, i64 7)
 ; CHECK: call i64 @llvm.fshl.i64(i64 %b, i64 %b, i64 8)
+; CHECK: call i64 @llvm.fshl.i64(i64 %b, i64 %b, i64 32)
 ;
   %r1 = call i32 @llvm.nvvm.rotate.b32(i32 %a, i32 6)
   %r2 = call i64 @llvm.nvvm.rotate.right.b64(i64 %b, i32 7)
   %r3 = call i64 @llvm.nvvm.rotate.b64(i64 %b, i32 8)
+  %r4 = call i64 @llvm.nvvm.swap.lo.hi.b64(i64 %b)
   ret void
 }
 

@AlexMaclean AlexMaclean merged commit 30ff508 into llvm:main Mar 21, 2025
14 checks passed
@llvm-ci
Copy link
Collaborator

llvm-ci commented Mar 21, 2025

LLVM Buildbot has detected a new failure on builder llvm-clang-aarch64-darwin running on doug-worker-4 while building llvm at step 6 "test-build-unified-tree-check-all".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/190/builds/16806

Here is the relevant piece of the build log for the reference
Step 6 (test-build-unified-tree-check-all) failure: test (failure)
******************** TEST 'LLVM :: ExecutionEngine/OrcLazy/multiple-compile-threads-basic.ll' FAILED ********************
Exit Code: 2

Command Output (stderr):
--
RUN: at line 1: /Users/buildbot/buildbot-root/aarch64-darwin/build/bin/lli -jit-kind=orc-lazy -compile-threads=2 -thread-entry hello /Users/buildbot/buildbot-root/aarch64-darwin/llvm-project/llvm/test/ExecutionEngine/OrcLazy/multiple-compile-threads-basic.ll | /Users/buildbot/buildbot-root/aarch64-darwin/build/bin/FileCheck /Users/buildbot/buildbot-root/aarch64-darwin/llvm-project/llvm/test/ExecutionEngine/OrcLazy/multiple-compile-threads-basic.ll
+ /Users/buildbot/buildbot-root/aarch64-darwin/build/bin/lli -jit-kind=orc-lazy -compile-threads=2 -thread-entry hello /Users/buildbot/buildbot-root/aarch64-darwin/llvm-project/llvm/test/ExecutionEngine/OrcLazy/multiple-compile-threads-basic.ll
+ /Users/buildbot/buildbot-root/aarch64-darwin/build/bin/FileCheck /Users/buildbot/buildbot-root/aarch64-darwin/llvm-project/llvm/test/ExecutionEngine/OrcLazy/multiple-compile-threads-basic.ll
PLEASE submit a bug report to https://github.com/llvm/llvm-project/issues/ and include the crash backtrace.
 #0 0x0000000103694984 llvm::sys::PrintStackTrace(llvm::raw_ostream&, int) (/Users/buildbot/buildbot-root/aarch64-darwin/build/bin/lli+0x100ec0984)
 #1 0x0000000103692a08 llvm::sys::RunSignalHandlers() (/Users/buildbot/buildbot-root/aarch64-darwin/build/bin/lli+0x100ebea08)
 #2 0x0000000103695040 SignalHandler(int, __siginfo*, void*) (/Users/buildbot/buildbot-root/aarch64-darwin/build/bin/lli+0x100ec1040)
 #3 0x0000000189db6584 (/usr/lib/system/libsystem_platform.dylib+0x18047a584)
 #4 0x0000000189d8521c (/usr/lib/system/libsystem_pthread.dylib+0x18044921c)
 #5 0x0000000189cabad0 (/usr/lib/libc++.1.dylib+0x18036fad0)
 #6 0x000000010323df80 void llvm::detail::UniqueFunctionBase<void, llvm::Expected<llvm::DenseMap<llvm::orc::SymbolStringPtr, llvm::orc::ExecutorSymbolDef, llvm::DenseMapInfo<llvm::orc::SymbolStringPtr, void>, llvm::detail::DenseMapPair<llvm::orc::SymbolStringPtr, llvm::orc::ExecutorSymbolDef>>>>::CallImpl<llvm::orc::Platform::lookupInitSymbols(llvm::orc::ExecutionSession&, llvm::DenseMap<llvm::orc::JITDylib*, llvm::orc::SymbolLookupSet, llvm::DenseMapInfo<llvm::orc::JITDylib*, void>, llvm::detail::DenseMapPair<llvm::orc::JITDylib*, llvm::orc::SymbolLookupSet>> const&)::$_45>(void*, llvm::Expected<llvm::DenseMap<llvm::orc::SymbolStringPtr, llvm::orc::ExecutorSymbolDef, llvm::DenseMapInfo<llvm::orc::SymbolStringPtr, void>, llvm::detail::DenseMapPair<llvm::orc::SymbolStringPtr, llvm::orc::ExecutorSymbolDef>>>&) (/Users/buildbot/buildbot-root/aarch64-darwin/build/bin/lli+0x100a69f80)
 #7 0x0000000103239d70 llvm::orc::AsynchronousSymbolQuery::handleComplete(llvm::orc::ExecutionSession&)::RunQueryCompleteTask::run() (/Users/buildbot/buildbot-root/aarch64-darwin/build/bin/lli+0x100a65d70)
 #8 0x00000001032f4e04 void* std::__1::__thread_proxy[abi:un170006]<std::__1::tuple<std::__1::unique_ptr<std::__1::__thread_struct, std::__1::default_delete<std::__1::__thread_struct>>, llvm::orc::DynamicThreadPoolTaskDispatcher::dispatch(std::__1::unique_ptr<llvm::orc::Task, std::__1::default_delete<llvm::orc::Task>>)::$_0>>(void*) (/Users/buildbot/buildbot-root/aarch64-darwin/build/bin/lli+0x100b20e04)
 #9 0x0000000189d85f94 (/usr/lib/system/libsystem_pthread.dylib+0x180449f94)
#10 0x0000000189d80d34 (/usr/lib/system/libsystem_pthread.dylib+0x180444d34)
FileCheck error: '<stdin>' is empty.
FileCheck command line:  /Users/buildbot/buildbot-root/aarch64-darwin/build/bin/FileCheck /Users/buildbot/buildbot-root/aarch64-darwin/llvm-project/llvm/test/ExecutionEngine/OrcLazy/multiple-compile-threads-basic.ll

--

********************


Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

4 participants