[NVPTX] Aggressively try to replace image handles with references #119730

AlexMaclean · 2024-12-12T18:01:51Z

Even in cases where handles are supported, references are still preferable for performance. This is because, a ref uses one
less register and can avoid the handle creating code associated with taking the address of a tex/surf/sampler.

llvmbot · 2024-12-12T18:02:27Z

@llvm/pr-subscribers-backend-nvptx

Author: Alex MacLean (AlexMaclean)

Changes

Even in cases where handles are supported, references are still preferable for performance. This is because, a ref uses one
less register and can avoid the handle creating code associated with taking the address of a tex/surf/sampler.

Patch is 37.39 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/119730.diff

11 Files Affected:

(modified) llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp (+15-12)
(modified) llvm/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h (+15-7)
(modified) llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp (+2-2)
(modified) llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp (+1-11)
(modified) llvm/lib/Target/NVPTX/NVPTXSubtarget.h (-2)
(modified) llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp (+1-5)
(modified) llvm/test/CodeGen/NVPTX/surf-read-cuda.ll (+61-19)
(modified) llvm/test/CodeGen/NVPTX/surf-tex.py (+6-18)
(modified) llvm/test/CodeGen/NVPTX/surf-write-cuda.ll (+45-12)
(modified) llvm/test/CodeGen/NVPTX/tex-read-cuda.ll (+110-30)
(modified) llvm/test/CodeGen/NVPTX/texsurf-queries.ll (+161-36)

diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 7cac4d787778f2..cb756246b8d116 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -209,7 +209,7 @@ void NVPTXAsmPrinter::lowerImageHandleSymbol(unsigned Index, MCOperand &MCOp) {
   TargetMachine &TM = const_cast<TargetMachine &>(MF->getTarget());
   NVPTXTargetMachine &nvTM = static_cast<NVPTXTargetMachine &>(TM);
   const NVPTXMachineFunctionInfo *MFI = MF->getInfo<NVPTXMachineFunctionInfo>();
-  const char *Sym = MFI->getImageHandleSymbol(Index);
+  StringRef Sym = MFI->getImageHandleSymbol(Index);
   StringRef SymName = nvTM.getStrPool().save(Sym);
   MCOp = GetSymbolRef(OutContext.getOrCreateSymbol(SymName));
 }
@@ -224,16 +224,13 @@ void NVPTXAsmPrinter::lowerToMCInst(const MachineInstr *MI, MCInst &OutMI) {
     return;
   }
 
-  const NVPTXSubtarget &STI = MI->getMF()->getSubtarget<NVPTXSubtarget>();
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = MI->getOperand(i);
 
     MCOperand MCOp;
-    if (!STI.hasImageHandles()) {
-      if (lowerImageHandleOperand(MI, i, MCOp)) {
-        OutMI.addOperand(MCOp);
-        continue;
-      }
+    if (lowerImageHandleOperand(MI, i, MCOp)) {
+      OutMI.addOperand(MCOp);
+      continue;
     }
 
     if (lowerOperand(MO, MCOp))
@@ -1509,13 +1506,14 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
   const AttributeList &PAL = F->getAttributes();
   const NVPTXSubtarget &STI = TM.getSubtarget<NVPTXSubtarget>(*F);
   const auto *TLI = cast<NVPTXTargetLowering>(STI.getTargetLowering());
+  const NVPTXMachineFunctionInfo *MFI =
+      MF ? MF->getInfo<NVPTXMachineFunctionInfo>() : nullptr;
 
   Function::const_arg_iterator I, E;
   unsigned paramIndex = 0;
   bool first = true;
   bool isKernelFunc = isKernelFunction(*F);
   bool isABI = (STI.getSmVersion() >= 20);
-  bool hasImageHandles = STI.hasImageHandles();
 
   if (F->arg_empty() && !F->isVarArg()) {
     O << "()";
@@ -1533,25 +1531,30 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
     first = false;
 
     // Handle image/sampler parameters
-    if (isKernelFunction(*F)) {
+    if (isKernelFunc) {
       if (isSampler(*I) || isImage(*I)) {
+        std::string ParamSym;
+        raw_string_ostream ParamStr(ParamSym);
+        ParamStr << F->getName() << "_param_" << paramIndex;
+        ParamStr.flush();
+        bool EmitImagePtr = !MFI || !MFI->checkImageHandleSymbol(ParamSym);
         if (isImage(*I)) {
           if (isImageWriteOnly(*I) || isImageReadWrite(*I)) {
-            if (hasImageHandles)
+            if (EmitImagePtr)
               O << "\t.param .u64 .ptr .surfref ";
             else
               O << "\t.param .surfref ";
             O << TLI->getParamName(F, paramIndex);
           }
           else { // Default image is read_only
-            if (hasImageHandles)
+            if (EmitImagePtr)
               O << "\t.param .u64 .ptr .texref ";
             else
               O << "\t.param .texref ";
             O << TLI->getParamName(F, paramIndex);
           }
         } else {
-          if (hasImageHandles)
+          if (EmitImagePtr)
             O << "\t.param .u64 .ptr .samplerref ";
           else
             O << "\t.param .samplerref ";
diff --git a/llvm/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h b/llvm/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h
index 77426f7f6da71e..6670cb296f2160 100644
--- a/llvm/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h
+++ b/llvm/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h
@@ -14,13 +14,14 @@
 #ifndef LLVM_LIB_TARGET_NVPTX_NVPTXMACHINEFUNCTIONINFO_H
 #define LLVM_LIB_TARGET_NVPTX_NVPTXMACHINEFUNCTIONINFO_H
 
+#include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/MachineFunction.h"
 
 namespace llvm {
 class NVPTXMachineFunctionInfo : public MachineFunctionInfo {
 private:
-  /// Stores a mapping from index to symbol name for removing image handles
-  /// on Fermi.
+  /// Stores a mapping from index to symbol name for image handles that are
+  /// replaced with image references
   SmallVector<std::string, 8> ImageHandleList;
 
 public:
@@ -36,20 +37,27 @@ class NVPTXMachineFunctionInfo : public MachineFunctionInfo {
   /// Returns the index for the symbol \p Symbol. If the symbol was previously,
   /// added, the same index is returned. Otherwise, the symbol is added and the
   /// new index is returned.
-  unsigned getImageHandleSymbolIndex(const char *Symbol) {
+  unsigned getImageHandleSymbolIndex(StringRef Symbol) {
     // Is the symbol already present?
     for (unsigned i = 0, e = ImageHandleList.size(); i != e; ++i)
-      if (ImageHandleList[i] == std::string(Symbol))
+      if (ImageHandleList[i] == Symbol)
         return i;
     // Nope, insert it
-    ImageHandleList.push_back(Symbol);
+    ImageHandleList.push_back(Symbol.str());
     return ImageHandleList.size()-1;
   }
 
   /// Returns the symbol name at the given index.
-  const char *getImageHandleSymbol(unsigned Idx) const {
+  StringRef getImageHandleSymbol(unsigned Idx) const {
     assert(ImageHandleList.size() > Idx && "Bad index");
-    return ImageHandleList[Idx].c_str();
+    return ImageHandleList[Idx];
+  }
+
+  /// Check if the symbol has a mapping. Having a mapping means the handle is
+  /// replaced with a reference
+  bool checkImageHandleSymbol(StringRef Symbol) const {
+    return ImageHandleList.end() !=
+           std::find(ImageHandleList.begin(), ImageHandleList.end(), Symbol);
   }
 };
 }
diff --git a/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp b/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp
index f66504b09cb63f..a3e3978cbbfe29 100644
--- a/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp
@@ -1830,7 +1830,7 @@ findIndexForHandle(MachineOperand &Op, MachineFunction &MF, unsigned &Idx) {
     NewSymStr << MF.getName() << "_param_" << Param;
 
     InstrsToRemove.insert(&TexHandleDef);
-    Idx = MFI->getImageHandleSymbolIndex(NewSymStr.str().c_str());
+    Idx = MFI->getImageHandleSymbolIndex(NewSymStr.str());
     return true;
   }
   case NVPTX::texsurf_handles: {
@@ -1839,7 +1839,7 @@ findIndexForHandle(MachineOperand &Op, MachineFunction &MF, unsigned &Idx) {
     const GlobalValue *GV = TexHandleDef.getOperand(1).getGlobal();
     assert(GV->hasName() && "Global sampler must be named!");
     InstrsToRemove.insert(&TexHandleDef);
-    Idx = MFI->getImageHandleSymbolIndex(GV->getName().data());
+    Idx = MFI->getImageHandleSymbolIndex(GV->getName());
     return true;
   }
   case NVPTX::nvvm_move_i64:
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
index 0e6b75e622c6ad..abd7070ef0153b 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
@@ -55,19 +55,9 @@ NVPTXSubtarget::NVPTXSubtarget(const Triple &TT, const std::string &CPU,
                                const std::string &FS,
                                const NVPTXTargetMachine &TM)
     : NVPTXGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS), PTXVersion(0),
-      FullSmVersion(200), SmVersion(getSmVersion()), TM(TM),
+      FullSmVersion(200), SmVersion(getSmVersion()),
       TLInfo(TM, initializeSubtargetDependencies(CPU, FS)) {}
 
-bool NVPTXSubtarget::hasImageHandles() const {
-  // Enable handles for Kepler+, where CUDA supports indirect surfaces and
-  // textures
-  if (TM.getDrvInterface() == NVPTX::CUDA)
-    return (SmVersion >= 30);
-
-  // Disabled, otherwise
-  return false;
-}
-
 bool NVPTXSubtarget::allowFP16Math() const {
   return hasFP16Math() && NoF16Math == false;
 }
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index e785bbf830da62..b90ebde8a0ba1d 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -43,7 +43,6 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
   // FullSmVersion.
   unsigned int SmVersion;
 
-  const NVPTXTargetMachine &TM;
   NVPTXInstrInfo InstrInfo;
   NVPTXTargetLowering TLInfo;
   SelectionDAGTargetInfo TSInfo;
@@ -81,7 +80,6 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
   bool hasClusters() const { return SmVersion >= 90 && PTXVersion >= 78; }
   bool hasLDG() const { return SmVersion >= 32; }
   bool hasHWROT32() const { return SmVersion >= 32; }
-  bool hasImageHandles() const;
   bool hasFP16Math() const { return SmVersion >= 53; }
   bool hasBF16Math() const { return SmVersion >= 80; }
   bool allowFP16Math() const;
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index a5c5e9420ee737..b3b2880588cc59 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -404,14 +404,10 @@ void NVPTXPassConfig::addIRPasses() {
 }
 
 bool NVPTXPassConfig::addInstSelector() {
-  const NVPTXSubtarget &ST = *getTM<NVPTXTargetMachine>().getSubtargetImpl();
-
   addPass(createLowerAggrCopies());
   addPass(createAllocaHoisting());
   addPass(createNVPTXISelDag(getNVPTXTargetMachine(), getOptLevel()));
-
-  if (!ST.hasImageHandles())
-    addPass(createNVPTXReplaceImageHandlesPass());
+  addPass(createNVPTXReplaceImageHandlesPass());
 
   return false;
 }
diff --git a/llvm/test/CodeGen/NVPTX/surf-read-cuda.ll b/llvm/test/CodeGen/NVPTX/surf-read-cuda.ll
index a4ab7892469163..30c3c0fc17c0a0 100644
--- a/llvm/test/CodeGen/NVPTX/surf-read-cuda.ll
+++ b/llvm/test/CodeGen/NVPTX/surf-read-cuda.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs | FileCheck %s --check-prefix=SM20
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 -verify-machineinstrs | FileCheck %s --check-prefix=SM30
 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
@@ -9,38 +10,79 @@ declare i32 @llvm.nvvm.suld.1d.i32.trap(i64, i32)
 declare i64 @llvm.nvvm.texsurf.handle.internal.p1(ptr addrspace(1))
 
 
-; SM20-LABEL: .entry foo
-; SM30-LABEL: .entry foo
 define void @foo(i64 %img, ptr %red, i32 %idx) {
-; SM20: ld.param.u64    %rd[[SURFREG:[0-9]+]], [foo_param_0];
-; SM20: suld.b.1d.b32.trap {%r[[RED:[0-9]+]]}, [%rd[[SURFREG]], {%r{{[0-9]+}}}]
-; SM30: ld.param.u64    %rd[[SURFREG:[0-9]+]], [foo_param_0];
-; SM30: suld.b.1d.b32.trap {%r[[RED:[0-9]+]]}, [%rd[[SURFREG]], {%r{{[0-9]+}}}]
+; SM20-LABEL: foo(
+; SM20:       {
+; SM20-NEXT:    .reg .b32 %r<3>;
+; SM20-NEXT:    .reg .f32 %f<2>;
+; SM20-NEXT:    .reg .b64 %rd<4>;
+; SM20-EMPTY:
+; SM20-NEXT:  // %bb.0:
+; SM20-NEXT:    ld.param.u64 %rd1, [foo_param_0];
+; SM20-NEXT:    ld.param.u64 %rd2, [foo_param_1];
+; SM20-NEXT:    cvta.to.global.u64 %rd3, %rd2;
+; SM20-NEXT:    ld.param.u32 %r1, [foo_param_2];
+; SM20-NEXT:    suld.b.1d.b32.trap {%r2}, [%rd1, {%r1}];
+; SM20-NEXT:    cvt.rn.f32.s32 %f1, %r2;
+; SM20-NEXT:    st.global.f32 [%rd3], %f1;
+; SM20-NEXT:    ret;
+;
+; SM30-LABEL: foo(
+; SM30:       {
+; SM30-NEXT:    .reg .b32 %r<3>;
+; SM30-NEXT:    .reg .f32 %f<2>;
+; SM30-NEXT:    .reg .b64 %rd<4>;
+; SM30-EMPTY:
+; SM30-NEXT:  // %bb.0:
+; SM30-NEXT:    ld.param.u64 %rd1, [foo_param_0];
+; SM30-NEXT:    ld.param.u64 %rd2, [foo_param_1];
+; SM30-NEXT:    cvta.to.global.u64 %rd3, %rd2;
+; SM30-NEXT:    ld.param.u32 %r1, [foo_param_2];
+; SM30-NEXT:    suld.b.1d.b32.trap {%r2}, [%rd1, {%r1}];
+; SM30-NEXT:    cvt.rn.f32.s32 %f1, %r2;
+; SM30-NEXT:    st.global.f32 [%rd3], %f1;
+; SM30-NEXT:    ret;
   %val = tail call i32 @llvm.nvvm.suld.1d.i32.trap(i64 %img, i32 %idx)
-; SM20: cvt.rn.f32.s32 %f[[REDF:[0-9]+]], %r[[RED]]
-; SM30: cvt.rn.f32.s32 %f[[REDF:[0-9]+]], %r[[RED]]
   %ret = sitofp i32 %val to float
-; SM20: st.global.f32 [%rd{{[0-9]+}}], %f[[REDF]]
-; SM30: st.global.f32 [%rd{{[0-9]+}}], %f[[REDF]]
   store float %ret, ptr %red
   ret void
 }
 
 @surf0 = internal addrspace(1) global i64 0, align 8
 
-; SM20-LABEL: .entry bar
-; SM30-LABEL: .entry bar
 define void @bar(ptr %red, i32 %idx) {
-; SM30: mov.u64 %rd[[SURFHANDLE:[0-9]+]], surf0
+; SM20-LABEL: bar(
+; SM20:       {
+; SM20-NEXT:    .reg .b32 %r<3>;
+; SM20-NEXT:    .reg .f32 %f<2>;
+; SM20-NEXT:    .reg .b64 %rd<4>;
+; SM20-EMPTY:
+; SM20-NEXT:  // %bb.0:
+; SM20-NEXT:    ld.param.u64 %rd1, [bar_param_0];
+; SM20-NEXT:    cvta.to.global.u64 %rd2, %rd1;
+; SM20-NEXT:    ld.param.u32 %r1, [bar_param_1];
+; SM20-NEXT:    suld.b.1d.b32.trap {%r2}, [surf0, {%r1}];
+; SM20-NEXT:    cvt.rn.f32.s32 %f1, %r2;
+; SM20-NEXT:    st.global.f32 [%rd2], %f1;
+; SM20-NEXT:    ret;
+;
+; SM30-LABEL: bar(
+; SM30:       {
+; SM30-NEXT:    .reg .b32 %r<3>;
+; SM30-NEXT:    .reg .f32 %f<2>;
+; SM30-NEXT:    .reg .b64 %rd<4>;
+; SM30-EMPTY:
+; SM30-NEXT:  // %bb.0:
+; SM30-NEXT:    ld.param.u64 %rd1, [bar_param_0];
+; SM30-NEXT:    cvta.to.global.u64 %rd2, %rd1;
+; SM30-NEXT:    ld.param.u32 %r1, [bar_param_1];
+; SM30-NEXT:    suld.b.1d.b32.trap {%r2}, [surf0, {%r1}];
+; SM30-NEXT:    cvt.rn.f32.s32 %f1, %r2;
+; SM30-NEXT:    st.global.f32 [%rd2], %f1;
+; SM30-NEXT:    ret;
   %surfHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1(ptr addrspace(1) @surf0)
-; SM20: suld.b.1d.b32.trap {%r[[RED:[0-9]+]]}, [surf0, {%r{{[0-9]+}}}]
-; SM30: suld.b.1d.b32.trap {%r[[RED:[0-9]+]]}, [%rd[[SURFHANDLE]], {%r{{[0-9]+}}}]
   %val = tail call i32 @llvm.nvvm.suld.1d.i32.trap(i64 %surfHandle, i32 %idx)
-; SM20: cvt.rn.f32.s32 %f[[REDF:[0-9]+]], %r[[RED]]
-; SM30: cvt.rn.f32.s32 %f[[REDF:[0-9]+]], %r[[RED]]
   %ret = sitofp i32 %val to float
-; SM20: st.global.f32 [%rd{{[0-9]+}}], %f[[REDF]]
-; SM30: st.global.f32 [%rd{{[0-9]+}}], %f[[REDF]]
   store float %ret, ptr %red
   ret void
 }
diff --git a/llvm/test/CodeGen/NVPTX/surf-tex.py b/llvm/test/CodeGen/NVPTX/surf-tex.py
index 7d86696087438b..9607a58856bac8 100644
--- a/llvm/test/CodeGen/NVPTX/surf-tex.py
+++ b/llvm/test/CodeGen/NVPTX/surf-tex.py
@@ -1,12 +1,12 @@
 # RUN: %python %s --target=cuda --tests=suld,sust,tex,tld4 --gen-list=%t.list > %t-cuda.ll
-# RUN: llc -mcpu=sm_60 -mattr=+ptx43 %t-cuda.ll -verify-machineinstrs -o - | FileCheck %t-cuda.ll --check-prefixes=CHECK,CHECK-CUDA
+# RUN: llc -mcpu=sm_60 -mattr=+ptx43 %t-cuda.ll -verify-machineinstrs -o - | FileCheck %t-cuda.ll
 # RUN: %if ptxas %{ llc -mcpu=sm_60 -mattr=+ptx43 %t-cuda.ll -verify-machineinstrs -o - | %ptxas-verify %}
 
 # We only need to run this second time for texture tests, because
 # there is a difference between unified and non-unified intrinsics.
 #
 # RUN: %python %s --target=nvcl --tests=suld,sust,tex,tld4 --gen-list-append --gen-list=%t.list > %t-nvcl.ll
-# RUN: llc %t-nvcl.ll -verify-machineinstrs -o - | FileCheck %t-nvcl.ll --check-prefixes=CHECK,CHECK-NVCL
+# RUN: llc %t-nvcl.ll -verify-machineinstrs -o - | FileCheck %t-nvcl.ll
 # RUN: %if ptxas %{ llc %t-nvcl.ll -verify-machineinstrs -o - | %ptxas-verify %}
 
 # Verify that all instructions and intrinsics defined in TableGen
@@ -269,9 +269,7 @@ def gen_suld_tests(target, global_surf):
     ret void
   }
   ; CHECK-LABEL: .entry ${test_name}_global
-  ; CHECK-CUDA: mov.u64 [[REG${reg_id}:%.*]], ${global_surf}
-  ; CHECK-CUDA: ${instruction} ${reg_ret}, [[[REG${reg_id}]], ${reg_access}]
-  ; CHECK-NVCL: ${instruction} ${reg_ret}, [${global_surf}, ${reg_access}]
+  ; CHECK: ${instruction} ${reg_ret}, [${global_surf}, ${reg_access}]
   define void @${test_name}_global(${retty}* %ret, ${access}) {
     %gs = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @${global_surf})
     %val = tail call ${retty} @${intrinsic}(i64 %gs, ${access})
@@ -314,7 +312,6 @@ def gen_suld_tests(target, global_surf):
             "reg_ret": get_ptx_vec_reg(vec, dtype),
             "reg_surf": get_ptx_surface(target),
             "reg_access": get_ptx_surface_access(geom),
-            "reg_id": get_table_gen_id(),
         }
         gen_test(template, params)
         generated_items.append((params["intrinsic"], params["instruction"]))
@@ -364,9 +361,7 @@ def gen_sust_tests(target, global_surf):
     ret void
   }
   ; CHECK-LABEL: .entry ${test_name}_global
-  ; CHECK-CUDA: mov.u64 [[REG${reg_id}:%.*]], ${global_surf}
-  ; CHECK-CUDA: ${instruction} [[[REG${reg_id}]], ${reg_access}], ${reg_value}
-  ; CHECK-NVCL: ${instruction} [${global_surf}, ${reg_access}], ${reg_value}
+  ; CHECK: ${instruction} [${global_surf}, ${reg_access}], ${reg_value}
   define void @${test_name}_global(${value}, ${access}) {
     %gs = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @${global_surf})
     tail call void @${intrinsic}(i64 %gs, ${access}, ${value})
@@ -420,7 +415,6 @@ def gen_sust_tests(target, global_surf):
             "reg_value": get_ptx_vec_reg(vec, ctype),
             "reg_surf": get_ptx_surface(target),
             "reg_access": get_ptx_surface_access(geom),
-            "reg_id": get_table_gen_id(),
         }
         gen_test(template, params)
         generated_items.append((params["intrinsic"], params["instruction"]))
@@ -627,9 +621,7 @@ def gen_tex_tests(target, global_tex, global_sampler):
     ret void
   }
   ; CHECK-LABEL: .entry ${test_name}_global
-  ; CHECK-CUDA: mov.u64 [[REG${reg_id}:%.*]], ${global_tex}
-  ; CHECK-CUDA: ${instruction} ${ptx_ret}, [[[REG${reg_id}]], ${ptx_global_sampler} ${ptx_access}]
-  ; CHECK-NVCL: ${instruction} ${ptx_ret}, [${global_tex}, ${ptx_global_sampler} ${ptx_access}]
+  ; CHECK: ${instruction} ${ptx_ret}, [${global_tex}, ${ptx_global_sampler} ${ptx_access}]
   define void @${test_name}_global(${retty}* %ret, ${access}) {
     %gt = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @${global_tex})
     ${get_sampler_handle}
@@ -713,7 +705,6 @@ def gen_tex_tests(target, global_tex, global_sampler):
             "ptx_tex": get_ptx_texture(target),
             "ptx_access": get_ptx_texture_access(geom, ctype),
             "ptx_global_sampler": get_ptx_global_sampler(target, global_sampler),
-            "reg_id": get_table_gen_id(),
         }
         gen_test(template, params)
         generated_items.append((params["intrinsic"], params["instruction"]))
@@ -814,9 +805,7 @@ def gen_tld4_tests(target, global_tex, global_sampler):
     ret void
   }
   ; CHECK-LABEL: .entry ${test_name}_global
-  ; CHECK-CUDA: mov.u64 [[REG${reg_id}:%.*]], ${global_tex}
-  ; CHECK-CUDA: ${instruction} ${ptx_ret}, [[[REG${reg_id}]], ${ptx_global_sampler} ${ptx_access}]
-  ; CHECK-NVCL: ${instruction} ${ptx_ret}, [${global_tex}, ${ptx_global_sampler} ${ptx_access}]
+  ; CHECK: ${instruction} ${ptx_ret}, [${global_tex}, ${ptx_global_sampler} ${ptx_access}]
   define void @${test_name}_global(${retty}* %ret, ${access}) {
     %gt = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @${global_tex})
     ${get_sampler_handle}
@@ -862,7 +851,6 @@ def gen_tld4_tests(target, global_tex, global_sampler):
             "ptx_tex": get_ptx_texture(target),
             "ptx_access": get_ptx_tld4_access(geom),
             "ptx_global_sampler": get_ptx_global_sampler(target, global_sampler),
-            "reg_id": get_table_gen_id(),
         }
         gen_test(template, params)
         generated_items.append((params["intrinsic"], params["instruction"]))
diff --git a/llvm/test/CodeGen/NVPTX/surf-write-cuda.ll b/llvm/test/CodeGen/NVPTX/surf-write-cuda.ll
index 9d840ce24e7af8..d6f3956d68db45 100644
--- a/llvm/test/CodeGen/NVPTX/surf-write-cuda.ll
+++ b/llvm/test/CodeGen/NVPTX/surf-write-cuda.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs | FileCheck %s --check-prefix=SM20
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 -verify-machineinstrs | FileCheck %s --check-prefix=SM30
 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
@@ -9,13 +10,30 @@ declare void @llvm.nvvm.sust.b.1d.i32.trap(i64, i32, i32)
 declare i64 @llvm.nvvm.texsurf.handle.internal.p1(ptr addrspace(1))
 
 
-; SM20-LABEL: .entry foo
-; SM30-LABEL: .entry foo
 define void @foo(i64 %img, i32 %val, i32 %idx) {
-; SM20: ld.param.u64    %rd[[SURFREG:[0-9]+]], [foo_param_0];
-; SM20: sust.b.1d.b32.trap [%rd[[SURFREG]], {%r{{[0-9]+}}}],...
[truncated]

justinfargnoli

Can you highlight what test changes are a result of your change?

llvm/test/CodeGen/NVPTX/tex-read-cuda.ll

llvm/test/CodeGen/NVPTX/texsurf-queries.ll

justinfargnoli · 2024-12-12T21:46:03Z

Can you highlight what test changes are a result of your change?

This link does that.

justinfargnoli · 2024-12-12T21:56:33Z

avoid the handle creating code associated with taking the address of a tex/surf/sampler.

Is there a test change that illustrates this?

justinfargnoli

Overall LGTM :)

AlexMaclean · 2024-12-12T22:36:29Z

avoid the handle creating code associated with taking the address of a tex/surf/sampler.

Is there a test change that illustrates this?

The existing tests already demonstrate this. Previously there was an additional mov instruction which took the address of the tex/surf and was used to create the handle. This has been removed.

Artem-B · 2024-12-13T19:32:49Z

llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp

-bool NVPTXSubtarget::hasImageHandles() const {
-  // Enable handles for Kepler+, where CUDA supports indirect surfaces and
-  // textures
-  if (TM.getDrvInterface() == NVPTX::CUDA)


Technically we didn't get rid of sm_20 yet, though it's practical usability is likely close to zero these days.
This may be worth mentioning in release notes as one of the first steps on the way to get rid of pre-kepler stuff.

This isn't actually breaking sm_20 (any more than it already is broken). We're just trying to avoid using image handles in all cases after this change (we used to only do this for sm_20), since references are preferable to handles on newer architectures too, even if handles are supported.

On a side note, I agree we should deprecate and remove support for older architectures. For reference, I think the minimum supported by nvcc is sm_50.

I think the minimum supported by nvcc is sm_50.

The bar here is what hardware is still out there in quantities sufficient to keep support for them around. AWS still seems to list P2 instances with K80 https://aws.amazon.com/ec2/instance-types/:

I think it's the current practical low bar on the support.

Sounds good, thanks for the explanation of how we're choosing the minimum supported version.

metaflow · 2024-12-18T10:43:30Z

that did brake llvm/test/CodeGen/NVPTX/nvcl-param-align.ll

//
// Generated by LLVM NVPTX Back-End
//

.version 3.2
.target sm_20, texmode_independent
.address_size 64

        // .globl       foo                     // -- Begin function foo
                                        // @foo
.entry foo(
        .param .u64 .ptr .texref foo_param_0,
        .param .u64 .ptr .samplerref foo_param_1,
        .param .u64 .ptr .align 32 foo_param_2,
        .param .u64 .ptr .align 1 foo_param_3
)
{


// %bb.0:
        ret;
                                        // -- End function
}

ptxas-verify:
line 12; error : Feature 'kernel parameter pointers to opaque types' requires PTX ISA .version 4.3 or later
line 13; error : Feature 'kernel parameter pointers to opaque types' requires PTX ISA .version 4.3 or later

updating to sm_60 in 41c1992 seems to fix the issue. Not sure if that is intended or if test needs a further update.

AlexMaclean self-assigned this Dec 12, 2024

llvmbot added the backend:NVPTX label Dec 12, 2024

AlexMaclean requested review from Artem-B and justinfargnoli December 12, 2024 19:18

justinfargnoli reviewed Dec 12, 2024

View reviewed changes

llvm/test/CodeGen/NVPTX/tex-read-cuda.ll Outdated Show resolved Hide resolved

llvm/test/CodeGen/NVPTX/texsurf-queries.ll Outdated Show resolved Hide resolved

justinfargnoli approved these changes Dec 12, 2024

View reviewed changes

Artem-B approved these changes Dec 13, 2024

View reviewed changes

AlexMaclean added 3 commits December 16, 2024 18:06

pre-commit update tests

e013cf5

[NVPTX] Always try to replace image handles with references

8a2d3eb

address comments - unify FileChecks

da39f3a

AlexMaclean force-pushed the dev/amaclean/upstream-always-replace-handles2 branch from 2cd7a6d to da39f3a Compare December 16, 2024 18:09

AlexMaclean merged commit f9c8c01 into llvm:main Dec 16, 2024
8 checks passed

jsji mentioned this pull request Jan 2, 2025

[CUDA] bindless_images tests failed after f9c8c01d38f8fb intel/llvm#16503

Closed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[NVPTX] Aggressively try to replace image handles with references #119730

[NVPTX] Aggressively try to replace image handles with references #119730

Uh oh!

AlexMaclean commented Dec 12, 2024

Uh oh!

llvmbot commented Dec 12, 2024

Uh oh!

justinfargnoli left a comment

Uh oh!

Uh oh!

Uh oh!

justinfargnoli commented Dec 12, 2024

Uh oh!

justinfargnoli commented Dec 12, 2024

Uh oh!

justinfargnoli left a comment

Uh oh!

AlexMaclean commented Dec 12, 2024

Uh oh!

Artem-B Dec 13, 2024

Uh oh!

AlexMaclean Dec 13, 2024

Uh oh!

Artem-B Dec 13, 2024 •

edited

Loading

Uh oh!

AlexMaclean Dec 13, 2024

Uh oh!

Uh oh!

metaflow commented Dec 18, 2024

Uh oh!

Uh oh!

[NVPTX] Aggressively try to replace image handles with references #119730

[NVPTX] Aggressively try to replace image handles with references #119730

Uh oh!

Conversation

AlexMaclean commented Dec 12, 2024

Uh oh!

llvmbot commented Dec 12, 2024

Uh oh!

justinfargnoli left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

justinfargnoli commented Dec 12, 2024

Uh oh!

justinfargnoli commented Dec 12, 2024

Uh oh!

justinfargnoli left a comment

Choose a reason for hiding this comment

Uh oh!

AlexMaclean commented Dec 12, 2024

Uh oh!

Artem-B Dec 13, 2024

Choose a reason for hiding this comment

Uh oh!

AlexMaclean Dec 13, 2024

Choose a reason for hiding this comment

Uh oh!

Artem-B Dec 13, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

AlexMaclean Dec 13, 2024

Choose a reason for hiding this comment

Uh oh!

Uh oh!

metaflow commented Dec 18, 2024

Uh oh!

Uh oh!

Artem-B Dec 13, 2024 •

edited

Loading