Skip to content

Commit 340cc17

Browse files
authored
[LLVM][NVPTX]: Add intrinsic for setmaxnreg (#77289)
This patch adds an intrinsic for setmaxnreg PTX instruction. * PTX Doc link for this instruction: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-setmaxnreg * The i32 argument, an immediate value, specifies the actual absolute register count for the instruction. * The `setmaxnreg` instruction is available in SM90a. So, this patch adds 'hasSM90a' predicate to use in the NVPTX backend. * lit tests are added to verify the lowering of the intrinsic. * Verifier logic (and tests) are added to test the register count range and divisibility-by-8 requirements. Signed-off-by: Durgadoss R <[email protected]>
1 parent c7c68f1 commit 340cc17

File tree

7 files changed

+69
-0
lines changed

7 files changed

+69
-0
lines changed

llvm/include/llvm/IR/IntrinsicsNVVM.td

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4710,4 +4710,14 @@ def int_nvvm_is_explicit_cluster
47104710
[IntrNoMem, IntrSpeculatable, NoUndef<RetIndex>],
47114711
"llvm.nvvm.is_explicit_cluster">;
47124712

4713+
// Setmaxnreg inc/dec intrinsics
4714+
def int_nvvm_setmaxnreg_inc_sync_aligned_u32
4715+
: DefaultAttrsIntrinsic<[], [llvm_i32_ty],
4716+
[IntrConvergent, IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<0>>],
4717+
"llvm.nvvm.setmaxnreg.inc.sync.aligned.u32">;
4718+
def int_nvvm_setmaxnreg_dec_sync_aligned_u32
4719+
: DefaultAttrsIntrinsic<[], [llvm_i32_ty],
4720+
[IntrConvergent, IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<0>>],
4721+
"llvm.nvvm.setmaxnreg.dec.sync.aligned.u32">;
4722+
47134723
} // let TargetPrefix = "nvvm"

llvm/lib/IR/Verifier.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@
9696
#include "llvm/IR/IntrinsicsAArch64.h"
9797
#include "llvm/IR/IntrinsicsAMDGPU.h"
9898
#include "llvm/IR/IntrinsicsARM.h"
99+
#include "llvm/IR/IntrinsicsNVPTX.h"
99100
#include "llvm/IR/IntrinsicsWebAssembly.h"
100101
#include "llvm/IR/LLVMContext.h"
101102
#include "llvm/IR/Metadata.h"
@@ -6031,6 +6032,16 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
60316032
"Value for inactive lanes must be a VGPR function argument", &Call);
60326033
break;
60336034
}
6035+
case Intrinsic::nvvm_setmaxnreg_inc_sync_aligned_u32:
6036+
case Intrinsic::nvvm_setmaxnreg_dec_sync_aligned_u32: {
6037+
Value *V = Call.getArgOperand(0);
6038+
unsigned RegCount = cast<ConstantInt>(V)->getZExtValue();
6039+
Check(RegCount % 8 == 0,
6040+
"reg_count argument to nvvm.setmaxnreg must be in multiples of 8");
6041+
Check((RegCount >= 24 && RegCount <= 256),
6042+
"reg_count argument to nvvm.setmaxnreg must be within [24, 256]");
6043+
break;
6044+
}
60346045
case Intrinsic::experimental_convergence_entry:
60356046
LLVM_FALLTHROUGH;
60366047
case Intrinsic::experimental_convergence_anchor:

llvm/lib/Target/NVPTX/NVPTXInstrInfo.td

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,9 @@ def True : Predicate<"true">;
164164
class hasPTX<int version>: Predicate<"Subtarget->getPTXVersion() >= " # version>;
165165
class hasSM<int version>: Predicate<"Subtarget->getSmVersion() >= " # version>;
166166

167+
// Explicit records for arch-accelerated SM versions
168+
def hasSM90a : Predicate<"Subtarget->getFullSmVersion() == 901">;
169+
167170
// non-sync shfl instructions are not available on sm_70+ in PTX6.4+
168171
def hasSHFL : Predicate<"!(Subtarget->getSmVersion() >= 70"
169172
"&& Subtarget->getPTXVersion() >= 64)">;

llvm/lib/Target/NVPTX/NVPTXIntrinsics.td

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6727,3 +6727,16 @@ def is_explicit_cluster: NVPTXInst<(outs Int1Regs:$d), (ins),
67276727
"mov.pred\t$d, %is_explicit_cluster;",
67286728
[(set Int1Regs:$d, (int_nvvm_is_explicit_cluster))]>,
67296729
Requires<[hasSM<90>, hasPTX<78>]>;
6730+
6731+
// setmaxnreg inc/dec intrinsics
6732+
let isConvergent = true in {
6733+
multiclass SET_MAXNREG<string Action, Intrinsic Intr> {
6734+
def : NVPTXInst<(outs), (ins i32imm:$reg_count),
6735+
"setmaxnreg." # Action # ".sync.aligned.u32 $reg_count;",
6736+
[(Intr timm:$reg_count)]>,
6737+
Requires<[hasSM90a, hasPTX<80>]>;
6738+
}
6739+
6740+
defm INT_SET_MAXNREG_INC : SET_MAXNREG<"inc", int_nvvm_setmaxnreg_inc_sync_aligned_u32>;
6741+
defm INT_SET_MAXNREG_DEC : SET_MAXNREG<"dec", int_nvvm_setmaxnreg_dec_sync_aligned_u32>;
6742+
} // isConvergent

llvm/test/CodeGen/NVPTX/setmaxnreg.ll

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
; RUN: llc < %s -march=nvptx64 -mcpu=sm_90a -mattr=+ptx80| FileCheck --check-prefixes=CHECK %s
2+
; RUN: %if ptxas-12.0 %{ llc < %s -march=nvptx64 -mcpu=sm_90a -mattr=+ptx80| %ptxas-verify -arch=sm_90a %}
3+
4+
declare void @llvm.nvvm.setmaxnreg.inc.sync.aligned.u32(i32 %reg_count)
5+
declare void @llvm.nvvm.setmaxnreg.dec.sync.aligned.u32(i32 %reg_count)
6+
7+
; CHECK-LABEL: test_set_maxn_reg
8+
define void @test_set_maxn_reg() {
9+
; CHECK: setmaxnreg.inc.sync.aligned.u32 96;
10+
call void @llvm.nvvm.setmaxnreg.inc.sync.aligned.u32(i32 96)
11+
12+
; CHECK: setmaxnreg.dec.sync.aligned.u32 64;
13+
call void @llvm.nvvm.setmaxnreg.dec.sync.aligned.u32(i32 64)
14+
15+
ret void
16+
}
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
if not "NVPTX" in config.root.targets:
2+
config.unsupported = True
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
; RUN: not llvm-as %s -o /dev/null 2>&1 | FileCheck %s
2+
3+
declare void @llvm.nvvm.setmaxnreg.inc.sync.aligned.u32(i32 %reg_count)
4+
declare void @llvm.nvvm.setmaxnreg.dec.sync.aligned.u32(i32 %reg_count)
5+
6+
define void @test_set_maxn_reg() {
7+
; CHECK: reg_count argument to nvvm.setmaxnreg must be in multiples of 8
8+
call void @llvm.nvvm.setmaxnreg.inc.sync.aligned.u32(i32 95)
9+
10+
; CHECK: reg_count argument to nvvm.setmaxnreg must be within [24, 256]
11+
call void @llvm.nvvm.setmaxnreg.dec.sync.aligned.u32(i32 16)
12+
13+
ret void
14+
}

0 commit comments

Comments
 (0)