Skip to content

[NVPTX] Add ranges to intrinsic definitions, cleanup NVVMIntrRange #138338

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 80 additions & 20 deletions llvm/include/llvm/IR/IntrinsicsNVVM.td
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,19 @@ def llvm_shared_cluster_ptr_ty : LLVMQualPointerType<7>; // (shared_cluster)ptr
// MISC
//

defvar WARP_SIZE = 32;

// Note: the maximum grid size in the x-dimension is the lower value of 65535
// on sm_20. We conservatively use the larger value here as it required for
// sm_30+ and also correct for sm_20.
defvar MAX_GRID_SIZE_X = 0x7fffffff;
defvar MAX_GRID_SIZE_Y = 0xffff;
defvar MAX_GRID_SIZE_Z = 0xffff;

defvar MAX_BLOCK_SIZE_X = 1024;
defvar MAX_BLOCK_SIZE_Y = 1024;
defvar MAX_BLOCK_SIZE_Z = 64;

// Helper class that concatenates list elements with
// a given separator 'sep' and returns the result.
// Handles empty strings.
Expand Down Expand Up @@ -4747,26 +4760,35 @@ def int_nvvm_sust_p_3d_v4i32_trap

// Accessing special registers.

class PTXReadSRegIntrinsicNB_r32
: DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable, NoUndef<RetIndex>]>;
class PTXReadSRegIntrinsic_r32<string name>
: PTXReadSRegIntrinsicNB_r32, ClangBuiltin<"__nvvm_read_ptx_sreg_" # name>;
class PTXReadSRegIntrinsicNB_r32<list<IntrinsicProperty> properties = []>
: DefaultAttrsIntrinsic<[llvm_i32_ty], [],
!listconcat([IntrNoMem, IntrSpeculatable, NoUndef<RetIndex>], properties)>;

multiclass PTXReadSRegIntrinsic_v4i32<string regname> {
class PTXReadSRegIntrinsic_r32<string name,
list<IntrinsicProperty> properties = []>
: PTXReadSRegIntrinsicNB_r32<properties>,
ClangBuiltin<"__nvvm_read_ptx_sreg_" # name>;

multiclass PTXReadSRegIntrinsic_v4i32<string regname,
list<list<IntrinsicProperty>> properties = [[], [], [], []]> {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we add an assert that the properties list is of size 4? a TableGen assert

Copy link
Contributor

@jurahul jurahul May 2, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The outer list in this case, not the inner one

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep. I've added these asserts.

assert !eq(!size(properties), 4), "properties must be a list of 4 lists";
// FIXME: Do we need the 128-bit integer type version?
// def _r64 : Intrinsic<[llvm_i128_ty], [], [IntrNoMem, IntrSpeculatable]>;

// FIXME: Enable this once v4i32 support is enabled in back-end.
// def _v4i16 : Intrinsic<[llvm_v4i32_ty], [], [IntrNoMem, IntrSpeculatable]>;
foreach suffix = ["_x", "_y", "_z", "_w"] in
def suffix : PTXReadSRegIntrinsic_r32<regname # suffix>;
defvar suffixes = ["_x", "_y", "_z", "_w"];
foreach i = !range(suffixes) in
def suffixes[i] : PTXReadSRegIntrinsic_r32<regname # suffixes[i], properties[i]>;
}

// Same, but without automatic clang builtins. It will be used for
// registers that require particular GPU or PTX version.
multiclass PTXReadSRegIntrinsicNB_v4i32 {
foreach suffix = ["_x", "_y", "_z", "_w"] in
def suffix : PTXReadSRegIntrinsicNB_r32;
multiclass PTXReadSRegIntrinsicNB_v4i32<list<list<IntrinsicProperty>> properties = [[], [], [], []]> {
assert !eq(!size(properties), 4), "properties must be a list of 4 lists";
defvar suffixes = ["_x", "_y", "_z", "_w"];
foreach i = !range(suffixes) in
def suffixes[i] : PTXReadSRegIntrinsicNB_r32<properties[i]>;
}

class PTXReadSRegIntrinsic_r64<string name>
Expand All @@ -4782,15 +4804,41 @@ class PTXReadNCSRegIntrinsic_r64<string name>
: Intrinsic<[llvm_i64_ty], [], [IntrInaccessibleMemOnly, IntrNoCallback, NoUndef<RetIndex>]>,
ClangBuiltin<"__nvvm_read_ptx_sreg_" # name>;

defm int_nvvm_read_ptx_sreg_tid : PTXReadSRegIntrinsic_v4i32<"tid">;
defm int_nvvm_read_ptx_sreg_ntid : PTXReadSRegIntrinsic_v4i32<"ntid">;
defm int_nvvm_read_ptx_sreg_tid
: PTXReadSRegIntrinsic_v4i32<"tid",
[[Range<RetIndex, 0, MAX_BLOCK_SIZE_X>],
[Range<RetIndex, 0, MAX_BLOCK_SIZE_Y>],
[Range<RetIndex, 0, MAX_BLOCK_SIZE_Z>],
[Range<RetIndex, 0, 1>]]>;

defm int_nvvm_read_ptx_sreg_ntid
: PTXReadSRegIntrinsic_v4i32<"ntid",
[[Range<RetIndex, 1, !add(MAX_BLOCK_SIZE_X, 1)>],
[Range<RetIndex, 1, !add(MAX_BLOCK_SIZE_Y, 1)>],
[Range<RetIndex, 1, !add(MAX_BLOCK_SIZE_Z, 1)>],
[Range<RetIndex, 0, 1>]]>;

def int_nvvm_read_ptx_sreg_laneid
: PTXReadSRegIntrinsic_r32<"laneid", [Range<RetIndex, 0, WARP_SIZE>]>;

def int_nvvm_read_ptx_sreg_laneid : PTXReadSRegIntrinsic_r32<"laneid">;
def int_nvvm_read_ptx_sreg_warpid : PTXReadSRegIntrinsic_r32<"warpid">;
def int_nvvm_read_ptx_sreg_nwarpid : PTXReadSRegIntrinsic_r32<"nwarpid">;

defm int_nvvm_read_ptx_sreg_ctaid : PTXReadSRegIntrinsic_v4i32<"ctaid">;
defm int_nvvm_read_ptx_sreg_nctaid : PTXReadSRegIntrinsic_v4i32<"nctaid">;
defvar MAX_GRID_ID_RANGE = [[Range<RetIndex, 0, MAX_GRID_SIZE_X>],
[Range<RetIndex, 0, MAX_GRID_SIZE_Y>],
[Range<RetIndex, 0, MAX_GRID_SIZE_Z>],
[Range<RetIndex, 0, 1>]];

defvar MAX_GRID_NID_RANGE = [[Range<RetIndex, 1, !add(MAX_GRID_SIZE_X, 1)>],
[Range<RetIndex, 1, !add(MAX_GRID_SIZE_Y, 1)>],
[Range<RetIndex, 1, !add(MAX_GRID_SIZE_Z, 1)>],
[Range<RetIndex, 0, 1>]];

defm int_nvvm_read_ptx_sreg_ctaid
: PTXReadSRegIntrinsic_v4i32<"ctaid", MAX_GRID_ID_RANGE>;

defm int_nvvm_read_ptx_sreg_nctaid
: PTXReadSRegIntrinsic_v4i32<"nctaid", MAX_GRID_NID_RANGE>;

def int_nvvm_read_ptx_sreg_smid : PTXReadSRegIntrinsic_r32<"smid">;
def int_nvvm_read_ptx_sreg_nsmid : PTXReadSRegIntrinsic_r32<"nsmid">;
Expand All @@ -4817,13 +4865,25 @@ def int_nvvm_read_ptx_sreg_pm1 : PTXReadNCSRegIntrinsic_r32<"pm1">;
def int_nvvm_read_ptx_sreg_pm2 : PTXReadNCSRegIntrinsic_r32<"pm2">;
def int_nvvm_read_ptx_sreg_pm3 : PTXReadNCSRegIntrinsic_r32<"pm3">;

def int_nvvm_read_ptx_sreg_warpsize : PTXReadSRegIntrinsic_r32<"warpsize">;
def int_nvvm_read_ptx_sreg_warpsize
: PTXReadSRegIntrinsic_r32<"warpsize",
[Range<RetIndex, WARP_SIZE, !add(WARP_SIZE, 1)>]>;

// sm90+, PTX7.8+
defm int_nvvm_read_ptx_sreg_clusterid : PTXReadSRegIntrinsicNB_v4i32;
defm int_nvvm_read_ptx_sreg_nclusterid : PTXReadSRegIntrinsicNB_v4i32;
defm int_nvvm_read_ptx_sreg_cluster_ctaid : PTXReadSRegIntrinsicNB_v4i32;
defm int_nvvm_read_ptx_sreg_cluster_nctaid : PTXReadSRegIntrinsicNB_v4i32;

// Note: Since clusters are subdivisions of the grid, we conservatively use the
// maximum grid size as an upper bound for the clusterid and cluster_ctaid. In
// practice, the clusterid will likely be much smaller. The CUDA programming
// guide recommends 8 as a maximum portable value and H100s support 16.

defm int_nvvm_read_ptx_sreg_clusterid
: PTXReadSRegIntrinsicNB_v4i32<MAX_GRID_ID_RANGE>;
defm int_nvvm_read_ptx_sreg_nclusterid
: PTXReadSRegIntrinsicNB_v4i32<MAX_GRID_NID_RANGE>;
defm int_nvvm_read_ptx_sreg_cluster_ctaid
: PTXReadSRegIntrinsicNB_v4i32<MAX_GRID_ID_RANGE>;
defm int_nvvm_read_ptx_sreg_cluster_nctaid
: PTXReadSRegIntrinsicNB_v4i32<MAX_GRID_NID_RANGE>;

def int_nvvm_read_ptx_sreg_cluster_ctarank : PTXReadSRegIntrinsicNB_r32;
def int_nvvm_read_ptx_sreg_cluster_nctarank : PTXReadSRegIntrinsicNB_r32;
Expand Down
10 changes: 10 additions & 0 deletions llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,16 @@ std::optional<uint64_t> getOverallReqNTID(const Function &F) {
return getVectorProduct(ReqNTID);
}

std::optional<uint64_t> getOverallClusterRank(const Function &F) {
// maxclusterrank and cluster_dim are mutually exclusive.
if (const auto ClusterRank = getMaxClusterRank(F))
return ClusterRank;

// Note: The semantics here are a bit strange. See getMaxNTID.
const auto ClusterDim = getClusterDim(F);
return getVectorProduct(ClusterDim);
}

std::optional<unsigned> getMaxClusterRank(const Function &F) {
return getFnAttrParsedInt(F, "nvvm.maxclusterrank");
}
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/NVPTX/NVPTXUtilities.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ SmallVector<unsigned, 3> getClusterDim(const Function &);

std::optional<uint64_t> getOverallMaxNTID(const Function &);
std::optional<uint64_t> getOverallReqNTID(const Function &);
std::optional<uint64_t> getOverallClusterRank(const Function &);

std::optional<unsigned> getMaxClusterRank(const Function &);
std::optional<unsigned> getMinCTASm(const Function &);
Expand Down
126 changes: 64 additions & 62 deletions llvm/lib/Target/NVPTX/NVVMIntrRange.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,87 +58,89 @@ static bool addRangeAttr(uint64_t Low, uint64_t High, IntrinsicInst *II) {
}

static bool runNVVMIntrRange(Function &F) {
struct {
unsigned x, y, z;
} MaxBlockSize, MaxGridSize;
struct Vector3 {
unsigned X, Y, Z;
};

const unsigned MetadataNTID = getOverallReqNTID(F).value_or(
getOverallMaxNTID(F).value_or(std::numeric_limits<unsigned>::max()));
// All these annotations are only valid for kernel functions.
if (!isKernelFunction(F))
return false;

MaxBlockSize.x = std::min(1024u, MetadataNTID);
MaxBlockSize.y = std::min(1024u, MetadataNTID);
MaxBlockSize.z = std::min(64u, MetadataNTID);
const auto OverallReqNTID = getOverallReqNTID(F);
const auto OverallMaxNTID = getOverallMaxNTID(F);
const auto OverallClusterRank = getOverallClusterRank(F);

MaxGridSize.x = 0x7fffffff;
MaxGridSize.y = 0xffff;
MaxGridSize.z = 0xffff;
// If this function lacks any range information, do nothing.
if (!(OverallReqNTID || OverallMaxNTID || OverallClusterRank))
return false;

// Go through the calls in this function.
bool Changed = false;
for (Instruction &I : instructions(F)) {
IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I);
if (!II)
continue;
const unsigned FunctionNTID = OverallReqNTID.value_or(
OverallMaxNTID.value_or(std::numeric_limits<unsigned>::max()));

const unsigned FunctionClusterRank =
OverallClusterRank.value_or(std::numeric_limits<unsigned>::max());

const Vector3 MaxBlockSize{std::min(1024u, FunctionNTID),
std::min(1024u, FunctionNTID),
std::min(64u, FunctionNTID)};

// We conservatively use the maximum grid size as an upper bound for the
// cluster rank.
const Vector3 MaxClusterRank{std::min(0x7fffffffu, FunctionClusterRank),
std::min(0xffffu, FunctionClusterRank),
std::min(0xffffu, FunctionClusterRank)};

const auto ProccessIntrinsic = [&](IntrinsicInst *II) -> bool {
switch (II->getIntrinsicID()) {
// Index within block
case Intrinsic::nvvm_read_ptx_sreg_tid_x:
Changed |= addRangeAttr(0, MaxBlockSize.x, II);
break;
return addRangeAttr(0, MaxBlockSize.X, II);
case Intrinsic::nvvm_read_ptx_sreg_tid_y:
Changed |= addRangeAttr(0, MaxBlockSize.y, II);
break;
return addRangeAttr(0, MaxBlockSize.Y, II);
case Intrinsic::nvvm_read_ptx_sreg_tid_z:
Changed |= addRangeAttr(0, MaxBlockSize.z, II);
break;
return addRangeAttr(0, MaxBlockSize.Z, II);

// Block size
case Intrinsic::nvvm_read_ptx_sreg_ntid_x:
Changed |= addRangeAttr(1, MaxBlockSize.x + 1, II);
break;
return addRangeAttr(1, MaxBlockSize.X + 1, II);
case Intrinsic::nvvm_read_ptx_sreg_ntid_y:
Changed |= addRangeAttr(1, MaxBlockSize.y + 1, II);
break;
return addRangeAttr(1, MaxBlockSize.Y + 1, II);
case Intrinsic::nvvm_read_ptx_sreg_ntid_z:
Changed |= addRangeAttr(1, MaxBlockSize.z + 1, II);
break;

// Index within grid
case Intrinsic::nvvm_read_ptx_sreg_ctaid_x:
Changed |= addRangeAttr(0, MaxGridSize.x, II);
break;
case Intrinsic::nvvm_read_ptx_sreg_ctaid_y:
Changed |= addRangeAttr(0, MaxGridSize.y, II);
return addRangeAttr(1, MaxBlockSize.Z + 1, II);

// Cluster size
case Intrinsic::nvvm_read_ptx_sreg_cluster_ctaid_x:
return addRangeAttr(0, MaxClusterRank.X, II);
case Intrinsic::nvvm_read_ptx_sreg_cluster_ctaid_y:
return addRangeAttr(0, MaxClusterRank.Y, II);
case Intrinsic::nvvm_read_ptx_sreg_cluster_ctaid_z:
return addRangeAttr(0, MaxClusterRank.Z, II);
case Intrinsic::nvvm_read_ptx_sreg_cluster_nctaid_x:
return addRangeAttr(1, MaxClusterRank.X + 1, II);
case Intrinsic::nvvm_read_ptx_sreg_cluster_nctaid_y:
return addRangeAttr(1, MaxClusterRank.Y + 1, II);
case Intrinsic::nvvm_read_ptx_sreg_cluster_nctaid_z:
return addRangeAttr(1, MaxClusterRank.Z + 1, II);

case Intrinsic::nvvm_read_ptx_sreg_cluster_ctarank:
if (OverallClusterRank)
return addRangeAttr(0, FunctionClusterRank, II);
break;
case Intrinsic::nvvm_read_ptx_sreg_ctaid_z:
Changed |= addRangeAttr(0, MaxGridSize.z, II);
case Intrinsic::nvvm_read_ptx_sreg_cluster_nctarank:
if (OverallClusterRank)
return addRangeAttr(1, FunctionClusterRank + 1, II);
break;

// Grid size
case Intrinsic::nvvm_read_ptx_sreg_nctaid_x:
Changed |= addRangeAttr(1, MaxGridSize.x + 1, II);
break;
case Intrinsic::nvvm_read_ptx_sreg_nctaid_y:
Changed |= addRangeAttr(1, MaxGridSize.y + 1, II);
break;
case Intrinsic::nvvm_read_ptx_sreg_nctaid_z:
Changed |= addRangeAttr(1, MaxGridSize.z + 1, II);
break;

// warp size is constant 32.
case Intrinsic::nvvm_read_ptx_sreg_warpsize:
Changed |= addRangeAttr(32, 32 + 1, II);
break;

// Lane ID is [0..warpsize)
case Intrinsic::nvvm_read_ptx_sreg_laneid:
Changed |= addRangeAttr(0, 32, II);
break;

default:
break;
return false;
}
}
return false;
};

// Go through the calls in this function.
bool Changed = false;
for (Instruction &I : instructions(F))
if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I))
Changed |= ProccessIntrinsic(II);

return Changed;
}
Expand Down
Loading