@@ -139,6 +139,16 @@ def llvm_shared_cluster_ptr_ty : LLVMQualPointerType<7>; // (shared_cluster)ptr
139
139
// MISC
140
140
//
141
141
142
+ defvar WARP_SIZE = 32;
143
+
144
+ defvar MAX_GRID_SIZE_X = 0x7fffffff;
145
+ defvar MAX_GRID_SIZE_Y = 0xffff;
146
+ defvar MAX_GRID_SIZE_Z = 0xffff;
147
+
148
+ defvar MAX_BLOCK_SIZE_X = 1024;
149
+ defvar MAX_BLOCK_SIZE_Y = 1024;
150
+ defvar MAX_BLOCK_SIZE_Z = 64;
151
+
142
152
// Helper class that concatenates list elements with
143
153
// a given separator 'sep' and returns the result.
144
154
// Handles empty strings.
@@ -4747,26 +4757,33 @@ def int_nvvm_sust_p_3d_v4i32_trap
4747
4757
4748
4758
// Accessing special registers.
4749
4759
4750
- class PTXReadSRegIntrinsicNB_r32
4751
- : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable, NoUndef<RetIndex>]>;
4752
- class PTXReadSRegIntrinsic_r32<string name>
4753
- : PTXReadSRegIntrinsicNB_r32, ClangBuiltin<"__nvvm_read_ptx_sreg_" # name>;
4760
+ class PTXReadSRegIntrinsicNB_r32<list<IntrinsicProperty> properties = []>
4761
+ : DefaultAttrsIntrinsic<[llvm_i32_ty], [],
4762
+ !listconcat([IntrNoMem, IntrSpeculatable, NoUndef<RetIndex>], properties)>;
4754
4763
4755
- multiclass PTXReadSRegIntrinsic_v4i32<string regname> {
4764
+ class PTXReadSRegIntrinsic_r32<string name,
4765
+ list<IntrinsicProperty> properties = []>
4766
+ : PTXReadSRegIntrinsicNB_r32<properties>,
4767
+ ClangBuiltin<"__nvvm_read_ptx_sreg_" # name>;
4768
+
4769
+ multiclass PTXReadSRegIntrinsic_v4i32<string regname,
4770
+ list<list<IntrinsicProperty>> properties = [[], [], [], []]> {
4756
4771
// FIXME: Do we need the 128-bit integer type version?
4757
4772
// def _r64 : Intrinsic<[llvm_i128_ty], [], [IntrNoMem, IntrSpeculatable]>;
4758
4773
4759
4774
// FIXME: Enable this once v4i32 support is enabled in back-end.
4760
4775
// def _v4i16 : Intrinsic<[llvm_v4i32_ty], [], [IntrNoMem, IntrSpeculatable]>;
4761
- foreach suffix = ["_x", "_y", "_z", "_w"] in
4762
- def suffix : PTXReadSRegIntrinsic_r32<regname # suffix>;
4776
+ defvar suffixes = ["_x", "_y", "_z", "_w"];
4777
+ foreach i = !range(suffixes) in
4778
+ def suffixes[i] : PTXReadSRegIntrinsic_r32<regname # suffixes[i], properties[i]>;
4763
4779
}
4764
4780
4765
4781
// Same, but without automatic clang builtins. It will be used for
4766
4782
// registers that require particular GPU or PTX version.
4767
- multiclass PTXReadSRegIntrinsicNB_v4i32 {
4768
- foreach suffix = ["_x", "_y", "_z", "_w"] in
4769
- def suffix : PTXReadSRegIntrinsicNB_r32;
4783
+ multiclass PTXReadSRegIntrinsicNB_v4i32<list<list<IntrinsicProperty>> properties = [[], [], [], []]> {
4784
+ defvar suffixes = ["_x", "_y", "_z", "_w"];
4785
+ foreach i = !range(suffixes) in
4786
+ def suffixes[i] : PTXReadSRegIntrinsicNB_r32<properties[i]>;
4770
4787
}
4771
4788
4772
4789
class PTXReadSRegIntrinsic_r64<string name>
@@ -4782,15 +4799,41 @@ class PTXReadNCSRegIntrinsic_r64<string name>
4782
4799
: Intrinsic<[llvm_i64_ty], [], [IntrInaccessibleMemOnly, IntrNoCallback, NoUndef<RetIndex>]>,
4783
4800
ClangBuiltin<"__nvvm_read_ptx_sreg_" # name>;
4784
4801
4785
- defm int_nvvm_read_ptx_sreg_tid : PTXReadSRegIntrinsic_v4i32<"tid">;
4786
- defm int_nvvm_read_ptx_sreg_ntid : PTXReadSRegIntrinsic_v4i32<"ntid">;
4802
+ defm int_nvvm_read_ptx_sreg_tid
4803
+ : PTXReadSRegIntrinsic_v4i32<"tid",
4804
+ [[Range<RetIndex, 0, MAX_BLOCK_SIZE_X>],
4805
+ [Range<RetIndex, 0, MAX_BLOCK_SIZE_Y>],
4806
+ [Range<RetIndex, 0, MAX_BLOCK_SIZE_Z>],
4807
+ [Range<RetIndex, 0, 1>]]>;
4808
+
4809
+ defm int_nvvm_read_ptx_sreg_ntid
4810
+ : PTXReadSRegIntrinsic_v4i32<"ntid",
4811
+ [[Range<RetIndex, 1, !add(MAX_BLOCK_SIZE_X, 1)>],
4812
+ [Range<RetIndex, 1, !add(MAX_BLOCK_SIZE_Y, 1)>],
4813
+ [Range<RetIndex, 1, !add(MAX_BLOCK_SIZE_Z, 1)>],
4814
+ [Range<RetIndex, 0, 1>]]>;
4815
+
4816
+ def int_nvvm_read_ptx_sreg_laneid
4817
+ : PTXReadSRegIntrinsic_r32<"laneid", [Range<RetIndex, 0, WARP_SIZE>]>;
4787
4818
4788
- def int_nvvm_read_ptx_sreg_laneid : PTXReadSRegIntrinsic_r32<"laneid">;
4789
4819
def int_nvvm_read_ptx_sreg_warpid : PTXReadSRegIntrinsic_r32<"warpid">;
4790
4820
def int_nvvm_read_ptx_sreg_nwarpid : PTXReadSRegIntrinsic_r32<"nwarpid">;
4791
4821
4792
- defm int_nvvm_read_ptx_sreg_ctaid : PTXReadSRegIntrinsic_v4i32<"ctaid">;
4793
- defm int_nvvm_read_ptx_sreg_nctaid : PTXReadSRegIntrinsic_v4i32<"nctaid">;
4822
+ defvar MAX_GRID_ID_RANGE = [[Range<RetIndex, 0, MAX_GRID_SIZE_X>],
4823
+ [Range<RetIndex, 0, MAX_GRID_SIZE_Y>],
4824
+ [Range<RetIndex, 0, MAX_GRID_SIZE_Z>],
4825
+ [Range<RetIndex, 0, 1>]];
4826
+
4827
+ defvar MAX_GRID_NID_RANGE = [[Range<RetIndex, 1, !add(MAX_GRID_SIZE_X, 1)>],
4828
+ [Range<RetIndex, 1, !add(MAX_GRID_SIZE_Y, 1)>],
4829
+ [Range<RetIndex, 1, !add(MAX_GRID_SIZE_Z, 1)>],
4830
+ [Range<RetIndex, 0, 1>]];
4831
+
4832
+ defm int_nvvm_read_ptx_sreg_ctaid
4833
+ : PTXReadSRegIntrinsic_v4i32<"ctaid", MAX_GRID_ID_RANGE>;
4834
+
4835
+ defm int_nvvm_read_ptx_sreg_nctaid
4836
+ : PTXReadSRegIntrinsic_v4i32<"nctaid", MAX_GRID_NID_RANGE>;
4794
4837
4795
4838
def int_nvvm_read_ptx_sreg_smid : PTXReadSRegIntrinsic_r32<"smid">;
4796
4839
def int_nvvm_read_ptx_sreg_nsmid : PTXReadSRegIntrinsic_r32<"nsmid">;
@@ -4817,13 +4860,25 @@ def int_nvvm_read_ptx_sreg_pm1 : PTXReadNCSRegIntrinsic_r32<"pm1">;
4817
4860
def int_nvvm_read_ptx_sreg_pm2 : PTXReadNCSRegIntrinsic_r32<"pm2">;
4818
4861
def int_nvvm_read_ptx_sreg_pm3 : PTXReadNCSRegIntrinsic_r32<"pm3">;
4819
4862
4820
- def int_nvvm_read_ptx_sreg_warpsize : PTXReadSRegIntrinsic_r32<"warpsize">;
4863
+ def int_nvvm_read_ptx_sreg_warpsize
4864
+ : PTXReadSRegIntrinsic_r32<"warpsize",
4865
+ [Range<RetIndex, WARP_SIZE, !add(WARP_SIZE, 1)>]>;
4821
4866
4822
4867
// sm90+, PTX7.8+
4823
- defm int_nvvm_read_ptx_sreg_clusterid : PTXReadSRegIntrinsicNB_v4i32;
4824
- defm int_nvvm_read_ptx_sreg_nclusterid : PTXReadSRegIntrinsicNB_v4i32;
4825
- defm int_nvvm_read_ptx_sreg_cluster_ctaid : PTXReadSRegIntrinsicNB_v4i32;
4826
- defm int_nvvm_read_ptx_sreg_cluster_nctaid : PTXReadSRegIntrinsicNB_v4i32;
4868
+
4869
+ // Note: Since clusters are subdivisions of the grid, we conservatively use the
4870
+ // maximum grid size as an upper bound for the clusterid and cluster_ctaid. In
4871
+ // practice, the clusterid will likely be much smaller. The CUDA programming
4872
+ // guide recommends 8 as a maximum portable value and H100s support 16.
4873
+
4874
+ defm int_nvvm_read_ptx_sreg_clusterid
4875
+ : PTXReadSRegIntrinsicNB_v4i32<MAX_GRID_ID_RANGE>;
4876
+ defm int_nvvm_read_ptx_sreg_nclusterid
4877
+ : PTXReadSRegIntrinsicNB_v4i32<MAX_GRID_NID_RANGE>;
4878
+ defm int_nvvm_read_ptx_sreg_cluster_ctaid
4879
+ : PTXReadSRegIntrinsicNB_v4i32<MAX_GRID_ID_RANGE>;
4880
+ defm int_nvvm_read_ptx_sreg_cluster_nctaid
4881
+ : PTXReadSRegIntrinsicNB_v4i32<MAX_GRID_NID_RANGE>;
4827
4882
4828
4883
def int_nvvm_read_ptx_sreg_cluster_ctarank : PTXReadSRegIntrinsicNB_r32;
4829
4884
def int_nvvm_read_ptx_sreg_cluster_nctarank : PTXReadSRegIntrinsicNB_r32;
0 commit comments