Skip to content

Commit 631c6e8

Browse files
authored
[CUDA] Add support for CUDA-12.3 and sm_90a (#74895)
1 parent ced631e commit 631c6e8

File tree

11 files changed

+66
-17
lines changed

11 files changed

+66
-17
lines changed

clang/docs/ReleaseNotes.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -973,6 +973,9 @@ CUDA/HIP Language Changes
973973
CUDA Support
974974
^^^^^^^^^^^^
975975

976+
- Clang now supports CUDA SDK up to 12.3
977+
- Added support for sm_90a
978+
976979
AIX Support
977980
^^^^^^^^^^^
978981

clang/include/clang/Basic/BuiltinsNVPTX.def

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,9 @@
2626
#pragma push_macro("SM_87")
2727
#pragma push_macro("SM_89")
2828
#pragma push_macro("SM_90")
29-
#define SM_90 "sm_90"
29+
#pragma push_macro("SM_90a")
30+
#define SM_90a "sm_90a"
31+
#define SM_90 "sm_90|" SM_90a
3032
#define SM_89 "sm_89|" SM_90
3133
#define SM_87 "sm_87|" SM_89
3234
#define SM_86 "sm_86|" SM_87
@@ -56,7 +58,11 @@
5658
#pragma push_macro("PTX78")
5759
#pragma push_macro("PTX80")
5860
#pragma push_macro("PTX81")
59-
#define PTX81 "ptx81"
61+
#pragma push_macro("PTX82")
62+
#pragma push_macro("PTX83")
63+
#define PTX83 "ptx83"
64+
#define PTX82 "ptx82|" PTX83
65+
#define PTX81 "ptx81|" PTX82
6066
#define PTX80 "ptx80|" PTX81
6167
#define PTX78 "ptx78|" PTX80
6268
#define PTX77 "ptx77|" PTX78
@@ -1055,6 +1061,7 @@ TARGET_BUILTIN(__nvvm_getctarank_shared_cluster, "iv*3", "", AND(SM_90,PTX78))
10551061
#pragma pop_macro("SM_87")
10561062
#pragma pop_macro("SM_89")
10571063
#pragma pop_macro("SM_90")
1064+
#pragma pop_macro("SM_90a")
10581065
#pragma pop_macro("PTX42")
10591066
#pragma pop_macro("PTX60")
10601067
#pragma pop_macro("PTX61")
@@ -1072,3 +1079,5 @@ TARGET_BUILTIN(__nvvm_getctarank_shared_cluster, "iv*3", "", AND(SM_90,PTX78))
10721079
#pragma pop_macro("PTX78")
10731080
#pragma pop_macro("PTX80")
10741081
#pragma pop_macro("PTX81")
1082+
#pragma pop_macro("PTX82")
1083+
#pragma pop_macro("PTX83")

clang/include/clang/Basic/Cuda.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,9 +39,11 @@ enum class CudaVersion {
3939
CUDA_118,
4040
CUDA_120,
4141
CUDA_121,
42-
FULLY_SUPPORTED = CUDA_118,
42+
CUDA_122,
43+
CUDA_123,
44+
FULLY_SUPPORTED = CUDA_123,
4345
PARTIALLY_SUPPORTED =
44-
CUDA_121, // Partially supported. Proceed with a warning.
46+
CUDA_123, // Partially supported. Proceed with a warning.
4547
NEW = 10000, // Too new. Issue a warning, but allow using it.
4648
};
4749
const char *CudaVersionToString(CudaVersion V);
@@ -71,6 +73,7 @@ enum class CudaArch {
7173
SM_87,
7274
SM_89,
7375
SM_90,
76+
SM_90a,
7477
GFX600,
7578
GFX601,
7679
GFX602,

clang/lib/Basic/Cuda.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ static const CudaVersionMapEntry CudaNameVersionMap[] = {
3939
CUDA_ENTRY(11, 8),
4040
CUDA_ENTRY(12, 0),
4141
CUDA_ENTRY(12, 1),
42+
CUDA_ENTRY(12, 2),
43+
CUDA_ENTRY(12, 3),
4244
{"", CudaVersion::NEW, llvm::VersionTuple(std::numeric_limits<int>::max())},
4345
{"unknown", CudaVersion::UNKNOWN, {}} // End of list tombstone.
4446
};
@@ -93,6 +95,7 @@ static const CudaArchToStringMap arch_names[] = {
9395
SM(87), // Jetson/Drive AGX Orin
9496
SM(89), // Ada Lovelace
9597
SM(90), // Hopper
98+
SM(90a), // Hopper
9699
GFX(600), // gfx600
97100
GFX(601), // gfx601
98101
GFX(602), // gfx602
@@ -209,6 +212,8 @@ CudaVersion MinVersionForCudaArch(CudaArch A) {
209212
case CudaArch::SM_89:
210213
case CudaArch::SM_90:
211214
return CudaVersion::CUDA_118;
215+
case CudaArch::SM_90a:
216+
return CudaVersion::CUDA_120;
212217
default:
213218
llvm_unreachable("invalid enum");
214219
}

clang/lib/Basic/Targets/NVPTX.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -262,11 +262,14 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts,
262262
case CudaArch::SM_89:
263263
return "890";
264264
case CudaArch::SM_90:
265+
case CudaArch::SM_90a:
265266
return "900";
266267
}
267268
llvm_unreachable("unhandled CudaArch");
268269
}();
269270
Builder.defineMacro("__CUDA_ARCH__", CUDAArchCode);
271+
if (GPU == CudaArch::SM_90a)
272+
Builder.defineMacro("__CUDA_ARCH_FEAT_SM90_ALL", "1");
270273
}
271274
}
272275

clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3483,6 +3483,7 @@ void CGOpenMPRuntimeGPU::processRequiresDirective(
34833483
case CudaArch::SM_87:
34843484
case CudaArch::SM_89:
34853485
case CudaArch::SM_90:
3486+
case CudaArch::SM_90a:
34863487
case CudaArch::GFX600:
34873488
case CudaArch::GFX601:
34883489
case CudaArch::GFX602:

clang/lib/Driver/ToolChains/Cuda.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,10 @@ CudaVersion getCudaVersion(uint32_t raw_version) {
7878
return CudaVersion::CUDA_120;
7979
if (raw_version < 12020)
8080
return CudaVersion::CUDA_121;
81+
if (raw_version < 12030)
82+
return CudaVersion::CUDA_122;
83+
if (raw_version < 12040)
84+
return CudaVersion::CUDA_123;
8185
return CudaVersion::NEW;
8286
}
8387

@@ -671,6 +675,8 @@ void NVPTX::getNVPTXTargetFeatures(const Driver &D, const llvm::Triple &Triple,
671675
case CudaVersion::CUDA_##CUDA_VER: \
672676
PtxFeature = "+ptx" #PTX_VER; \
673677
break;
678+
CASE_CUDA_VERSION(123, 83);
679+
CASE_CUDA_VERSION(122, 82);
674680
CASE_CUDA_VERSION(121, 81);
675681
CASE_CUDA_VERSION(120, 80);
676682
CASE_CUDA_VERSION(118, 78);

clang/test/Misc/target-invalid-cpu-note.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929

3030
// RUN: not %clang_cc1 -triple nvptx--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix NVPTX
3131
// NVPTX: error: unknown target CPU 'not-a-cpu'
32-
// NVPTX-NEXT: note: valid target CPU values are: sm_20, sm_21, sm_30, sm_32, sm_35, sm_37, sm_50, sm_52, sm_53, sm_60, sm_61, sm_62, sm_70, sm_72, sm_75, sm_80, sm_86, sm_87, sm_89, sm_90, gfx600, gfx601, gfx602, gfx700, gfx701, gfx702, gfx703, gfx704, gfx705, gfx801, gfx802, gfx803, gfx805, gfx810, gfx900, gfx902, gfx904, gfx906, gfx908, gfx909, gfx90a, gfx90c, gfx940, gfx941, gfx942, gfx1010, gfx1011, gfx1012, gfx1013, gfx1030, gfx1031, gfx1032, gfx1033, gfx1034, gfx1035, gfx1036, gfx1100, gfx1101, gfx1102, gfx1103, gfx1150, gfx1151, gfx1200, gfx1201{{$}}
32+
// NVPTX-NEXT: note: valid target CPU values are: sm_20, sm_21, sm_30, sm_32, sm_35, sm_37, sm_50, sm_52, sm_53, sm_60, sm_61, sm_62, sm_70, sm_72, sm_75, sm_80, sm_86, sm_87, sm_89, sm_90, sm_90a, gfx600, gfx601, gfx602, gfx700, gfx701, gfx702, gfx703, gfx704, gfx705, gfx801, gfx802, gfx803, gfx805, gfx810, gfx900, gfx902, gfx904, gfx906, gfx908, gfx909, gfx90a, gfx90c, gfx940, gfx941, gfx942, gfx1010, gfx1011, gfx1012, gfx1013, gfx1030, gfx1031, gfx1032, gfx1033, gfx1034, gfx1035, gfx1036, gfx1100, gfx1101, gfx1102, gfx1103, gfx1150, gfx1151, gfx1200, gfx1201{{$}}
3333

3434
// RUN: not %clang_cc1 -triple r600--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix R600
3535
// R600: error: unknown target CPU 'not-a-cpu'

llvm/lib/Target/NVPTX/NVPTX.td

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -24,23 +24,24 @@ include "NVPTXInstrInfo.td"
2424
// TableGen in NVPTXGenSubtarget.inc.
2525
//===----------------------------------------------------------------------===//
2626

27-
class FeatureSM<int version>:
28-
SubtargetFeature<"sm_"# version, "SmVersion",
29-
"" # version,
30-
"Target SM " # version>;
31-
def SM90a: FeatureSM<90>;
27+
class FeatureSM<string sm, int value>:
28+
SubtargetFeature<"sm_"# sm, "FullSmVersion",
29+
"" # value,
30+
"Target SM " # sm>;
3231

3332
class FeaturePTX<int version>:
3433
SubtargetFeature<"ptx"# version, "PTXVersion",
3534
"" # version,
3635
"Use PTX version " # version>;
3736

38-
foreach version = [20, 21, 30, 32, 35, 37, 50, 52, 53,
39-
60, 61, 62, 70, 72, 75, 80, 86, 87, 89, 90] in
40-
def SM#version: FeatureSM<version>;
37+
foreach sm = [20, 21, 30, 32, 35, 37, 50, 52, 53,
38+
60, 61, 62, 70, 72, 75, 80, 86, 87, 89, 90] in
39+
def SM#sm: FeatureSM<""#sm, !mul(sm, 10)>;
40+
41+
def SM90a: FeatureSM<"90a", 901>;
4142

4243
foreach version = [32, 40, 41, 42, 43, 50, 60, 61, 63, 64, 65,
43-
70, 71, 72, 73, 74, 75, 76, 77, 78, 80, 81] in
44+
70, 71, 72, 73, 74, 75, 76, 77, 78, 80, 81, 82, 83] in
4445
def PTX#version: FeaturePTX<version>;
4546

4647
//===----------------------------------------------------------------------===//

llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,11 @@ NVPTXSubtarget &NVPTXSubtarget::initializeSubtargetDependencies(StringRef CPU,
3636

3737
ParseSubtargetFeatures(TargetName, /*TuneCPU*/ TargetName, FS);
3838

39+
// Re-map SM version numbers, SmVersion carries the regular SMs which do
40+
// have relative order, while FullSmVersion allows distinguishing sm_90 from
41+
// sm_90a, which would *not* be a subset of sm_91.
42+
SmVersion = getSmVersion();
43+
3944
// Set default to PTX 6.0 (CUDA 9.0)
4045
if (PTXVersion == 0) {
4146
PTXVersion = 60;
@@ -48,7 +53,7 @@ NVPTXSubtarget::NVPTXSubtarget(const Triple &TT, const std::string &CPU,
4853
const std::string &FS,
4954
const NVPTXTargetMachine &TM)
5055
: NVPTXGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS), PTXVersion(0),
51-
SmVersion(20), TM(TM),
56+
FullSmVersion(200), SmVersion(getSmVersion()), TM(TM),
5257
TLInfo(TM, initializeSubtargetDependencies(CPU, FS)) {}
5358

5459
bool NVPTXSubtarget::hasImageHandles() const {

llvm/lib/Target/NVPTX/NVPTXSubtarget.h

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,12 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
3535
// PTX version x.y is represented as 10*x+y, e.g. 3.1 == 31
3636
unsigned PTXVersion;
3737

38-
// SM version x.y is represented as 10*x+y, e.g. 3.1 == 31
38+
// Full SM version x.y is represented as 100*x+10*y+feature, e.g. 3.1 == 310
39+
// sm_90a == 901
40+
unsigned int FullSmVersion;
41+
42+
// SM version x.y is represented as 10*x+y, e.g. 3.1 == 31. Derived from
43+
// FullSmVersion.
3944
unsigned int SmVersion;
4045

4146
const NVPTXTargetMachine &TM;
@@ -80,7 +85,15 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
8085
bool allowFP16Math() const;
8186
bool hasMaskOperator() const { return PTXVersion >= 71; }
8287
bool hasNoReturn() const { return SmVersion >= 30 && PTXVersion >= 64; }
83-
unsigned int getSmVersion() const { return SmVersion; }
88+
unsigned int getFullSmVersion() const { return FullSmVersion; }
89+
unsigned int getSmVersion() const { return getFullSmVersion() / 10; }
90+
// GPUs with "a" suffix have include architecture-accelerated features that
91+
// are supported on the specified architecture only, hence such targets do not
92+
// follow the onion layer model. hasAAFeatures() allows distinguishing such
93+
// GPU variants from the base GPU architecture.
94+
// - 0 represents base GPU model,
95+
// - non-zero value identifies particular architecture-accelerated variant.
96+
bool hasAAFeatures() const { return getFullSmVersion() % 10; }
8497
std::string getTargetName() const { return TargetName; }
8598

8699
// Get maximum value of required alignments among the supported data types.

0 commit comments

Comments
 (0)