[CUDA] Add support for CUDA-12.3 and sm_90a (#74895)

Artem-B · web-flow · commit 631c6e834cb0 · 2023-12-11T12:18:28.000-08:00
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
@@ -973,6 +973,9 @@ CUDA/HIP Language Changes
 CUDA Support
 ^^^^^^^^^^^^
 
+- Clang now supports CUDA SDK up to 12.3
+- Added support for sm_90a
+
 AIX Support
 ^^^^^^^^^^^
 
diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def
@@ -26,7 +26,9 @@
 #pragma push_macro("SM_87")
 #pragma push_macro("SM_89")
 #pragma push_macro("SM_90")
-#define SM_90 "sm_90"
+#pragma push_macro("SM_90a")
+#define SM_90a "sm_90a"
+#define SM_90 "sm_90|" SM_90a
 #define SM_89 "sm_89|" SM_90
 #define SM_87 "sm_87|" SM_89
 #define SM_86 "sm_86|" SM_87
@@ -56,7 +58,11 @@
 #pragma push_macro("PTX78")
 #pragma push_macro("PTX80")
 #pragma push_macro("PTX81")
-#define PTX81 "ptx81"
+#pragma push_macro("PTX82")
+#pragma push_macro("PTX83")
+#define PTX83 "ptx83"
+#define PTX82 "ptx82|" PTX83
+#define PTX81 "ptx81|" PTX82
 #define PTX80 "ptx80|" PTX81
 #define PTX78 "ptx78|" PTX80
 #define PTX77 "ptx77|" PTX78
@@ -1055,6 +1061,7 @@ TARGET_BUILTIN(__nvvm_getctarank_shared_cluster, "iv*3", "", AND(SM_90,PTX78))
 #pragma pop_macro("SM_87")
 #pragma pop_macro("SM_89")
 #pragma pop_macro("SM_90")
+#pragma pop_macro("SM_90a")
 #pragma pop_macro("PTX42")
 #pragma pop_macro("PTX60")
 #pragma pop_macro("PTX61")
@@ -1072,3 +1079,5 @@ TARGET_BUILTIN(__nvvm_getctarank_shared_cluster, "iv*3", "", AND(SM_90,PTX78))
 #pragma pop_macro("PTX78")
 #pragma pop_macro("PTX80")
 #pragma pop_macro("PTX81")
+#pragma pop_macro("PTX82")
+#pragma pop_macro("PTX83")
diff --git a/clang/include/clang/Basic/Cuda.h b/clang/include/clang/Basic/Cuda.h
@@ -39,9 +39,11 @@ enum class CudaVersion {
   CUDA_118,
   CUDA_120,
   CUDA_121,
-  FULLY_SUPPORTED = CUDA_118,
+  CUDA_122,
+  CUDA_123,
+  FULLY_SUPPORTED = CUDA_123,
   PARTIALLY_SUPPORTED =
-      CUDA_121, // Partially supported. Proceed with a warning.
+      CUDA_123, // Partially supported. Proceed with a warning.
   NEW = 10000,  // Too new. Issue a warning, but allow using it.
 };
 const char *CudaVersionToString(CudaVersion V);
@@ -71,6 +73,7 @@ enum class CudaArch {
   SM_87,
   SM_89,
   SM_90,
+  SM_90a,
   GFX600,
   GFX601,
   GFX602,
diff --git a/clang/lib/Basic/Cuda.cpp b/clang/lib/Basic/Cuda.cpp
@@ -39,6 +39,8 @@ static const CudaVersionMapEntry CudaNameVersionMap[] = {
     CUDA_ENTRY(11, 8),
     CUDA_ENTRY(12, 0),
     CUDA_ENTRY(12, 1),
+    CUDA_ENTRY(12, 2),
+    CUDA_ENTRY(12, 3),
     {"", CudaVersion::NEW, llvm::VersionTuple(std::numeric_limits<int>::max())},
     {"unknown", CudaVersion::UNKNOWN, {}} // End of list tombstone.
 };
@@ -93,6 +95,7 @@ static const CudaArchToStringMap arch_names[] = {
     SM(87),                          // Jetson/Drive AGX Orin
     SM(89),                          // Ada Lovelace
     SM(90),                          // Hopper
+    SM(90a),                         // Hopper
     GFX(600),  // gfx600
     GFX(601),  // gfx601
     GFX(602),  // gfx602
@@ -209,6 +212,8 @@ CudaVersion MinVersionForCudaArch(CudaArch A) {
   case CudaArch::SM_89:
   case CudaArch::SM_90:
     return CudaVersion::CUDA_118;
+  case CudaArch::SM_90a:
+    return CudaVersion::CUDA_120;
   default:
     llvm_unreachable("invalid enum");
   }
diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp
@@ -262,11 +262,14 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts,
       case CudaArch::SM_89:
         return "890";
       case CudaArch::SM_90:
+      case CudaArch::SM_90a:
         return "900";
       }
       llvm_unreachable("unhandled CudaArch");
     }();
     Builder.defineMacro("__CUDA_ARCH__", CUDAArchCode);
+    if (GPU == CudaArch::SM_90a)
+      Builder.defineMacro("__CUDA_ARCH_FEAT_SM90_ALL", "1");
   }
 }
 
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -3483,6 +3483,7 @@ void CGOpenMPRuntimeGPU::processRequiresDirective(
       case CudaArch::SM_87:
       case CudaArch::SM_89:
       case CudaArch::SM_90:
+      case CudaArch::SM_90a:
       case CudaArch::GFX600:
       case CudaArch::GFX601:
       case CudaArch::GFX602:
diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -78,6 +78,10 @@ CudaVersion getCudaVersion(uint32_t raw_version) {
     return CudaVersion::CUDA_120;
   if (raw_version < 12020)
     return CudaVersion::CUDA_121;
+  if (raw_version < 12030)
+    return CudaVersion::CUDA_122;
+  if (raw_version < 12040)
+    return CudaVersion::CUDA_123;
   return CudaVersion::NEW;
 }
 
@@ -671,6 +675,8 @@ void NVPTX::getNVPTXTargetFeatures(const Driver &D, const llvm::Triple &Triple,
   case CudaVersion::CUDA_##CUDA_VER:                                           \
     PtxFeature = "+ptx" #PTX_VER;                                              \
     break;
+    CASE_CUDA_VERSION(123, 83);
+    CASE_CUDA_VERSION(122, 82);
     CASE_CUDA_VERSION(121, 81);
     CASE_CUDA_VERSION(120, 80);
     CASE_CUDA_VERSION(118, 78);
diff --git a/clang/test/Misc/target-invalid-cpu-note.c b/clang/test/Misc/target-invalid-cpu-note.c
@@ -29,7 +29,7 @@
 
 // RUN: not %clang_cc1 -triple nvptx--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix NVPTX
 // NVPTX: error: unknown target CPU 'not-a-cpu'
-// NVPTX-NEXT: note: valid target CPU values are: sm_20, sm_21, sm_30, sm_32, sm_35, sm_37, sm_50, sm_52, sm_53, sm_60, sm_61, sm_62, sm_70, sm_72, sm_75, sm_80, sm_86, sm_87, sm_89, sm_90, gfx600, gfx601, gfx602, gfx700, gfx701, gfx702, gfx703, gfx704, gfx705, gfx801, gfx802, gfx803, gfx805, gfx810, gfx900, gfx902, gfx904, gfx906, gfx908, gfx909, gfx90a, gfx90c, gfx940, gfx941, gfx942, gfx1010, gfx1011, gfx1012, gfx1013, gfx1030, gfx1031, gfx1032, gfx1033, gfx1034, gfx1035, gfx1036, gfx1100, gfx1101, gfx1102, gfx1103, gfx1150, gfx1151, gfx1200, gfx1201{{$}}
+// NVPTX-NEXT: note: valid target CPU values are: sm_20, sm_21, sm_30, sm_32, sm_35, sm_37, sm_50, sm_52, sm_53, sm_60, sm_61, sm_62, sm_70, sm_72, sm_75, sm_80, sm_86, sm_87, sm_89, sm_90, sm_90a, gfx600, gfx601, gfx602, gfx700, gfx701, gfx702, gfx703, gfx704, gfx705, gfx801, gfx802, gfx803, gfx805, gfx810, gfx900, gfx902, gfx904, gfx906, gfx908, gfx909, gfx90a, gfx90c, gfx940, gfx941, gfx942, gfx1010, gfx1011, gfx1012, gfx1013, gfx1030, gfx1031, gfx1032, gfx1033, gfx1034, gfx1035, gfx1036, gfx1100, gfx1101, gfx1102, gfx1103, gfx1150, gfx1151, gfx1200, gfx1201{{$}}
 
 // RUN: not %clang_cc1 -triple r600--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix R600
 // R600: error: unknown target CPU 'not-a-cpu'
diff --git a/llvm/lib/Target/NVPTX/NVPTX.td b/llvm/lib/Target/NVPTX/NVPTX.td
@@ -24,23 +24,24 @@ include "NVPTXInstrInfo.td"
 //   TableGen in NVPTXGenSubtarget.inc.
 //===----------------------------------------------------------------------===//
 
-class FeatureSM<int version>:
-   SubtargetFeature<"sm_"# version, "SmVersion",
-                    "" # version,
-                    "Target SM " # version>;
-def SM90a: FeatureSM<90>;
+class FeatureSM<string sm, int value>:
+   SubtargetFeature<"sm_"# sm, "FullSmVersion",
+                    "" # value,
+                    "Target SM " # sm>;
 
 class FeaturePTX<int version>:
    SubtargetFeature<"ptx"# version, "PTXVersion",
                     "" # version,
                     "Use PTX version " # version>;
 
-foreach version = [20, 21, 30, 32, 35, 37, 50, 52, 53,
-                   60, 61, 62, 70, 72, 75, 80, 86, 87, 89, 90] in
-  def SM#version: FeatureSM<version>;
+foreach sm = [20, 21, 30, 32, 35, 37, 50, 52, 53,
+              60, 61, 62, 70, 72, 75, 80, 86, 87, 89, 90] in
+  def SM#sm: FeatureSM<""#sm, !mul(sm, 10)>;
+
+def SM90a: FeatureSM<"90a", 901>;
 
 foreach version = [32, 40, 41, 42, 43, 50, 60, 61, 63, 64, 65,
-                   70, 71, 72, 73, 74, 75, 76, 77, 78, 80, 81] in
+                   70, 71, 72, 73, 74, 75, 76, 77, 78, 80, 81, 82, 83] in
   def PTX#version: FeaturePTX<version>;
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
@@ -36,6 +36,11 @@ NVPTXSubtarget &NVPTXSubtarget::initializeSubtargetDependencies(StringRef CPU,
 
     ParseSubtargetFeatures(TargetName, /*TuneCPU*/ TargetName, FS);
 
+    // Re-map SM version numbers, SmVersion carries the regular SMs which do
+    // have relative order, while FullSmVersion allows distinguishing sm_90 from
+    // sm_90a, which would *not* be a subset of sm_91.
+    SmVersion = getSmVersion();
+
     // Set default to PTX 6.0 (CUDA 9.0)
     if (PTXVersion == 0) {
       PTXVersion = 60;
@@ -48,7 +53,7 @@ NVPTXSubtarget::NVPTXSubtarget(const Triple &TT, const std::string &CPU,
                                const std::string &FS,
                                const NVPTXTargetMachine &TM)
     : NVPTXGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS), PTXVersion(0),
-      SmVersion(20), TM(TM),
+      FullSmVersion(200), SmVersion(getSmVersion()), TM(TM),
       TLInfo(TM, initializeSubtargetDependencies(CPU, FS)) {}
 
 bool NVPTXSubtarget::hasImageHandles() const {
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -35,7 +35,12 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
   // PTX version x.y is represented as 10*x+y, e.g. 3.1 == 31
   unsigned PTXVersion;
 
-  // SM version x.y is represented as 10*x+y, e.g. 3.1 == 31
+  // Full SM version x.y is represented as 100*x+10*y+feature, e.g. 3.1 == 310
+  // sm_90a == 901
+  unsigned int FullSmVersion;
+
+  // SM version x.y is represented as 10*x+y, e.g. 3.1 == 31. Derived from
+  // FullSmVersion.
   unsigned int SmVersion;
 
   const NVPTXTargetMachine &TM;
@@ -80,7 +85,15 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
   bool allowFP16Math() const;
   bool hasMaskOperator() const { return PTXVersion >= 71; }
   bool hasNoReturn() const { return SmVersion >= 30 && PTXVersion >= 64; }
-  unsigned int getSmVersion() const { return SmVersion; }
+  unsigned int getFullSmVersion() const { return FullSmVersion; }
+  unsigned int getSmVersion() const { return getFullSmVersion() / 10; }
+  // GPUs with "a" suffix have include architecture-accelerated features that
+  // are supported on the specified architecture only, hence such targets do not
+  // follow the onion layer model. hasAAFeatures() allows distinguishing such
+  // GPU variants from the base GPU architecture.
+  // - 0 represents base GPU model,
+  // - non-zero value identifies particular architecture-accelerated variant.
+  bool hasAAFeatures() const { return getFullSmVersion() % 10; }
   std::string getTargetName() const { return TargetName; }
 
   // Get maximum value of required alignments among the supported data types.

Original file line number	Diff line number	Diff line change
`@@ -262,11 +262,14 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts,`
`262`	`262`	`case CudaArch::SM_89:`
`263`	`263`	`return "890";`
`264`	`264`	`case CudaArch::SM_90:`
	`265`	`+ case CudaArch::SM_90a:`
`265`	`266`	`return "900";`
`266`	`267`	`}`
`267`	`268`	`llvm_unreachable("unhandled CudaArch");`
`268`	`269`	`}();`
`269`	`270`	`Builder.defineMacro("__CUDA_ARCH__", CUDAArchCode);`
	`271`	`+ if (GPU == CudaArch::SM_90a)`
	`272`	`+ Builder.defineMacro("__CUDA_ARCH_FEAT_SM90_ALL", "1");`
`270`	`273`	`}`
`271`	`274`	`}`
`272`	`275`