-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[NVPTX] Add support for PTX 8.6 and CUDA 12.6 (12.8) #123398
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-clang-driver @llvm/pr-subscribers-clang-codegen Author: Sergey Kozub (sergey-kozub) ChangesCUDA 12.8 supports PTX 8.6 which enables architecture "sm100a" (supports Blackwell-specific instructions). Full diff: https://github.com/llvm/llvm-project/pull/123398.diff 7 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def
index 969dd9e41ebfa3..37b4e6ff77fda6 100644
--- a/clang/include/clang/Basic/BuiltinsNVPTX.def
+++ b/clang/include/clang/Basic/BuiltinsNVPTX.def
@@ -28,7 +28,9 @@
#pragma push_macro("SM_90")
#pragma push_macro("SM_90a")
#pragma push_macro("SM_100")
-#define SM_100 "sm_100"
+#pragma push_macro("SM_100a")
+#define SM_100a "sm_100a"
+#define SM_100 "sm_100|" SM_100a
#define SM_90a "sm_90a"
#define SM_90 "sm_90|" SM_90a "|" SM_100
#define SM_89 "sm_89|" SM_90
@@ -1091,6 +1093,7 @@ TARGET_BUILTIN(__nvvm_getctarank_shared_cluster, "iv*3", "", AND(SM_90,PTX78))
#pragma pop_macro("SM_90")
#pragma pop_macro("SM_90a")
#pragma pop_macro("SM_100")
+#pragma pop_macro("SM_100a")
#pragma pop_macro("PTX42")
#pragma pop_macro("PTX60")
#pragma pop_macro("PTX61")
diff --git a/clang/include/clang/Basic/Cuda.h b/clang/include/clang/Basic/Cuda.h
index c2a4addf488df1..1cdfc8178db843 100644
--- a/clang/include/clang/Basic/Cuda.h
+++ b/clang/include/clang/Basic/Cuda.h
@@ -44,9 +44,12 @@ enum class CudaVersion {
CUDA_124,
CUDA_125,
CUDA_126,
+ CUDA_127,
+ CUDA_128,
+ CUDA_129,
FULLY_SUPPORTED = CUDA_123,
PARTIALLY_SUPPORTED =
- CUDA_126, // Partially supported. Proceed with a warning.
+ CUDA_129, // Partially supported. Proceed with a warning.
NEW = 10000, // Too new. Issue a warning, but allow using it.
};
const char *CudaVersionToString(CudaVersion V);
@@ -80,6 +83,7 @@ enum class OffloadArch {
SM_90,
SM_90a,
SM_100,
+ SM_100a,
GFX600,
GFX601,
GFX602,
diff --git a/clang/lib/Basic/Cuda.cpp b/clang/lib/Basic/Cuda.cpp
index d56609a2a8f24a..692ab7c319d8bd 100644
--- a/clang/lib/Basic/Cuda.cpp
+++ b/clang/lib/Basic/Cuda.cpp
@@ -44,6 +44,9 @@ static const CudaVersionMapEntry CudaNameVersionMap[] = {
CUDA_ENTRY(12, 4),
CUDA_ENTRY(12, 5),
CUDA_ENTRY(12, 6),
+ CUDA_ENTRY(12, 7),
+ CUDA_ENTRY(12, 8),
+ CUDA_ENTRY(12, 9),
{"", CudaVersion::NEW, llvm::VersionTuple(std::numeric_limits<int>::max())},
{"unknown", CudaVersion::UNKNOWN, {}} // End of list tombstone.
};
@@ -98,6 +101,7 @@ static const OffloadArchToStringMap arch_names[] = {
SM(90), // Hopper
SM(90a), // Hopper
SM(100), // Blackwell
+ SM(100a), // Blackwell
GFX(600), // gfx600
GFX(601), // gfx601
GFX(602), // gfx602
@@ -227,8 +231,8 @@ CudaVersion MinVersionForOffloadArch(OffloadArch A) {
case OffloadArch::SM_90a:
return CudaVersion::CUDA_120;
case OffloadArch::SM_100:
- return CudaVersion::NEW; // TODO: use specific CUDA version once it's
- // public.
+ case OffloadArch::SM_100a:
+ return CudaVersion::CUDA_128;
default:
llvm_unreachable("invalid enum");
}
diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp
index dbc3fec3657610..56efad90cb7c84 100644
--- a/clang/lib/Basic/Targets/NVPTX.cpp
+++ b/clang/lib/Basic/Targets/NVPTX.cpp
@@ -285,6 +285,7 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts,
case OffloadArch::SM_90a:
return "900";
case OffloadArch::SM_100:
+ case OffloadArch::SM_100a:
return "1000";
}
llvm_unreachable("unhandled OffloadArch");
@@ -292,6 +293,8 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts,
Builder.defineMacro("__CUDA_ARCH__", CUDAArchCode);
if (GPU == OffloadArch::SM_90a)
Builder.defineMacro("__CUDA_ARCH_FEAT_SM90_ALL", "1");
+ if (GPU == OffloadArch::SM_100a)
+ Builder.defineMacro("__CUDA_ARCH_FEAT_SM100_ALL", "1");
}
}
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 87c3635ed3f70e..c13928f61a7481 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -2277,6 +2277,7 @@ void CGOpenMPRuntimeGPU::processRequiresDirective(const OMPRequiresDecl *D) {
case OffloadArch::SM_90:
case OffloadArch::SM_90a:
case OffloadArch::SM_100:
+ case OffloadArch::SM_100a:
case OffloadArch::GFX600:
case OffloadArch::GFX601:
case OffloadArch::GFX602:
diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp
index 8967115bcc73d9..8cb82fe4c07d3d 100644
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -89,6 +89,12 @@ CudaVersion getCudaVersion(uint32_t raw_version) {
return CudaVersion::CUDA_125;
if (raw_version < 12070)
return CudaVersion::CUDA_126;
+ if (raw_version < 12080)
+ return CudaVersion::CUDA_127;
+ if (raw_version < 12090)
+ return CudaVersion::CUDA_128;
+ if (raw_version < 12100)
+ return CudaVersion::CUDA_129;
return CudaVersion::NEW;
}
@@ -682,6 +688,9 @@ void NVPTX::getNVPTXTargetFeatures(const Driver &D, const llvm::Triple &Triple,
case CudaVersion::CUDA_##CUDA_VER: \
PtxFeature = "+ptx" #PTX_VER; \
break;
+ CASE_CUDA_VERSION(129, 86);
+ CASE_CUDA_VERSION(128, 86);
+ CASE_CUDA_VERSION(127, 85);
CASE_CUDA_VERSION(126, 85);
CASE_CUDA_VERSION(125, 85);
CASE_CUDA_VERSION(124, 84);
diff --git a/llvm/lib/Target/NVPTX/NVPTX.td b/llvm/lib/Target/NVPTX/NVPTX.td
index 9af8715ef52ae7..3ca8b4d294079c 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.td
+++ b/llvm/lib/Target/NVPTX/NVPTX.td
@@ -39,6 +39,7 @@ foreach sm = [20, 21, 30, 32, 35, 37, 50, 52, 53,
def SM#sm: FeatureSM<""#sm, !mul(sm, 10)>;
def SM90a: FeatureSM<"90a", 901>;
+def SM100a: FeatureSM<"100a", 1001>;
foreach version = [32, 40, 41, 42, 43, 50, 60, 61, 62, 63, 64, 65,
70, 71, 72, 73, 74, 75, 76, 77, 78,
@@ -74,6 +75,7 @@ def : Proc<"sm_89", [SM89, PTX78]>;
def : Proc<"sm_90", [SM90, PTX78]>;
def : Proc<"sm_90a", [SM90a, PTX80]>;
def : Proc<"sm_100", [SM100, PTX86]>;
+def : Proc<"sm_100a", [SM100a, PTX86]>;
def NVPTXInstrInfo : InstrInfo {
}
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The updates look good to me.
04313ed
to
2f90900
Compare
It's not clear from the logs why the "builkite" checks have failed. Running "check-all" target locally suceeds. |
2f90900
to
dc43fbf
Compare
This is now resolved by fixing the NVPTX test. |
Remove CUDA_127 and CUDA_129 defines incorrectly added in #123398
Remove CUDA_127 and CUDA_129 defines incorrectly added in llvm/llvm-project#123398
@@ -682,6 +688,9 @@ void NVPTX::getNVPTXTargetFeatures(const Driver &D, const llvm::Triple &Triple, | |||
case CudaVersion::CUDA_##CUDA_VER: \ | |||
PtxFeature = "+ptx" #PTX_VER; \ | |||
break; | |||
CASE_CUDA_VERSION(129, 87); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks like we forgot about adding PTX87 to the builtins in the clang/include/clang/Basic/BuiltinsNVPTX.def
Right now cc1
ends up running with -target-features +ptx87
, but the target builtins only enumerate up to ptx86, so the compilation fails with
In file included from trunk/include/__clang_cuda_runtime_wrapper.h:473:
trunk/include/__clang_cuda_intrinsics.h:179:40: error: '__nvvm_shfl_sync_down_i32' needs target feature ptx60|ptx61|ptx62|ptx63|ptx64|ptx65|ptx70|ptx71|ptx72|ptx73|ptx74|ptx75|ptx76|ptx77|ptx78|ptx80|ptx81|ptx82|ptx83|ptx84|ptx85|ptx86
179 | __MAKE_SYNC_SHUFFLES(__shfl_down_sync, __nvvm_shfl_sync_down_i32,
| ^
Fixes build break with CUDA-12.8 introduced in #123398
CUDA 12.8 supports PTX 8.7 which enables architecture "sm100a" (supports Blackwell-specific instructions).