Skip to content

Commit 2a948e8

Browse files
committed
[CUDA] Add a pseudo GPU sm_next which allows overrides for SM/PTX versions.
Sometimes users may need to use older clang with newer SM/PTX versions which clang does not know anything about, yet. --offload-arch=sm_next, combined with --cuda-next-sm=X and --cuda-next-ptx=Y allows passing through the specified SM and PTX versions down to ptxas, which may be able to make sense of them. Or not, but that's up to the user to figure out the values they may need to use to make it work. The feature is intended to be a stop-gap workaround for situations when clang didn't catch up to the newer CUDA SDK releases yet. No guarantees that it will work with any given combination of clang/CUDA/SM/PTX versions. YMMV.
1 parent f999b32 commit 2a948e8

File tree

17 files changed

+332
-118
lines changed

17 files changed

+332
-118
lines changed

clang/include/clang/Basic/Cuda.h

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#ifndef LLVM_CLANG_BASIC_CUDA_H
1010
#define LLVM_CLANG_BASIC_CUDA_H
1111

12+
#include "llvm/ADT/StringRef.h"
1213
namespace llvm {
1314
class StringRef;
1415
class Twine;
@@ -52,6 +53,42 @@ const char *CudaVersionToString(CudaVersion V);
5253
// Input is "Major.Minor"
5354
CudaVersion CudaStringToVersion(const llvm::Twine &S);
5455

56+
enum class PTXVersion {
57+
PTX_UNKNOWN = 0,
58+
PTX_32 = 32,
59+
PTX_40 = 40,
60+
PTX_41,
61+
PTX_42,
62+
PTX_43,
63+
PTX_50 = 50,
64+
PTX_60 = 60,
65+
PTX_61,
66+
PTX_62,
67+
PTX_63,
68+
PTX_64,
69+
PTX_65,
70+
PTX_70 = 70,
71+
PTX_71,
72+
PTX_72,
73+
PTX_73,
74+
PTX_74,
75+
PTX_75,
76+
PTX_76,
77+
PTX_77,
78+
PTX_78,
79+
PTX_80 = 80,
80+
PTX_81,
81+
PTX_82,
82+
PTX_83,
83+
PTX_84,
84+
PTX_85,
85+
PTX_LAST = PTX_85,
86+
PTX_custom = 9999, // placeholder for an unknown future version.
87+
};
88+
89+
const std::string PTXVersionToFeature(PTXVersion V);
90+
PTXVersion GetRequiredPTXVersion(CudaVersion V);
91+
5592
enum class OffloadArch {
5693
UNUSED,
5794
UNKNOWN,
@@ -78,6 +115,7 @@ enum class OffloadArch {
78115
SM_89,
79116
SM_90,
80117
SM_90a,
118+
SM_custom,
81119
GFX600,
82120
GFX601,
83121
GFX602,
@@ -160,6 +198,12 @@ const char *OffloadArchToVirtualArchString(OffloadArch A);
160198
// The input should have the form "sm_20".
161199
OffloadArch StringToOffloadArch(llvm::StringRef S);
162200

201+
// Converts custom SM name to its numeric value to be used in __CUDA_ARCH__
202+
// Custom SM name format: `sm_[ID][suffix]`.
203+
// The function returns `ID`*10 or zero on error.
204+
// `suffix` is expected to be empty or `a` and is ignored otherwise.
205+
unsigned CUDACustomSMToArchID(llvm::StringRef S);
206+
163207
/// Get the earliest CudaVersion that supports the given OffloadArch.
164208
CudaVersion MinVersionForOffloadArch(OffloadArch A);
165209

clang/include/clang/Basic/DiagnosticDriverKinds.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -743,6 +743,8 @@ def err_drv_invalid_or_unsupported_offload_target : Error<
743743
"invalid or unsupported offload target: '%0'">;
744744
def err_drv_cuda_offload_only_emit_bc : Error<
745745
"CUDA offload target is supported only along with --emit-llvm">;
746+
def err_drv_sm_custom_args : Error<
747+
"offload target sm_custom requires both --cuda-custom_sm and --cuda_custom_ptx to be specified">;
746748

747749
def warn_drv_jmc_requires_debuginfo : Warning<
748750
"%0 requires debug info. Use %1 or debug options that enable debugger's "

clang/include/clang/Basic/LangOptions.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -579,6 +579,10 @@ class LangOptions : public LangOptionsBase {
579579
// WebAssembly target.
580580
bool NoWasmOpt = false;
581581

582+
// Overrides for the custom SM/PTX variants for CUDA's sm_custom target.
583+
std::string CUDACustomSM;
584+
unsigned CUDACustomPTX = 0;
585+
582586
LangOptions();
583587

584588
/// Set language defaults for the given input language and

clang/include/clang/Driver/Options.td

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1464,6 +1464,17 @@ def fno_hip_emit_relocatable : Flag<["-"], "fno-hip-emit-relocatable">,
14641464
HelpText<"Do not override toolchain to compile HIP source to relocatable">;
14651465
}
14661466

1467+
def cuda_custom_sm_EQ : Joined<["--"], "cuda-custom-sm=">,
1468+
Visibility<[ClangOption, CC1Option]>,
1469+
HelpText<"SM version to use for sm_custom GPU">,
1470+
MarshallingInfoString<LangOpts<"CUDACustomSM">>,
1471+
ShouldParseIf<cuda.KeyPath>,Flags<[HelpHidden]>;
1472+
def cuda_custom_ptx_EQ : Joined<["--"], "cuda-custom-ptx=">,
1473+
Visibility<[ClangOption, CC1Option]>,
1474+
HelpText<"SM version to use for sm_custom GPU">,
1475+
MarshallingInfoInt<LangOpts<"CUDACustomPTX">, "0">,
1476+
ShouldParseIf<cuda.KeyPath>,Flags<[HelpHidden]>;
1477+
14671478
// Clang specific/exclusive options for OpenACC.
14681479
def openacc_macro_override
14691480
: Separate<["-"], "fexperimental-openacc-macro-override">,

clang/include/clang/Driver/ToolChain.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -677,6 +677,13 @@ class ToolChain {
677677
virtual void addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
678678
llvm::opt::ArgStringList &CC1Args,
679679
Action::OffloadKind DeviceOffloadKind) const;
680+
/// [optional] Some toolchains may need more info and need to pass JobAction.
681+
/// This is only intended to augment the function above.
682+
virtual void addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
683+
llvm::opt::ArgStringList &CC1Args,
684+
const JobAction &JC) const {
685+
addClangTargetOptions(DriverArgs, CC1Args, JC.getOffloadingDeviceKind());
686+
}
680687

681688
/// Add options that need to be passed to cc1as for this target.
682689
virtual void

clang/lib/Basic/Cuda.cpp

Lines changed: 65 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
#include "llvm/ADT/StringRef.h"
44
#include "llvm/ADT/Twine.h"
55
#include "llvm/Support/ErrorHandling.h"
6+
#include "llvm/Support/FormatVariadic.h"
67
#include "llvm/Support/VersionTuple.h"
78

89
namespace clang {
@@ -11,40 +12,43 @@ struct CudaVersionMapEntry {
1112
const char *Name;
1213
CudaVersion Version;
1314
llvm::VersionTuple TVersion;
15+
PTXVersion PTX;
1416
};
15-
#define CUDA_ENTRY(major, minor) \
17+
#define CUDA_ENTRY(major, minor, ptx) \
1618
{ \
1719
#major "." #minor, CudaVersion::CUDA_##major##minor, \
18-
llvm::VersionTuple(major, minor) \
20+
llvm::VersionTuple(major, minor), PTXVersion::ptx \
1921
}
2022

2123
static const CudaVersionMapEntry CudaNameVersionMap[] = {
22-
CUDA_ENTRY(7, 0),
23-
CUDA_ENTRY(7, 5),
24-
CUDA_ENTRY(8, 0),
25-
CUDA_ENTRY(9, 0),
26-
CUDA_ENTRY(9, 1),
27-
CUDA_ENTRY(9, 2),
28-
CUDA_ENTRY(10, 0),
29-
CUDA_ENTRY(10, 1),
30-
CUDA_ENTRY(10, 2),
31-
CUDA_ENTRY(11, 0),
32-
CUDA_ENTRY(11, 1),
33-
CUDA_ENTRY(11, 2),
34-
CUDA_ENTRY(11, 3),
35-
CUDA_ENTRY(11, 4),
36-
CUDA_ENTRY(11, 5),
37-
CUDA_ENTRY(11, 6),
38-
CUDA_ENTRY(11, 7),
39-
CUDA_ENTRY(11, 8),
40-
CUDA_ENTRY(12, 0),
41-
CUDA_ENTRY(12, 1),
42-
CUDA_ENTRY(12, 2),
43-
CUDA_ENTRY(12, 3),
44-
CUDA_ENTRY(12, 4),
45-
CUDA_ENTRY(12, 5),
46-
{"", CudaVersion::NEW, llvm::VersionTuple(std::numeric_limits<int>::max())},
47-
{"unknown", CudaVersion::UNKNOWN, {}} // End of list tombstone.
24+
CUDA_ENTRY(7, 0, PTX_42),
25+
CUDA_ENTRY(7, 5, PTX_43),
26+
CUDA_ENTRY(8, 0, PTX_50),
27+
CUDA_ENTRY(9, 0, PTX_60),
28+
CUDA_ENTRY(9, 1, PTX_61),
29+
CUDA_ENTRY(9, 2, PTX_62),
30+
CUDA_ENTRY(10, 0, PTX_63),
31+
CUDA_ENTRY(10, 1, PTX_64),
32+
CUDA_ENTRY(10, 2, PTX_65),
33+
CUDA_ENTRY(11, 0, PTX_70),
34+
CUDA_ENTRY(11, 1, PTX_71),
35+
CUDA_ENTRY(11, 2, PTX_72),
36+
CUDA_ENTRY(11, 3, PTX_73),
37+
CUDA_ENTRY(11, 4, PTX_74),
38+
CUDA_ENTRY(11, 5, PTX_75),
39+
CUDA_ENTRY(11, 6, PTX_76),
40+
CUDA_ENTRY(11, 7, PTX_77),
41+
CUDA_ENTRY(11, 8, PTX_78),
42+
CUDA_ENTRY(12, 0, PTX_80),
43+
CUDA_ENTRY(12, 1, PTX_81),
44+
CUDA_ENTRY(12, 2, PTX_82),
45+
CUDA_ENTRY(12, 3, PTX_83),
46+
CUDA_ENTRY(12, 4, PTX_84),
47+
CUDA_ENTRY(12, 5, PTX_85),
48+
{"", CudaVersion::NEW, llvm::VersionTuple(std::numeric_limits<int>::max()),
49+
PTXVersion::PTX_LAST},
50+
// End of list tombstone
51+
{"unknown", CudaVersion::UNKNOWN, {}, PTXVersion::PTX_42}
4852
};
4953
#undef CUDA_ENTRY
5054

@@ -71,6 +75,20 @@ CudaVersion ToCudaVersion(llvm::VersionTuple Version) {
7175
return CudaVersion::UNKNOWN;
7276
}
7377

78+
const std::string PTXVersionToFeature(PTXVersion V) {
79+
if (V > PTXVersion::PTX_UNKNOWN && V <= PTXVersion::PTX_LAST)
80+
return llvm::formatv("+ptx{0}", static_cast<unsigned>(V));
81+
return {};
82+
}
83+
84+
PTXVersion GetRequiredPTXVersion(CudaVersion V) {
85+
for (auto &I : CudaNameVersionMap)
86+
if (V == I.Version)
87+
return I.PTX;
88+
89+
return PTXVersion::PTX_UNKNOWN;
90+
}
91+
7492
namespace {
7593
struct OffloadArchToStringMap {
7694
OffloadArch arch;
@@ -79,9 +97,11 @@ struct OffloadArchToStringMap {
7997
};
8098
} // namespace
8199

82-
#define SM2(sm, ca) {OffloadArch::SM_##sm, "sm_" #sm, ca}
100+
#define SM2(sm, ca) \
101+
{ OffloadArch::SM_##sm, "sm_" #sm, ca }
83102
#define SM(sm) SM2(sm, "compute_" #sm)
84-
#define GFX(gpu) {OffloadArch::GFX##gpu, "gfx" #gpu, "compute_amdgcn"}
103+
#define GFX(gpu) \
104+
{ OffloadArch::GFX##gpu, "gfx" #gpu, "compute_amdgcn" }
85105
static const OffloadArchToStringMap arch_names[] = {
86106
// clang-format off
87107
{OffloadArch::UNUSED, "", ""},
@@ -96,6 +116,7 @@ static const OffloadArchToStringMap arch_names[] = {
96116
SM(89), // Ada Lovelace
97117
SM(90), // Hopper
98118
SM(90a), // Hopper
119+
SM(custom), // Placeholder for a new arch.
99120
GFX(600), // gfx600
100121
GFX(601), // gfx601
101122
GFX(602), // gfx602
@@ -181,6 +202,18 @@ OffloadArch StringToOffloadArch(llvm::StringRef S) {
181202
return result->arch;
182203
}
183204

205+
unsigned CUDACustomSMToArchID(llvm::StringRef S) {
206+
if (!S.starts_with("sm_"))
207+
return 0;
208+
S = S.drop_front(3); // skip `sm_`
209+
if (S.ends_with("a"))
210+
S = S.drop_back(1);
211+
unsigned ID;
212+
if (S.getAsInteger(10, ID))
213+
return 0; // We've failed to parse the SM name
214+
return ID * 10;
215+
}
216+
184217
CudaVersion MinVersionForOffloadArch(OffloadArch A) {
185218
if (A == OffloadArch::UNKNOWN)
186219
return CudaVersion::UNKNOWN;
@@ -221,6 +254,8 @@ CudaVersion MinVersionForOffloadArch(OffloadArch A) {
221254
return CudaVersion::CUDA_118;
222255
case OffloadArch::SM_90a:
223256
return CudaVersion::CUDA_120;
257+
case clang::OffloadArch::SM_custom:
258+
return CudaVersion::UNKNOWN;
224259
default:
225260
llvm_unreachable("invalid enum");
226261
}

clang/lib/Basic/Targets/NVPTX.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,10 @@
1313
#include "NVPTX.h"
1414
#include "Targets.h"
1515
#include "clang/Basic/Builtins.h"
16+
#include "clang/Basic/Cuda.h"
1617
#include "clang/Basic/MacroBuilder.h"
1718
#include "clang/Basic/TargetBuiltins.h"
19+
#include "llvm/ADT/StringExtras.h"
1820
#include "llvm/ADT/StringSwitch.h"
1921

2022
using namespace clang;
@@ -180,7 +182,7 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts,
180182

181183
if (Opts.CUDAIsDevice || Opts.OpenMPIsTargetDevice || !HostTarget) {
182184
// Set __CUDA_ARCH__ for the GPU specified.
183-
std::string CUDAArchCode = [this] {
185+
std::string CUDAArchCode = [&]() -> std::string {
184186
switch (GPU) {
185187
case OffloadArch::GFX600:
186188
case OffloadArch::GFX601:
@@ -281,6 +283,8 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts,
281283
case OffloadArch::SM_90:
282284
case OffloadArch::SM_90a:
283285
return "900";
286+
case OffloadArch::SM_custom:
287+
return llvm::itostr(CUDACustomSMToArchID(Opts.CUDACustomSM));
284288
}
285289
llvm_unreachable("unhandled OffloadArch");
286290
}();

clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2277,6 +2277,7 @@ void CGOpenMPRuntimeGPU::processRequiresDirective(const OMPRequiresDecl *D) {
22772277
case OffloadArch::SM_89:
22782278
case OffloadArch::SM_90:
22792279
case OffloadArch::SM_90a:
2280+
case OffloadArch::SM_custom:
22802281
case OffloadArch::GFX600:
22812282
case OffloadArch::GFX601:
22822283
case OffloadArch::GFX602:

clang/lib/Driver/ToolChains/Clang.cpp

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1670,7 +1670,8 @@ void Clang::AddARMTargetArgs(const llvm::Triple &Triple, const ArgList &Args,
16701670
AddUnalignedAccessWarning(CmdArgs);
16711671
}
16721672

1673-
void Clang::RenderTargetOptions(const llvm::Triple &EffectiveTriple,
1673+
void Clang::RenderTargetOptions(const JobAction &JA,
1674+
const llvm::Triple &EffectiveTriple,
16741675
const ArgList &Args, bool KernelOrKext,
16751676
ArgStringList &CmdArgs) const {
16761677
const ToolChain &TC = getToolChain();
@@ -5378,7 +5379,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
53785379
CmdArgs.push_back("-disable-llvm-passes");
53795380

53805381
// Render target options.
5381-
TC.addClangTargetOptions(Args, CmdArgs, JA.getOffloadingDeviceKind());
5382+
TC.addClangTargetOptions(Args, CmdArgs, JA);
53825383

53835384
// reject options that shouldn't be supported in bitcode
53845385
// also reject kernel/kext
@@ -6069,7 +6070,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
60696070
/*ForAS*/ false, /*IsAux*/ true);
60706071
}
60716072

6072-
TC.addClangTargetOptions(Args, CmdArgs, JA.getOffloadingDeviceKind());
6073+
TC.addClangTargetOptions(Args, CmdArgs, JA);
60736074

60746075
addMCModel(D, Args, Triple, RelocationModel, CmdArgs);
60756076

@@ -6096,7 +6097,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
60966097
CmdArgs.push_back(Args.MakeArgString(CPU));
60976098
}
60986099

6099-
RenderTargetOptions(Triple, Args, KernelOrKext, CmdArgs);
6100+
RenderTargetOptions(JA, Triple, Args, KernelOrKext, CmdArgs);
61006101

61016102
// Add clang-cl arguments.
61026103
types::ID InputType = Input.getType();

clang/lib/Driver/ToolChains/Clang.h

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,8 @@ class LLVM_LIBRARY_VISIBILITY Clang : public Tool {
4545
const InputInfo &Output,
4646
const InputInfoList &Inputs) const;
4747

48-
void RenderTargetOptions(const llvm::Triple &EffectiveTriple,
48+
void RenderTargetOptions(const JobAction &JA,
49+
const llvm::Triple &EffectiveTriple,
4950
const llvm::opt::ArgList &Args, bool KernelOrKext,
5051
llvm::opt::ArgStringList &CmdArgs) const;
5152

@@ -61,6 +62,8 @@ class LLVM_LIBRARY_VISIBILITY Clang : public Tool {
6162
llvm::opt::ArgStringList &CmdArgs) const;
6263
void AddMIPSTargetArgs(const llvm::opt::ArgList &Args,
6364
llvm::opt::ArgStringList &CmdArgs) const;
65+
void AddNVPTXTargetArgs(const JobAction &JA, const llvm::opt::ArgList &Args,
66+
llvm::opt::ArgStringList &CmdArgs) const;
6467
void AddPPCTargetArgs(const llvm::opt::ArgList &Args,
6568
llvm::opt::ArgStringList &CmdArgs) const;
6669
void AddR600TargetArgs(const llvm::opt::ArgList &Args,
@@ -94,8 +97,8 @@ class LLVM_LIBRARY_VISIBILITY Clang : public Tool {
9497

9598
mutable std::unique_ptr<llvm::raw_fd_ostream> CompilationDatabase = nullptr;
9699
void DumpCompilationDatabase(Compilation &C, StringRef Filename,
97-
StringRef Target,
98-
const InputInfo &Output, const InputInfo &Input,
100+
StringRef Target, const InputInfo &Output,
101+
const InputInfo &Input,
99102
const llvm::opt::ArgList &Args) const;
100103

101104
void DumpCompilationDatabaseFragmentToDir(

0 commit comments

Comments
 (0)