Skip to content

Commit f616c3e

Browse files
committed
[OpenMP][DeviceRTL][AMDGPU] Support code object version 5
Update DeviceRTL and the AMDGPU plugin to support code object version 5. Default is code object version 4. CodeGen for __builtin_amdgpu_workgroup_size generates code for cov4 as well as cov5 if -mcode-object-version=none is specified. DeviceRTL compilation passes this argument via Xclang option to generate abi-agnostic code. Generated code for the above builtin uses a clang control constant "llvm.amdgcn.abi.version" to branch on the abi version, which is available during linking of user's OpenMP code. Load of this constant gets eliminated during linking. AMDGPU plugin queries the ELF for code object version and then prepares various implicitargs accordingly. Differential Revision: https://reviews.llvm.org/D139730 Reviewed By: jhuber6, yaxunl
1 parent 30b6b27 commit f616c3e

File tree

14 files changed

+290
-39
lines changed

14 files changed

+290
-39
lines changed

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 51 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
#include "clang/AST/OSLog.h"
2828
#include "clang/Basic/TargetBuiltins.h"
2929
#include "clang/Basic/TargetInfo.h"
30+
#include "clang/Basic/TargetOptions.h"
3031
#include "clang/CodeGen/CGFunctionInfo.h"
3132
#include "clang/Frontend/FrontendDiagnostic.h"
3233
#include "llvm/ADT/APFloat.h"
@@ -17098,24 +17099,61 @@ Value *EmitAMDGPUImplicitArgPtr(CodeGenFunction &CGF) {
1709817099
}
1709917100

1710017101
// \p Index is 0, 1, and 2 for x, y, and z dimension, respectively.
17102+
/// Emit code based on Code Object ABI version.
17103+
/// COV_4 : Emit code to use dispatch ptr
17104+
/// COV_5 : Emit code to use implicitarg ptr
17105+
/// COV_NONE : Emit code to load a global variable "llvm.amdgcn.abi.version"
17106+
/// and use its value for COV_4 or COV_5 approach. It is used for
17107+
/// compiling device libraries in an ABI-agnostic way.
17108+
///
17109+
/// Note: "llvm.amdgcn.abi.version" is supposed to be emitted and intialized by
17110+
/// clang during compilation of user code.
1710117111
Value *EmitAMDGPUWorkGroupSize(CodeGenFunction &CGF, unsigned Index) {
17102-
bool IsCOV_5 = CGF.getTarget().getTargetOpts().CodeObjectVersion ==
17103-
clang::TargetOptions::COV_5;
17104-
Constant *Offset;
17105-
Value *DP;
17106-
if (IsCOV_5) {
17112+
llvm::LoadInst *LD;
17113+
17114+
auto Cov = CGF.getTarget().getTargetOpts().CodeObjectVersion;
17115+
17116+
if (Cov == clang::TargetOptions::COV_None) {
17117+
auto *ABIVersionC = CGF.CGM.GetOrCreateLLVMGlobal(
17118+
"llvm.amdgcn.abi.version", CGF.Int32Ty, LangAS::Default, nullptr,
17119+
CodeGen::NotForDefinition);
17120+
17121+
// This load will be eliminated by the IPSCCP because it is constant
17122+
// weak_odr without externally_initialized. Either changing it to weak or
17123+
// adding externally_initialized will keep the load.
17124+
Value *ABIVersion = CGF.Builder.CreateAlignedLoad(CGF.Int32Ty, ABIVersionC,
17125+
CGF.CGM.getIntAlign());
17126+
17127+
Value *IsCOV5 = CGF.Builder.CreateICmpSGE(
17128+
ABIVersion,
17129+
llvm::ConstantInt::get(CGF.Int32Ty, clang::TargetOptions::COV_5));
17130+
1710717131
// Indexing the implicit kernarg segment.
17108-
Offset = llvm::ConstantInt::get(CGF.Int32Ty, 12 + Index * 2);
17109-
DP = EmitAMDGPUImplicitArgPtr(CGF);
17110-
} else {
17132+
Value *ImplicitGEP = CGF.Builder.CreateConstGEP1_32(
17133+
CGF.Int8Ty, EmitAMDGPUImplicitArgPtr(CGF), 12 + Index * 2);
17134+
1711117135
// Indexing the HSA kernel_dispatch_packet struct.
17112-
Offset = llvm::ConstantInt::get(CGF.Int32Ty, 4 + Index * 2);
17113-
DP = EmitAMDGPUDispatchPtr(CGF);
17136+
Value *DispatchGEP = CGF.Builder.CreateConstGEP1_32(
17137+
CGF.Int8Ty, EmitAMDGPUDispatchPtr(CGF), 4 + Index * 2);
17138+
17139+
auto Result = CGF.Builder.CreateSelect(IsCOV5, ImplicitGEP, DispatchGEP);
17140+
LD = CGF.Builder.CreateLoad(
17141+
Address(Result, CGF.Int16Ty, CharUnits::fromQuantity(2)));
17142+
} else {
17143+
Value *GEP = nullptr;
17144+
if (Cov == clang::TargetOptions::COV_5) {
17145+
// Indexing the implicit kernarg segment.
17146+
GEP = CGF.Builder.CreateConstGEP1_32(
17147+
CGF.Int8Ty, EmitAMDGPUImplicitArgPtr(CGF), 12 + Index * 2);
17148+
} else {
17149+
// Indexing the HSA kernel_dispatch_packet struct.
17150+
GEP = CGF.Builder.CreateConstGEP1_32(
17151+
CGF.Int8Ty, EmitAMDGPUDispatchPtr(CGF), 4 + Index * 2);
17152+
}
17153+
LD = CGF.Builder.CreateLoad(
17154+
Address(GEP, CGF.Int16Ty, CharUnits::fromQuantity(2)));
1711417155
}
1711517156

17116-
auto *GEP = CGF.Builder.CreateGEP(CGF.Int8Ty, DP, Offset);
17117-
auto *LD = CGF.Builder.CreateLoad(
17118-
Address(GEP, CGF.Int16Ty, CharUnits::fromQuantity(2)));
1711917157
llvm::MDBuilder MDHelper(CGF.getLLVMContext());
1712017158
llvm::MDNode *RNode = MDHelper.createRange(APInt(16, 1),
1712117159
APInt(16, CGF.getTarget().getMaxOpenCLWorkGroupSize() + 1));

clang/lib/CodeGen/CodeGenModule.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1203,6 +1203,8 @@ void CodeGenModule::Release() {
12031203
getModule().addModuleFlag(llvm::Module::Error, "MaxTLSAlign",
12041204
getContext().getTargetInfo().getMaxTLSAlign());
12051205

1206+
getTargetCodeGenInfo().emitTargetGlobals(*this);
1207+
12061208
getTargetCodeGenInfo().emitTargetMetadata(*this, MangledDeclNames);
12071209

12081210
EmitBackendOptionsMetadata(getCodeGenOpts());

clang/lib/CodeGen/CodeGenModule.h

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1571,6 +1571,11 @@ class CodeGenModule : public CodeGenTypeCache {
15711571
void handleAMDGPUWavesPerEUAttr(llvm::Function *F,
15721572
const AMDGPUWavesPerEUAttr *A);
15731573

1574+
llvm::Constant *
1575+
GetOrCreateLLVMGlobal(StringRef MangledName, llvm::Type *Ty, LangAS AddrSpace,
1576+
const VarDecl *D,
1577+
ForDefinition_t IsForDefinition = NotForDefinition);
1578+
15741579
private:
15751580
llvm::Constant *GetOrCreateLLVMFunction(
15761581
StringRef MangledName, llvm::Type *Ty, GlobalDecl D, bool ForVTable,
@@ -1593,11 +1598,6 @@ class CodeGenModule : public CodeGenTypeCache {
15931598
void UpdateMultiVersionNames(GlobalDecl GD, const FunctionDecl *FD,
15941599
StringRef &CurName);
15951600

1596-
llvm::Constant *
1597-
GetOrCreateLLVMGlobal(StringRef MangledName, llvm::Type *Ty, LangAS AddrSpace,
1598-
const VarDecl *D,
1599-
ForDefinition_t IsForDefinition = NotForDefinition);
1600-
16011601
bool GetCPUAndFeaturesAttributes(GlobalDecl GD,
16021602
llvm::AttrBuilder &AttrBuilder,
16031603
bool SetTargetFeatures = true);

clang/lib/CodeGen/TargetInfo.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,9 @@ class TargetCodeGenInfo {
8181
CodeGen::CodeGenModule &CGM,
8282
const llvm::MapVector<GlobalDecl, StringRef> &MangledDeclNames) const {}
8383

84+
/// Provides a convenient hook to handle extra target-specific globals.
85+
virtual void emitTargetGlobals(CodeGen::CodeGenModule &CGM) const {}
86+
8487
/// Any further codegen related checks that need to be done on a function call
8588
/// in a target specific manner.
8689
virtual void checkFunctionCallABI(CodeGenModule &CGM, SourceLocation CallLoc,

clang/lib/CodeGen/Targets/AMDGPU.cpp

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
#include "ABIInfoImpl.h"
1010
#include "TargetInfo.h"
11+
#include "clang/Basic/TargetOptions.h"
1112

1213
using namespace clang;
1314
using namespace clang::CodeGen;
@@ -274,6 +275,8 @@ class AMDGPUTargetCodeGenInfo : public TargetCodeGenInfo {
274275
void setFunctionDeclAttributes(const FunctionDecl *FD, llvm::Function *F,
275276
CodeGenModule &CGM) const;
276277

278+
void emitTargetGlobals(CodeGen::CodeGenModule &CGM) const override;
279+
277280
void setTargetAttributes(const Decl *D, llvm::GlobalValue *GV,
278281
CodeGen::CodeGenModule &M) const override;
279282
unsigned getOpenCLKernelCallingConv() const override;
@@ -354,6 +357,28 @@ void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
354357
}
355358
}
356359

360+
/// Emits control constants used to change per-architecture behaviour in the
361+
/// AMDGPU ROCm device libraries.
362+
void AMDGPUTargetCodeGenInfo::emitTargetGlobals(
363+
CodeGen::CodeGenModule &CGM) const {
364+
StringRef Name = "llvm.amdgcn.abi.version";
365+
if (CGM.getModule().getNamedGlobal(Name))
366+
return;
367+
368+
auto *Type = llvm::IntegerType::getIntNTy(CGM.getModule().getContext(), 32);
369+
llvm::Constant *COV = llvm::ConstantInt::get(
370+
Type, CGM.getTarget().getTargetOpts().CodeObjectVersion);
371+
372+
// It needs to be constant weak_odr without externally_initialized so that
373+
// the load instuction can be eliminated by the IPSCCP.
374+
auto *GV = new llvm::GlobalVariable(
375+
CGM.getModule(), Type, true, llvm::GlobalValue::WeakODRLinkage, COV, Name,
376+
nullptr, llvm::GlobalValue::ThreadLocalMode::NotThreadLocal,
377+
CGM.getContext().getTargetAddressSpace(LangAS::opencl_constant));
378+
GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Local);
379+
GV->setVisibility(llvm::GlobalValue::VisibilityTypes::HiddenVisibility);
380+
}
381+
357382
void AMDGPUTargetCodeGenInfo::setTargetAttributes(
358383
const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const {
359384
if (requiresAMDGPUProtectedVisibility(D, GV)) {

clang/lib/Driver/ToolChain.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1370,7 +1370,10 @@ llvm::opt::DerivedArgList *ToolChain::TranslateOpenMPTargetArgs(
13701370
// matches the current toolchain triple. If it is not present
13711371
// at all, target and host share a toolchain.
13721372
if (A->getOption().matches(options::OPT_m_Group)) {
1373-
if (SameTripleAsHost)
1373+
// Pass code object version to device toolchain
1374+
// to correctly set metadata in intermediate files.
1375+
if (SameTripleAsHost ||
1376+
A->getOption().matches(options::OPT_mcode_object_version_EQ))
13741377
DAL->append(A);
13751378
else
13761379
Modified = true;

clang/lib/Driver/ToolChains/Clang.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8645,6 +8645,14 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
86458645
CmdArgs.push_back("--device-debug");
86468646
}
86478647

8648+
// code-object-version=X needs to be passed to clang-linker-wrapper to ensure
8649+
// that it is used by lld.
8650+
if (const Arg *A = Args.getLastArg(options::OPT_mcode_object_version_EQ)) {
8651+
CmdArgs.push_back(Args.MakeArgString("-mllvm"));
8652+
CmdArgs.push_back(Args.MakeArgString(
8653+
Twine("--amdhsa-code-object-version=") + A->getValue()));
8654+
}
8655+
86488656
for (const auto &A : Args.getAllArgValues(options::OPT_Xcuda_ptxas))
86498657
CmdArgs.push_back(Args.MakeArgString("--ptxas-arg=" + A));
86508658

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm-bc \
2+
// RUN: -mcode-object-version=4 -DUSER -x hip -o %t_4.bc %s
3+
4+
// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm-bc \
5+
// RUN: -mcode-object-version=5 -DUSER -x hip -o %t_5.bc %s
6+
7+
// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm-bc \
8+
// RUN: -mcode-object-version=none -DDEVICELIB -x hip -o %t_0.bc %s
9+
10+
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -emit-llvm -O3 \
11+
// RUN: %t_4.bc -mlink-builtin-bitcode %t_0.bc -o - |\
12+
// RUN: FileCheck -check-prefix=LINKED4 %s
13+
14+
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -emit-llvm -O3 \
15+
// RUN: %t_5.bc -mlink-builtin-bitcode %t_0.bc -o - |\
16+
// RUN: FileCheck -check-prefix=LINKED5 %s
17+
18+
#include "Inputs/cuda.h"
19+
20+
// LINKED4: @llvm.amdgcn.abi.version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 400
21+
// LINKED4-LABEL: bar
22+
// LINKED4-NOT: load i32, ptr addrspacecast (ptr addrspace(4) @llvm.amdgcn.abi.version to ptr), align {{.*}}
23+
// LINKED4-NOT: icmp sge i32 %{{.*}}, 500
24+
// LINKED4: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
25+
// LINKED4: [[GEP_5_X:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 12
26+
// LINKED4: call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
27+
// LINKED4: [[GEP_4_X:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 4
28+
// LINKED4: select i1 false, ptr addrspace(4) [[GEP_5_X]], ptr addrspace(4) [[GEP_4_X]]
29+
// LINKED4: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef
30+
31+
// LINKED4-NOT: load i32, ptr addrspacecast (ptr addrspace(4) @llvm.amdgcn.abi.version to ptr), align {{.*}}
32+
// LINKED4-NOT: icmp sge i32 %{{.*}}, 500
33+
// LINKED4: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
34+
// LINKED4: [[GEP_5_Y:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 14
35+
// LINKED4: call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
36+
// LINKED4: [[GEP_4_Y:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 6
37+
// LINKED4: select i1 false, ptr addrspace(4) [[GEP_5_Y]], ptr addrspace(4) [[GEP_4_Y]]
38+
// LINKED4: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef
39+
40+
// LINKED4-NOT: load i32, ptr addrspacecast (ptr addrspace(4) @llvm.amdgcn.abi.version to ptr), align {{.*}}
41+
// LINKED4-NOT: icmp sge i32 %{{.*}}, 500
42+
// LINKED4: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
43+
// LINKED4: [[GEP_5_Z:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 16
44+
// LINKED4: call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
45+
// LINKED4: [[GEP_4_Z:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 8
46+
// LINKED4: select i1 false, ptr addrspace(4) [[GEP_5_Z]], ptr addrspace(4) [[GEP_4_Z]]
47+
// LINKED4: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef
48+
// LINKED4: "amdgpu_code_object_version", i32 400
49+
50+
// LINKED5: llvm.amdgcn.abi.version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 500
51+
// LINKED5-LABEL: bar
52+
// LINKED5-NOT: load i32, ptr addrspacecast (ptr addrspace(4) @llvm.amdgcn.abi.version to ptr), align {{.*}}
53+
// LINKED5-NOT: icmp sge i32 %{{.*}}, 500
54+
// LINKED5: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
55+
// LINKED5: [[GEP_5_X:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 12
56+
// LINKED5: call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
57+
// LINKED5: [[GEP_4_X:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 4
58+
// LINKED5: select i1 true, ptr addrspace(4) [[GEP_5_X]], ptr addrspace(4) [[GEP_4_X]]
59+
// LINKED5: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef
60+
61+
// LINKED5-NOT: load i32, ptr addrspacecast (ptr addrspace(4) @llvm.amdgcn.abi.version to ptr), align {{.*}}
62+
// LINKED5-NOT: icmp sge i32 %{{.*}}, 500
63+
// LINKED5: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
64+
// LINKED5: [[GEP_5_Y:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 14
65+
// LINKED5: call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
66+
// LINKED5: [[GEP_4_Y:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 6
67+
// LINKED5: select i1 true, ptr addrspace(4) [[GEP_5_Y]], ptr addrspace(4) [[GEP_4_Y]]
68+
// LINKED5: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef
69+
70+
// LINKED5-NOT: load i32, ptr addrspacecast (ptr addrspace(4) @llvm.amdgcn.abi.version to ptr), align {{.*}}
71+
// LINKED5-NOT: icmp sge i32 %{{.*}}, 500
72+
// LINKED5: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
73+
// LINKED5: [[GEP_5_Z:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 16
74+
// LINKED5: call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
75+
// LINKED5: [[GEP_4_Z:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 8
76+
// LINKED5: select i1 true, ptr addrspace(4) [[GEP_5_Z]], ptr addrspace(4) [[GEP_4_Z]]
77+
// LINKED5: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef
78+
// LINKED5: "amdgpu_code_object_version", i32 500
79+
80+
#ifdef DEVICELIB
81+
__device__ void bar(int *x, int *y, int *z)
82+
{
83+
*x = __builtin_amdgcn_workgroup_size_x();
84+
*y = __builtin_amdgcn_workgroup_size_y();
85+
*z = __builtin_amdgcn_workgroup_size_z();
86+
}
87+
#endif
88+
89+
#ifdef USER
90+
__device__ void bar(int *x, int *y, int *z);
91+
__device__ void foo()
92+
{
93+
int *x, *y, *z;
94+
bar(x, y, z);
95+
}
96+
#endif

clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,10 @@
77
// RUN: -fcuda-is-device -mcode-object-version=5 -emit-llvm -o - -x hip %s \
88
// RUN: | FileCheck -check-prefix=COV5 %s
99

10+
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa \
11+
// RUN: -fcuda-is-device -mcode-object-version=none -emit-llvm -o - -x hip %s \
12+
// RUN: | FileCheck -check-prefix=COVNONE %s
13+
1014
#include "Inputs/cuda.h"
1115

1216
// PRECOV5-LABEL: test_get_workgroup_size
@@ -26,6 +30,36 @@
2630
// COV5: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef
2731
// COV5: getelementptr i8, ptr addrspace(4) %{{.*}}, i32 16
2832
// COV5: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef
33+
34+
35+
// COVNONE-LABEL: test_get_workgroup_size
36+
// COVNONE: load i32, ptr addrspacecast (ptr addrspace(1) @llvm.amdgcn.abi.version to ptr), align {{.*}}
37+
// COVNONE: [[ABI5_X:%.*]] = icmp sge i32 %{{.*}}, 500
38+
// COVNONE: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
39+
// COVNONE: [[GEP_5_X:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 12
40+
// COVNONE: call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
41+
// COVNONE: [[GEP_4_X:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 4
42+
// COVNONE: select i1 [[ABI5_X]], ptr addrspace(4) [[GEP_5_X]], ptr addrspace(4) [[GEP_4_X]]
43+
// COVNONE: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef
44+
45+
// COVNONE: load i32, ptr addrspacecast (ptr addrspace(1) @llvm.amdgcn.abi.version to ptr), align {{.*}}
46+
// COVNONE: [[ABI5_Y:%.*]] = icmp sge i32 %{{.*}}, 500
47+
// COVNONE: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
48+
// COVNONE: [[GEP_5_Y:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 14
49+
// COVNONE: call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
50+
// COVNONE: [[GEP_4_Y:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 6
51+
// COVNONE: select i1 [[ABI5_Y]], ptr addrspace(4) [[GEP_5_Y]], ptr addrspace(4) [[GEP_4_Y]]
52+
// COVNONE: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef
53+
54+
// COVNONE: load i32, ptr addrspacecast (ptr addrspace(1) @llvm.amdgcn.abi.version to ptr), align {{.*}}
55+
// COVNONE: [[ABI5_Z:%.*]] = icmp sge i32 %{{.*}}, 500
56+
// COVNONE: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
57+
// COVNONE: [[GEP_5_Z:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 16
58+
// COVNONE: call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
59+
// COVNONE: [[GEP_4_Z:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 8
60+
// COVNONE: select i1 [[ABI5_Z]], ptr addrspace(4) [[GEP_5_Z]], ptr addrspace(4) [[GEP_4_Z]]
61+
// COVNONE: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef
62+
2963
__device__ void test_get_workgroup_size(int d, int *out)
3064
{
3165
switch (d) {

clang/test/CodeGenOpenCL/opencl_types.cl

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,12 @@
1-
// RUN: %clang_cc1 -cl-std=CL2.0 %s -triple "spir-unknown-unknown" -emit-llvm -o - -O0 | FileCheck %s --check-prefixes=CHECK-COM,CHECK-SPIR
2-
// RUN: %clang_cc1 -cl-std=CL2.0 %s -triple "amdgcn--amdhsa" -emit-llvm -o - -O0 | FileCheck %s --check-prefixes=CHECK-COM,CHECK-AMDGCN
1+
// RUN: %clang_cc1 -cl-std=CL2.0 %s -triple "spir-unknown-unknown" -emit-llvm -o - -O0 | FileCheck %s --check-prefix=CHECK-SPIR
2+
// RUN: %clang_cc1 -cl-std=CL2.0 %s -triple "amdgcn--amdhsa" -emit-llvm -o - -O0 | FileCheck %s --check-prefix=CHECK-AMDGCN
33

44
#define CLK_ADDRESS_CLAMP_TO_EDGE 2
55
#define CLK_NORMALIZED_COORDS_TRUE 1
66
#define CLK_FILTER_NEAREST 0x10
77
#define CLK_FILTER_LINEAR 0x20
88

99
constant sampler_t glb_smp = CLK_ADDRESS_CLAMP_TO_EDGE|CLK_NORMALIZED_COORDS_TRUE|CLK_FILTER_NEAREST;
10-
// CHECK-COM-NOT: constant i32
1110

1211
void fnc1(image1d_t img) {}
1312
// CHECK-SPIR: @fnc1(target("spirv.Image", void, 0, 0, 0, 0, 0, 0, 0)

clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -403,6 +403,12 @@ Expected<StringRef> clang(ArrayRef<StringRef> InputFiles, const ArgList &Args) {
403403
llvm::copy(LinkerArgs, std::back_inserter(CmdArgs));
404404
}
405405

406+
// Pass on -mllvm options to the clang invocation.
407+
for (const opt::Arg *Arg : Args.filtered(OPT_mllvm)) {
408+
CmdArgs.push_back("-mllvm");
409+
CmdArgs.push_back(Arg->getValue());
410+
}
411+
406412
if (Args.hasArg(OPT_debug))
407413
CmdArgs.push_back("-g");
408414

openmp/libomptarget/DeviceRTL/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -288,7 +288,7 @@ add_custom_target(omptarget.devicertl.nvptx)
288288
add_custom_target(omptarget.devicertl.amdgpu)
289289
foreach(gpu_arch ${LIBOMPTARGET_DEVICE_ARCHITECTURES})
290290
if("${gpu_arch}" IN_LIST all_amdgpu_architectures)
291-
compileDeviceRTLLibrary(${gpu_arch} amdgpu amdgcn-amd-amdhsa)
291+
compileDeviceRTLLibrary(${gpu_arch} amdgpu amdgcn-amd-amdhsa -Xclang -mcode-object-version=none)
292292
elseif("${gpu_arch}" IN_LIST all_nvptx_architectures)
293293
compileDeviceRTLLibrary(${gpu_arch} nvptx nvptx64-nvidia-cuda --cuda-feature=+ptx61)
294294
else()

0 commit comments

Comments
 (0)