Skip to content

Commit db81d8f

Browse files
[OpenMP] Lower printf to __llvm_omp_vprintf
Extension of D112504. Lower amdgpu printf to `__llvm_omp_vprintf` which takes the same const char*, void* arguments as cuda vprintf and also passes the size of the void* alloca which will be needed by a non-stub implementation of `__llvm_omp_vprintf` for amdgpu. This removes the amdgpu link error on any printf in a target region in favour of silently compiling code that doesn't print anything to stdout. The exact set of changes to check-openmp probably needs revision before commit Reviewed By: jdoerfert Differential Revision: https://reviews.llvm.org/D112680
1 parent 1658980 commit db81d8f

21 files changed

+147
-75
lines changed

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5106,11 +5106,16 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
51065106
return RValue::get(Builder.CreateFPExt(HalfVal, Builder.getFloatTy()));
51075107
}
51085108
case Builtin::BIprintf:
5109-
if (getTarget().getTriple().isNVPTX())
5110-
return EmitNVPTXDevicePrintfCallExpr(E, ReturnValue);
5111-
if (getTarget().getTriple().getArch() == Triple::amdgcn &&
5112-
getLangOpts().HIP)
5113-
return EmitAMDGPUDevicePrintfCallExpr(E, ReturnValue);
5109+
if (getTarget().getTriple().isNVPTX() ||
5110+
getTarget().getTriple().isAMDGCN()) {
5111+
if (getLangOpts().OpenMPIsDevice)
5112+
return EmitOpenMPDevicePrintfCallExpr(E);
5113+
if (getTarget().getTriple().isNVPTX())
5114+
return EmitNVPTXDevicePrintfCallExpr(E);
5115+
if (getTarget().getTriple().isAMDGCN() && getLangOpts().HIP)
5116+
return EmitAMDGPUDevicePrintfCallExpr(E);
5117+
}
5118+
51145119
break;
51155120
case Builtin::BI__builtin_canonicalize:
51165121
case Builtin::BI__builtin_canonicalizef:

clang/lib/CodeGen/CGGPUBuiltin.cpp

Lines changed: 80 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -21,13 +21,14 @@
2121
using namespace clang;
2222
using namespace CodeGen;
2323

24-
static llvm::Function *GetVprintfDeclaration(llvm::Module &M) {
24+
namespace {
25+
llvm::Function *GetVprintfDeclaration(llvm::Module &M) {
2526
llvm::Type *ArgTypes[] = {llvm::Type::getInt8PtrTy(M.getContext()),
2627
llvm::Type::getInt8PtrTy(M.getContext())};
2728
llvm::FunctionType *VprintfFuncType = llvm::FunctionType::get(
2829
llvm::Type::getInt32Ty(M.getContext()), ArgTypes, false);
2930

30-
if (auto* F = M.getFunction("vprintf")) {
31+
if (auto *F = M.getFunction("vprintf")) {
3132
// Our CUDA system header declares vprintf with the right signature, so
3233
// nobody else should have been able to declare vprintf with a bogus
3334
// signature.
@@ -41,6 +42,28 @@ static llvm::Function *GetVprintfDeclaration(llvm::Module &M) {
4142
VprintfFuncType, llvm::GlobalVariable::ExternalLinkage, "vprintf", &M);
4243
}
4344

45+
llvm::Function *GetOpenMPVprintfDeclaration(CodeGenModule &CGM) {
46+
const char *Name = "__llvm_omp_vprintf";
47+
llvm::Module &M = CGM.getModule();
48+
llvm::Type *ArgTypes[] = {llvm::Type::getInt8PtrTy(M.getContext()),
49+
llvm::Type::getInt8PtrTy(M.getContext()),
50+
llvm::Type::getInt32Ty(M.getContext())};
51+
llvm::FunctionType *VprintfFuncType = llvm::FunctionType::get(
52+
llvm::Type::getInt32Ty(M.getContext()), ArgTypes, false);
53+
54+
if (auto *F = M.getFunction(Name)) {
55+
if (F->getFunctionType() != VprintfFuncType) {
56+
CGM.Error(SourceLocation(),
57+
"Invalid type declaration for __llvm_omp_vprintf");
58+
return nullptr;
59+
}
60+
return F;
61+
}
62+
63+
return llvm::Function::Create(
64+
VprintfFuncType, llvm::GlobalVariable::ExternalLinkage, Name, &M);
65+
}
66+
4467
// Transforms a call to printf into a call to the NVPTX vprintf syscall (which
4568
// isn't particularly special; it's invoked just like a regular function).
4669
// vprintf takes two args: A format string, and a pointer to a buffer containing
@@ -67,17 +90,17 @@ static llvm::Function *GetVprintfDeclaration(llvm::Module &M) {
6790
// Note that by the time this function runs, E's args have already undergone the
6891
// standard C vararg promotion (short -> int, float -> double, etc.).
6992

70-
namespace {
71-
llvm::Value *packArgsIntoNVPTXFormatBuffer(CodeGenFunction *CGF,
72-
const CallArgList &Args) {
93+
std::pair<llvm::Value *, llvm::TypeSize>
94+
packArgsIntoNVPTXFormatBuffer(CodeGenFunction *CGF, const CallArgList &Args) {
7395
const llvm::DataLayout &DL = CGF->CGM.getDataLayout();
7496
llvm::LLVMContext &Ctx = CGF->CGM.getLLVMContext();
7597
CGBuilderTy &Builder = CGF->Builder;
7698

7799
// Construct and fill the args buffer that we'll pass to vprintf.
78100
if (Args.size() <= 1) {
79-
// If there are no args, pass a null pointer to vprintf.
80-
return llvm::ConstantPointerNull::get(llvm::Type::getInt8PtrTy(Ctx));
101+
// If there are no args, pass a null pointer and size 0
102+
llvm::Value * BufferPtr = llvm::ConstantPointerNull::get(llvm::Type::getInt8PtrTy(Ctx));
103+
return {BufferPtr, llvm::TypeSize::Fixed(0)};
81104
} else {
82105
llvm::SmallVector<llvm::Type *, 8> ArgTypes;
83106
for (unsigned I = 1, NumArgs = Args.size(); I < NumArgs; ++I)
@@ -96,43 +119,64 @@ llvm::Value *packArgsIntoNVPTXFormatBuffer(CodeGenFunction *CGF,
96119
llvm::Value *Arg = Args[I].getRValue(*CGF).getScalarVal();
97120
Builder.CreateAlignedStore(Arg, P, DL.getPrefTypeAlign(Arg->getType()));
98121
}
99-
return Builder.CreatePointerCast(Alloca, llvm::Type::getInt8PtrTy(Ctx));
122+
llvm::Value *BufferPtr =
123+
Builder.CreatePointerCast(Alloca, llvm::Type::getInt8PtrTy(Ctx));
124+
return {BufferPtr, DL.getTypeAllocSize(AllocaTy)};
100125
}
101126
}
102-
} // namespace
103127

104-
RValue
105-
CodeGenFunction::EmitNVPTXDevicePrintfCallExpr(const CallExpr *E,
106-
ReturnValueSlot ReturnValue) {
107-
assert(getTarget().getTriple().isNVPTX());
128+
bool containsNonScalarVarargs(CodeGenFunction *CGF, CallArgList Args) {
129+
return llvm::any_of(llvm::drop_begin(Args), [&](const CallArg &A) {
130+
return !A.getRValue(*CGF).isScalar();
131+
});
132+
}
133+
134+
RValue EmitDevicePrintfCallExpr(const CallExpr *E, CodeGenFunction *CGF,
135+
llvm::Function *Decl, bool WithSizeArg) {
136+
CodeGenModule &CGM = CGF->CGM;
137+
CGBuilderTy &Builder = CGF->Builder;
108138
assert(E->getBuiltinCallee() == Builtin::BIprintf);
109139
assert(E->getNumArgs() >= 1); // printf always has at least one arg.
110140

141+
// Uses the same format as nvptx for the argument packing, but also passes
142+
// an i32 for the total size of the passed pointer
111143
CallArgList Args;
112-
EmitCallArgs(Args,
113-
E->getDirectCallee()->getType()->getAs<FunctionProtoType>(),
114-
E->arguments(), E->getDirectCallee(),
115-
/* ParamsToSkip = */ 0);
144+
CGF->EmitCallArgs(Args,
145+
E->getDirectCallee()->getType()->getAs<FunctionProtoType>(),
146+
E->arguments(), E->getDirectCallee(),
147+
/* ParamsToSkip = */ 0);
116148

117149
// We don't know how to emit non-scalar varargs.
118-
if (llvm::any_of(llvm::drop_begin(Args), [&](const CallArg &A) {
119-
return !A.getRValue(*this).isScalar();
120-
})) {
150+
if (containsNonScalarVarargs(CGF, Args)) {
121151
CGM.ErrorUnsupported(E, "non-scalar arg to printf");
122-
return RValue::get(llvm::ConstantInt::get(IntTy, 0));
152+
return RValue::get(llvm::ConstantInt::get(CGF->IntTy, 0));
123153
}
124154

125-
llvm::Value *BufferPtr = packArgsIntoNVPTXFormatBuffer(this, Args);
155+
auto r = packArgsIntoNVPTXFormatBuffer(CGF, Args);
156+
llvm::Value *BufferPtr = r.first;
157+
158+
llvm::SmallVector<llvm::Value *, 3> Vec = {
159+
Args[0].getRValue(*CGF).getScalarVal(), BufferPtr};
160+
if (WithSizeArg) {
161+
// Passing > 32bit of data as a local alloca doesn't work for nvptx or
162+
// amdgpu
163+
llvm::Constant *Size =
164+
llvm::ConstantInt::get(llvm::Type::getInt32Ty(CGM.getLLVMContext()),
165+
static_cast<uint32_t>(r.second.getFixedSize()));
126166

127-
// Invoke vprintf and return.
128-
llvm::Function* VprintfFunc = GetVprintfDeclaration(CGM.getModule());
129-
return RValue::get(Builder.CreateCall(
130-
VprintfFunc, {Args[0].getRValue(*this).getScalarVal(), BufferPtr}));
167+
Vec.push_back(Size);
168+
}
169+
return RValue::get(Builder.CreateCall(Decl, Vec));
131170
}
171+
} // namespace
132172

133-
RValue
134-
CodeGenFunction::EmitAMDGPUDevicePrintfCallExpr(const CallExpr *E,
135-
ReturnValueSlot ReturnValue) {
173+
RValue CodeGenFunction::EmitNVPTXDevicePrintfCallExpr(const CallExpr *E) {
174+
assert(getTarget().getTriple().isNVPTX());
175+
return EmitDevicePrintfCallExpr(
176+
E, this, GetVprintfDeclaration(CGM.getModule()), false);
177+
}
178+
179+
RValue CodeGenFunction::EmitAMDGPUDevicePrintfCallExpr(const CallExpr *E) {
136180
assert(getTarget().getTriple().getArch() == llvm::Triple::amdgcn);
137181
assert(E->getBuiltinCallee() == Builtin::BIprintf ||
138182
E->getBuiltinCallee() == Builtin::BI__builtin_printf);
@@ -162,3 +206,10 @@ CodeGenFunction::EmitAMDGPUDevicePrintfCallExpr(const CallExpr *E,
162206
Builder.SetInsertPoint(IRB.GetInsertBlock(), IRB.GetInsertPoint());
163207
return RValue::get(Printf);
164208
}
209+
210+
RValue CodeGenFunction::EmitOpenMPDevicePrintfCallExpr(const CallExpr *E) {
211+
assert(getTarget().getTriple().isNVPTX() ||
212+
getTarget().getTriple().isAMDGCN());
213+
return EmitDevicePrintfCallExpr(E, this, GetOpenMPVprintfDeclaration(CGM),
214+
true);
215+
}

clang/lib/CodeGen/CodeGenFunction.h

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4070,10 +4070,9 @@ class CodeGenFunction : public CodeGenTypeCache {
40704070
RValue EmitCUDAKernelCallExpr(const CUDAKernelCallExpr *E,
40714071
ReturnValueSlot ReturnValue);
40724072

4073-
RValue EmitNVPTXDevicePrintfCallExpr(const CallExpr *E,
4074-
ReturnValueSlot ReturnValue);
4075-
RValue EmitAMDGPUDevicePrintfCallExpr(const CallExpr *E,
4076-
ReturnValueSlot ReturnValue);
4073+
RValue EmitNVPTXDevicePrintfCallExpr(const CallExpr *E);
4074+
RValue EmitAMDGPUDevicePrintfCallExpr(const CallExpr *E);
4075+
RValue EmitOpenMPDevicePrintfCallExpr(const CallExpr *E);
40774076

40784077
RValue EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
40794078
const CallExpr *E, ReturnValueSlot ReturnValue);

openmp/libomptarget/DeviceRTL/include/Debug.h

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -34,23 +34,15 @@ void __assert_fail(const char *assertion, const char *file, unsigned line,
3434
///}
3535

3636
/// Print
37-
/// TODO: For now we have to use macros to guard the code because Clang lowers
38-
/// `printf` to different function calls on NVPTX and AMDGCN platforms, and it
39-
/// doesn't work for AMDGCN. After it can work on AMDGCN, we will remove the
40-
/// macro.
37+
/// printf() calls are rewritten by CGGPUBuiltin to __llvm_omp_vprintf
4138
/// {
4239

43-
#ifndef __AMDGCN__
4440
extern "C" {
4541
int printf(const char *format, ...);
4642
}
4743

48-
#define PRINTF(fmt, ...) (void)printf(fmt, __VA_ARGS__);
44+
#define PRINTF(fmt, ...) (void)printf(fmt, ##__VA_ARGS__);
4945
#define PRINT(str) PRINTF("%s", str)
50-
#else
51-
#define PRINTF(fmt, ...)
52-
#define PRINT(str)
53-
#endif
5446

5547
///}
5648

openmp/libomptarget/DeviceRTL/src/Debug.cpp

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,29 @@ void __assert_fail(const char *assertion, const char *file, unsigned line,
2929
assertion);
3030
__builtin_trap();
3131
}
32+
33+
#pragma omp begin declare variant match( \
34+
device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
35+
int32_t vprintf(const char *, void *);
36+
namespace impl {
37+
static int32_t omp_vprintf(const char *Format, void *Arguments, uint32_t) {
38+
return vprintf(Format, Arguments);
39+
}
40+
} // namespace impl
41+
#pragma omp end declare variant
42+
43+
// We do not have a vprintf implementation for AMD GPU yet so we use a stub.
44+
#pragma omp begin declare variant match(device = {arch(amdgcn)})
45+
namespace impl {
46+
static int32_t omp_vprintf(const char *Format, void *Arguments, uint32_t) {
47+
return -1;
48+
}
49+
} // namespace impl
50+
#pragma omp end declare variant
51+
52+
int32_t __llvm_omp_vprintf(const char *Format, void *Arguments, uint32_t Size) {
53+
return impl::omp_vprintf(Format, Arguments, Size);
54+
}
3255
}
3356

3457
/// Current indentation level for the function trace. Only accessed by thread 0.

openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,11 @@ __attribute__((weak)) EXTERN void *__kmpc_impl_malloc(size_t) {
184184
}
185185
__attribute__((weak)) EXTERN void __kmpc_impl_free(void *) {}
186186

187+
EXTERN
188+
int32_t __llvm_omp_vprintf(const char *Format, void *Arguments, uint32_t) {
189+
return -1;
190+
}
191+
187192
EXTERN void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
188193
lo = (uint32_t)(val & UINT64_C(0x00000000FFFFFFFF));
189194
hi = (uint32_t)((val & UINT64_C(0xFFFFFFFF00000000)) >> 32);

openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,9 +184,15 @@ EXTERN int __kmpc_impl_test_lock(omp_lock_t *lock) {
184184
extern "C" {
185185
void *malloc(size_t);
186186
void free(void *);
187+
int32_t vprintf(const char *, void *);
187188
}
188189

189190
EXTERN void *__kmpc_impl_malloc(size_t x) { return malloc(x); }
190191
EXTERN void __kmpc_impl_free(void *x) { free(x); }
191192

193+
EXTERN int32_t __llvm_omp_vprintf(const char *Format, void *Arguments,
194+
uint32_t) {
195+
return vprintf(Format, Arguments);
196+
}
197+
192198
#pragma omp end declare target

openmp/libomptarget/test/mapping/data_member_ref.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
// RUN: %libomptarget-compilexx-run-and-check-generic
22

3-
// amdgcn does not have printf definition
3+
// Wrong results on amdgpu
44
// XFAIL: amdgcn-amd-amdhsa
55
// XFAIL: amdgcn-amd-amdhsa-newRTL
66

openmp/libomptarget/test/mapping/declare_mapper_nested_default_mappers.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
// RUN: %libomptarget-compilexx-run-and-check-generic
22

3-
// amdgcn does not have printf definition
3+
// Wrong results on amdgpu
44
// XFAIL: amdgcn-amd-amdhsa
55
// XFAIL: amdgcn-amd-amdhsa-newRTL
66

openmp/libomptarget/test/mapping/declare_mapper_nested_mappers.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
// RUN: %libomptarget-compilexx-run-and-check-generic
22

3-
// amdgcn does not have printf definition
3+
// Wrong results on amdgpu
44
// XFAIL: amdgcn-amd-amdhsa
55
// XFAIL: amdgcn-amd-amdhsa-newRTL
66

openmp/libomptarget/test/mapping/lambda_by_value.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
// RUN: %libomptarget-compilexx-run-and-check-generic
22

3-
// amdgcn does not have printf definition
3+
// Wrong results on amdgpu
44
// XFAIL: amdgcn-amd-amdhsa
55
// XFAIL: amdgcn-amd-amdhsa-newRTL
66

openmp/libomptarget/test/mapping/ompx_hold/struct.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
// RUN: %libomptarget-compile-generic -fopenmp-extensions
22
// RUN: %libomptarget-run-generic | %fcheck-generic -strict-whitespace
33

4-
// amdgcn does not have printf definition
4+
// Wrong results on amdgpu
55
// XFAIL: amdgcn-amd-amdhsa
66
// XFAIL: amdgcn-amd-amdhsa-newRTL
77

openmp/libomptarget/test/mapping/ptr_and_obj_motion.c

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,5 @@
11
// RUN: %libomptarget-compile-run-and-check-generic
22

3-
// amdgcn does not have printf definition
4-
// XFAIL: amdgcn-amd-amdhsa
5-
// XFAIL: amdgcn-amd-amdhsa-newRTL
6-
73
#include <stdio.h>
84

95
typedef struct {

openmp/libomptarget/test/mapping/reduction_implicit_map.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
// RUN: %libomptarget-compilexx-run-and-check-generic
22

3-
// amdgcn does not have printf definition
4-
// UNSUPPORTED: amdgcn-amd-amdhsa
5-
// UNSUPPORTED: amdgcn-amd-amdhsa-newRTL
3+
// Wrong results on amdgpu
4+
// XFAIL: amdgcn-amd-amdhsa
5+
// XFAIL: amdgcn-amd-amdhsa-newRTL
66

77
#include <stdio.h>
88

openmp/libomptarget/test/offloading/bug49021.cpp

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
// RUN: %libomptarget-compilexx-generic -O3 && %libomptarget-run-generic
22

3-
// Wrong results on amdgcn
4-
// UNSUPPORTED: amdgcn-amd-amdhsa
5-
// UNSUPPORTED: amdgcn-amd-amdhsa-newRTL
3+
// Wrong results on amdgpu
4+
// XFAIL: amdgcn-amd-amdhsa
65

76
#include <iostream>
87

openmp/libomptarget/test/offloading/bug50022.cpp

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,5 @@
11
// RUN: %libomptarget-compilexx-and-run-generic
22

3-
// UNSUPPORTED: amdgcn-amd-amdhsa
4-
// UNSUPPORTED: amdgcn-amd-amdhsa-newRTL
5-
63
#include <cassert>
74
#include <iostream>
85
#include <stdexcept>

openmp/libomptarget/test/offloading/host_as_target.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
// RUN: %libomptarget-compile-run-and-check-generic
99

10-
// amdgcn does not have printf definition
10+
// amdgpu does not have a working printf definition
1111
// XFAIL: amdgcn-amd-amdhsa
1212
// XFAIL: amdgcn-amd-amdhsa-newRTL
1313

openmp/libomptarget/test/unified_shared_memory/api.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
// XFAIL: nvptx64-nvidia-cuda
33
// XFAIL: nvptx64-nvidia-cuda-newRTL
44

5-
// Fails on amdgcn with error: GPU Memory Error
5+
// Fails on amdgpu with error: GPU Memory Error
66
// XFAIL: amdgcn-amd-amdhsa
77
// XFAIL: amdgcn-amd-amdhsa-newRTL
88

openmp/libomptarget/test/unified_shared_memory/close_enter_exit.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
// REQUIRES: unified_shared_memory
44
// UNSUPPORTED: clang-6, clang-7, clang-8, clang-9
55

6-
// Fails on amdgcn with error: GPU Memory Error
6+
// Fails on amdgpu with error: GPU Memory Error
77
// XFAIL: amdgcn-amd-amdhsa
88
// XFAIL: amdgcn-amd-amdhsa-newRTL
99

0 commit comments

Comments
 (0)