Skip to content

Commit 5b501ee

Browse files
PietroGhguwedolinskyAlexeySachkovsteffenlarsen
authored
[SYCL][NATIVECPU] Emit subhandler as LLVM IR (#10282)
This PR allows the emit the Native CPU subhandler as LLVM-IR, instead of emitting its declaration as C++ code in the Native CPU integration header. This is a first step towards removing the Native CPU integration header, using clang-offload-wrapper and also supporting multiple SYCL targets alongside SYCL Native CPU, which we will implement in following PRs. --------- Co-authored-by: Uwe Dolinsky <[email protected]> Co-authored-by: Alexey Sachkov <[email protected]> Co-authored-by: Steffen Larsen <[email protected]>
1 parent 1aabcf8 commit 5b501ee

File tree

17 files changed

+271
-219
lines changed

17 files changed

+271
-219
lines changed

clang/lib/CodeGen/CodeGenModule.cpp

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2115,10 +2115,6 @@ void CodeGenModule::GenKernelArgMetadata(llvm::Function *Fn,
21152115
Fn->setMetadata("kernel_arg_exclusive_ptr",
21162116
llvm::MDNode::get(VMContext, argSYCLAccessorPtrs));
21172117
}
2118-
if (LangOpts.SYCLIsNativeCPU) {
2119-
Fn->setMetadata("kernel_arg_type",
2120-
llvm::MDNode::get(VMContext, argTypeNames));
2121-
}
21222118
} else {
21232119
if (getLangOpts().OpenCL || getLangOpts().SYCLIsDevice) {
21242120
Fn->setMetadata("kernel_arg_addr_space",

clang/test/CodeGenSYCL/native_cpu_basic.cpp

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
// This test checks for some basic Front End features for Native CPU:
22
// * Kernel name mangling
3-
// * kernel_arg_type metadata node
43
// * is-native-cpu module flag
54
// RUN: %clang_cc1 -fsycl-is-device -S -emit-llvm -internal-isystem %S/Inputs -fsycl-is-native-cpu -o %t.ll %s
65
// RUN: FileCheck -input-file=%t.ll %s
@@ -50,11 +49,9 @@ void gen() {
5049
test<float>(q);
5150
}
5251

53-
// Check name mangling and kernel_arg_type metadata
54-
// CHECK-DAG: @_ZTS6init_aIiE_NativeCPUKernel({{.*}}){{.*}}!kernel_arg_type ![[TYPE1:[0-9]*]]
55-
// CHECK-DAG: @_ZTS6init_aIfE_NativeCPUKernel({{.*}}){{.*}}!kernel_arg_type ![[TYPE3:[0-9]*]]
56-
// CHECK-DAG: ![[TYPE1]] = !{!"int*", !"sycl::range<1>", !"sycl::range<1>", !"sycl::id<1>", !"int"}
57-
// CHECK-DAG: ![[TYPE3]] = !{!"float*", !"sycl::range<1>", !"sycl::range<1>", !"sycl::id<1>", !"float"}
52+
// Check name mangling
53+
// CHECK-DAG: @_ZTS6init_aIiE_NativeCPUKernel_NativeCPUKernel({{.*}})
54+
// CHECK-DAG: @_ZTS6init_aIfE_NativeCPUKernel_NativeCPUKernel({{.*}})
5855

5956
// Check Native CPU module flag
6057
// CHECK-DAG: !{{[0-9]*}} = !{i32 1, !"is-native-cpu", i32 1}

llvm/lib/SYCLLowerIR/EmitSYCLNativeCPUHeader.cpp

Lines changed: 5 additions & 129 deletions
Original file line numberDiff line numberDiff line change
@@ -33,133 +33,12 @@
3333
using namespace llvm;
3434

3535
namespace {
36-
SmallVector<bool> getArgMask(const Function *F) {
37-
SmallVector<bool> Res;
38-
auto *UsedNode = F->getMetadata("sycl_kernel_omit_args");
39-
if (!UsedNode) {
40-
// the metadata node is not available if -fenable-sycl-dae
41-
// was not set; set everything to true in the mask.
42-
for (unsigned I = 0; I < F->getFunctionType()->getNumParams(); I++) {
43-
Res.push_back(true);
44-
}
45-
return Res;
46-
}
47-
auto NumOperands = UsedNode->getNumOperands();
48-
for (unsigned I = 0; I < NumOperands; I++) {
49-
auto &Op = UsedNode->getOperand(I);
50-
if (auto *CAM = dyn_cast<ConstantAsMetadata>(Op.get())) {
51-
if (auto *Const = dyn_cast<ConstantInt>(CAM->getValue())) {
52-
auto Val = Const->getValue();
53-
Res.push_back(!Val.getBoolValue());
54-
} else {
55-
report_fatal_error("Unable to retrieve constant int from "
56-
"sycl_kernel_omit_args metadata node");
57-
}
58-
} else {
59-
report_fatal_error(
60-
"Error while processing sycl_kernel_omit_args metadata node");
61-
}
62-
}
63-
return Res;
64-
}
6536

66-
SmallVector<StringRef> getArgTypeNames(const Function *F) {
67-
SmallVector<StringRef> Res;
68-
auto *TNNode = F->getMetadata("kernel_arg_type");
69-
assert(TNNode &&
70-
"kernel_arg_type metadata node is required for sycl native CPU");
71-
auto NumOperands = TNNode->getNumOperands();
72-
for (unsigned I = 0; I < NumOperands; I++) {
73-
auto &Op = TNNode->getOperand(I);
74-
auto *MDS = dyn_cast<MDString>(Op.get());
75-
if (!MDS)
76-
report_fatal_error("error while processing kernel_arg_types metadata");
77-
Res.push_back(MDS->getString());
78-
}
79-
return Res;
80-
}
81-
82-
void emitKernelDecl(const Function *F, const SmallVector<bool> &ArgMask,
83-
const SmallVector<StringRef> &ArgTypeNames,
84-
raw_ostream &O) {
85-
auto EmitArgDecl = [&](const Argument *Arg, unsigned Index) {
86-
Type *ArgTy = Arg->getType();
87-
if (isa<PointerType>(ArgTy))
88-
return "void *";
89-
return ArgTypeNames[Index].data();
90-
};
91-
92-
auto NumParams = F->getFunctionType()->getNumParams();
93-
O << "extern \"C\" void " << F->getName() << "(";
94-
95-
unsigned I = 0, UsedI = 0;
96-
for (; I + 1 < ArgMask.size() && UsedI + 1 < NumParams; I++) {
97-
if (!ArgMask[I])
98-
continue;
99-
O << EmitArgDecl(F->getArg(UsedI), I) << ", ";
100-
UsedI++;
101-
}
102-
103-
// parameters may have been removed.
104-
bool NoUsedArgs = true;
105-
for (auto &Entry : ArgMask) {
106-
NoUsedArgs &= !Entry;
107-
}
108-
if (NoUsedArgs) {
109-
O << ");\n";
110-
return;
111-
}
112-
// find the index of the last used arg
113-
while (!ArgMask[I] && I + 1 < ArgMask.size())
114-
I++;
115-
O << EmitArgDecl(F->getArg(UsedI), I) << ", __nativecpu_state *);\n";
116-
}
117-
118-
void emitSubKernelHandler(const Function *F, const SmallVector<bool> &ArgMask,
119-
const SmallVector<StringRef> &ArgTypeNames,
120-
raw_ostream &O) {
121-
SmallVector<unsigned> UsedArgIdx;
122-
auto EmitParamCast = [&](Argument *Arg, unsigned Index) {
123-
std::string Res;
124-
llvm::raw_string_ostream OS(Res);
125-
UsedArgIdx.push_back(Index);
126-
if (isa<PointerType>(Arg->getType())) {
127-
OS << " void* arg" << Index << " = ";
128-
OS << "MArgs[" << Index << "].getPtr();\n";
129-
return OS.str();
130-
}
131-
auto TN = ArgTypeNames[Index].str();
132-
OS << " " << TN << " arg" << Index << " = ";
133-
OS << "*(" << TN << "*)"
134-
<< "MArgs[" << Index << "].getPtr();\n";
135-
return OS.str();
136-
};
137-
138-
O << "\ninline static void " << F->getName() << "subhandler(";
37+
void emitSubKernelHandler(const Function *F, raw_ostream &O) {
38+
O << "\nextern \"C\" void " << F->getName() << "subhandler(";
13939
O << "const sycl::detail::NativeCPUArgDesc *MArgs, "
140-
"__nativecpu_state *state) {\n";
141-
// Retrieve only the args that are used
142-
for (unsigned I = 0, UsedI = 0;
143-
I < ArgMask.size() && UsedI < F->getFunctionType()->getNumParams();
144-
I++) {
145-
if (ArgMask[I]) {
146-
O << EmitParamCast(F->getArg(UsedI), I);
147-
UsedI++;
148-
}
149-
}
150-
// Emit the actual kernel call
151-
O << " " << F->getName() << "(";
152-
if (UsedArgIdx.size() == 0) {
153-
O << ");\n";
154-
} else {
155-
for (unsigned I = 0; I < UsedArgIdx.size() - 1; I++) {
156-
O << "arg" << UsedArgIdx[I] << ", ";
157-
}
158-
if (UsedArgIdx.size() >= 1)
159-
O << "arg" << UsedArgIdx.back();
160-
O << ", state);\n";
161-
}
162-
O << "};\n\n";
40+
"__nativecpu_state *state);\n";
41+
return;
16342
}
16443

16544
// Todo: maybe we could use clang-offload-wrapper for this,
@@ -254,10 +133,7 @@ PreservedAnalyses EmitSYCLNativeCPUHeaderPass::run(Module &M,
254133
O << "extern \"C\" void __sycl_register_lib(pi_device_binaries desc);\n";
255134

256135
for (auto *F : Kernels) {
257-
auto ArgMask = getArgMask(F);
258-
auto ArgTypeNames = getArgTypeNames(F);
259-
emitKernelDecl(F, ArgMask, ArgTypeNames, O);
260-
emitSubKernelHandler(F, ArgMask, ArgTypeNames, O);
136+
emitSubKernelHandler(F, O);
261137
emitSYCLRegisterLib(F, O);
262138
}
263139

llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp

Lines changed: 91 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
#include "llvm/IR/IRBuilder.h"
2424
#include "llvm/IR/Instruction.h"
2525
#include "llvm/IR/Instructions.h"
26+
#include "llvm/IR/LLVMContext.h"
2627
#include "llvm/IR/Operator.h"
2728
#include "llvm/IR/Value.h"
2829
#include "llvm/InitializePasses.h"
@@ -62,7 +63,88 @@ void fixCallingConv(Function *F) {
6263
F->setLinkage(GlobalValue::LinkageTypes::WeakAnyLinkage);
6364
}
6465

65-
// Clone the function and returns a new function with a new argument on type T
66+
// returns the indexes of the used arguments
67+
SmallVector<unsigned> getUsedIndexes(const Function *F) {
68+
SmallVector<unsigned> res;
69+
auto UsedNode = F->getMetadata("sycl_kernel_omit_args");
70+
if (!UsedNode) {
71+
// the metadata node is not available if -fenable-sycl-dae
72+
// was not set; set everything to true
73+
// Exclude one arg because we already added the state ptr
74+
for (unsigned I = 0; I + 1 < F->getFunctionType()->getNumParams(); I++) {
75+
res.push_back(I);
76+
}
77+
return res;
78+
}
79+
auto NumOperands = UsedNode->getNumOperands();
80+
for (unsigned I = 0; I < NumOperands; I++) {
81+
auto &Op = UsedNode->getOperand(I);
82+
if (auto CAM = dyn_cast<ConstantAsMetadata>(Op.get())) {
83+
if (auto Const = dyn_cast<ConstantInt>(CAM->getValue())) {
84+
auto Val = Const->getValue();
85+
if (!Val.getBoolValue()) {
86+
res.push_back(I);
87+
}
88+
} else {
89+
report_fatal_error("Unable to retrieve constant int from "
90+
"sycl_kernel_omit_args metadata node");
91+
}
92+
} else {
93+
report_fatal_error(
94+
"Error while processing sycl_kernel_omit_args metadata node");
95+
}
96+
}
97+
return res;
98+
}
99+
100+
void emitSubkernelForKernel(Function *F, Type *NativeCPUArgDescType,
101+
Type *StatePtrType) {
102+
LLVMContext &Ctx = F->getContext();
103+
Type *NativeCPUArgDescPtrType = PointerType::getUnqual(NativeCPUArgDescType);
104+
105+
// Create function signature
106+
const std::string SubHandlerName = F->getName().str() + "subhandler";
107+
FunctionType *FTy = FunctionType::get(
108+
Type::getVoidTy(Ctx), {NativeCPUArgDescPtrType, StatePtrType}, false);
109+
auto SubhFCallee = F->getParent()->getOrInsertFunction(SubHandlerName, FTy);
110+
Function *SubhF = cast<Function>(SubhFCallee.getCallee());
111+
112+
// Emit function body, unpack kernel args
113+
auto UsedIndexes = getUsedIndexes(F);
114+
auto *KernelTy = F->getFunctionType();
115+
// assert(UsedIndexes.size() + 1 == KernelTy->getNumParams() && "mismatch
116+
// between number of params and used args");
117+
IRBuilder<> Builder(Ctx);
118+
BasicBlock *Block = BasicBlock::Create(Ctx, "entry", SubhF);
119+
Builder.SetInsertPoint(Block);
120+
unsigned NumArgs = UsedIndexes.size();
121+
auto *BaseNativeCPUArg = SubhF->getArg(0);
122+
SmallVector<Value *, 5> KernelArgs;
123+
for (unsigned I = 0; I < NumArgs; I++) {
124+
auto *Arg = F->getArg(I);
125+
auto UsedI = UsedIndexes[I];
126+
// Load the correct NativeCPUDesc and load the pointer from it
127+
auto *Addr = Builder.CreateGEP(NativeCPUArgDescType, BaseNativeCPUArg,
128+
{Builder.getInt64(UsedI)});
129+
auto *Load = Builder.CreateLoad(PointerType::getUnqual(Ctx), Addr);
130+
if (Arg->getType()->isPointerTy()) {
131+
// If the arg is a pointer, just use it
132+
KernelArgs.push_back(Load);
133+
} else {
134+
// Otherwise, load the scalar value and use that
135+
auto *Scalar = Builder.CreateLoad(Arg->getType(), Load);
136+
KernelArgs.push_back(Scalar);
137+
}
138+
}
139+
140+
// Call the kernel
141+
// Add the nativecpu state as arg
142+
KernelArgs.push_back(SubhF->getArg(1));
143+
Builder.CreateCall(KernelTy, F, KernelArgs);
144+
Builder.CreateRetVoid();
145+
}
146+
147+
// Clones the function and returns a new function with a new argument on type T
66148
// added as last argument
67149
Function *cloneFunctionAndAddParam(Function *OldF, Type *T) {
68150
auto *OldT = OldF->getFunctionType();
@@ -166,6 +248,14 @@ PreservedAnalyses PrepareSYCLNativeCPUPass::run(Module &M,
166248
ModuleChanged |= true;
167249
}
168250

251+
StructType *NativeCPUArgDescType =
252+
StructType::create({PointerType::getUnqual(M.getContext())});
253+
for (auto &NewK : NewKernels) {
254+
emitSubkernelForKernel(NewK, NativeCPUArgDescType, StatePtrType);
255+
std::string NewName = NewK->getName().str() + "_NativeCPUKernel";
256+
NewK->setName(NewName);
257+
}
258+
169259
// Then we iterate over all the supported builtins, find their uses and
170260
// replace them with calls to our Native CPU functions.
171261
for (auto &Entry : BuiltinNamesMap) {

sycl/doc/design/SYCLNativeCPU.md

Lines changed: 20 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -122,26 +122,35 @@ entry:
122122
ret void
123123
}
124124
```
125-
This pass will also set the correct calling convention for the target, and handle calling convention-related function attributes, allowing to call the kernel from the runtime.
125+
This pass will also set the correct calling convention for the target, and handle calling convention-related function attributes, allowing to call the kernel from the runtime. \\
126+
Additionally, this pass emits the definition for a `subhandler` function, which unpacks the vector of kernel arguments coming from the SYCL runtime, and forwards only the used arguments to the kernel. For our example the `subhandler` IR is:
127+
128+
```llvm
129+
define weak void @_Z6Samplesubhandler(ptr %0, ptr %1) #4 {
130+
entry:
131+
%2 = getelementptr %0, ptr %0, i64 0
132+
%3 = load ptr, ptr %2, align 8
133+
%4 = getelementptr %0, ptr %0, i64 3
134+
%5 = load ptr, ptr %4, align 8
135+
%6 = getelementptr %0, ptr %0, i64 4
136+
%7 = load ptr, ptr %6, align 8
137+
%8 = getelementptr %0, ptr %0, i64 7
138+
%9 = load ptr, ptr %8, align 8
139+
call void @_ZTS10SimpleVaddIiE_NativeCPUKernel(ptr %3, ptr %5, ptr %7, ptr %9, ptr %1)
140+
ret void
141+
}
142+
```
126143

127144
## EmitSYCLNativeCPUHeader pass
128145

129146
This pass emits an additional integration header, that will be compiled by the host compiler during the host compilation step. This header is included by the main integration footer and does not need to be managed manually. Its main purpose is to enable the SYCL runtime to register kernels and to call kernels that had unused parameters removed by the optimizer. The header contains, for each kernel:
130-
* The kernel declaration as a C++ function, all pointer arguments are emitted as `void *`, the scalar arguments maintain their type.
131-
* A `subhandler` definition, which unpacks the vector of kernel arguments coming from the SYCL runtime, and forwards only the used arguments to the kernel.
147+
* The subhandler declaration as a C++ function.
132148
* The definition of `_pi_offload_entry_struct`, `pi_device_binary_struct` and `pi_device_binaries_struct` variables, and a call to `__sycl_register_lib`, which allows to register the kernel to the sycl runtime (the call to `__sycl_register_lib` is performed at program startup via the constructor of a global). The Native CPU integration header is always named `<main-sycl-int-header>.hc`.
133149

134150
The Native CPU integration header for our example is:
135151

136152
```c++
137-
extern "C" void _Z6Sample(void *, void *, void *, nativecpu_state *);
138-
139-
inline static void _Z6Samplesubhandler(const sycl::detail::NativeCPUArgDesc *MArgs, nativecpu_state *state) {
140-
void* arg0 = MArgs[0].getPtr();
141-
void* arg1 = MArgs[1].getPtr();
142-
void* arg2 = MArgs[2].getPtr();
143-
_Z6Sample(arg0, arg1, arg2, state);
144-
};
153+
extern "C" void _Z6Samplesubhandler(const sycl::detail::NativeCPUArgDesc *MArgs, nativecpu_state *state);
145154

146155
static _pi_offload_entry_struct _pi_offload_entry_struct_Z6Sample{(void*)&_Z6Samplesubhandler, const_cast<char*>("_Z6Sample"), 1, 0, 0 };
147156
static pi_device_binary_struct pi_device_binary_struct_Z6Sample{0, 4, 0, __SYCL_PI_DEVICE_BINARY_TARGET_UNKNOWN, nullptr, nullptr, nullptr, nullptr, (unsigned char*)&_Z6Samplesubhandler, (unsigned char*)&_Z6Samplesubhandler + 1, &_pi_offload_entry_struct_Z6Sample, &_pi_offload_entry_struct_Z6Sample+1, nullptr, nullptr };

sycl/include/sycl/detail/native_cpu.hpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ namespace detail {
1818
struct NativeCPUArgDesc {
1919
void *MPtr;
2020

21-
void *getPtr() const { return MPtr; }
2221
NativeCPUArgDesc(void *Ptr) : MPtr(Ptr){};
2322
};
2423

sycl/plugins/native_cpu/pi_native_cpu.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1385,6 +1385,17 @@ pi_result piextEnqueueCommandBuffer(pi_ext_command_buffer, pi_queue, pi_uint32,
13851385
DIE_NO_IMPLEMENTATION;
13861386
}
13871387

1388+
pi_result piextEnablePeerAccess(pi_device, pi_device) { DIE_NO_IMPLEMENTATION; }
1389+
1390+
pi_result piextDisablePeerAccess(pi_device, pi_device) {
1391+
DIE_NO_IMPLEMENTATION;
1392+
}
1393+
1394+
pi_result piextPeerAccessGetInfo(pi_device, pi_device, pi_peer_attr, size_t,
1395+
void *, size_t *) {
1396+
DIE_NO_IMPLEMENTATION;
1397+
}
1398+
13881399
pi_result piTearDown(void *) {
13891400
// Todo: is it fine as a no-op?
13901401
return PI_SUCCESS;

0 commit comments

Comments
 (0)