Skip to content

Commit 8f289df

Browse files
AndreyPavlenkokurapov-peter
authored andcommitted
[GpuOclRuntime] Add DLTI attributes from the device info
1 parent 8b64109 commit 8f289df

File tree

3 files changed

+151
-31
lines changed

3 files changed

+151
-31
lines changed

lib/gc/ExecutionEngine/GPURuntime/ocl/GpuOclRuntime.cpp

Lines changed: 149 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,10 @@
1616
#include "llvm/ExecutionEngine/Orc/LLJIT.h"
1717
#include "llvm/Support/Error.h"
1818

19+
#include "mlir/Dialect/DLTI/DLTI.h"
1920
#include "mlir/Dialect/Func/IR/FuncOps.h"
2021
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
22+
#include "mlir/Interfaces/DataLayoutInterfaces.h"
2123
#include "mlir/Pass/PassManager.h"
2224

2325
namespace mlir::gc::gpu {
@@ -148,10 +150,12 @@ struct Kernel {
148150
}
149151

150152
~Kernel() {
151-
CL_CHECKR(clReleaseKernel(kernel), "Failed to release OpenCL kernel.");
152-
gcLogD("Released OpenCL kernel: ", kernel);
153-
CL_CHECKR(clReleaseProgram(program), "Failed to release OpenCL program.");
154-
gcLogD("Released OpenCL program: ", program);
153+
if (kernel != nullptr) {
154+
CL_CHECKR(clReleaseKernel(kernel), "Failed to release OpenCL kernel.");
155+
gcLogD("Released OpenCL kernel: ", kernel);
156+
CL_CHECKR(clReleaseProgram(program), "Failed to release OpenCL program.");
157+
gcLogD("Released OpenCL program: ", program);
158+
}
155159
}
156160
};
157161

@@ -220,7 +224,14 @@ struct OclRuntime::Exports {
220224
gcLogD("The program has been built: ", program);
221225

222226
auto kernel = clCreateKernel(program, name, &err);
223-
CL_CHECKR(err, "Failed to create OpenCL kernel from program: ", program);
227+
if (err != CL_SUCCESS) {
228+
// This is a special case, handled by OclModuleBuilder::build(), that
229+
// allows rebuilding the kernel with different options in case of failure.
230+
clReleaseProgram(program);
231+
gcLogD("OpenCL error ", err,
232+
": Failed to create OpenCL kernel from program: ", program);
233+
return new Kernel(nullptr, nullptr, gridSize, blockSize, argNum, argSize);
234+
}
224235
gcLogD("Created new OpenCL kernel ", kernel, " from program ", program);
225236

226237
cl_bool enable = CL_TRUE;
@@ -639,8 +650,7 @@ void OclContext::setLastEvent(cl_event event) {
639650
}
640651
}
641652

642-
OclModule::~OclModule() {
643-
assert(engine);
653+
static void destroyKernels(const std::unique_ptr<ExecutionEngine> &engine) {
644654
auto fn = engine->lookup(GPU_OCL_MOD_DESTRUCTOR);
645655
if (fn) {
646656
reinterpret_cast<void (*)()>(fn.get())();
@@ -649,13 +659,19 @@ OclModule::~OclModule() {
649659
}
650660
}
651661

662+
OclModule::~OclModule() {
663+
assert(engine);
664+
destroyKernels(engine);
665+
}
666+
652667
// If all arguments of 'origFunc' are memrefs with static shape, create a new
653668
// function called gcGpuOclStaticMain, that accepts 2 arguments: a pointer to
654669
// OclContext and a pointer to an array, containing pointers to aligned memory
655670
// buffers. The function will call the original function with the context,
656671
// buffers and the offset/shape/strides, statically created from the
657672
// memref descriptor.
658-
StringRef createStaticMain(ModuleOp &module, const StringRef &funcName,
673+
StringRef createStaticMain(OpBuilder &builder, ModuleOp &module,
674+
const StringRef &funcName,
659675
const ArrayRef<Type> argTypes) {
660676
auto mainFunc = module.lookupSymbol<LLVM::LLVMFuncOp>(funcName);
661677
if (!mainFunc) {
@@ -670,11 +686,8 @@ StringRef createStaticMain(ModuleOp &module, const StringRef &funcName,
670686
"' must have an least 3 arguments.");
671687
}
672688

673-
auto ctx = module.getContext();
674-
ctx->getOrLoadDialect<LLVM::LLVMDialect>();
675-
OpBuilder builder(ctx);
676689
auto i64Type = builder.getI64Type();
677-
auto ptrType = LLVM::LLVMPointerType::get(ctx);
690+
auto ptrType = LLVM::LLVMPointerType::get(builder.getContext());
678691

679692
if (mainArgTypes[nargs - 3] != ptrType ||
680693
mainArgTypes[nargs - 2] != ptrType ||
@@ -722,7 +735,7 @@ StringRef createStaticMain(ModuleOp &module, const StringRef &funcName,
722735
auto loc = mainFunc.getLoc();
723736
auto newFuncType = LLVM::LLVMFunctionType::get(
724737
mainFunc.getNumResults() ? mainFunc->getResult(0).getType()
725-
: LLVM::LLVMVoidType::get(ctx),
738+
: LLVM::LLVMVoidType::get(builder.getContext()),
726739
{ptrType, ptrType});
727740
auto newFunc =
728741
OpBuilder::atBlockEnd(module.getBody())
@@ -848,17 +861,58 @@ OclModuleBuilder::build(cl_device_id device, cl_context context) {
848861

849862
llvm::Expected<std::shared_ptr<const OclModule>>
850863
OclModuleBuilder::build(const OclRuntime::Ext &ext) {
851-
auto mod = mlirModule.clone();
852-
PassManager pm{mod.getContext()};
853-
pipeline(pm);
854-
CHECK(!pm.run(mod).failed(), "GPU pipeline failed!");
864+
auto ctx = mlirModule.getContext();
865+
ctx->getOrLoadDialect<DLTIDialect>();
866+
ctx->getOrLoadDialect<LLVM::LLVMDialect>();
867+
OpBuilder builder(ctx);
868+
DataLayoutEntryInterface dltiAttrs[6];
855869

856-
auto staticMain = createStaticMain(mod, funcName, argTypes);
870+
{
871+
struct DevInfo {
872+
cl_device_info key;
873+
const char *attrName;
874+
};
875+
DevInfo devInfo[]{
876+
{CL_DEVICE_MAX_COMPUTE_UNITS, "num_exec_units"},
877+
{CL_DEVICE_NUM_EUS_PER_SUB_SLICE_INTEL, "num_exec_units_per_slice"},
878+
{CL_DEVICE_NUM_THREADS_PER_EU_INTEL, "num_threads_per_eu"},
879+
// Assuming the cache size is equal to the local mem
880+
{CL_DEVICE_LOCAL_MEM_SIZE, "L1_cache_size_in_bytes"},
881+
};
857882

858-
if (printIr) {
859-
mod.dump();
860-
}
883+
unsigned i = 0;
884+
for (auto &[key, attrName] : devInfo) {
885+
int64_t value = 0;
886+
CL_CHECK(
887+
clGetDeviceInfo(ext.device, key, sizeof(cl_ulong), &value, nullptr),
888+
"Failed to get the device property ", attrName);
889+
gcLogD("Device property ", attrName, "=", value);
890+
dltiAttrs[i++] =
891+
DataLayoutEntryAttr::get(ctx, builder.getStringAttr(attrName),
892+
builder.getI64IntegerAttr(value));
893+
}
861894

895+
// There is no a corresponding property in the OpenCL API, using the
896+
// hardcoded value.
897+
// TODO: Get the real value.
898+
dltiAttrs[i] = DataLayoutEntryAttr::get(
899+
ctx, builder.getStringAttr("max_vector_op_width"),
900+
builder.getI64IntegerAttr(512));
901+
}
902+
903+
OclRuntime rt(ext);
904+
auto expectedQueue = rt.createQueue();
905+
CHECKE(expectedQueue, "Failed to create queue!");
906+
struct OclQueue {
907+
cl_command_queue queue;
908+
~OclQueue() { clReleaseCommandQueue(queue); }
909+
} queue{*expectedQueue};
910+
OclContext oclCtx{rt, queue.queue, false};
911+
912+
ModuleOp mod;
913+
StringRef staticMain;
914+
std::unique_ptr<ExecutionEngine> eng;
915+
auto devStr = builder.getStringAttr("GPU" /* device ID*/);
862916
ExecutionEngineOptions opts;
863917
opts.jitCodeGenOptLevel = llvm::CodeGenOptLevel::Aggressive;
864918
opts.enableObjectDump = enableObjectDump;
@@ -868,18 +922,86 @@ OclModuleBuilder::build(const OclRuntime::Ext &ext) {
868922
opts.enablePerfNotificationListener = false;
869923
#endif
870924

871-
auto eng = ExecutionEngine::create(mod, opts);
872-
CHECKE(eng, "Failed to create ExecutionEngine!");
873-
eng->get()->registerSymbols(OclRuntime::Exports::symbolMap);
925+
// Build the module and check the kernels workgroup size. If the workgroup
926+
// size is different, rebuild the module with the new size.
927+
for (size_t wgSize = 64, maxSize = std::numeric_limits<size_t>::max();;) {
928+
dltiAttrs[sizeof(dltiAttrs) / sizeof(DataLayoutEntryInterface) - 1] =
929+
DataLayoutEntryAttr::get(
930+
ctx, builder.getStringAttr("max_work_group_size"),
931+
builder.getI64IntegerAttr(static_cast<int64_t>(wgSize)));
932+
TargetDeviceSpecInterface devSpec =
933+
TargetDeviceSpecAttr::get(ctx, dltiAttrs);
934+
auto sysSpec =
935+
TargetSystemSpecAttr::get(ctx, ArrayRef(std::pair(devStr, devSpec)));
936+
mod = mlirModule.clone();
937+
mod.getOperation()->setAttr("#dlti.sys_spec", sysSpec);
938+
PassManager pm{ctx};
939+
pipeline(pm);
940+
CHECK(!pm.run(mod).failed(), "GPU pipeline failed!");
941+
staticMain = createStaticMain(builder, mod, funcName, argTypes);
942+
auto expectedEng = ExecutionEngine::create(mod, opts);
943+
CHECKE(expectedEng, "Failed to create ExecutionEngine!");
944+
expectedEng->get()->registerSymbols(OclRuntime::Exports::symbolMap);
945+
946+
// Find all kernels and query the workgroup size
947+
size_t minSize = maxSize;
948+
mod.walk<>([&](LLVM::LLVMFuncOp func) {
949+
auto name = func.getName();
950+
if (!name.starts_with("createGcGpuOclKernel_")) {
951+
return WalkResult::skip();
952+
}
953+
auto fn = expectedEng.get()->lookup(name);
954+
if (!fn) {
955+
gcLogE("Function not found: ", name.data());
956+
return WalkResult::skip();
957+
}
958+
959+
Kernel *kernel =
960+
reinterpret_cast<Kernel *(*)(OclContext *)>(fn.get())(&oclCtx);
961+
962+
if (kernel->kernel == nullptr) {
963+
maxSize = wgSize / 2;
964+
if (maxSize == 0) {
965+
gcReportErr("Failed to build the kernel.");
966+
}
967+
minSize = maxSize;
968+
return WalkResult::interrupt();
969+
}
970+
971+
size_t s = 0;
972+
auto err = clGetKernelWorkGroupInfo(kernel->kernel, ext.device,
973+
CL_KERNEL_WORK_GROUP_SIZE,
974+
sizeof(size_t), &s, nullptr);
975+
if (err == CL_SUCCESS) {
976+
minSize = std::min(minSize, s);
977+
} else {
978+
gcLogE("Failed to get the kernel workgroup size: ", err);
979+
}
980+
return WalkResult::skip();
981+
});
982+
983+
if (minSize == wgSize || minSize == std::numeric_limits<size_t>::max()) {
984+
eng = std::move(*expectedEng);
985+
break;
986+
}
987+
988+
destroyKernels(expectedEng.get());
989+
gcLogD("Changing the workgroup size from ", wgSize, " to ", minSize);
990+
wgSize = minSize;
991+
}
992+
993+
if (printIr) {
994+
mod.dump();
995+
}
874996

875997
OclModule::MainFunc main = {nullptr};
876998

877999
if (staticMain.empty()) {
878-
auto expect = eng.get()->lookupPacked(funcName);
1000+
auto expect = eng->lookupPacked(funcName);
8791001
CHECKE(expect, "Packed function '", funcName.begin(), "' not found!");
8801002
main.wrappedMain = *expect;
8811003
} else {
882-
auto expect = eng.get()->lookup(staticMain);
1004+
auto expect = eng->lookup(staticMain);
8831005
CHECKE(expect, "Compiled function '", staticMain.begin(), "' not found!");
8841006
main.staticMain = reinterpret_cast<OclModule::StaticMainFunc>(*expect);
8851007
}
@@ -889,8 +1011,7 @@ OclModuleBuilder::build(const OclRuntime::Ext &ext) {
8891011
return it->second;
8901012
}
8911013
std::shared_ptr<const OclModule> ptr(
892-
new OclModule(OclRuntime(ext), !staticMain.empty(), main, argTypes,
893-
std::move(eng.get())));
1014+
new OclModule(rt, !staticMain.empty(), main, argTypes, std::move(eng)));
8941015
return cache.emplace(OclDevCtxPair(ext.device, ext.context), ptr)
8951016
.first->second;
8961017
}

lib/gc/Transforms/GPU/GpuToGpuOcl.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -381,8 +381,7 @@ struct ConvertLaunch final : ConvertOpPattern<gpu::LaunchFuncOp> {
381381

382382
auto function = rewriter.create<LLVM::LLVMFuncOp>(
383383
loc, funcName,
384-
LLVM::LLVMFunctionType::get(helper.ptrType, {helper.ptrType}),
385-
LLVM::Linkage::Internal);
384+
LLVM::LLVMFunctionType::get(helper.ptrType, {helper.ptrType}));
386385
rewriter.setInsertionPointToStart(function.addEntryBlock(rewriter));
387386

388387
auto ptr = mod.lookupSymbol<LLVM::GlobalOp>(str("Ptr"));

test/mlir/test/gc/Transforms/GPU/gpu-to-gpuocl.mlir

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ module @test attributes {gpu.container_module} {
3636
// CHECK: llvm.mlir.global internal constant @gcGpuOclKernel_entry_kernel_Name
3737
// CHECK: llvm.mlir.global internal @gcGpuOclKernel_entry_kernel_Ptr
3838

39-
// CHECK: llvm.func internal @createGcGpuOclKernel_entry_kernel([[CTX:%.+]]: !llvm.ptr) -> !llvm.ptr
39+
// CHECK: llvm.func @createGcGpuOclKernel_entry_kernel([[CTX:%.+]]: !llvm.ptr) -> !llvm.ptr
4040
// CHECK: [[NEW_PTR:%.+]] = llvm.call @gcGpuOclKernelCreate([[CTX]]
4141
// CHECK: [[ZERO:%.+]] = llvm.mlir.zero
4242
// CHECK: [[PTR_ADDR:%.+]] = llvm.mlir.addressof @gcGpuOclKernel_entry_kernel_Ptr

0 commit comments

Comments
 (0)