16
16
#include " llvm/ExecutionEngine/Orc/LLJIT.h"
17
17
#include " llvm/Support/Error.h"
18
18
19
+ #include " mlir/Dialect/DLTI/DLTI.h"
19
20
#include " mlir/Dialect/Func/IR/FuncOps.h"
20
21
#include " mlir/Dialect/LLVMIR/LLVMDialect.h"
22
+ #include " mlir/Interfaces/DataLayoutInterfaces.h"
21
23
#include " mlir/Pass/PassManager.h"
22
24
23
25
namespace mlir ::gc::gpu {
@@ -126,10 +128,9 @@ struct Kernel {
126
128
127
129
explicit Kernel (cl_program program, cl_kernel kernel, const size_t *gridSize,
128
130
const size_t *blockSize, size_t argNum, const size_t *argSize)
129
- : program(program),
130
- kernel(kernel), globalSize{gridSize[0 ] * blockSize[0 ],
131
- gridSize[1 ] * blockSize[1 ],
132
- gridSize[2 ] * blockSize[2 ]},
131
+ : program(program), kernel(kernel),
132
+ globalSize{gridSize[0 ] * blockSize[0 ], gridSize[1 ] * blockSize[1 ],
133
+ gridSize[2 ] * blockSize[2 ]},
133
134
localSize{blockSize[0 ], blockSize[1 ], blockSize[2 ]},
134
135
argSize (argSize, argSize + argNum) {
135
136
#ifndef NDEBUG
@@ -148,10 +149,12 @@ struct Kernel {
148
149
}
149
150
150
151
~Kernel () {
151
- CL_CHECKR (clReleaseKernel (kernel), " Failed to release OpenCL kernel." );
152
- gcLogD (" Released OpenCL kernel: " , kernel);
153
- CL_CHECKR (clReleaseProgram (program), " Failed to release OpenCL program." );
154
- gcLogD (" Released OpenCL program: " , program);
152
+ if (kernel != nullptr ) {
153
+ CL_CHECKR (clReleaseKernel (kernel), " Failed to release OpenCL kernel." );
154
+ gcLogD (" Released OpenCL kernel: " , kernel);
155
+ CL_CHECKR (clReleaseProgram (program), " Failed to release OpenCL program." );
156
+ gcLogD (" Released OpenCL program: " , program);
157
+ }
155
158
}
156
159
};
157
160
@@ -220,7 +223,14 @@ struct OclRuntime::Exports {
220
223
gcLogD (" The program has been built: " , program);
221
224
222
225
auto kernel = clCreateKernel (program, name, &err);
223
- CL_CHECKR (err, " Failed to create OpenCL kernel from program: " , program);
226
+ if (err != CL_SUCCESS) {
227
+ // This is a special case, handled by OclModuleBuilder::build(), that
228
+ // allows rebuilding the kernel with different options in case of failure.
229
+ clReleaseProgram (program);
230
+ gcLogD (" OpenCL error " , err,
231
+ " : Failed to create OpenCL kernel from program: " , program);
232
+ return new Kernel (nullptr , nullptr , gridSize, blockSize, argNum, argSize);
233
+ }
224
234
gcLogD (" Created new OpenCL kernel " , kernel, " from program " , program);
225
235
226
236
cl_bool enable = CL_TRUE;
@@ -639,8 +649,7 @@ void OclContext::setLastEvent(cl_event event) {
639
649
}
640
650
}
641
651
642
- OclModule::~OclModule () {
643
- assert (engine);
652
+ static void destroyKernels (const std::unique_ptr<ExecutionEngine> &engine) {
644
653
auto fn = engine->lookup (GPU_OCL_MOD_DESTRUCTOR);
645
654
if (fn) {
646
655
reinterpret_cast <void (*)()>(fn.get ())();
@@ -649,13 +658,19 @@ OclModule::~OclModule() {
649
658
}
650
659
}
651
660
661
+ OclModule::~OclModule () {
662
+ assert (engine);
663
+ destroyKernels (engine);
664
+ }
665
+
652
666
// If all arguments of 'origFunc' are memrefs with static shape, create a new
653
667
// function called gcGpuOclStaticMain, that accepts 2 arguments: a pointer to
654
668
// OclContext and a pointer to an array, containing pointers to aligned memory
655
669
// buffers. The function will call the original function with the context,
656
670
// buffers and the offset/shape/strides, statically created from the
657
671
// memref descriptor.
658
- StringRef createStaticMain (ModuleOp &module , const StringRef &funcName,
672
+ StringRef createStaticMain (OpBuilder &builder, ModuleOp &module ,
673
+ const StringRef &funcName,
659
674
const ArrayRef<Type> argTypes) {
660
675
auto mainFunc = module .lookupSymbol <LLVM::LLVMFuncOp>(funcName);
661
676
if (!mainFunc) {
@@ -670,11 +685,8 @@ StringRef createStaticMain(ModuleOp &module, const StringRef &funcName,
670
685
" ' must have an least 3 arguments." );
671
686
}
672
687
673
- auto ctx = module .getContext ();
674
- ctx->getOrLoadDialect <LLVM::LLVMDialect>();
675
- OpBuilder builder (ctx);
676
688
auto i64Type = builder.getI64Type ();
677
- auto ptrType = LLVM::LLVMPointerType::get (ctx );
689
+ auto ptrType = LLVM::LLVMPointerType::get (builder. getContext () );
678
690
679
691
if (mainArgTypes[nargs - 3 ] != ptrType ||
680
692
mainArgTypes[nargs - 2 ] != ptrType ||
@@ -722,7 +734,7 @@ StringRef createStaticMain(ModuleOp &module, const StringRef &funcName,
722
734
auto loc = mainFunc.getLoc ();
723
735
auto newFuncType = LLVM::LLVMFunctionType::get (
724
736
mainFunc.getNumResults () ? mainFunc->getResult (0 ).getType ()
725
- : LLVM::LLVMVoidType::get (ctx ),
737
+ : LLVM::LLVMVoidType::get (builder. getContext () ),
726
738
{ptrType, ptrType});
727
739
auto newFunc =
728
740
OpBuilder::atBlockEnd (module .getBody ())
@@ -848,17 +860,58 @@ OclModuleBuilder::build(cl_device_id device, cl_context context) {
848
860
849
861
llvm::Expected<std::shared_ptr<const OclModule>>
850
862
OclModuleBuilder::build (const OclRuntime::Ext &ext) {
851
- auto mod = mlirModule.clone ();
852
- PassManager pm{mod.getContext ()};
853
- pipeline (pm);
854
- CHECK (!pm.run (mod).failed (), " GPU pipeline failed!" );
863
+ auto ctx = mlirModule.getContext ();
864
+ ctx->getOrLoadDialect <DLTIDialect>();
865
+ ctx->getOrLoadDialect <LLVM::LLVMDialect>();
866
+ OpBuilder builder (ctx);
867
+ DataLayoutEntryInterface dltiAttrs[6 ];
855
868
856
- auto staticMain = createStaticMain (mod, funcName, argTypes);
869
+ {
870
+ struct DevInfo {
871
+ cl_device_info key;
872
+ const char *attrName;
873
+ };
874
+ DevInfo devInfo[]{
875
+ {CL_DEVICE_MAX_COMPUTE_UNITS, " num_exec_units" },
876
+ {CL_DEVICE_NUM_EUS_PER_SUB_SLICE_INTEL, " num_exec_units_per_slice" },
877
+ {CL_DEVICE_NUM_THREADS_PER_EU_INTEL, " num_threads_per_eu" },
878
+ // Assuming the cache size is equal to the local mem
879
+ {CL_DEVICE_LOCAL_MEM_SIZE, " L1_cache_size_in_bytes" },
880
+ };
857
881
858
- if (printIr) {
859
- mod.dump ();
860
- }
882
+ unsigned i = 0 ;
883
+ for (auto &[key, attrName] : devInfo) {
884
+ int64_t value = 0 ;
885
+ CL_CHECK (
886
+ clGetDeviceInfo (ext.device , key, sizeof (cl_ulong), &value, nullptr ),
887
+ " Failed to get the device property " , attrName);
888
+ gcLogD (" Device property " , attrName, " =" , value);
889
+ dltiAttrs[i++] =
890
+ DataLayoutEntryAttr::get (ctx, builder.getStringAttr (attrName),
891
+ builder.getI64IntegerAttr (value));
892
+ }
861
893
894
+ // There is no a corresponding property in the OpenCL API, using the
895
+ // hardcoded value.
896
+ // TODO: Get the real value.
897
+ dltiAttrs[i] = DataLayoutEntryAttr::get (
898
+ ctx, builder.getStringAttr (" max_vector_op_width" ),
899
+ builder.getI64IntegerAttr (512 ));
900
+ }
901
+
902
+ OclRuntime rt (ext);
903
+ auto expectedQueue = rt.createQueue ();
904
+ CHECKE (expectedQueue, " Failed to create queue!" );
905
+ struct OclQueue {
906
+ cl_command_queue queue;
907
+ ~OclQueue () { clReleaseCommandQueue (queue); }
908
+ } queue{*expectedQueue};
909
+ OclContext oclCtx{rt, queue.queue , false };
910
+
911
+ ModuleOp mod;
912
+ StringRef staticMain;
913
+ std::unique_ptr<ExecutionEngine> eng;
914
+ auto devStr = builder.getStringAttr (" GPU" /* device ID*/ );
862
915
ExecutionEngineOptions opts;
863
916
opts.jitCodeGenOptLevel = llvm::CodeGenOptLevel::Aggressive;
864
917
opts.enableObjectDump = enableObjectDump;
@@ -868,18 +921,86 @@ OclModuleBuilder::build(const OclRuntime::Ext &ext) {
868
921
opts.enablePerfNotificationListener = false ;
869
922
#endif
870
923
871
- auto eng = ExecutionEngine::create (mod, opts);
872
- CHECKE (eng, " Failed to create ExecutionEngine!" );
873
- eng->get ()->registerSymbols (OclRuntime::Exports::symbolMap);
924
+ // Build the module and check the kernels workgroup size. If the workgroup
925
+ // size is different, rebuild the module with the new size.
926
+ for (size_t wgSize = 64 , maxSize = std::numeric_limits<size_t >::max ();;) {
927
+ dltiAttrs[sizeof (dltiAttrs) / sizeof (DataLayoutEntryInterface) - 1 ] =
928
+ DataLayoutEntryAttr::get (
929
+ ctx, builder.getStringAttr (" max_work_group_size" ),
930
+ builder.getI64IntegerAttr (static_cast <int64_t >(wgSize)));
931
+ TargetDeviceSpecInterface devSpec =
932
+ TargetDeviceSpecAttr::get (ctx, dltiAttrs);
933
+ auto sysSpec =
934
+ TargetSystemSpecAttr::get (ctx, ArrayRef (std::pair (devStr, devSpec)));
935
+ mod = mlirModule.clone ();
936
+ mod.getOperation ()->setAttr (" #dlti.sys_spec" , sysSpec);
937
+ PassManager pm{ctx};
938
+ pipeline (pm);
939
+ CHECK (!pm.run (mod).failed (), " GPU pipeline failed!" );
940
+ staticMain = createStaticMain (builder, mod, funcName, argTypes);
941
+ auto expectedEng = ExecutionEngine::create (mod, opts);
942
+ CHECKE (expectedEng, " Failed to create ExecutionEngine!" );
943
+ expectedEng->get ()->registerSymbols (OclRuntime::Exports::symbolMap);
944
+
945
+ // Find all kernels and query the workgroup size
946
+ size_t minSize = maxSize;
947
+ mod.walk <>([&](LLVM::LLVMFuncOp func) {
948
+ auto name = func.getName ();
949
+ if (!name.starts_with (" createGcGpuOclKernel_" )) {
950
+ return WalkResult::skip ();
951
+ }
952
+ auto fn = expectedEng.get ()->lookup (name);
953
+ if (!fn) {
954
+ gcLogE (" Function not found: " , name.data ());
955
+ return WalkResult::skip ();
956
+ }
957
+
958
+ Kernel *kernel =
959
+ reinterpret_cast <Kernel *(*)(OclContext *)>(fn.get ())(&oclCtx);
960
+
961
+ if (kernel->kernel == nullptr ) {
962
+ maxSize = wgSize / 2 ;
963
+ if (maxSize == 0 ) {
964
+ gcReportErr (" Failed to build the kernel." );
965
+ }
966
+ minSize = maxSize;
967
+ return WalkResult::interrupt ();
968
+ }
969
+
970
+ size_t s = 0 ;
971
+ auto err = clGetKernelWorkGroupInfo (kernel->kernel , ext.device ,
972
+ CL_KERNEL_WORK_GROUP_SIZE,
973
+ sizeof (size_t ), &s, nullptr );
974
+ if (err == CL_SUCCESS) {
975
+ minSize = std::min (minSize, s);
976
+ } else {
977
+ gcLogE (" Failed to get the kernel workgroup size: " , err);
978
+ }
979
+ return WalkResult::skip ();
980
+ });
981
+
982
+ if (minSize == wgSize || minSize == std::numeric_limits<size_t >::max ()) {
983
+ eng = std::move (*expectedEng);
984
+ break ;
985
+ }
986
+
987
+ destroyKernels (expectedEng.get ());
988
+ gcLogD (" Changing the workgroup size from " , wgSize, " to " , minSize);
989
+ wgSize = minSize;
990
+ }
991
+
992
+ if (printIr) {
993
+ mod.dump ();
994
+ }
874
995
875
996
OclModule::MainFunc main = {nullptr };
876
997
877
998
if (staticMain.empty ()) {
878
- auto expect = eng. get () ->lookupPacked (funcName);
999
+ auto expect = eng->lookupPacked (funcName);
879
1000
CHECKE (expect, " Packed function '" , funcName.begin (), " ' not found!" );
880
1001
main.wrappedMain = *expect;
881
1002
} else {
882
- auto expect = eng. get () ->lookup (staticMain);
1003
+ auto expect = eng->lookup (staticMain);
883
1004
CHECKE (expect, " Compiled function '" , staticMain.begin (), " ' not found!" );
884
1005
main.staticMain = reinterpret_cast <OclModule::StaticMainFunc>(*expect);
885
1006
}
@@ -889,8 +1010,7 @@ OclModuleBuilder::build(const OclRuntime::Ext &ext) {
889
1010
return it->second ;
890
1011
}
891
1012
std::shared_ptr<const OclModule> ptr (
892
- new OclModule (OclRuntime (ext), !staticMain.empty (), main, argTypes,
893
- std::move (eng.get ())));
1013
+ new OclModule (rt, !staticMain.empty (), main, argTypes, std::move (eng)));
894
1014
return cache.emplace (OclDevCtxPair (ext.device , ext.context ), ptr)
895
1015
.first ->second ;
896
1016
}
0 commit comments