@@ -3267,9 +3267,10 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
3267
3267
uint32_t NumThreads, uint64_t NumBlocks,
3268
3268
KernelArgsTy &KernelArgs, void *Args,
3269
3269
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
3270
- const uint32_t KernelArgsSize = KernelArgs.NumArgs * sizeof (void *);
3270
+ const uint32_t LaunchParamsSize = KernelArgs.NumArgs * sizeof (void *);
3271
3271
3272
- if (ArgsSize < KernelArgsSize)
3272
+ if (ArgsSize != LaunchParamsSize &&
3273
+ ArgsSize != LaunchParamsSize + getImplicitArgsSize ())
3273
3274
return Plugin::error (" Mismatch of kernel arguments size" );
3274
3275
3275
3276
AMDGPUPluginTy &AMDGPUPlugin =
@@ -3292,20 +3293,21 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
3292
3293
if (auto Err = GenericDevice.getDeviceStackSize (StackSize))
3293
3294
return Err;
3294
3295
3295
- // Initialize implicit arguments.
3296
- utils::AMDGPUImplicitArgsTy *ImplArgs =
3297
- reinterpret_cast <utils::AMDGPUImplicitArgsTy *>(
3298
- advanceVoidPtr (AllArgs, KernelArgsSize));
3296
+ utils::AMDGPUImplicitArgsTy *ImplArgs = nullptr ;
3297
+ if (ArgsSize == LaunchParamsSize + getImplicitArgsSize ()) {
3298
+ // Initialize implicit arguments.
3299
+ ImplArgs = reinterpret_cast <utils::AMDGPUImplicitArgsTy *>(
3300
+ advanceVoidPtr (AllArgs, LaunchParamsSize));
3299
3301
3300
- // Initialize the implicit arguments to zero.
3301
- std::memset (ImplArgs, 0 , ImplicitArgsSize);
3302
+ // Initialize the implicit arguments to zero.
3303
+ std::memset (ImplArgs, 0 , getImplicitArgsSize ());
3304
+ }
3302
3305
3303
3306
// Copy the explicit arguments.
3304
3307
// TODO: We should expose the args memory manager alloc to the common part as
3305
3308
// alternative to copying them twice.
3306
- if (KernelArgs.NumArgs )
3307
- std::memcpy (AllArgs, *static_cast <void **>(Args),
3308
- sizeof (void *) * KernelArgs.NumArgs );
3309
+ if (LaunchParamsSize)
3310
+ std::memcpy (AllArgs, *static_cast <void **>(Args), LaunchParamsSize);
3309
3311
3310
3312
AMDGPUDeviceTy &AMDGPUDevice = static_cast <AMDGPUDeviceTy &>(GenericDevice);
3311
3313
@@ -3318,7 +3320,8 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
3318
3320
Stream->setRPCServer (GenericDevice.getRPCServer ());
3319
3321
3320
3322
// Only COV5 implicitargs needs to be set. COV4 implicitargs are not used.
3321
- if (getImplicitArgsSize () == sizeof (utils::AMDGPUImplicitArgsTy)) {
3323
+ if (ImplArgs &&
3324
+ getImplicitArgsSize () == sizeof (utils::AMDGPUImplicitArgsTy)) {
3322
3325
ImplArgs->BlockCountX = NumBlocks;
3323
3326
ImplArgs->BlockCountY = 1 ;
3324
3327
ImplArgs->BlockCountZ = 1 ;
0 commit comments