@@ -420,7 +420,7 @@ struct AMDGPUMemoryManagerTy : public DeviceAllocatorTy {
420
420
assert (PtrStorage && " Invalid pointer storage" );
421
421
422
422
*PtrStorage = MemoryManager->allocate (Size, nullptr );
423
- if (*PtrStorage == nullptr )
423
+ if (Size && *PtrStorage == nullptr )
424
424
return Plugin::error (ErrorCode::OUT_OF_RESOURCES,
425
425
" failure to allocate from AMDGPU memory manager" );
426
426
@@ -429,8 +429,6 @@ struct AMDGPUMemoryManagerTy : public DeviceAllocatorTy {
429
429
430
430
// / Release an allocation to be reused.
431
431
Error deallocate (void *Ptr) {
432
- assert (Ptr && " Invalid pointer" );
433
-
434
432
if (MemoryManager->free (Ptr))
435
433
return Plugin::error (ErrorCode::UNKNOWN,
436
434
" failure to deallocate from AMDGPU memory manager" );
@@ -3365,7 +3363,7 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
3365
3363
KernelLaunchParamsTy LaunchParams,
3366
3364
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
3367
3365
if (ArgsSize != LaunchParams.Size &&
3368
- ArgsSize != LaunchParams.Size + getImplicitArgsSize ())
3366
+ ArgsSize > LaunchParams.Size + getImplicitArgsSize ())
3369
3367
return Plugin::error (ErrorCode::INVALID_ARGUMENT,
3370
3368
" mismatch of kernel arguments size" );
3371
3369
@@ -3401,23 +3399,39 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
3401
3399
if (auto Err = AMDGPUDevice.getStream (AsyncInfoWrapper, Stream))
3402
3400
return Err;
3403
3401
3404
- hsa_utils::AMDGPUImplicitArgsTy *ImplArgs = nullptr ;
3405
- if (ArgsSize == LaunchParams.Size + getImplicitArgsSize ()) {
3406
- ImplArgs = reinterpret_cast <hsa_utils::AMDGPUImplicitArgsTy *>(
3407
- utils::advancePtr (AllArgs, LaunchParams.Size ));
3408
-
3409
- // Set the COV5+ implicit arguments to the appropriate values.
3410
- std::memset (ImplArgs, 0 , getImplicitArgsSize ());
3411
- ImplArgs->BlockCountX = NumBlocks[0 ];
3412
- ImplArgs->BlockCountY = NumBlocks[1 ];
3413
- ImplArgs->BlockCountZ = NumBlocks[2 ];
3414
- ImplArgs->GroupSizeX = NumThreads[0 ];
3415
- ImplArgs->GroupSizeY = NumThreads[1 ];
3416
- ImplArgs->GroupSizeZ = NumThreads[2 ];
3417
- ImplArgs->GridDims = NumBlocks[2 ] * NumThreads[2 ] > 1
3418
- ? 3
3419
- : 1 + (NumBlocks[1 ] * NumThreads[1 ] != 1 );
3420
- ImplArgs->DynamicLdsSize = KernelArgs.DynCGroupMem ;
3402
+ uint64_t ImplArgsOffset = utils::roundUp (
3403
+ LaunchParams.Size , alignof (hsa_utils::AMDGPUImplicitArgsTy));
3404
+ if (ArgsSize > ImplArgsOffset) {
3405
+ hsa_utils::AMDGPUImplicitArgsTy *ImplArgs =
3406
+ reinterpret_cast <hsa_utils::AMDGPUImplicitArgsTy *>(
3407
+ utils::advancePtr (AllArgs, ImplArgsOffset));
3408
+
3409
+ // Set the COV5+ implicit arguments to the appropriate values if present.
3410
+ uint64_t ImplArgsSize = ArgsSize - ImplArgsOffset;
3411
+ std::memset (ImplArgs, 0 , ImplArgsSize);
3412
+
3413
+ using ImplArgsTy = hsa_utils::AMDGPUImplicitArgsTy;
3414
+ hsa_utils::initImplArg (ImplArgs, &ImplArgsTy::BlockCountX, ImplArgsSize,
3415
+ NumBlocks[0 ]);
3416
+ hsa_utils::initImplArg (ImplArgs, &ImplArgsTy::BlockCountY, ImplArgsSize,
3417
+ NumBlocks[1 ]);
3418
+ hsa_utils::initImplArg (ImplArgs, &ImplArgsTy::BlockCountZ, ImplArgsSize,
3419
+ NumBlocks[2 ]);
3420
+
3421
+ hsa_utils::initImplArg (ImplArgs, &ImplArgsTy::GroupSizeX, ImplArgsSize,
3422
+ NumThreads[0 ]);
3423
+ hsa_utils::initImplArg (ImplArgs, &ImplArgsTy::GroupSizeY, ImplArgsSize,
3424
+ NumThreads[1 ]);
3425
+ hsa_utils::initImplArg (ImplArgs, &ImplArgsTy::GroupSizeZ, ImplArgsSize,
3426
+ NumThreads[2 ]);
3427
+
3428
+ hsa_utils::initImplArg (ImplArgs, &ImplArgsTy::GridDims, ImplArgsSize,
3429
+ NumBlocks[2 ] * NumThreads[2 ] > 1
3430
+ ? 3
3431
+ : 1 + (NumBlocks[1 ] * NumThreads[1 ] != 1 ));
3432
+
3433
+ hsa_utils::initImplArg (ImplArgs, &ImplArgsTy::DynamicLdsSize, ImplArgsSize,
3434
+ KernelArgs.DynCGroupMem );
3421
3435
}
3422
3436
3423
3437
// Push the kernel launch into the stream.
0 commit comments