@@ -632,12 +632,6 @@ struct AMDGPUKernelTy : public GenericKernelTy {
632
632
return Err;
633
633
}
634
634
635
- // Get the currently set dynamic stack size from the device as 32bit value.
636
- assert (Device.getTargetStackSize () <=
637
- std::numeric_limits<uint32_t >::max () &&
638
- " AMDGPU Private Address Space may not exceed 32bit range" );
639
- TargetStackSize = static_cast <uint32_t >(Device.getTargetStackSize ());
640
-
641
635
// Make sure it is a kernel symbol.
642
636
if (SymbolType != HSA_SYMBOL_KIND_KERNEL)
643
637
return Plugin::error (" Symbol %s is not a kernel function" );
@@ -714,10 +708,6 @@ struct AMDGPUKernelTy : public GenericKernelTy {
714
708
uint32_t getPrivateSize () const { return PrivateSize; }
715
709
uint16_t getConstWGSize () const { return ConstWGSize; }
716
710
717
- // Get dynamic callstack information.
718
- bool hasDynamicCallstack () const { return DynCallstack; }
719
- uint32_t getTargetStackSize () const { return TargetStackSize; }
720
-
721
711
// / Get the HSA kernel object representing the kernel function.
722
712
uint64_t getKernelObject () const { return KernelObject; }
723
713
@@ -738,14 +728,6 @@ struct AMDGPUKernelTy : public GenericKernelTy {
738
728
uint32_t PrivateSize;
739
729
bool DynamicStack;
740
730
741
- // The kernel meta data if a dynamic callstack is being used.
742
- bool DynCallstack;
743
-
744
- // The dynamic / target callstack size (from environmental variable).
745
- // Note: While the EnVar is provided as uint64_t, the amdgpu private address
746
- // space uses 32 bit.
747
- uint32_t TargetStackSize;
748
-
749
731
// / The size of implicit kernel arguments.
750
732
uint32_t ImplicitArgsSize;
751
733
@@ -1161,7 +1143,7 @@ struct AMDGPUQueueTy {
1161
1143
// / signal and can define an optional input signal (nullptr if none).
1162
1144
Error pushKernelLaunch (const AMDGPUKernelTy &Kernel, void *KernelArgs,
1163
1145
uint32_t NumThreads, uint64_t NumBlocks,
1164
- uint32_t GroupSize, uint64_t StackSize,
1146
+ uint32_t GroupSize, uint32_t StackSize,
1165
1147
AMDGPUSignalTy *OutputSignal,
1166
1148
AMDGPUSignalTy *InputSignal) {
1167
1149
assert (OutputSignal && " Invalid kernel output signal" );
@@ -1200,7 +1182,8 @@ struct AMDGPUQueueTy {
1200
1182
Packet->grid_size_y = 1 ;
1201
1183
Packet->grid_size_z = 1 ;
1202
1184
Packet->private_segment_size =
1203
- Kernel.usesDynamicStack () ? StackSize : Kernel.getPrivateSize ();
1185
+ Kernel.usesDynamicStack () ? std::max (Kernel.getPrivateSize (), StackSize)
1186
+ : Kernel.getPrivateSize ();
1204
1187
Packet->group_segment_size = GroupSize;
1205
1188
Packet->kernel_object = Kernel.getKernelObject ();
1206
1189
Packet->kernarg_address = KernelArgs;
@@ -1735,7 +1718,7 @@ struct AMDGPUStreamTy {
1735
1718
// / the kernel args buffer to the specified memory manager.
1736
1719
Error pushKernelLaunch (const AMDGPUKernelTy &Kernel, void *KernelArgs,
1737
1720
uint32_t NumThreads, uint64_t NumBlocks,
1738
- uint32_t GroupSize, uint64_t StackSize,
1721
+ uint32_t GroupSize, uint32_t StackSize,
1739
1722
AMDGPUMemoryManagerTy &MemoryManager) {
1740
1723
if (Queue == nullptr )
1741
1724
return Plugin::error (" Target queue was nullptr" );
@@ -2566,6 +2549,24 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
2566
2549
else
2567
2550
return Plugin::error (" Unexpected AMDGPU wavefront %d" , WavefrontSize);
2568
2551
2552
+ // To determine the correct scratch memory size per thread, we need to check
2553
+ // the device architecure generation. Hence, we slice the major GFX version
2554
+ // from the agent info (e.g. 'gfx90a' -> 9).
2555
+ StringRef Arch (ComputeUnitKind);
2556
+ unsigned GfxGen = 0u ;
2557
+ if (!llvm::to_integer (Arch.slice (sizeof (" gfx" ) - 1 , Arch.size () - 2 ),
2558
+ GfxGen))
2559
+ return Plugin::error (" Invalid GFX architecture string" );
2560
+
2561
+ // TODO: Will try to eliminate this calculation, since its duplicated.
2562
+ // See: 'getMaxWaveScratchSize' in 'llvm/lib/Target/AMDGPU/GCNSubtarget.h'.
2563
+ // But we need to divide by WavefrontSize.
2564
+ // For generations pre-gfx11: use 13-bit field in units of 256-dword,
2565
+ // otherwise: 15-bit field in units of 64-dword.
2566
+ MaxThreadScratchSize = (GfxGen < 11 )
2567
+ ? ((256 * 4 ) / WavefrontSize) * ((1 << 13 ) - 1 )
2568
+ : ((64 * 4 ) / WavefrontSize) * ((1 << 15 ) - 1 );
2569
+
2569
2570
// Get maximum number of workitems per workgroup.
2570
2571
uint16_t WorkgroupMaxDim[3 ];
2571
2572
if (auto Err =
@@ -3416,7 +3417,17 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
3416
3417
return Plugin::success ();
3417
3418
}
3418
3419
Error setDeviceStackSize (uint64_t Value) override {
3419
- StackSize = Value;
3420
+ if (Value > MaxThreadScratchSize) {
3421
+ // Cap device scratch size.
3422
+ MESSAGE (" Scratch memory size will be set to %d. Reason: Requested size "
3423
+ " %ld would exceed available resources." ,
3424
+ MaxThreadScratchSize, Value);
3425
+ StackSize = MaxThreadScratchSize;
3426
+ } else {
3427
+ // Apply device scratch size, since it is within limits.
3428
+ StackSize = Value;
3429
+ }
3430
+
3420
3431
return Plugin::success ();
3421
3432
}
3422
3433
Error getDeviceHeapSize (uint64_t &Value) override {
@@ -3663,7 +3674,12 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
3663
3674
3664
3675
// / The current size of the stack that will be used in cases where it could
3665
3676
// / not be statically determined.
3666
- uint64_t StackSize = 16 * 1024 /* 16 KB */ ;
3677
+ // / Default: 1024, in conformity to hipLimitStackSize.
3678
+ uint32_t StackSize = 1024 /* 1 KB */ ;
3679
+
3680
+ // The maximum scratch memory size per thread.
3681
+ // See COMPUTE_TMPRING_SIZE.WAVESIZE (divided by threads per wave).
3682
+ uint32_t MaxThreadScratchSize;
3667
3683
};
3668
3684
3669
3685
Error AMDGPUDeviceImageTy::loadExecutable (const AMDGPUDeviceTy &Device) {
@@ -4375,7 +4391,8 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
4375
4391
4376
4392
// Push the kernel launch into the stream.
4377
4393
return Stream->pushKernelLaunch (*this , AllArgs, NumThreads, NumBlocks,
4378
- GroupSize, StackSize, ArgsMemoryManager);
4394
+ GroupSize, static_cast <uint32_t >(StackSize),
4395
+ ArgsMemoryManager);
4379
4396
}
4380
4397
4381
4398
void AMDGPUKernelTy::printAMDOneLineKernelTrace (GenericDeviceTy &GenericDevice,
0 commit comments