@@ -666,7 +666,7 @@ struct AMDGPUQueueTy {
666
666
// / signal and can define an optional input signal (nullptr if none).
667
667
Error pushKernelLaunch (const AMDGPUKernelTy &Kernel, void *KernelArgs,
668
668
uint32_t NumThreads, uint64_t NumBlocks,
669
- uint32_t GroupSize, uint64_t StackSize,
669
+ uint32_t GroupSize, uint32_t StackSize,
670
670
AMDGPUSignalTy *OutputSignal,
671
671
AMDGPUSignalTy *InputSignal) {
672
672
assert (OutputSignal && " Invalid kernel output signal" );
@@ -705,7 +705,8 @@ struct AMDGPUQueueTy {
705
705
Packet->grid_size_y = 1 ;
706
706
Packet->grid_size_z = 1 ;
707
707
Packet->private_segment_size =
708
- Kernel.usesDynamicStack () ? StackSize : Kernel.getPrivateSize ();
708
+ Kernel.usesDynamicStack () ? std::max (Kernel.getPrivateSize (), StackSize)
709
+ : Kernel.getPrivateSize ();
709
710
Packet->group_segment_size = GroupSize;
710
711
Packet->kernel_object = Kernel.getKernelObject ();
711
712
Packet->kernarg_address = KernelArgs;
@@ -1174,7 +1175,7 @@ struct AMDGPUStreamTy {
1174
1175
// / the kernel args buffer to the specified memory manager.
1175
1176
Error pushKernelLaunch (const AMDGPUKernelTy &Kernel, void *KernelArgs,
1176
1177
uint32_t NumThreads, uint64_t NumBlocks,
1177
- uint32_t GroupSize, uint64_t StackSize,
1178
+ uint32_t GroupSize, uint32_t StackSize,
1178
1179
AMDGPUMemoryManagerTy &MemoryManager) {
1179
1180
if (Queue == nullptr )
1180
1181
return Plugin::error (" Target queue was nullptr" );
@@ -1872,6 +1873,25 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
1872
1873
else
1873
1874
return Plugin::error (" Unexpected AMDGPU wavefront %d" , WavefrontSize);
1874
1875
1876
+ // To determine the correct scratch memory size per thread, we need to check
1877
+ // the device architecure generation. According to AOT_OFFLOADARCHS we may
1878
+ // assume that AMDGPU offload archs are prefixed with "gfx" and suffixed
1879
+ // with a two char arch specialization. In-between is the 1-2 char
1880
+ // generation number we want to extract.
1881
+ StringRef Arch (ComputeUnitKind);
1882
+ unsigned GfxGen = 0u ;
1883
+ if (!llvm::to_integer (Arch.slice (sizeof (" gfx" ) - 1 , Arch.size () - 2 ),
1884
+ GfxGen))
1885
+ return Plugin::error (" Invalid GFX architecture string" );
1886
+
1887
+ // See: 'getMaxWaveScratchSize' in 'llvm/lib/Target/AMDGPU/GCNSubtarget.h'.
1888
+ // But we need to divide by WavefrontSize.
1889
+ // For generations pre-gfx11: use 13-bit field in units of 256-dword,
1890
+ // otherwise: 15-bit field in units of 64-dword.
1891
+ MaxThreadScratchSize = (GfxGen < 11 )
1892
+ ? ((256 * 4 ) / WavefrontSize) * ((1 << 13 ) - 1 )
1893
+ : ((64 * 4 ) / WavefrontSize) * ((1 << 15 ) - 1 );
1894
+
1875
1895
// Get maximum number of workitems per workgroup.
1876
1896
uint16_t WorkgroupMaxDim[3 ];
1877
1897
if (auto Err =
@@ -2623,7 +2643,17 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
2623
2643
return Plugin::success ();
2624
2644
}
2625
2645
Error setDeviceStackSize (uint64_t Value) override {
2626
- StackSize = Value;
2646
+ if (Value > MaxThreadScratchSize) {
2647
+ // Cap device scratch size.
2648
+ MESSAGE (" Scratch memory size will be set to %d. Reason: Requested size "
2649
+ " %ld would exceed available resources." ,
2650
+ MaxThreadScratchSize, Value);
2651
+ StackSize = MaxThreadScratchSize;
2652
+ } else {
2653
+ // Apply device scratch size, since it is within limits.
2654
+ StackSize = Value;
2655
+ }
2656
+
2627
2657
return Plugin::success ();
2628
2658
}
2629
2659
Error getDeviceHeapSize (uint64_t &Value) override {
@@ -2782,7 +2812,12 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
2782
2812
2783
2813
// / The current size of the stack that will be used in cases where it could
2784
2814
// / not be statically determined.
2785
- uint64_t StackSize = 16 * 1024 /* 16 KB */ ;
2815
+ // / Default: 1024, in conformity to hipLimitStackSize.
2816
+ uint64_t StackSize = 1024 /* 1 KiB */ ;
2817
+
2818
+ // The maximum scratch memory size per thread.
2819
+ // See COMPUTE_TMPRING_SIZE.WAVESIZE (divided by threads per wave).
2820
+ uint32_t MaxThreadScratchSize;
2786
2821
};
2787
2822
2788
2823
Error AMDGPUDeviceImageTy::loadExecutable (const AMDGPUDeviceTy &Device) {
@@ -3198,7 +3233,8 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
3198
3233
3199
3234
// Push the kernel launch into the stream.
3200
3235
return Stream->pushKernelLaunch (*this , AllArgs, NumThreads, NumBlocks,
3201
- GroupSize, StackSize, ArgsMemoryManager);
3236
+ GroupSize, static_cast <uint32_t >(StackSize),
3237
+ ArgsMemoryManager);
3202
3238
}
3203
3239
3204
3240
Error AMDGPUKernelTy::printLaunchInfoDetails (GenericDeviceTy &GenericDevice,
0 commit comments