@@ -475,6 +475,7 @@ struct AMDGPUKernelTy : public GenericKernelTy {
475
475
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &KernelObject},
476
476
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &ArgsSize},
477
477
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &GroupSize},
478
+ {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK, &DynamicStack},
478
479
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &PrivateSize}};
479
480
480
481
for (auto &Info : RequiredInfos) {
@@ -524,6 +525,9 @@ struct AMDGPUKernelTy : public GenericKernelTy {
524
525
// / @return 56 for cov4 and 256 for cov5
525
526
uint32_t getImplicitArgsSize () const { return ImplicitArgsSize; }
526
527
528
+ // / Indicates whether or not we need to set up our own private segment size.
529
+ bool usesDynamicStack () const { return DynamicStack; }
530
+
527
531
private:
528
532
// / The kernel object to execute.
529
533
uint64_t KernelObject;
@@ -532,6 +536,7 @@ struct AMDGPUKernelTy : public GenericKernelTy {
532
536
uint32_t ArgsSize;
533
537
uint32_t GroupSize;
534
538
uint32_t PrivateSize;
539
+ bool DynamicStack;
535
540
536
541
// / The size of implicit kernel arguments.
537
542
uint32_t ImplicitArgsSize;
@@ -660,7 +665,8 @@ struct AMDGPUQueueTy {
660
665
// / signal and can define an optional input signal (nullptr if none).
661
666
Error pushKernelLaunch (const AMDGPUKernelTy &Kernel, void *KernelArgs,
662
667
uint32_t NumThreads, uint64_t NumBlocks,
663
- uint32_t GroupSize, AMDGPUSignalTy *OutputSignal,
668
+ uint32_t GroupSize, uint64_t StackSize,
669
+ AMDGPUSignalTy *OutputSignal,
664
670
AMDGPUSignalTy *InputSignal) {
665
671
assert (OutputSignal && " Invalid kernel output signal" );
666
672
@@ -697,7 +703,8 @@ struct AMDGPUQueueTy {
697
703
Packet->grid_size_x = NumBlocks * NumThreads;
698
704
Packet->grid_size_y = 1 ;
699
705
Packet->grid_size_z = 1 ;
700
- Packet->private_segment_size = Kernel.getPrivateSize ();
706
+ Packet->private_segment_size =
707
+ Kernel.usesDynamicStack () ? StackSize : Kernel.getPrivateSize ();
701
708
Packet->group_segment_size = GroupSize;
702
709
Packet->kernel_object = Kernel.getKernelObject ();
703
710
Packet->kernarg_address = KernelArgs;
@@ -1166,7 +1173,7 @@ struct AMDGPUStreamTy {
1166
1173
// / the kernel args buffer to the specified memory manager.
1167
1174
Error pushKernelLaunch (const AMDGPUKernelTy &Kernel, void *KernelArgs,
1168
1175
uint32_t NumThreads, uint64_t NumBlocks,
1169
- uint32_t GroupSize,
1176
+ uint32_t GroupSize, uint64_t StackSize,
1170
1177
AMDGPUMemoryManagerTy &MemoryManager) {
1171
1178
if (Queue == nullptr )
1172
1179
return Plugin::error (" Target queue was nullptr" );
@@ -1189,7 +1196,8 @@ struct AMDGPUStreamTy {
1189
1196
1190
1197
// Push the kernel with the output signal and an input signal (optional)
1191
1198
return Queue->pushKernelLaunch (Kernel, KernelArgs, NumThreads, NumBlocks,
1192
- GroupSize, OutputSignal, InputSignal);
1199
+ GroupSize, StackSize, OutputSignal,
1200
+ InputSignal);
1193
1201
}
1194
1202
1195
1203
// / Push an asynchronous memory copy between pinned memory buffers.
@@ -2610,10 +2618,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
2610
2618
2611
2619
// / Getters and setters for stack and heap sizes.
2612
2620
Error getDeviceStackSize (uint64_t &Value) override {
2613
- Value = 0 ;
2621
+ Value = StackSize ;
2614
2622
return Plugin::success ();
2615
2623
}
2616
2624
Error setDeviceStackSize (uint64_t Value) override {
2625
+ StackSize = Value;
2617
2626
return Plugin::success ();
2618
2627
}
2619
2628
Error getDeviceHeapSize (uint64_t &Value) override {
@@ -2769,6 +2778,10 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
2769
2778
2770
2779
// / The current size of the global device memory pool (managed by us).
2771
2780
uint64_t DeviceMemoryPoolSize = 1L << 29L /* 512MB */ ;
2781
+
2782
+ // / The current size of the stack that will be used in cases where it could
2783
+ // / not be statically determined.
2784
+ uint64_t StackSize = 16 * 1024 /* 16 KB */ ;
2772
2785
};
2773
2786
2774
2787
Error AMDGPUDeviceImageTy::loadExecutable (const AMDGPUDeviceTy &Device) {
@@ -3142,6 +3155,10 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
3142
3155
GroupSize += MaxDynCGroupMem;
3143
3156
}
3144
3157
3158
+ uint64_t StackSize;
3159
+ if (auto Err = GenericDevice.getDeviceStackSize (StackSize))
3160
+ return Err;
3161
+
3145
3162
// Initialize implicit arguments.
3146
3163
utils::AMDGPUImplicitArgsTy *ImplArgs =
3147
3164
reinterpret_cast <utils::AMDGPUImplicitArgsTy *>(
@@ -3180,7 +3197,7 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
3180
3197
3181
3198
// Push the kernel launch into the stream.
3182
3199
return Stream->pushKernelLaunch (*this , AllArgs, NumThreads, NumBlocks,
3183
- GroupSize, ArgsMemoryManager);
3200
+ GroupSize, StackSize, ArgsMemoryManager);
3184
3201
}
3185
3202
3186
3203
Error AMDGPUKernelTy::printLaunchInfoDetails (GenericDeviceTy &GenericDevice,
0 commit comments