@@ -436,6 +436,7 @@ struct AMDGPUKernelTy : public GenericKernelTy {
436
436
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &KernelObject},
437
437
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &ArgsSize},
438
438
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &GroupSize},
439
+ {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK, &DynamicStack},
439
440
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &PrivateSize}};
440
441
441
442
for (auto &Info : RequiredInfos) {
@@ -485,6 +486,9 @@ struct AMDGPUKernelTy : public GenericKernelTy {
485
486
// / @return 56 for cov4 and 256 for cov5
486
487
uint32_t getImplicitArgsSize () const { return ImplicitArgsSize; }
487
488
489
+ // / Indicates whether or not we need to set up our own private segment size.
490
+ bool usesDynamicStack () const { return DynamicStack; }
491
+
488
492
private:
489
493
// / The kernel object to execute.
490
494
uint64_t KernelObject;
@@ -493,6 +497,7 @@ struct AMDGPUKernelTy : public GenericKernelTy {
493
497
uint32_t ArgsSize;
494
498
uint32_t GroupSize;
495
499
uint32_t PrivateSize;
500
+ bool DynamicStack;
496
501
497
502
// / The size of implicit kernel arguments.
498
503
uint32_t ImplicitArgsSize;
@@ -621,7 +626,8 @@ struct AMDGPUQueueTy {
621
626
// / signal and can define an optional input signal (nullptr if none).
622
627
Error pushKernelLaunch (const AMDGPUKernelTy &Kernel, void *KernelArgs,
623
628
uint32_t NumThreads, uint64_t NumBlocks,
624
- uint32_t GroupSize, AMDGPUSignalTy *OutputSignal,
629
+ uint32_t GroupSize, uint64_t StackSize,
630
+ AMDGPUSignalTy *OutputSignal,
625
631
AMDGPUSignalTy *InputSignal) {
626
632
assert (OutputSignal && " Invalid kernel output signal" );
627
633
@@ -658,7 +664,8 @@ struct AMDGPUQueueTy {
658
664
Packet->grid_size_x = NumBlocks * NumThreads;
659
665
Packet->grid_size_y = 1 ;
660
666
Packet->grid_size_z = 1 ;
661
- Packet->private_segment_size = Kernel.getPrivateSize ();
667
+ Packet->private_segment_size =
668
+ Kernel.usesDynamicStack () ? StackSize : Kernel.getPrivateSize ();
662
669
Packet->group_segment_size = GroupSize;
663
670
Packet->kernel_object = Kernel.getKernelObject ();
664
671
Packet->kernarg_address = KernelArgs;
@@ -1124,7 +1131,7 @@ struct AMDGPUStreamTy {
1124
1131
// / the kernel args buffer to the specified memory manager.
1125
1132
Error pushKernelLaunch (const AMDGPUKernelTy &Kernel, void *KernelArgs,
1126
1133
uint32_t NumThreads, uint64_t NumBlocks,
1127
- uint32_t GroupSize,
1134
+ uint32_t GroupSize, uint64_t StackSize,
1128
1135
AMDGPUMemoryManagerTy &MemoryManager) {
1129
1136
if (Queue == nullptr )
1130
1137
return Plugin::error (" Target queue was nullptr" );
@@ -1147,7 +1154,8 @@ struct AMDGPUStreamTy {
1147
1154
1148
1155
// Push the kernel with the output signal and an input signal (optional)
1149
1156
return Queue->pushKernelLaunch (Kernel, KernelArgs, NumThreads, NumBlocks,
1150
- GroupSize, OutputSignal, InputSignal);
1157
+ GroupSize, StackSize, OutputSignal,
1158
+ InputSignal);
1151
1159
}
1152
1160
1153
1161
// / Push an asynchronous memory copy between pinned memory buffers.
@@ -2574,10 +2582,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
2574
2582
2575
2583
// / Getters and setters for stack and heap sizes.
2576
2584
Error getDeviceStackSize (uint64_t &Value) override {
2577
- Value = 0 ;
2585
+ Value = StackSize ;
2578
2586
return Plugin::success ();
2579
2587
}
2580
2588
Error setDeviceStackSize (uint64_t Value) override {
2589
+ StackSize = Value;
2581
2590
return Plugin::success ();
2582
2591
}
2583
2592
Error getDeviceHeapSize (uint64_t &Value) override {
@@ -2728,6 +2737,10 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
2728
2737
2729
2738
// / The current size of the global device memory pool (managed by us).
2730
2739
uint64_t DeviceMemoryPoolSize = 1L << 29L /* 512MB */ ;
2740
+
2741
+ // / The current size of the stack that will be used in cases where it could
2742
+ // / not be statically determined.
2743
+ uint64_t StackSize = 16 * 1024 /* 16 KB */ ;
2731
2744
};
2732
2745
2733
2746
Error AMDGPUDeviceImageTy::loadExecutable (const AMDGPUDeviceTy &Device) {
@@ -3100,6 +3113,10 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
3100
3113
GroupSize += MaxDynCGroupMem;
3101
3114
}
3102
3115
3116
+ uint64_t StackSize;
3117
+ if (auto Err = GenericDevice.getDeviceStackSize (StackSize))
3118
+ return Err;
3119
+
3103
3120
// Initialize implicit arguments.
3104
3121
utils::AMDGPUImplicitArgsTy *ImplArgs =
3105
3122
reinterpret_cast <utils::AMDGPUImplicitArgsTy *>(
@@ -3138,7 +3155,7 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
3138
3155
3139
3156
// Push the kernel launch into the stream.
3140
3157
return Stream->pushKernelLaunch (*this , AllArgs, NumThreads, NumBlocks,
3141
- GroupSize, ArgsMemoryManager);
3158
+ GroupSize, StackSize, ArgsMemoryManager);
3142
3159
}
3143
3160
3144
3161
Error AMDGPUKernelTy::printLaunchInfoDetails (GenericDeviceTy &GenericDevice,
0 commit comments