|
21 | 21 |
|
22 | 22 | static constexpr unsigned MinBytes = 8;
|
23 | 23 |
|
24 |
| -template <unsigned BytesPerThread, unsigned NThreads = MAX_THREADS_PER_TEAM> |
| 24 | +template <unsigned BPerThread, unsigned NThreads = MAX_THREADS_PER_TEAM> |
25 | 25 | struct alignas(32) ThreadStackTy {
|
26 |
| - static constexpr unsigned MaxSize = NThreads * BytesPerThread; |
| 26 | + static constexpr unsigned BytesPerThread = BPerThread; |
27 | 27 | static constexpr unsigned NumThreads = NThreads;
|
28 | 28 | static constexpr unsigned NumWarps = (NThreads + WARPSIZE - 1) / WARPSIZE;
|
29 |
| - static constexpr unsigned MaxSizePerWarp = MaxSize / NumWarps; |
30 | 29 |
|
31 |
| - unsigned char Data[MaxSize]; |
32 |
| - char Sizes[MaxSize / MinBytes]; |
33 |
| - char SizeUsage[NumWarps]; |
34 |
| - char Usage[NumWarps]; |
| 30 | + unsigned char Data[NumThreads][BytesPerThread]; |
| 31 | + unsigned char Usage[NumThreads]; |
35 | 32 | };
|
36 | 33 |
|
37 | 34 | [[clang::loader_uninitialized]] ThreadStackTy<MinBytes * 8, 1> MainSharedStack;
|
38 | 35 | #pragma omp allocate(MainSharedStack) allocator(omp_pteam_mem_alloc)
|
39 | 36 |
|
40 |
| -[[clang::loader_uninitialized]] ThreadStackTy<MinBytes * 2, |
41 |
| - MAX_THREADS_PER_TEAM / 8> |
| 37 | +[[clang::loader_uninitialized]] ThreadStackTy<MinBytes, |
| 38 | + MAX_THREADS_PER_TEAM / 4> |
42 | 39 | WorkerSharedStack;
|
43 | 40 | #pragma omp allocate(WorkerSharedStack) allocator(omp_pteam_mem_alloc)
|
44 | 41 |
|
45 |
| -template <typename AllocTy> |
46 |
| -static void *__kmpc_alloc_for_warp(AllocTy Alloc, unsigned Bytes, |
47 |
| - unsigned WarpBytes) { |
48 |
| - void *Ptr; |
49 |
| - __kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask(); |
50 |
| - unsigned LeaderID = __kmpc_impl_ffs(CurActive) - 1; |
51 |
| - bool IsWarpLeader = |
52 |
| - (__kmpc_get_hardware_thread_id_in_block() % WARPSIZE) == LeaderID; |
53 |
| - if (IsWarpLeader) |
54 |
| - Ptr = Alloc(); |
55 |
| - // Get address from the first active lane. |
56 |
| - int *FP = (int *)&Ptr; |
57 |
| - FP[0] = __kmpc_impl_shfl_sync(CurActive, FP[0], LeaderID); |
58 |
| - if (sizeof(Ptr) == 8) |
59 |
| - FP[1] = __kmpc_impl_shfl_sync(CurActive, FP[1], LeaderID); |
60 |
| - return (void *)&((char *)(Ptr))[(GetLaneId() - LeaderID) * Bytes]; |
61 |
| -} |
62 |
| - |
63 | 42 | EXTERN void *__kmpc_alloc_shared(size_t Bytes) {
|
64 |
| - Bytes = Bytes + (Bytes % MinBytes); |
| 43 | + size_t AlignedBytes = Bytes + (Bytes % MinBytes); |
65 | 44 | int TID = __kmpc_get_hardware_thread_id_in_block();
|
66 | 45 | if (__kmpc_is_generic_main_thread(TID)) {
|
67 | 46 | // Main thread alone, use shared memory if space is available.
|
68 |
| - if (MainSharedStack.Usage[0] + Bytes <= MainSharedStack.MaxSize) { |
69 |
| - void *Ptr = &MainSharedStack.Data[MainSharedStack.Usage[0]]; |
70 |
| - MainSharedStack.Usage[0] += Bytes; |
71 |
| - MainSharedStack.Sizes[MainSharedStack.SizeUsage[0]++] = Bytes; |
| 47 | + if (MainSharedStack.Usage[0] + AlignedBytes <= MainSharedStack.BytesPerThread) { |
| 48 | + void *Ptr = &MainSharedStack.Data[0][MainSharedStack.Usage[0]]; |
| 49 | + MainSharedStack.Usage[0] += AlignedBytes; |
72 | 50 | return Ptr;
|
73 | 51 | }
|
74 |
| - } else { |
75 |
| - int WID = GetWarpId(); |
76 |
| - unsigned WarpBytes = Bytes * WARPSIZE; |
77 |
| - auto AllocSharedStack = [&]() { |
78 |
| - unsigned WarpOffset = WID * WorkerSharedStack.MaxSizePerWarp; |
79 |
| - void *Ptr = |
80 |
| - &WorkerSharedStack.Data[WarpOffset + WorkerSharedStack.Usage[WID]]; |
81 |
| - WorkerSharedStack.Usage[WID] += WarpBytes; |
82 |
| - WorkerSharedStack.Sizes[WorkerSharedStack.SizeUsage[WID]++] = WarpBytes; |
| 52 | + } else if (TID < WorkerSharedStack.NumThreads) { |
| 53 | + if (WorkerSharedStack.Usage[TID] + AlignedBytes <= WorkerSharedStack.BytesPerThread) { |
| 54 | + void *Ptr = &WorkerSharedStack.Data[TID][WorkerSharedStack.Usage[TID]]; |
| 55 | + WorkerSharedStack.Usage[TID] += AlignedBytes; |
83 | 56 | return Ptr;
|
84 |
| - }; |
85 |
| - if (TID < WorkerSharedStack.NumThreads && |
86 |
| - WorkerSharedStack.Usage[WID] + WarpBytes <= |
87 |
| - WorkerSharedStack.MaxSizePerWarp) |
88 |
| - return __kmpc_alloc_for_warp(AllocSharedStack, Bytes, WarpBytes); |
| 57 | + } |
89 | 58 | }
|
90 | 59 | // Fallback to malloc
|
91 |
| - unsigned WarpBytes = Bytes * WARPSIZE; |
92 |
| - auto AllocGlobal = [&] { |
93 |
| - return SafeMalloc(WarpBytes, "AllocGlobalFallback"); |
94 |
| - }; |
95 |
| - return __kmpc_alloc_for_warp(AllocGlobal, Bytes, WarpBytes); |
| 60 | + return SafeMalloc(Bytes, "AllocGlobalFallback"); |
96 | 61 | }
|
97 | 62 |
|
98 |
| -EXTERN void __kmpc_free_shared(void *Ptr, size_t /* Bytes */) { |
99 |
| - __kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask(); |
100 |
| - unsigned LeaderID = __kmpc_impl_ffs(CurActive) - 1; |
101 |
| - bool IsWarpLeader = |
102 |
| - (__kmpc_get_hardware_thread_id_in_block() % WARPSIZE) == LeaderID; |
103 |
| - __kmpc_syncwarp(CurActive); |
104 |
| - if (IsWarpLeader) { |
105 |
| - if (Ptr >= &MainSharedStack.Data[0] && |
106 |
| - Ptr < &MainSharedStack.Data[MainSharedStack.MaxSize]) { |
107 |
| - unsigned Bytes = MainSharedStack.Sizes[--MainSharedStack.SizeUsage[0]]; |
108 |
| - MainSharedStack.Usage[0] -= Bytes; |
| 63 | +EXTERN void __kmpc_free_shared(void *Ptr, size_t Bytes) { |
| 64 | + size_t AlignedBytes = Bytes + (Bytes % MinBytes); |
| 65 | + int TID = __kmpc_get_hardware_thread_id_in_block(); |
| 66 | + if (__kmpc_is_generic_main_thread(TID)) { |
| 67 | + if (Ptr >= &MainSharedStack.Data[0][0] && |
| 68 | + Ptr < &MainSharedStack.Data[MainSharedStack.NumThreads][0]) { |
| 69 | + MainSharedStack.Usage[0] -= AlignedBytes; |
109 | 70 | return;
|
110 | 71 | }
|
111 |
| - if (Ptr >= &WorkerSharedStack.Data[0] && |
112 |
| - Ptr < &WorkerSharedStack.Data[WorkerSharedStack.MaxSize]) { |
113 |
| - int WID = GetWarpId(); |
114 |
| - unsigned Bytes = |
115 |
| - WorkerSharedStack.Sizes[--WorkerSharedStack.SizeUsage[WID]]; |
116 |
| - WorkerSharedStack.Usage[WID] -= Bytes; |
| 72 | + } else if (TID < WorkerSharedStack.NumThreads) { |
| 73 | + if (Ptr >= &WorkerSharedStack.Data[0][0] && |
| 74 | + Ptr < &WorkerSharedStack.Data[WorkerSharedStack.NumThreads][0]) { |
| 75 | + int TID = __kmpc_get_hardware_thread_id_in_block(); |
| 76 | + WorkerSharedStack.Usage[TID] -= AlignedBytes; |
117 | 77 | return;
|
118 | 78 | }
|
119 |
| - SafeFree(Ptr, "FreeGlobalFallback"); |
120 | 79 | }
|
| 80 | + SafeFree(Ptr, "FreeGlobalFallback"); |
121 | 81 | }
|
122 | 82 |
|
123 | 83 | EXTERN void __kmpc_data_sharing_init_stack() {
|
124 |
| - for (unsigned i = 0; i < MainSharedStack.NumWarps; ++i) { |
125 |
| - MainSharedStack.SizeUsage[i] = 0; |
| 84 | + for (unsigned i = 0; i < MainSharedStack.NumWarps; ++i) |
126 | 85 | MainSharedStack.Usage[i] = 0;
|
127 |
| - } |
128 |
| - for (unsigned i = 0; i < WorkerSharedStack.NumWarps; ++i) { |
129 |
| - WorkerSharedStack.SizeUsage[i] = 0; |
| 86 | + for (unsigned i = 0; i < WorkerSharedStack.NumThreads; ++i) |
130 | 87 | WorkerSharedStack.Usage[i] = 0;
|
131 |
| - } |
132 | 88 | }
|
133 | 89 |
|
134 | 90 | /// Allocate storage in shared memory to communicate arguments from the main
|
|
0 commit comments