Skip to content

Commit 2ab99aa

Browse files
authored
Partially reverts back the update on guess local work size function. (#10055)
* Reverts back the update to calculation of threads per block for 0th dimension when primary ranges are involved. That could cause out of range access.
1 parent d812d1e commit 2ab99aa

File tree

1 file changed

+1
-25
lines changed
  • sycl/plugins/unified_runtime/ur/adapters/cuda

1 file changed

+1
-25
lines changed

sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp

Lines changed: 1 addition & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -143,21 +143,6 @@ void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock,
143143
GlobalSizeNormalized[i] = GlobalWorkSize[i];
144144
}
145145

146-
static auto IsPrime = [](size_t Number) -> bool {
147-
auto LastNumToCheck = ceil(sqrt(Number));
148-
if (Number < 2)
149-
return false;
150-
if (Number == 2)
151-
return true;
152-
if (Number % 2 == 0)
153-
return false;
154-
for (int i = 3; i <= LastNumToCheck; i += 2) {
155-
if (Number % i == 0)
156-
return false;
157-
}
158-
return true;
159-
};
160-
161146
cuDeviceGetAttribute(&MaxBlockDim[1], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y,
162147
Device->get());
163148
cuDeviceGetAttribute(&MaxBlockDim[2], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z,
@@ -177,15 +162,6 @@ void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock,
177162
std::min(MaxThreadsPerBlock[0],
178163
std::min(GlobalSizeNormalized[0], size_t(MaxBlockDim[0])));
179164

180-
// When GlobalSizeNormalized[0] is prime threadPerBlock[0] will later
181-
// computed as 1, which is not efficient configuration. In such case we use
182-
// GlobalSizeNormalized[0] + 1 to compute threadPerBlock[0].
183-
int Adjusted0DimGlobalWorkSize =
184-
(IsPrime(GlobalSizeNormalized[0]) &&
185-
(ThreadsPerBlock[0] != GlobalSizeNormalized[0]))
186-
? GlobalSizeNormalized[0] + 1
187-
: GlobalSizeNormalized[0];
188-
189165
static auto IsPowerOf2 = [](size_t Value) -> bool {
190166
return Value && !(Value & (Value - 1));
191167
};
@@ -194,7 +170,7 @@ void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock,
194170
// work group size to produce uniform work groups.
195171
// Additionally, for best compute utilisation, the local size has
196172
// to be a power of two.
197-
while (0u != (Adjusted0DimGlobalWorkSize % ThreadsPerBlock[0]) ||
173+
while (0u != (GlobalSizeNormalized[0] % ThreadsPerBlock[0]) ||
198174
!IsPowerOf2(ThreadsPerBlock[0])) {
199175
--ThreadsPerBlock[0];
200176
}

0 commit comments

Comments
 (0)