@@ -143,21 +143,6 @@ void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock,
143
143
GlobalSizeNormalized[i] = GlobalWorkSize[i];
144
144
}
145
145
146
- static auto IsPrime = [](size_t Number) -> bool {
147
- auto LastNumToCheck = ceil (sqrt (Number));
148
- if (Number < 2 )
149
- return false ;
150
- if (Number == 2 )
151
- return true ;
152
- if (Number % 2 == 0 )
153
- return false ;
154
- for (int i = 3 ; i <= LastNumToCheck; i += 2 ) {
155
- if (Number % i == 0 )
156
- return false ;
157
- }
158
- return true ;
159
- };
160
-
161
146
cuDeviceGetAttribute (&MaxBlockDim[1 ], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y,
162
147
Device->get ());
163
148
cuDeviceGetAttribute (&MaxBlockDim[2 ], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z,
@@ -177,15 +162,6 @@ void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock,
177
162
std::min (MaxThreadsPerBlock[0 ],
178
163
std::min (GlobalSizeNormalized[0 ], size_t (MaxBlockDim[0 ])));
179
164
180
- // When GlobalSizeNormalized[0] is prime threadPerBlock[0] will later
181
- // computed as 1, which is not efficient configuration. In such case we use
182
- // GlobalSizeNormalized[0] + 1 to compute threadPerBlock[0].
183
- int Adjusted0DimGlobalWorkSize =
184
- (IsPrime (GlobalSizeNormalized[0 ]) &&
185
- (ThreadsPerBlock[0 ] != GlobalSizeNormalized[0 ]))
186
- ? GlobalSizeNormalized[0 ] + 1
187
- : GlobalSizeNormalized[0 ];
188
-
189
165
static auto IsPowerOf2 = [](size_t Value) -> bool {
190
166
return Value && !(Value & (Value - 1 ));
191
167
};
@@ -194,7 +170,7 @@ void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock,
194
170
// work group size to produce uniform work groups.
195
171
// Additionally, for best compute utilisation, the local size has
196
172
// to be a power of two.
197
- while (0u != (Adjusted0DimGlobalWorkSize % ThreadsPerBlock[0 ]) ||
173
+ while (0u != (GlobalSizeNormalized[ 0 ] % ThreadsPerBlock[0 ]) ||
198
174
!IsPowerOf2 (ThreadsPerBlock[0 ])) {
199
175
--ThreadsPerBlock[0 ];
200
176
}
0 commit comments