@@ -256,9 +256,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill(
256
256
return UR_RESULT_SUCCESS;
257
257
}
258
258
259
- // OpenCL only supports pattern sizes as large as the largest CL type
260
- // (double16/long16 - 128 bytes), anything larger we need to do on the host
261
- // side and copy it into the target allocation.
259
+ // OpenCL only supports pattern sizes which are powers of 2 and are as large
260
+ // as the largest CL type (double16/long16 - 128 bytes), anything larger or
261
+ // not a power of 2, we need to do on the host side and copy it into the
262
+ // target allocation.
262
263
clHostMemAllocINTEL_fn HostMemAlloc = nullptr ;
263
264
UR_RETURN_ON_FAILURE (cl_ext::getExtFuncFromContext<clHostMemAllocINTEL_fn>(
264
265
CLContext, cl_ext::ExtFuncPtrCache->clHostMemAllocINTELCache ,
@@ -275,14 +276,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill(
275
276
cl_ext::MemBlockingFreeName, &USMFree));
276
277
277
278
cl_int ClErr = CL_SUCCESS;
278
- auto HostBuffer = static_cast <uint64_t *>(
279
+ auto HostBuffer = static_cast <unsigned char *>(
279
280
HostMemAlloc (CLContext, nullptr , size, 0 , &ClErr));
280
281
CL_RETURN_ON_FAILURE (ClErr);
281
282
282
- auto NumValues = size / sizeof ( uint64_t ) ;
283
- auto NumChunks = patternSize / sizeof ( uint64_t );
284
- for ( size_t i = 0 ; i < NumValues; i++) {
285
- HostBuffer[i] = static_cast < const uint64_t *>( pPattern)[i % NumChunks] ;
283
+ auto NumChunks = size / patternSize ;
284
+ for ( size_t i = 0 ; i < NumChunks; i++) {
285
+ auto Dest = HostBuffer + i * patternSize;
286
+ memcpy (Dest, pPattern, patternSize) ;
286
287
}
287
288
288
289
cl_event CopyEvent = nullptr ;
0 commit comments