@@ -239,6 +239,36 @@ void *MemoryManager::allocateMemSubBuffer(ContextImplPtr TargetContext,
239
239
return NewMem;
240
240
}
241
241
242
+ struct TermPositions {
243
+ int XTerm;
244
+ int YTerm;
245
+ int ZTerm;
246
+ };
247
+ void prepTermPositions (TermPositions &pos, int Dimensions,
248
+ detail::SYCLMemObjI::MemObjType Type) {
249
+ // For buffers, the offsets/ranges coming from accessor are always
250
+ // id<3>/range<3> But their organization varies by dimension:
251
+ // 1 ==> {width, 1, 1}
252
+ // 2 ==> {height, width, 1}
253
+ // 3 ==> {depth, height, width}
254
+ // Some callers schedule 0 as DimDst/DimSrc.
255
+
256
+ if (Type == detail::SYCLMemObjI::MemObjType::BUFFER) {
257
+ if (Dimensions == 3 ) {
258
+ pos.XTerm = 2 , pos.YTerm = 1 , pos.ZTerm = 0 ;
259
+ } else if (Dimensions == 2 ) {
260
+ pos.XTerm = 1 , pos.YTerm = 0 , pos.ZTerm = 2 ;
261
+ } else { // Dimension is 1 or 0
262
+ pos.XTerm = 0 , pos.YTerm = 1 , pos.ZTerm = 2 ;
263
+ }
264
+ } else { // While range<>/id<> use by images is different than buffers, it's
265
+ // consistent with their accessors.
266
+ pos.XTerm = 0 ;
267
+ pos.YTerm = 1 ;
268
+ pos.ZTerm = 2 ;
269
+ }
270
+ }
271
+
242
272
void copyH2D (SYCLMemObjI *SYCLMemObj, char *SrcMem, QueueImplPtr,
243
273
unsigned int DimSrc, sycl::range<3 > SrcSize,
244
274
sycl::range<3 > SrcAccessRange, sycl::id<3 > SrcOffset,
@@ -250,34 +280,40 @@ void copyH2D(SYCLMemObjI *SYCLMemObj, char *SrcMem, QueueImplPtr,
250
280
assert (SYCLMemObj && " The SYCLMemObj is nullptr" );
251
281
252
282
const RT::PiQueue Queue = TgtQueue->getHandleRef ();
253
- // Adjust first dimension of copy range and offset as OpenCL expects size in
254
- // bytes.
255
- DstSize[0 ] *= DstElemSize;
256
283
const detail::plugin &Plugin = TgtQueue->getPlugin ();
257
- if (SYCLMemObj->getType () == detail::SYCLMemObjI::MemObjType::BUFFER) {
258
- DstOffset[0 ] *= DstElemSize;
259
- SrcOffset[0 ] *= SrcElemSize;
260
- SrcAccessRange[0 ] *= SrcElemSize;
261
- DstAccessRange[0 ] *= DstElemSize;
262
- SrcSize[0 ] *= SrcElemSize;
263
284
285
+ detail::SYCLMemObjI::MemObjType MemType = SYCLMemObj->getType ();
286
+ TermPositions SrcPos, DstPos;
287
+ prepTermPositions (SrcPos, DimSrc, MemType);
288
+ prepTermPositions (DstPos, DimDst, MemType);
289
+
290
+ size_t DstXOffBytes = DstOffset[DstPos.XTerm ] * DstElemSize;
291
+ size_t SrcXOffBytes = SrcOffset[SrcPos.XTerm ] * SrcElemSize;
292
+ size_t DstAccessRangeWidthBytes = DstAccessRange[DstPos.XTerm ] * DstElemSize;
293
+ size_t DstSzWidthBytes = DstSize[DstPos.XTerm ] * DstElemSize;
294
+ size_t SrcSzWidthBytes = SrcSize[SrcPos.XTerm ] * SrcElemSize;
295
+
296
+ if (MemType == detail::SYCLMemObjI::MemObjType::BUFFER) {
264
297
if (1 == DimDst && 1 == DimSrc) {
265
298
Plugin.call <PiApiKind::piEnqueueMemBufferWrite>(
266
299
Queue, DstMem,
267
- /* blocking_write=*/ CL_FALSE, DstOffset[ 0 ], DstAccessRange[ 0 ] ,
268
- SrcMem + SrcOffset[ 0 ] , DepEvents.size (), DepEvents.data (), &OutEvent);
300
+ /* blocking_write=*/ CL_FALSE, DstXOffBytes, DstAccessRangeWidthBytes ,
301
+ SrcMem + SrcXOffBytes , DepEvents.size (), DepEvents.data (), &OutEvent);
269
302
} else {
270
- size_t BufferRowPitch = (1 == DimDst) ? 0 : DstSize[0 ];
271
- size_t BufferSlicePitch = (3 == DimDst) ? DstSize[0 ] * DstSize[1 ] : 0 ;
272
- size_t HostRowPitch = (1 == DimSrc) ? 0 : SrcSize[0 ];
273
- size_t HostSlicePitch = (3 == DimSrc) ? SrcSize[0 ] * SrcSize[1 ] : 0 ;
274
-
275
- pi_buff_rect_offset_struct BufferOffset{DstOffset[0 ], DstOffset[1 ],
276
- DstOffset[2 ]};
277
- pi_buff_rect_offset_struct HostOffset{SrcOffset[0 ], SrcOffset[1 ],
278
- SrcOffset[2 ]};
279
- pi_buff_rect_region_struct RectRegion{
280
- DstAccessRange[0 ], DstAccessRange[1 ], DstAccessRange[2 ]};
303
+ size_t BufferRowPitch = (1 == DimDst) ? 0 : DstSzWidthBytes;
304
+ size_t BufferSlicePitch =
305
+ (3 == DimDst) ? DstSzWidthBytes * DstSize[DstPos.YTerm ] : 0 ;
306
+ size_t HostRowPitch = (1 == DimSrc) ? 0 : SrcSzWidthBytes;
307
+ size_t HostSlicePitch =
308
+ (3 == DimSrc) ? SrcSzWidthBytes * SrcSize[SrcPos.YTerm ] : 0 ;
309
+
310
+ pi_buff_rect_offset_struct BufferOffset{
311
+ DstXOffBytes, DstOffset[DstPos.YTerm ], DstOffset[DstPos.ZTerm ]};
312
+ pi_buff_rect_offset_struct HostOffset{
313
+ SrcXOffBytes, SrcOffset[SrcPos.YTerm ], SrcOffset[SrcPos.ZTerm ]};
314
+ pi_buff_rect_region_struct RectRegion{DstAccessRangeWidthBytes,
315
+ DstAccessRange[DstPos.YTerm ],
316
+ DstAccessRange[DstPos.ZTerm ]};
281
317
282
318
Plugin.call <PiApiKind::piEnqueueMemBufferWriteRect>(
283
319
Queue, DstMem,
@@ -286,12 +322,16 @@ void copyH2D(SYCLMemObjI *SYCLMemObj, char *SrcMem, QueueImplPtr,
286
322
SrcMem, DepEvents.size (), DepEvents.data (), &OutEvent);
287
323
}
288
324
} else {
289
- size_t InputRowPitch = (1 == DimDst) ? 0 : DstSize[0 ];
290
- size_t InputSlicePitch = (3 == DimDst) ? DstSize[0 ] * DstSize[1 ] : 0 ;
325
+ size_t InputRowPitch = (1 == DimDst) ? 0 : DstSzWidthBytes;
326
+ size_t InputSlicePitch =
327
+ (3 == DimDst) ? DstSzWidthBytes * DstSize[DstPos.YTerm ] : 0 ;
291
328
292
- pi_image_offset_struct Origin{DstOffset[0 ], DstOffset[1 ], DstOffset[2 ]};
293
- pi_image_region_struct Region{DstAccessRange[0 ], DstAccessRange[1 ],
294
- DstAccessRange[2 ]};
329
+ pi_image_offset_struct Origin{DstOffset[DstPos.XTerm ],
330
+ DstOffset[DstPos.YTerm ],
331
+ DstOffset[DstPos.ZTerm ]};
332
+ pi_image_region_struct Region{DstAccessRange[DstPos.XTerm ],
333
+ DstAccessRange[DstPos.YTerm ],
334
+ DstAccessRange[DstPos.ZTerm ]};
295
335
296
336
Plugin.call <PiApiKind::piEnqueueMemImageWrite>(
297
337
Queue, DstMem,
@@ -311,34 +351,46 @@ void copyD2H(SYCLMemObjI *SYCLMemObj, RT::PiMem SrcMem, QueueImplPtr SrcQueue,
311
351
assert (SYCLMemObj && " The SYCLMemObj is nullptr" );
312
352
313
353
const RT::PiQueue Queue = SrcQueue->getHandleRef ();
314
- // Adjust sizes of 1 dimensions as OpenCL expects size in bytes.
315
- SrcSize[0 ] *= SrcElemSize;
316
354
const detail::plugin &Plugin = SrcQueue->getPlugin ();
317
- if (SYCLMemObj->getType () == detail::SYCLMemObjI::MemObjType::BUFFER) {
318
- DstOffset[0 ] *= DstElemSize;
319
- SrcOffset[0 ] *= SrcElemSize;
320
- SrcAccessRange[0 ] *= SrcElemSize;
321
- DstAccessRange[0 ] *= DstElemSize;
322
- DstSize[0 ] *= DstElemSize;
323
355
356
+ detail::SYCLMemObjI::MemObjType MemType = SYCLMemObj->getType ();
357
+ TermPositions SrcPos, DstPos;
358
+ prepTermPositions (SrcPos, DimSrc, MemType);
359
+ prepTermPositions (DstPos, DimDst, MemType);
360
+
361
+ // For a given buffer, the various mem copy routines (copyD2H, copyH2D,
362
+ // copyD2D) will usually have the same values for AccessRange, Size,
363
+ // Dimension, Offset, etc. EXCEPT when the dtor for ~SYCLMemObjT is called.
364
+ // Essentially, it schedules a copyBack of chars thus in copyD2H the
365
+ // Dimension will then be 1 and DstAccessRange[0] and DstSize[0] will be
366
+ // sized to bytes with a DstElemSize of 1.
367
+ size_t DstXOffBytes = DstOffset[DstPos.XTerm ] * DstElemSize;
368
+ size_t SrcXOffBytes = SrcOffset[SrcPos.XTerm ] * SrcElemSize;
369
+ size_t SrcAccessRangeWidthBytes = SrcAccessRange[SrcPos.XTerm ] * SrcElemSize;
370
+ size_t DstSzWidthBytes = DstSize[DstPos.XTerm ] * DstElemSize;
371
+ size_t SrcSzWidthBytes = SrcSize[SrcPos.XTerm ] * SrcElemSize;
372
+
373
+ if (MemType == detail::SYCLMemObjI::MemObjType::BUFFER) {
324
374
if (1 == DimDst && 1 == DimSrc) {
325
375
Plugin.call <PiApiKind::piEnqueueMemBufferRead>(
326
376
Queue, SrcMem,
327
- /* blocking_read=*/ CL_FALSE, SrcOffset[ 0 ], SrcAccessRange[ 0 ] ,
328
- DstMem + DstOffset[ 0 ] , DepEvents.size (), DepEvents.data (), &OutEvent);
377
+ /* blocking_read=*/ CL_FALSE, SrcXOffBytes, SrcAccessRangeWidthBytes ,
378
+ DstMem + DstXOffBytes , DepEvents.size (), DepEvents.data (), &OutEvent);
329
379
} else {
330
- size_t BufferRowPitch = (1 == DimSrc) ? 0 : SrcSize[0 ];
331
- size_t BufferSlicePitch = (3 == DimSrc) ? SrcSize[0 ] * SrcSize[1 ] : 0 ;
332
-
333
- size_t HostRowPitch = (1 == DimDst) ? 0 : DstSize[0 ];
334
- size_t HostSlicePitch = (3 == DimDst) ? DstSize[0 ] * DstSize[1 ] : 0 ;
335
-
336
- pi_buff_rect_offset_struct BufferOffset{SrcOffset[0 ], SrcOffset[1 ],
337
- SrcOffset[2 ]};
338
- pi_buff_rect_offset_struct HostOffset{DstOffset[0 ], DstOffset[1 ],
339
- DstOffset[2 ]};
340
- pi_buff_rect_region_struct RectRegion{
341
- SrcAccessRange[0 ], SrcAccessRange[1 ], SrcAccessRange[2 ]};
380
+ size_t BufferRowPitch = (1 == DimSrc) ? 0 : SrcSzWidthBytes;
381
+ size_t BufferSlicePitch =
382
+ (3 == DimSrc) ? SrcSzWidthBytes * SrcSize[SrcPos.YTerm ] : 0 ;
383
+ size_t HostRowPitch = (1 == DimDst) ? 0 : DstSzWidthBytes;
384
+ size_t HostSlicePitch =
385
+ (3 == DimDst) ? DstSzWidthBytes * DstSize[DstPos.YTerm ] : 0 ;
386
+
387
+ pi_buff_rect_offset_struct BufferOffset{
388
+ SrcXOffBytes, SrcOffset[SrcPos.YTerm ], SrcOffset[SrcPos.ZTerm ]};
389
+ pi_buff_rect_offset_struct HostOffset{
390
+ DstXOffBytes, DstOffset[DstPos.YTerm ], DstOffset[DstPos.ZTerm ]};
391
+ pi_buff_rect_region_struct RectRegion{SrcAccessRangeWidthBytes,
392
+ SrcAccessRange[SrcPos.YTerm ],
393
+ SrcAccessRange[SrcPos.ZTerm ]};
342
394
343
395
Plugin.call <PiApiKind::piEnqueueMemBufferReadRect>(
344
396
Queue, SrcMem,
@@ -347,12 +399,16 @@ void copyD2H(SYCLMemObjI *SYCLMemObj, RT::PiMem SrcMem, QueueImplPtr SrcQueue,
347
399
DstMem, DepEvents.size (), DepEvents.data (), &OutEvent);
348
400
}
349
401
} else {
350
- size_t RowPitch = (1 == DimSrc) ? 0 : SrcSize[0 ];
351
- size_t SlicePitch = (3 == DimSrc) ? SrcSize[0 ] * SrcSize[1 ] : 0 ;
402
+ size_t RowPitch = (1 == DimSrc) ? 0 : SrcSzWidthBytes;
403
+ size_t SlicePitch =
404
+ (3 == DimSrc) ? SrcSzWidthBytes * SrcSize[SrcPos.YTerm ] : 0 ;
352
405
353
- pi_image_offset_struct Offset{SrcOffset[0 ], SrcOffset[1 ], SrcOffset[2 ]};
354
- pi_image_region_struct Region{SrcAccessRange[0 ], SrcAccessRange[1 ],
355
- SrcAccessRange[2 ]};
406
+ pi_image_offset_struct Offset{SrcOffset[SrcPos.XTerm ],
407
+ SrcOffset[SrcPos.YTerm ],
408
+ SrcOffset[SrcPos.ZTerm ]};
409
+ pi_image_region_struct Region{SrcAccessRange[SrcPos.XTerm ],
410
+ SrcAccessRange[SrcPos.YTerm ],
411
+ SrcAccessRange[SrcPos.ZTerm ]};
356
412
357
413
Plugin.call <PiApiKind::piEnqueueMemImageRead>(
358
414
Queue, SrcMem, CL_FALSE, &Offset, &Region, RowPitch, SlicePitch, DstMem,
@@ -371,43 +427,60 @@ void copyD2D(SYCLMemObjI *SYCLMemObj, RT::PiMem SrcMem, QueueImplPtr SrcQueue,
371
427
372
428
const RT::PiQueue Queue = SrcQueue->getHandleRef ();
373
429
const detail::plugin &Plugin = SrcQueue->getPlugin ();
374
- if (SYCLMemObj->getType () == detail::SYCLMemObjI::MemObjType::BUFFER) {
375
- // Adjust sizes of 1 dimensions as OpenCL expects size in bytes.
376
- DstOffset[0 ] *= DstElemSize;
377
- SrcOffset[0 ] *= SrcElemSize;
378
- SrcAccessRange[0 ] *= SrcElemSize;
379
- SrcSize[0 ] *= SrcElemSize;
380
- DstSize[0 ] *= DstElemSize;
430
+
431
+ detail::SYCLMemObjI::MemObjType MemType = SYCLMemObj->getType ();
432
+ TermPositions SrcPos, DstPos;
433
+ prepTermPositions (SrcPos, DimSrc, MemType);
434
+ prepTermPositions (DstPos, DimDst, MemType);
435
+
436
+ size_t DstXOffBytes = DstOffset[DstPos.XTerm ] * DstElemSize;
437
+ size_t SrcXOffBytes = SrcOffset[SrcPos.XTerm ] * SrcElemSize;
438
+ size_t SrcAccessRangeWidthBytes = SrcAccessRange[SrcPos.XTerm ] * SrcElemSize;
439
+ size_t DstSzWidthBytes = DstSize[DstPos.XTerm ] * DstElemSize;
440
+ size_t SrcSzWidthBytes = SrcSize[SrcPos.XTerm ] * SrcElemSize;
441
+
442
+ if (MemType == detail::SYCLMemObjI::MemObjType::BUFFER) {
381
443
if (1 == DimDst && 1 == DimSrc) {
382
444
Plugin.call <PiApiKind::piEnqueueMemBufferCopy>(
383
- Queue, SrcMem, DstMem, SrcOffset[0 ], DstOffset[0 ], SrcAccessRange[0 ],
384
- DepEvents.size (), DepEvents.data (), &OutEvent);
445
+ Queue, SrcMem, DstMem, SrcXOffBytes, DstXOffBytes,
446
+ SrcAccessRangeWidthBytes, DepEvents.size (), DepEvents.data (),
447
+ &OutEvent);
385
448
} else {
386
- size_t SrcRowPitch = (1 == DimSrc) ? 0 : SrcSize[0 ];
387
- size_t SrcSlicePitch =
388
- (DimSrc > 1 ) ? SrcSize[0 ] * SrcSize[1 ] : SrcSize[0 ];
389
-
390
- size_t DstRowPitch = (1 == DimDst) ? 0 : DstSize[0 ];
391
- size_t DstSlicePitch =
392
- (DimDst > 1 ) ? DstSize[0 ] * DstSize[1 ] : DstSize[0 ];
393
-
394
- pi_buff_rect_offset_struct SrcOrigin{SrcOffset[0 ], SrcOffset[1 ],
395
- SrcOffset[2 ]};
396
- pi_buff_rect_offset_struct DstOrigin{DstOffset[0 ], DstOffset[1 ],
397
- DstOffset[2 ]};
398
- pi_buff_rect_region_struct Region{SrcAccessRange[0 ], SrcAccessRange[1 ],
399
- SrcAccessRange[2 ]};
449
+ // passing 0 for pitches not allowed. Because clEnqueueCopyBufferRect will
450
+ // calculate both src and dest pitch using region[0], which is not correct
451
+ // if src and dest are not the same size.
452
+ size_t SrcRowPitch = SrcSzWidthBytes;
453
+ size_t SrcSlicePitch = (DimSrc <= 1 )
454
+ ? SrcSzWidthBytes
455
+ : SrcSzWidthBytes * SrcSize[SrcPos.YTerm ];
456
+ size_t DstRowPitch = DstSzWidthBytes;
457
+ size_t DstSlicePitch = (DimDst <= 1 )
458
+ ? DstSzWidthBytes
459
+ : DstSzWidthBytes * DstSize[DstPos.YTerm ];
460
+
461
+ pi_buff_rect_offset_struct SrcOrigin{
462
+ SrcXOffBytes, SrcOffset[SrcPos.YTerm ], SrcOffset[SrcPos.ZTerm ]};
463
+ pi_buff_rect_offset_struct DstOrigin{
464
+ DstXOffBytes, DstOffset[DstPos.YTerm ], DstOffset[DstPos.ZTerm ]};
465
+ pi_buff_rect_region_struct Region{SrcAccessRangeWidthBytes,
466
+ SrcAccessRange[SrcPos.YTerm ],
467
+ SrcAccessRange[SrcPos.ZTerm ]};
400
468
401
469
Plugin.call <PiApiKind::piEnqueueMemBufferCopyRect>(
402
470
Queue, SrcMem, DstMem, &SrcOrigin, &DstOrigin, &Region, SrcRowPitch,
403
471
SrcSlicePitch, DstRowPitch, DstSlicePitch, DepEvents.size (),
404
472
DepEvents.data (), &OutEvent);
405
473
}
406
474
} else {
407
- pi_image_offset_struct SrcOrigin{SrcOffset[0 ], SrcOffset[1 ], SrcOffset[2 ]};
408
- pi_image_offset_struct DstOrigin{DstOffset[0 ], DstOffset[1 ], DstOffset[2 ]};
409
- pi_image_region_struct Region{SrcAccessRange[0 ], SrcAccessRange[1 ],
410
- SrcAccessRange[2 ]};
475
+ pi_image_offset_struct SrcOrigin{SrcOffset[SrcPos.XTerm ],
476
+ SrcOffset[SrcPos.YTerm ],
477
+ SrcOffset[SrcPos.ZTerm ]};
478
+ pi_image_offset_struct DstOrigin{DstOffset[DstPos.XTerm ],
479
+ DstOffset[DstPos.YTerm ],
480
+ DstOffset[DstPos.ZTerm ]};
481
+ pi_image_region_struct Region{SrcAccessRange[SrcPos.XTerm ],
482
+ SrcAccessRange[SrcPos.YTerm ],
483
+ SrcAccessRange[SrcPos.ZTerm ]};
411
484
412
485
Plugin.call <PiApiKind::piEnqueueMemImageCopy>(
413
486
Queue, SrcMem, DstMem, &SrcOrigin, &DstOrigin, &Region,
0 commit comments