@@ -31,7 +31,8 @@ ur_result_t enqueueEventsWait(ur_queue_handle_t CommandQueue, CUstream Stream,
31
31
if (Event->getStream () == Stream) {
32
32
return UR_RESULT_SUCCESS;
33
33
} else {
34
- return UR_CHECK_ERROR (cuStreamWaitEvent (Stream, Event->get (), 0 ));
34
+ UR_CHECK_ERROR (cuStreamWaitEvent (Stream, Event->get (), 0 ));
35
+ return UR_RESULT_SUCCESS;
35
36
}
36
37
});
37
38
return Result;
@@ -193,8 +194,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier(
193
194
const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
194
195
// This function makes one stream work on the previous work (or work
195
196
// represented by input events) and then all future work waits on that stream.
196
- ur_result_t Result;
197
-
198
197
try {
199
198
ScopedContext Active (hQueue->getContext ());
200
199
uint32_t StreamToken;
@@ -228,23 +227,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier(
228
227
Event->getComputeStreamToken ())) {
229
228
return UR_RESULT_SUCCESS;
230
229
} else {
231
- return UR_CHECK_ERROR (
230
+ UR_CHECK_ERROR (
232
231
cuStreamWaitEvent (CuStream, Event->get (), 0 ));
232
+ return UR_RESULT_SUCCESS;
233
233
}
234
234
});
235
235
}
236
236
237
- Result = UR_CHECK_ERROR (cuEventRecord (hQueue->BarrierEvent , CuStream));
237
+ UR_CHECK_ERROR (cuEventRecord (hQueue->BarrierEvent , CuStream));
238
238
for (unsigned int i = 0 ; i < hQueue->ComputeAppliedBarrier .size (); i++) {
239
239
hQueue->ComputeAppliedBarrier [i] = false ;
240
240
}
241
241
for (unsigned int i = 0 ; i < hQueue->TransferAppliedBarrier .size (); i++) {
242
242
hQueue->TransferAppliedBarrier [i] = false ;
243
243
}
244
244
}
245
- if (Result != UR_RESULT_SUCCESS) {
246
- return Result;
247
- }
248
245
249
246
if (phEvent) {
250
247
*phEvent = ur_event_handle_t_::makeNative (
@@ -430,7 +427,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
430
427
Device->getMaxChosenLocalMem ()));
431
428
}
432
429
433
- Result = UR_CHECK_ERROR (cuLaunchKernel (
430
+ UR_CHECK_ERROR (cuLaunchKernel (
434
431
CuFunc, BlocksPerGrid[0 ], BlocksPerGrid[1 ], BlocksPerGrid[2 ],
435
432
ThreadsPerBlock[0 ], ThreadsPerBlock[1 ], ThreadsPerBlock[2 ], LocalSize,
436
433
CuStream, const_cast <void **>(ArgIndices.data ()), nullptr ));
@@ -502,7 +499,9 @@ static ur_result_t commonEnqueueMemBufferCopyRect(
502
499
params.dstPitch = dst_row_pitch;
503
500
params.dstHeight = dst_slice_pitch / dst_row_pitch;
504
501
505
- return UR_CHECK_ERROR (cuMemcpy3DAsync (¶ms, cu_stream));
502
+ UR_CHECK_ERROR (cuMemcpy3DAsync (¶ms, cu_stream));
503
+
504
+ return UR_RESULT_SUCCESS;
506
505
}
507
506
508
507
UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferReadRect (
@@ -540,7 +539,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferReadRect(
540
539
}
541
540
542
541
if (blockingRead) {
543
- Result = UR_CHECK_ERROR (cuStreamSynchronize (CuStream));
542
+ UR_CHECK_ERROR (cuStreamSynchronize (CuStream));
544
543
}
545
544
546
545
if (phEvent) {
@@ -587,7 +586,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWriteRect(
587
586
}
588
587
589
588
if (blockingWrite) {
590
- Result = UR_CHECK_ERROR (cuStreamSynchronize (cuStream));
589
+ UR_CHECK_ERROR (cuStreamSynchronize (cuStream));
591
590
}
592
591
593
592
if (phEvent) {
@@ -614,7 +613,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopy(
614
613
615
614
try {
616
615
ScopedContext Active (hQueue->getContext ());
617
- ur_result_t Result;
616
+ ur_result_t Result = UR_RESULT_SUCCESS ;
618
617
619
618
auto Stream = hQueue->getNextTransferStream ();
620
619
Result =
@@ -630,7 +629,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopy(
630
629
auto Src = hBufferSrc->Mem .BufferMem .get () + srcOffset;
631
630
auto Dst = hBufferDst->Mem .BufferMem .get () + dstOffset;
632
631
633
- Result = UR_CHECK_ERROR (cuMemcpyDtoDAsync (Dst, Src, size, Stream));
632
+ UR_CHECK_ERROR (cuMemcpyDtoDAsync (Dst, Src, size, Stream));
634
633
635
634
if (phEvent) {
636
635
UR_CHECK_ERROR (RetImplEvent->record ());
@@ -705,10 +704,7 @@ ur_result_t commonMemSetLargePattern(CUstream Stream, uint32_t PatternSize,
705
704
706
705
// Get 4-byte chunk of the pattern and call cuMemsetD32Async
707
706
auto Value = *(static_cast <const uint32_t *>(pPattern));
708
- auto Result = UR_CHECK_ERROR (cuMemsetD32Async (Ptr, Value, Count32, Stream));
709
- if (Result != UR_RESULT_SUCCESS) {
710
- return Result;
711
- }
707
+ UR_CHECK_ERROR (cuMemsetD32Async (Ptr, Value, Count32, Stream));
712
708
for (auto step = 4u ; step < NumberOfSteps; ++step) {
713
709
// take 1 byte of the pattern
714
710
Value = *(static_cast <const uint8_t *>(pPattern) + step);
@@ -737,8 +733,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill(
737
733
ScopedContext Active (hQueue->getContext ());
738
734
739
735
auto Stream = hQueue->getNextTransferStream ();
740
- ur_result_t Result;
741
- Result =
736
+ ur_result_t Result =
742
737
enqueueEventsWait (hQueue, Stream, numEventsInWaitList, phEventWaitList);
743
738
744
739
if (phEvent) {
@@ -755,17 +750,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill(
755
750
switch (patternSize) {
756
751
case 1 : {
757
752
auto Value = *static_cast <const uint8_t *>(pPattern);
758
- Result = UR_CHECK_ERROR (cuMemsetD8Async (DstDevice, Value, N, Stream));
753
+ UR_CHECK_ERROR (cuMemsetD8Async (DstDevice, Value, N, Stream));
759
754
break ;
760
755
}
761
756
case 2 : {
762
757
auto Value = *static_cast <const uint16_t *>(pPattern);
763
- Result = UR_CHECK_ERROR (cuMemsetD16Async (DstDevice, Value, N, Stream));
758
+ UR_CHECK_ERROR (cuMemsetD16Async (DstDevice, Value, N, Stream));
764
759
break ;
765
760
}
766
761
case 4 : {
767
762
auto Value = *static_cast <const uint32_t *>(pPattern);
768
- Result = UR_CHECK_ERROR (cuMemsetD32Async (DstDevice, Value, N, Stream));
763
+ UR_CHECK_ERROR (cuMemsetD32Async (DstDevice, Value, N, Stream));
769
764
break ;
770
765
}
771
766
default : {
@@ -843,7 +838,8 @@ static ur_result_t commonEnqueueMemImageNDCopy(
843
838
}
844
839
CpyDesc.WidthInBytes = Region.width ;
845
840
CpyDesc.Height = Region.height ;
846
- return UR_CHECK_ERROR (cuMemcpy2DAsync (&CpyDesc, CuStream));
841
+ UR_CHECK_ERROR (cuMemcpy2DAsync (&CpyDesc, CuStream));
842
+ return UR_RESULT_SUCCESS;
847
843
}
848
844
if (ImgType == UR_MEM_TYPE_IMAGE3D) {
849
845
CUDA_MEMCPY3D CpyDesc;
@@ -869,7 +865,8 @@ static ur_result_t commonEnqueueMemImageNDCopy(
869
865
CpyDesc.WidthInBytes = Region.width ;
870
866
CpyDesc.Height = Region.height ;
871
867
CpyDesc.Depth = Region.depth ;
872
- return UR_CHECK_ERROR (cuMemcpy3DAsync (&CpyDesc, CuStream));
868
+ UR_CHECK_ERROR (cuMemcpy3DAsync (&CpyDesc, CuStream));
869
+ return UR_RESULT_SUCCESS;
873
870
}
874
871
return UR_RESULT_ERROR_INVALID_VALUE;
875
872
}
@@ -896,7 +893,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead(
896
893
CUarray Array = hImage->Mem .SurfaceMem .getArray ();
897
894
898
895
CUDA_ARRAY_DESCRIPTOR ArrayDesc;
899
- Result = UR_CHECK_ERROR (cuArrayGetDescriptor (&ArrayDesc, Array));
896
+ UR_CHECK_ERROR (cuArrayGetDescriptor (&ArrayDesc, Array));
900
897
901
898
int ElementByteSize = imageElementByteSize (ArrayDesc);
902
899
@@ -913,7 +910,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead(
913
910
UR_CHECK_ERROR (RetImplEvent->start ());
914
911
}
915
912
if (ImgType == UR_MEM_TYPE_IMAGE1D) {
916
- Result = UR_CHECK_ERROR (
913
+ UR_CHECK_ERROR (
917
914
cuMemcpyAtoHAsync (pDst, Array, ByteOffsetX, BytesToCopy, CuStream));
918
915
} else {
919
916
ur_rect_region_t AdjustedRegion = {BytesToCopy, region.height ,
@@ -923,7 +920,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead(
923
920
Result = commonEnqueueMemImageNDCopy (
924
921
CuStream, ImgType, AdjustedRegion, &Array, CU_MEMORYTYPE_ARRAY,
925
922
SrcOffset, pDst, CU_MEMORYTYPE_HOST, ur_rect_offset_t {});
926
-
927
923
if (Result != UR_RESULT_SUCCESS) {
928
924
return Result;
929
925
}
@@ -935,7 +931,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead(
935
931
}
936
932
937
933
if (blockingRead) {
938
- Result = UR_CHECK_ERROR (cuStreamSynchronize (CuStream));
934
+ UR_CHECK_ERROR (cuStreamSynchronize (CuStream));
939
935
}
940
936
} catch (ur_result_t Err) {
941
937
return Err;
@@ -969,7 +965,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageWrite(
969
965
CUarray Array = hImage->Mem .SurfaceMem .getArray ();
970
966
971
967
CUDA_ARRAY_DESCRIPTOR ArrayDesc;
972
- Result = UR_CHECK_ERROR (cuArrayGetDescriptor (&ArrayDesc, Array));
968
+ UR_CHECK_ERROR (cuArrayGetDescriptor (&ArrayDesc, Array));
973
969
974
970
int ElementByteSize = imageElementByteSize (ArrayDesc);
975
971
@@ -986,7 +982,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageWrite(
986
982
987
983
ur_mem_type_t ImgType = hImage->Mem .SurfaceMem .getImageType ();
988
984
if (ImgType == UR_MEM_TYPE_IMAGE1D) {
989
- Result = UR_CHECK_ERROR (
985
+ UR_CHECK_ERROR (
990
986
cuMemcpyHtoAAsync (Array, ByteOffsetX, pSrc, BytesToCopy, CuStream));
991
987
} else {
992
988
ur_rect_region_t AdjustedRegion = {BytesToCopy, region.height ,
@@ -1041,9 +1037,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageCopy(
1041
1037
CUarray DstArray = hImageDst->Mem .SurfaceMem .getArray ();
1042
1038
1043
1039
CUDA_ARRAY_DESCRIPTOR SrcArrayDesc;
1044
- Result = UR_CHECK_ERROR (cuArrayGetDescriptor (&SrcArrayDesc, SrcArray));
1040
+ UR_CHECK_ERROR (cuArrayGetDescriptor (&SrcArrayDesc, SrcArray));
1045
1041
CUDA_ARRAY_DESCRIPTOR DstArrayDesc;
1046
- Result = UR_CHECK_ERROR (cuArrayGetDescriptor (&DstArrayDesc, DstArray));
1042
+ UR_CHECK_ERROR (cuArrayGetDescriptor (&DstArrayDesc, DstArray));
1047
1043
1048
1044
UR_ASSERT (SrcArrayDesc.Format == DstArrayDesc.Format ,
1049
1045
UR_RESULT_ERROR_INVALID_MEM_OBJECT);
@@ -1069,8 +1065,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageCopy(
1069
1065
1070
1066
ur_mem_type_t ImgType = hImageSrc->Mem .SurfaceMem .getImageType ();
1071
1067
if (ImgType == UR_MEM_TYPE_IMAGE1D) {
1072
- Result = UR_CHECK_ERROR (cuMemcpyAtoA (DstArray, DstByteOffsetX, SrcArray,
1073
- SrcByteOffsetX, BytesToCopy));
1068
+ UR_CHECK_ERROR (cuMemcpyAtoA (DstArray, DstByteOffsetX, SrcArray,
1069
+ SrcByteOffsetX, BytesToCopy));
1074
1070
} else {
1075
1071
ur_rect_region_t AdjustedRegion = {BytesToCopy, region.height ,
1076
1072
region.depth };
@@ -1080,7 +1076,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageCopy(
1080
1076
Result = commonEnqueueMemImageNDCopy (
1081
1077
CuStream, ImgType, AdjustedRegion, &SrcArray, CU_MEMORYTYPE_ARRAY,
1082
1078
SrcOffset, &DstArray, CU_MEMORYTYPE_ARRAY, DstOffset);
1083
-
1084
1079
if (Result != UR_RESULT_SUCCESS) {
1085
1080
return Result;
1086
1081
}
@@ -1282,13 +1277,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy(
1282
1277
UR_COMMAND_USM_MEMCPY, hQueue, CuStream));
1283
1278
UR_CHECK_ERROR (EventPtr->start ());
1284
1279
}
1285
- Result = UR_CHECK_ERROR (
1280
+ UR_CHECK_ERROR (
1286
1281
cuMemcpyAsync ((CUdeviceptr)pDst, (CUdeviceptr)pSrc, size, CuStream));
1287
1282
if (phEvent) {
1288
1283
UR_CHECK_ERROR (EventPtr->record ());
1289
1284
}
1290
1285
if (blocking) {
1291
- Result = UR_CHECK_ERROR (cuStreamSynchronize (CuStream));
1286
+ UR_CHECK_ERROR (cuStreamSynchronize (CuStream));
1292
1287
}
1293
1288
if (phEvent) {
1294
1289
*phEvent = EventPtr.release ();
@@ -1347,7 +1342,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch(
1347
1342
UR_COMMAND_MEM_BUFFER_COPY, hQueue, CuStream));
1348
1343
UR_CHECK_ERROR (EventPtr->start ());
1349
1344
}
1350
- Result = UR_CHECK_ERROR (
1345
+ UR_CHECK_ERROR (
1351
1346
cuMemPrefetchAsync ((CUdeviceptr)pMem, size, Device->get (), CuStream));
1352
1347
if (phEvent) {
1353
1348
UR_CHECK_ERROR (EventPtr->record ());
@@ -1485,14 +1480,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy2D(
1485
1480
CpyDesc.WidthInBytes = width;
1486
1481
CpyDesc.Height = height;
1487
1482
1488
- result = UR_CHECK_ERROR (cuMemcpy2DAsync (&CpyDesc, cuStream));
1483
+ UR_CHECK_ERROR (cuMemcpy2DAsync (&CpyDesc, cuStream));
1489
1484
1490
1485
if (phEvent) {
1491
1486
UR_CHECK_ERROR (RetImplEvent->record ());
1492
1487
*phEvent = RetImplEvent.release ();
1493
1488
}
1494
1489
if (blocking) {
1495
- result = UR_CHECK_ERROR (cuStreamSynchronize (cuStream));
1490
+ UR_CHECK_ERROR (cuStreamSynchronize (cuStream));
1496
1491
}
1497
1492
} catch (ur_result_t err) {
1498
1493
result = err;
@@ -1608,9 +1603,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableWrite(
1608
1603
try {
1609
1604
CUdeviceptr DeviceGlobal = 0 ;
1610
1605
size_t DeviceGlobalSize = 0 ;
1611
- Result = UR_CHECK_ERROR (cuModuleGetGlobal (&DeviceGlobal, &DeviceGlobalSize,
1612
- hProgram->get (),
1613
- DeviceGlobalName.c_str ()));
1606
+ UR_CHECK_ERROR (cuModuleGetGlobal (&DeviceGlobal, &DeviceGlobalSize,
1607
+ hProgram->get (),
1608
+ DeviceGlobalName.c_str ()));
1614
1609
1615
1610
if (offset + count > DeviceGlobalSize)
1616
1611
return UR_RESULT_ERROR_INVALID_VALUE;
@@ -1640,9 +1635,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableRead(
1640
1635
try {
1641
1636
CUdeviceptr DeviceGlobal = 0 ;
1642
1637
size_t DeviceGlobalSize = 0 ;
1643
- Result = UR_CHECK_ERROR (cuModuleGetGlobal (&DeviceGlobal, &DeviceGlobalSize,
1644
- hProgram->get (),
1645
- DeviceGlobalName.c_str ()));
1638
+ UR_CHECK_ERROR (cuModuleGetGlobal (&DeviceGlobal, &DeviceGlobalSize,
1639
+ hProgram->get (),
1640
+ DeviceGlobalName.c_str ()));
1646
1641
1647
1642
if (offset + count > DeviceGlobalSize)
1648
1643
return UR_RESULT_ERROR_INVALID_VALUE;
0 commit comments