@@ -769,29 +769,19 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopyRect(
769
769
return Result;
770
770
}
771
771
772
- // HIP has no memset functions that allow setting values more than 4 bytes. UR
773
- // API lets you pass an arbitrary "pattern" to the buffer fill, which can be
774
- // more than 4 bytes. We must break up the pattern into 1 byte values, and set
775
- // the buffer using multiple strided calls. The first 4 patterns are set using
776
- // hipMemsetD32Async then all subsequent 1 byte patterns are set using
777
- // hipMemset2DAsync which is called for each pattern.
778
- ur_result_t commonMemSetLargePattern (hipStream_t Stream, uint32_t PatternSize,
779
- size_t Size, const void *pPattern,
780
- hipDeviceptr_t Ptr) {
781
- // Calculate the number of patterns, stride, number of times the pattern
782
- // needs to be applied, and the number of times the first 32 bit pattern
783
- // needs to be applied.
772
+ static inline void memsetRemainPattern (hipStream_t Stream, uint32_t PatternSize,
773
+ size_t Size, const void *pPattern,
774
+ hipDeviceptr_t Ptr) {
775
+
776
+ // Calculate the number of patterns, stride and the number of times the
777
+ // pattern needs to be applied.
784
778
auto NumberOfSteps = PatternSize / sizeof (uint8_t );
785
779
auto Pitch = NumberOfSteps * sizeof (uint8_t );
786
780
auto Height = Size / NumberOfSteps;
787
- auto Count32 = Size / sizeof (uint32_t );
788
781
789
- // Get 4-byte chunk of the pattern and call hipMemsetD32Async
790
- auto Value = *(static_cast <const uint32_t *>(pPattern));
791
- UR_CHECK_ERROR (hipMemsetD32Async (Ptr, Value, Count32, Stream));
792
782
for (auto step = 4u ; step < NumberOfSteps; ++step) {
793
783
// take 1 byte of the pattern
794
- Value = *(static_cast <const uint8_t *>(pPattern) + step);
784
+ auto Value = *(static_cast <const uint8_t *>(pPattern) + step);
795
785
796
786
// offset the pointer to the part of the buffer we want to write to
797
787
auto OffsetPtr = reinterpret_cast <void *>(reinterpret_cast <uint8_t *>(Ptr) +
@@ -801,6 +791,55 @@ ur_result_t commonMemSetLargePattern(hipStream_t Stream, uint32_t PatternSize,
801
791
UR_CHECK_ERROR (hipMemset2DAsync (OffsetPtr, Pitch, Value, sizeof (uint8_t ),
802
792
Height, Stream));
803
793
}
794
+ }
795
+
796
+ // HIP has no memset functions that allow setting values more than 4 bytes. UR
797
+ // API lets you pass an arbitrary "pattern" to the buffer fill, which can be
798
+ // more than 4 bytes. We must break up the pattern into 1 byte values, and set
799
+ // the buffer using multiple strided calls. The first 4 patterns are set using
800
+ // hipMemsetD32Async then all subsequent 1 byte patterns are set using
801
+ // hipMemset2DAsync which is called for each pattern.
802
+ ur_result_t commonMemSetLargePattern (hipStream_t Stream, uint32_t PatternSize,
803
+ size_t Size, const void *pPattern,
804
+ hipDeviceptr_t Ptr) {
805
+
806
+ // Get 4-byte chunk of the pattern and call hipMemsetD32Async
807
+ auto Count32 = Size / sizeof (uint32_t );
808
+ auto Value = *(static_cast <const uint32_t *>(pPattern));
809
+ UR_CHECK_ERROR (hipMemsetD32Async (Ptr, Value, Count32, Stream));
810
+
811
+ // There is a bug in ROCm prior to 6.0.0 version which causes hipMemset2D
812
+ // to behave incorrectly when acting on host pinned memory.
813
+ // In such a case, the memset operation is partially emulated with memcpy.
814
+ #if HIP_VERSION_MAJOR < 6
815
+ hipPointerAttribute_t ptrAttribs{};
816
+ UR_CHECK_ERROR (hipPointerGetAttributes (&ptrAttribs, (const void *)Ptr));
817
+
818
+ // The hostPointer attribute is non-null also for shared memory allocations.
819
+ // To make sure that this workaround only executes for host pinned memory, we
820
+ // need to check that isManaged attribute is false.
821
+ if (ptrAttribs.hostPointer && !ptrAttribs.isManaged ) {
822
+ const auto NumOfCopySteps = Size / PatternSize;
823
+ const auto Offset = sizeof (uint32_t );
824
+ const auto LeftPatternSize = PatternSize - Offset;
825
+ const auto OffsetPatternPtr = reinterpret_cast <const void *>(
826
+ reinterpret_cast <const uint8_t *>(pPattern) + Offset);
827
+
828
+ // Loop through the memory area to memset, advancing each time by the
829
+ // PatternSize and memcpy the left over pattern bits.
830
+ for (uint32_t i = 0 ; i < NumOfCopySteps; ++i) {
831
+ auto OffsetDstPtr = reinterpret_cast <void *>(
832
+ reinterpret_cast <uint8_t *>(Ptr) + Offset + i * PatternSize);
833
+ UR_CHECK_ERROR (hipMemcpyAsync (OffsetDstPtr, OffsetPatternPtr,
834
+ LeftPatternSize, hipMemcpyHostToHost,
835
+ Stream));
836
+ }
837
+ } else {
838
+ memsetRemainPattern (Stream, PatternSize, Size, pPattern, Ptr);
839
+ }
840
+ #else
841
+ memsetRemainPattern (Stream, PatternSize, Size, pPattern, Ptr);
842
+ #endif
804
843
return UR_RESULT_SUCCESS;
805
844
}
806
845
0 commit comments