Skip to content

Commit 1473ed8

Browse files
authored
Merge pull request #1395 from konradkusiak97/improvedQueueFill
[HIP] Implement workaround for hipMemset2D
2 parents 6ccaf38 + f277422 commit 1473ed8

File tree

1 file changed

+56
-17
lines changed

1 file changed

+56
-17
lines changed

source/adapters/hip/enqueue.cpp

Lines changed: 56 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -769,29 +769,19 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopyRect(
769769
return Result;
770770
}
771771

772-
// HIP has no memset functions that allow setting values more than 4 bytes. UR
773-
// API lets you pass an arbitrary "pattern" to the buffer fill, which can be
774-
// more than 4 bytes. We must break up the pattern into 1 byte values, and set
775-
// the buffer using multiple strided calls. The first 4 patterns are set using
776-
// hipMemsetD32Async then all subsequent 1 byte patterns are set using
777-
// hipMemset2DAsync which is called for each pattern.
778-
ur_result_t commonMemSetLargePattern(hipStream_t Stream, uint32_t PatternSize,
779-
size_t Size, const void *pPattern,
780-
hipDeviceptr_t Ptr) {
781-
// Calculate the number of patterns, stride, number of times the pattern
782-
// needs to be applied, and the number of times the first 32 bit pattern
783-
// needs to be applied.
772+
static inline void memsetRemainPattern(hipStream_t Stream, uint32_t PatternSize,
773+
size_t Size, const void *pPattern,
774+
hipDeviceptr_t Ptr) {
775+
776+
// Calculate the number of patterns, stride and the number of times the
777+
// pattern needs to be applied.
784778
auto NumberOfSteps = PatternSize / sizeof(uint8_t);
785779
auto Pitch = NumberOfSteps * sizeof(uint8_t);
786780
auto Height = Size / NumberOfSteps;
787-
auto Count32 = Size / sizeof(uint32_t);
788781

789-
// Get 4-byte chunk of the pattern and call hipMemsetD32Async
790-
auto Value = *(static_cast<const uint32_t *>(pPattern));
791-
UR_CHECK_ERROR(hipMemsetD32Async(Ptr, Value, Count32, Stream));
792782
for (auto step = 4u; step < NumberOfSteps; ++step) {
793783
// take 1 byte of the pattern
794-
Value = *(static_cast<const uint8_t *>(pPattern) + step);
784+
auto Value = *(static_cast<const uint8_t *>(pPattern) + step);
795785

796786
// offset the pointer to the part of the buffer we want to write to
797787
auto OffsetPtr = reinterpret_cast<void *>(reinterpret_cast<uint8_t *>(Ptr) +
@@ -801,6 +791,55 @@ ur_result_t commonMemSetLargePattern(hipStream_t Stream, uint32_t PatternSize,
801791
UR_CHECK_ERROR(hipMemset2DAsync(OffsetPtr, Pitch, Value, sizeof(uint8_t),
802792
Height, Stream));
803793
}
794+
}
795+
796+
// HIP has no memset functions that allow setting values more than 4 bytes. UR
797+
// API lets you pass an arbitrary "pattern" to the buffer fill, which can be
798+
// more than 4 bytes. We must break up the pattern into 1 byte values, and set
799+
// the buffer using multiple strided calls. The first 4 patterns are set using
800+
// hipMemsetD32Async then all subsequent 1 byte patterns are set using
801+
// hipMemset2DAsync which is called for each pattern.
802+
ur_result_t commonMemSetLargePattern(hipStream_t Stream, uint32_t PatternSize,
803+
size_t Size, const void *pPattern,
804+
hipDeviceptr_t Ptr) {
805+
806+
// Get 4-byte chunk of the pattern and call hipMemsetD32Async
807+
auto Count32 = Size / sizeof(uint32_t);
808+
auto Value = *(static_cast<const uint32_t *>(pPattern));
809+
UR_CHECK_ERROR(hipMemsetD32Async(Ptr, Value, Count32, Stream));
810+
811+
// There is a bug in ROCm prior to 6.0.0 version which causes hipMemset2D
812+
// to behave incorrectly when acting on host pinned memory.
813+
// In such a case, the memset operation is partially emulated with memcpy.
814+
#if HIP_VERSION_MAJOR < 6
815+
hipPointerAttribute_t ptrAttribs{};
816+
UR_CHECK_ERROR(hipPointerGetAttributes(&ptrAttribs, (const void *)Ptr));
817+
818+
// The hostPointer attribute is non-null also for shared memory allocations.
819+
// To make sure that this workaround only executes for host pinned memory, we
820+
// need to check that isManaged attribute is false.
821+
if (ptrAttribs.hostPointer && !ptrAttribs.isManaged) {
822+
const auto NumOfCopySteps = Size / PatternSize;
823+
const auto Offset = sizeof(uint32_t);
824+
const auto LeftPatternSize = PatternSize - Offset;
825+
const auto OffsetPatternPtr = reinterpret_cast<const void *>(
826+
reinterpret_cast<const uint8_t *>(pPattern) + Offset);
827+
828+
// Loop through the memory area to memset, advancing each time by the
829+
// PatternSize and memcpy the left over pattern bits.
830+
for (uint32_t i = 0; i < NumOfCopySteps; ++i) {
831+
auto OffsetDstPtr = reinterpret_cast<void *>(
832+
reinterpret_cast<uint8_t *>(Ptr) + Offset + i * PatternSize);
833+
UR_CHECK_ERROR(hipMemcpyAsync(OffsetDstPtr, OffsetPatternPtr,
834+
LeftPatternSize, hipMemcpyHostToHost,
835+
Stream));
836+
}
837+
} else {
838+
memsetRemainPattern(Stream, PatternSize, Size, pPattern, Ptr);
839+
}
840+
#else
841+
memsetRemainPattern(Stream, PatternSize, Size, pPattern, Ptr);
842+
#endif
804843
return UR_RESULT_SUCCESS;
805844
}
806845

0 commit comments

Comments
 (0)