Skip to content

Commit 534071e

Browse files
authored
Merge pull request #1077 from fabiomestre/fabio/combines_fixes_cuda_hip
[CUDA][HIP] Combined CTS Fixes
2 parents 9fc8230 + d164792 commit 534071e

26 files changed

+528
-132
lines changed

.github/workflows/cmake.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,7 @@ jobs:
164164
matrix:
165165
adapter: [
166166
{name: CUDA, triplet: nvptx64-nvidia-cuda},
167-
{name: HIP, triplet: spir64}, # should be amdgcn-amdhsa, but build scripts for device binaries are currently broken for this target.
167+
{name: HIP, triplet: amdgcn-amd-amdhsa},
168168
{name: L0, triplet: spir64}
169169
]
170170
build_type: [Debug, Release]
@@ -198,6 +198,8 @@ jobs:
198198
-DUR_DPCXX=${{github.workspace}}/dpcpp_compiler/bin/clang++
199199
-DUR_SYCL_LIBRARY_DIR=${{github.workspace}}/dpcpp_compiler/lib
200200
-DUR_CONFORMANCE_TARGET_TRIPLES=${{matrix.adapter.triplet}}
201+
${{ matrix.adapter.name == 'HIP' && '-DAMD_ARCH=gfx1030' || '' }}
202+
${{ matrix.adapter.name == 'HIP' && '-DUR_HIP_PLATFORM=AMD' || '' }}
201203
202204
- name: Build
203205
# This is so that device binaries can find the sycl runtime library

source/adapters/cuda/enqueue.cpp

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,10 @@ ur_result_t setCuMemAdvise(CUdeviceptr DevPtr, size_t Size,
121121

122122
for (auto &UnmappedFlag : UnmappedMemAdviceFlags) {
123123
if (URAdviceFlags & UnmappedFlag) {
124-
throw UR_RESULT_ERROR_INVALID_ENUMERATION;
124+
setErrorMessage("Memory advice ignored because the CUDA backend does not "
125+
"support some of the specified flags",
126+
UR_RESULT_SUCCESS);
127+
return UR_RESULT_ERROR_ADAPTER_SPECIFIC;
125128
}
126129
}
127130

@@ -1355,15 +1358,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch(
13551358
ur_queue_handle_t hQueue, const void *pMem, size_t size,
13561359
ur_usm_migration_flags_t flags, uint32_t numEventsInWaitList,
13571360
const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
1358-
unsigned int PointerRangeSize = 0;
1361+
std::ignore = flags;
1362+
1363+
size_t PointerRangeSize = 0;
13591364
UR_CHECK_ERROR(cuPointerGetAttribute(
13601365
&PointerRangeSize, CU_POINTER_ATTRIBUTE_RANGE_SIZE, (CUdeviceptr)pMem));
13611366
UR_ASSERT(size <= PointerRangeSize, UR_RESULT_ERROR_INVALID_SIZE);
13621367
ur_device_handle_t Device = hQueue->getContext()->getDevice();
13631368

13641369
// Certain cuda devices and Windows do not have support for some Unified
13651370
// Memory features. cuMemPrefetchAsync requires concurrent memory access
1366-
// for managed memory. Therfore, ignore prefetch hint if concurrent managed
1371+
// for managed memory. Therefore, ignore prefetch hint if concurrent managed
13671372
// memory access is not available.
13681373
if (!getAttribute(Device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) {
13691374
setErrorMessage("Prefetch hint ignored as device does not support "
@@ -1381,10 +1386,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch(
13811386
return UR_RESULT_ERROR_ADAPTER_SPECIFIC;
13821387
}
13831388

1384-
// flags is currently unused so fail if set
1385-
if (flags != 0)
1386-
return UR_RESULT_ERROR_INVALID_VALUE;
1387-
13881389
ur_result_t Result = UR_RESULT_SUCCESS;
13891390
std::unique_ptr<ur_event_handle_t_> EventPtr{nullptr};
13901391

@@ -1415,7 +1416,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch(
14151416
UR_APIEXPORT ur_result_t UR_APICALL
14161417
urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size,
14171418
ur_usm_advice_flags_t advice, ur_event_handle_t *phEvent) {
1418-
unsigned int PointerRangeSize = 0;
1419+
size_t PointerRangeSize = 0;
14191420
UR_CHECK_ERROR(cuPointerGetAttribute(
14201421
&PointerRangeSize, CU_POINTER_ATTRIBUTE_RANGE_SIZE, (CUdeviceptr)pMem));
14211422
UR_ASSERT(size <= PointerRangeSize, UR_RESULT_ERROR_INVALID_SIZE);

source/adapters/cuda/program.cpp

Lines changed: 50 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,42 @@ ur_result_t getKernelNames(ur_program_handle_t) {
165165
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
166166
}
167167

168+
/// Loads images from a list of PTX or CUBIN binaries.
169+
/// Note: No calls to CUDA driver API in this function, only store binaries
170+
/// for later.
171+
///
172+
/// Note: Only supports one device
173+
///
174+
ur_result_t createProgram(ur_context_handle_t hContext,
175+
ur_device_handle_t hDevice, size_t size,
176+
const uint8_t *pBinary,
177+
const ur_program_properties_t *pProperties,
178+
ur_program_handle_t *phProgram) {
179+
UR_ASSERT(hContext->getDevice()->get() == hDevice->get(),
180+
UR_RESULT_ERROR_INVALID_CONTEXT);
181+
UR_ASSERT(size, UR_RESULT_ERROR_INVALID_SIZE);
182+
183+
std::unique_ptr<ur_program_handle_t_> RetProgram{
184+
new ur_program_handle_t_{hContext}};
185+
186+
if (pProperties) {
187+
if (pProperties->count > 0 && pProperties->pMetadatas == nullptr) {
188+
return UR_RESULT_ERROR_INVALID_NULL_POINTER;
189+
} else if (pProperties->count == 0 && pProperties->pMetadatas != nullptr) {
190+
return UR_RESULT_ERROR_INVALID_SIZE;
191+
}
192+
UR_CHECK_ERROR(
193+
RetProgram->setMetadata(pProperties->pMetadatas, pProperties->count));
194+
}
195+
196+
auto pBinary_string = reinterpret_cast<const char *>(pBinary);
197+
198+
UR_CHECK_ERROR(RetProgram->setBinary(pBinary_string, size));
199+
*phProgram = RetProgram.release();
200+
201+
return UR_RESULT_SUCCESS;
202+
}
203+
168204
/// CUDA will handle the PTX/CUBIN binaries internally through CUmodule object.
169205
/// So, urProgramCreateWithIL and urProgramCreateWithBinary are equivalent in
170206
/// terms of CUDA adapter. See \ref urProgramCreateWithBinary.
@@ -175,8 +211,8 @@ urProgramCreateWithIL(ur_context_handle_t hContext, const void *pIL,
175211
ur_device_handle_t hDevice = hContext->getDevice();
176212
auto pBinary = reinterpret_cast<const uint8_t *>(pIL);
177213

178-
return urProgramCreateWithBinary(hContext, hDevice, length, pBinary,
179-
pProperties, phProgram);
214+
return createProgram(hContext, hDevice, length, pBinary, pProperties,
215+
phProgram);
180216
}
181217

182218
/// CUDA will handle the PTX/CUBIN binaries internally through a call to
@@ -185,7 +221,9 @@ urProgramCreateWithIL(ur_context_handle_t hContext, const void *pIL,
185221
UR_APIEXPORT ur_result_t UR_APICALL
186222
urProgramCompile(ur_context_handle_t hContext, ur_program_handle_t hProgram,
187223
const char *pOptions) {
188-
return urProgramBuild(hContext, hProgram, pOptions);
224+
UR_CHECK_ERROR(urProgramBuild(hContext, hProgram, pOptions));
225+
hProgram->BinaryType = UR_PROGRAM_BINARY_TYPE_COMPILED_OBJECT;
226+
return UR_RESULT_SUCCESS;
189227
}
190228

191229
/// Loads the images from a UR program into a CUmodule that can be
@@ -202,6 +240,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramBuild(ur_context_handle_t hContext,
202240
ScopedContext Active(hProgram->getContext());
203241

204242
hProgram->buildProgram(pOptions);
243+
hProgram->BinaryType = UR_PROGRAM_BINARY_TYPE_EXECUTABLE;
205244

206245
} catch (ur_result_t Err) {
207246
Result = Err;
@@ -241,6 +280,7 @@ urProgramLink(ur_context_handle_t hContext, uint32_t count,
241280
RetProgram->setBinary(static_cast<const char *>(CuBin), CuBinSize);
242281

243282
Result = RetProgram->buildProgram(pOptions);
283+
RetProgram->BinaryType = UR_PROGRAM_BINARY_TYPE_EXECUTABLE;
244284
} catch (...) {
245285
// Upon error attempt cleanup
246286
UR_CHECK_ERROR(cuLinkDestroy(State));
@@ -287,6 +327,9 @@ urProgramGetBuildInfo(ur_program_handle_t hProgram, ur_device_handle_t hDevice,
287327
return ReturnValue(hProgram->BuildOptions.c_str());
288328
case UR_PROGRAM_BUILD_INFO_LOG:
289329
return ReturnValue(hProgram->InfoLog, hProgram->MaxLogSize);
330+
case UR_PROGRAM_BUILD_INFO_BINARY_TYPE: {
331+
return ReturnValue(hProgram->BinaryType);
332+
}
290333
default:
291334
break;
292335
}
@@ -384,44 +427,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramGetNativeHandle(
384427
return UR_RESULT_SUCCESS;
385428
}
386429

387-
/// Loads images from a list of PTX or CUBIN binaries.
388-
/// Note: No calls to CUDA driver API in this function, only store binaries
389-
/// for later.
390-
///
391-
/// Note: Only supports one device
392-
///
393430
UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary(
394431
ur_context_handle_t hContext, ur_device_handle_t hDevice, size_t size,
395432
const uint8_t *pBinary, const ur_program_properties_t *pProperties,
396433
ur_program_handle_t *phProgram) {
397-
UR_ASSERT(hContext->getDevice()->get() == hDevice->get(),
398-
UR_RESULT_ERROR_INVALID_CONTEXT);
399-
UR_ASSERT(size, UR_RESULT_ERROR_INVALID_SIZE);
400434

401-
ur_result_t Result = UR_RESULT_SUCCESS;
435+
UR_CHECK_ERROR(
436+
createProgram(hContext, hDevice, size, pBinary, pProperties, phProgram));
437+
(*phProgram)->BinaryType = UR_PROGRAM_BINARY_TYPE_COMPILED_OBJECT;
402438

403-
std::unique_ptr<ur_program_handle_t_> RetProgram{
404-
new ur_program_handle_t_{hContext}};
405-
406-
if (pProperties) {
407-
if (pProperties->count > 0 && pProperties->pMetadatas == nullptr) {
408-
return UR_RESULT_ERROR_INVALID_NULL_POINTER;
409-
} else if (pProperties->count == 0 && pProperties->pMetadatas != nullptr) {
410-
return UR_RESULT_ERROR_INVALID_SIZE;
411-
}
412-
Result =
413-
RetProgram->setMetadata(pProperties->pMetadatas, pProperties->count);
414-
}
415-
UR_ASSERT(Result == UR_RESULT_SUCCESS, Result);
416-
417-
auto pBinary_string = reinterpret_cast<const char *>(pBinary);
418-
419-
Result = RetProgram->setBinary(pBinary_string, size);
420-
UR_ASSERT(Result == UR_RESULT_SUCCESS, Result);
421-
422-
*phProgram = RetProgram.release();
423-
424-
return Result;
439+
return UR_RESULT_SUCCESS;
425440
}
426441

427442
// This entry point is only used for native specialization constants (SPIR-V),

source/adapters/cuda/program.hpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,12 @@ struct ur_program_handle_t_ {
2525
std::atomic_uint32_t RefCount;
2626
ur_context_handle_t Context;
2727

28+
/* The ur_program_binary_type_t property is defined individually for every
29+
* device in a program. However, since the CUDA adapter only has 1 device per
30+
* context / program, there is no need to keep track of its value for each
31+
* device. */
32+
ur_program_binary_type_t BinaryType = UR_PROGRAM_BINARY_TYPE_NONE;
33+
2834
// Metadata
2935
std::unordered_map<std::string, std::tuple<uint32_t, uint32_t, uint32_t>>
3036
KernelReqdWorkGroupSizeMD;

source/adapters/hip/device.cpp

Lines changed: 32 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -210,14 +210,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
210210
return ReturnValue(uint64_t{MaxAlloc});
211211
}
212212
case UR_DEVICE_INFO_IMAGE_SUPPORTED: {
213-
return ReturnValue(uint32_t{true});
213+
return ReturnValue(true);
214214
}
215215
case UR_DEVICE_INFO_MAX_READ_IMAGE_ARGS: {
216216
// This call doesn't match to HIP as it doesn't have images, but instead
217217
// surfaces and textures. No clear call in the HIP API to determine this,
218218
// but some searching found as of SM 2.x 128 are supported.
219219
return ReturnValue(128u);
220220
}
221+
case UR_DEVICE_INFO_MAX_READ_WRITE_IMAGE_ARGS: {
222+
// This call doesn't match to HIP as it doesn't have images, but instead
223+
// surfaces and textures. No clear call in the HIP API to determine this,
224+
// but some searching found as of SM 2.x 128 are supported.
225+
return ReturnValue(128u);
226+
}
221227
case UR_DEVICE_INFO_MAX_WRITE_IMAGE_ARGS: {
222228
// This call doesn't match to HIP as it doesn't have images, but instead
223229
// surfaces and textures. No clear call in the HIP API to determine this,
@@ -339,7 +345,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
339345
return ReturnValue(0u);
340346
}
341347
case UR_DEVICE_INFO_SINGLE_FP_CONFIG: {
342-
uint64_t Config =
348+
ur_device_fp_capability_flags_t Config =
343349
UR_DEVICE_FP_CAPABILITY_FLAG_DENORM |
344350
UR_DEVICE_FP_CAPABILITY_FLAG_INF_NAN |
345351
UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST |
@@ -350,12 +356,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
350356
return ReturnValue(Config);
351357
}
352358
case UR_DEVICE_INFO_DOUBLE_FP_CONFIG: {
353-
uint64_t Config = UR_DEVICE_FP_CAPABILITY_FLAG_DENORM |
354-
UR_DEVICE_FP_CAPABILITY_FLAG_INF_NAN |
355-
UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST |
356-
UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_ZERO |
357-
UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_INF |
358-
UR_DEVICE_FP_CAPABILITY_FLAG_FMA;
359+
ur_device_fp_capability_flags_t Config =
360+
UR_DEVICE_FP_CAPABILITY_FLAG_DENORM |
361+
UR_DEVICE_FP_CAPABILITY_FLAG_INF_NAN |
362+
UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST |
363+
UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_ZERO |
364+
UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_INF |
365+
UR_DEVICE_FP_CAPABILITY_FLAG_FMA;
359366
return ReturnValue(Config);
360367
}
361368
case UR_DEVICE_INFO_GLOBAL_MEM_CACHE_TYPE: {
@@ -459,14 +466,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
459466
}
460467
case UR_DEVICE_INFO_QUEUE_ON_DEVICE_PROPERTIES: {
461468
// The mandated minimum capability:
462-
uint64_t Capability = UR_QUEUE_FLAG_PROFILING_ENABLE |
463-
UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE;
469+
ur_queue_flags_t Capability = UR_QUEUE_FLAG_PROFILING_ENABLE |
470+
UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE;
464471
return ReturnValue(Capability);
465472
}
466473
case UR_DEVICE_INFO_QUEUE_ON_HOST_PROPERTIES:
467474
case UR_DEVICE_INFO_QUEUE_PROPERTIES: {
468475
// The mandated minimum capability:
469-
uint64_t Capability = UR_QUEUE_FLAG_PROFILING_ENABLE;
476+
ur_queue_flags_t Capability = UR_QUEUE_FLAG_PROFILING_ENABLE;
470477
return ReturnValue(Capability);
471478
}
472479
case UR_DEVICE_INFO_BUILT_IN_KERNELS: {
@@ -730,9 +737,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
730737
}
731738

732739
case UR_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: {
733-
uint64_t Capabilities = UR_MEMORY_ORDER_CAPABILITY_FLAG_RELAXED |
734-
UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQUIRE |
735-
UR_MEMORY_ORDER_CAPABILITY_FLAG_RELEASE;
740+
ur_memory_order_capability_flags_t Capabilities =
741+
UR_MEMORY_ORDER_CAPABILITY_FLAG_RELAXED |
742+
UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQUIRE |
743+
UR_MEMORY_ORDER_CAPABILITY_FLAG_RELEASE;
736744
return ReturnValue(Capabilities);
737745
}
738746
case UR_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES:
@@ -821,7 +829,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
821829
case UR_DEVICE_INFO_GPU_HW_THREADS_PER_EU:
822830
case UR_DEVICE_INFO_MAX_MEMORY_BANDWIDTH:
823831
case UR_DEVICE_INFO_BFLOAT16:
824-
return UR_RESULT_ERROR_INVALID_ENUMERATION;
832+
case UR_DEVICE_INFO_IL_VERSION:
833+
case UR_DEVICE_INFO_ASYNC_BARRIER:
834+
case UR_DEVICE_INFO_VIRTUAL_MEMORY_SUPPORT:
835+
return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
825836

826837
default:
827838
break;
@@ -939,21 +950,18 @@ ur_result_t UR_APICALL urDeviceGetGlobalTimestamps(ur_device_handle_t hDevice,
939950
if (pDeviceTimestamp) {
940951
UR_CHECK_ERROR(hipEventCreateWithFlags(&Event, hipEventDefault));
941952
UR_CHECK_ERROR(hipEventRecord(Event));
942-
}
943-
if (pHostTimestamp) {
944-
using namespace std::chrono;
945-
*pHostTimestamp =
946-
duration_cast<nanoseconds>(steady_clock::now().time_since_epoch())
947-
.count();
948-
}
949-
950-
if (pDeviceTimestamp) {
951953
UR_CHECK_ERROR(hipEventSynchronize(Event));
952954
float ElapsedTime = 0.0f;
953955
UR_CHECK_ERROR(hipEventElapsedTime(&ElapsedTime,
954956
ur_platform_handle_t_::EvBase, Event));
955957
*pDeviceTimestamp = (uint64_t)(ElapsedTime * (double)1e6);
956958
}
957959

960+
if (pHostTimestamp) {
961+
using namespace std::chrono;
962+
*pHostTimestamp =
963+
duration_cast<nanoseconds>(steady_clock::now().time_since_epoch())
964+
.count();
965+
}
958966
return UR_RESULT_SUCCESS;
959967
}

source/adapters/hip/kernel.cpp

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,12 @@ urKernelCreate(ur_program_handle_t hProgram, const char *pKernelName,
2222
ScopedContext Active(hProgram->getContext()->getDevice());
2323

2424
hipFunction_t HIPFunc;
25-
UR_CHECK_ERROR(
26-
hipModuleGetFunction(&HIPFunc, hProgram->get(), pKernelName));
25+
hipError_t KernelError =
26+
hipModuleGetFunction(&HIPFunc, hProgram->get(), pKernelName);
27+
if (KernelError == hipErrorNotFound) {
28+
return UR_RESULT_ERROR_INVALID_KERNEL_NAME;
29+
}
30+
UR_CHECK_ERROR(KernelError);
2731

2832
std::string KernelNameWoffset = std::string(pKernelName) + "_with_offset";
2933
hipFunction_t HIPFuncWithOffsetParam;
@@ -321,3 +325,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelCreateWithNativeHandle(
321325
const ur_kernel_native_properties_t *, ur_kernel_handle_t *) {
322326
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
323327
}
328+
329+
UR_APIEXPORT ur_result_t UR_APICALL urKernelSetSpecializationConstants(
330+
[[maybe_unused]] ur_kernel_handle_t hKernel,
331+
[[maybe_unused]] uint32_t count,
332+
[[maybe_unused]] const ur_specialization_constant_info_t *pSpecConstants) {
333+
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
334+
}

0 commit comments

Comments
 (0)