Skip to content

Commit 2532282

Browse files
authored
[UR][Offload] Add initial membuffer implementation (#18849)
Add a basic implementation of buffers to the Offload adapter. The design is loosely based on the CUDA and HIP adapters. For now contexts only support a single device, so the implementation is relatively simple. When multi-device contexts are supported we will need to handle migration of data between devices (or otherwise change this implementation)
1 parent bb57e4a commit 2532282

File tree

9 files changed

+304
-7
lines changed

9 files changed

+304
-7
lines changed

unified-runtime/source/adapters/offload/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ add_ur_adapter(${TARGET_NAME}
3737
${CMAKE_CURRENT_SOURCE_DIR}/enqueue.cpp
3838
${CMAKE_CURRENT_SOURCE_DIR}/event.cpp
3939
${CMAKE_CURRENT_SOURCE_DIR}/kernel.cpp
40+
${CMAKE_CURRENT_SOURCE_DIR}/memory.cpp
4041
${CMAKE_CURRENT_SOURCE_DIR}/platform.cpp
4142
${CMAKE_CURRENT_SOURCE_DIR}/program.cpp
4243
${CMAKE_CURRENT_SOURCE_DIR}/queue.cpp

unified-runtime/source/adapters/offload/context.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,9 @@
1010

1111
#pragma once
1212

13+
#include "adapter.hpp"
1314
#include "common.hpp"
15+
#include "device.hpp"
1416
#include <OffloadAPI.h>
1517
#include <unordered_map>
1618
#include <ur_api.h>

unified-runtime/source/adapters/offload/enqueue.cpp

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,10 @@
1212
#include <assert.h>
1313
#include <ur_api.h>
1414

15+
#include "context.hpp"
1516
#include "event.hpp"
1617
#include "kernel.hpp"
18+
#include "memory.hpp"
1719
#include "queue.hpp"
1820
#include "ur2offload.hpp"
1921

@@ -88,3 +90,70 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy2D(
8890
size_t, uint32_t, const ur_event_handle_t *, ur_event_handle_t *) {
8991
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
9092
}
93+
94+
UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead(
95+
ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool blockingRead,
96+
size_t offset, size_t size, void *pDst, uint32_t numEventsInWaitList,
97+
const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
98+
99+
// Ignore wait list for now
100+
(void)numEventsInWaitList;
101+
(void)phEventWaitList;
102+
//
103+
104+
ol_event_handle_t EventOut = nullptr;
105+
106+
void *DevPtr = std::get<BufferMem>(hBuffer->Mem).Ptr;
107+
108+
olMemcpy(hQueue->OffloadQueue, pDst, Adapter.HostDevice, DevPtr + offset,
109+
hQueue->OffloadDevice, size, phEvent ? &EventOut : nullptr);
110+
111+
if (blockingRead) {
112+
olWaitQueue(hQueue->OffloadQueue);
113+
}
114+
115+
if (phEvent) {
116+
auto *Event = new ur_event_handle_t_();
117+
Event->OffloadEvent = EventOut;
118+
*phEvent = Event;
119+
}
120+
121+
return UR_RESULT_SUCCESS;
122+
}
123+
124+
UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWrite(
125+
ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool blockingWrite,
126+
size_t offset, size_t size, const void *pSrc, uint32_t numEventsInWaitList,
127+
const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
128+
129+
// Ignore wait list for now
130+
(void)numEventsInWaitList;
131+
(void)phEventWaitList;
132+
//
133+
134+
ol_event_handle_t EventOut = nullptr;
135+
136+
void *DevPtr = std::get<BufferMem>(hBuffer->Mem).Ptr;
137+
138+
auto Res =
139+
olMemcpy(hQueue->OffloadQueue, DevPtr + offset, hQueue->OffloadDevice,
140+
pSrc, Adapter.HostDevice, size, phEvent ? &EventOut : nullptr);
141+
if (Res) {
142+
return offloadResultToUR(Res);
143+
}
144+
145+
if (blockingWrite) {
146+
auto Res = olWaitQueue(hQueue->OffloadQueue);
147+
if (Res) {
148+
return offloadResultToUR(Res);
149+
}
150+
}
151+
152+
if (phEvent) {
153+
auto *Event = new ur_event_handle_t_();
154+
Event->OffloadEvent = EventOut;
155+
*phEvent = Event;
156+
}
157+
158+
return UR_RESULT_SUCCESS;
159+
}

unified-runtime/source/adapters/offload/kernel.cpp

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
//===----------------------------------------------------------------------===//
1010

1111
#include "kernel.hpp"
12+
#include "memory.hpp"
1213
#include "program.hpp"
1314
#include "ur2offload.hpp"
1415
#include <OffloadAPI.h>
@@ -83,6 +84,26 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgValue(
8384
return UR_RESULT_SUCCESS;
8485
}
8586

87+
UR_APIEXPORT ur_result_t UR_APICALL
88+
urKernelSetArgMemObj(ur_kernel_handle_t hKernel, uint32_t argIndex,
89+
const ur_kernel_arg_mem_obj_properties_t *Properties,
90+
ur_mem_handle_t hArgValue) {
91+
// Handle zero-sized buffers
92+
if (hArgValue == nullptr) {
93+
hKernel->Args.addArg(argIndex, 0, nullptr);
94+
return UR_RESULT_SUCCESS;
95+
}
96+
97+
ur_mem_flags_t MemAccess =
98+
Properties ? Properties->memoryAccess
99+
: static_cast<ur_mem_flags_t>(UR_MEM_FLAG_READ_WRITE);
100+
hKernel->Args.addMemObjArg(argIndex, hArgValue, MemAccess);
101+
102+
auto Ptr = std::get<BufferMem>(hArgValue->Mem).Ptr;
103+
hKernel->Args.addArg(argIndex, sizeof(void *), &Ptr);
104+
return UR_RESULT_SUCCESS;
105+
}
106+
86107
UR_APIEXPORT ur_result_t UR_APICALL urKernelGetGroupInfo(
87108
ur_kernel_handle_t, ur_device_handle_t, ur_kernel_group_info_t propName,
88109
size_t propSize, void *pPropValue, size_t *pPropSizeRet) {

unified-runtime/source/adapters/offload/kernel.hpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,13 @@ struct ur_kernel_handle_t_ : RefCounted {
3232
args_size_t ParamSizes;
3333
args_ptr_t Pointers;
3434

35+
struct MemObjArg {
36+
ur_mem_handle_t_ *Mem;
37+
int Index;
38+
ur_mem_flags_t AccessFlags;
39+
};
40+
std::vector<MemObjArg> MemObjArgs;
41+
3542
// Add an argument. If it already exists, it is replaced. Gaps are filled
3643
// with empty arguments.
3744
void addArg(size_t Index, size_t Size, const void *Arg) {
@@ -48,6 +55,19 @@ struct ur_kernel_handle_t_ : RefCounted {
4855
Pointers[Index] = &Storage[InsertPos];
4956
}
5057

58+
void addMemObjArg(int Index, ur_mem_handle_t hMem, ur_mem_flags_t Flags) {
59+
assert(hMem && "Invalid mem handle");
60+
// If a memobj is already set at this index, update the entry rather
61+
// than adding a duplicate one
62+
for (auto &Arg : MemObjArgs) {
63+
if (Arg.Index == Index) {
64+
Arg = MemObjArg{hMem, Index, Flags};
65+
return;
66+
}
67+
}
68+
MemObjArgs.push_back(MemObjArg{hMem, Index, Flags});
69+
}
70+
5171
const args_ptr_t &getPointers() const noexcept { return Pointers; }
5272

5373
const char *getStorage() const noexcept { return Storage.data(); }
Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
//===----------- memory.cpp - LLVM Offload Adapter -----------------------===//
2+
//
3+
// Copyright (C) 2025 Intel Corporation
4+
//
5+
// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
6+
// Exceptions. See LICENSE.TXT
7+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
8+
//
9+
//===----------------------------------------------------------------------===//
10+
11+
#include <OffloadAPI.h>
12+
#include <unordered_set>
13+
#include <ur/ur.hpp>
14+
#include <ur_api.h>
15+
16+
#include "adapter.hpp"
17+
#include "context.hpp"
18+
#include "device.hpp"
19+
#include "memory.hpp"
20+
#include "ur2offload.hpp"
21+
22+
UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreate(
23+
ur_context_handle_t hContext, ur_mem_flags_t flags, size_t size,
24+
const ur_buffer_properties_t *pProperties, ur_mem_handle_t *phBuffer) {
25+
26+
// TODO: We can avoid the initial copy with USE_HOST_POINTER by implementing
27+
// something like olMemRegister
28+
const bool PerformInitialCopy =
29+
(flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER) ||
30+
(flags & UR_MEM_FLAG_USE_HOST_POINTER);
31+
32+
void *Ptr = nullptr;
33+
auto HostPtr = pProperties ? pProperties->pHost : nullptr;
34+
auto OffloadDevice = hContext->Device->OffloadDevice;
35+
auto AllocMode = BufferMem::AllocMode::Default;
36+
37+
if (flags & UR_MEM_FLAG_ALLOC_HOST_POINTER) {
38+
auto Res = olMemAlloc(OffloadDevice, OL_ALLOC_TYPE_HOST, size, &HostPtr);
39+
if (Res) {
40+
return offloadResultToUR(Res);
41+
}
42+
// TODO: We (probably) need something like cuMemHostGetDevicePointer
43+
// for this to work everywhere. For now assume the managed host pointer is
44+
// device-accessible.
45+
Ptr = HostPtr;
46+
AllocMode = BufferMem::AllocMode::AllocHostPtr;
47+
} else {
48+
auto Res = olMemAlloc(OffloadDevice, OL_ALLOC_TYPE_DEVICE, size, &Ptr);
49+
if (Res) {
50+
return offloadResultToUR(Res);
51+
}
52+
if (flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER) {
53+
AllocMode = BufferMem::AllocMode::CopyIn;
54+
}
55+
}
56+
57+
ur_mem_handle_t ParentBuffer = nullptr;
58+
auto URMemObj = std::unique_ptr<ur_mem_handle_t_>(new ur_mem_handle_t_{
59+
hContext, ParentBuffer, flags, AllocMode, Ptr, HostPtr, size});
60+
61+
if (PerformInitialCopy) {
62+
auto Res = olMemcpy(nullptr, Ptr, OffloadDevice, HostPtr,
63+
Adapter.HostDevice, size, nullptr);
64+
if (Res) {
65+
return offloadResultToUR(Res);
66+
}
67+
}
68+
69+
*phBuffer = URMemObj.release();
70+
71+
return UR_RESULT_SUCCESS;
72+
}
73+
74+
UR_APIEXPORT ur_result_t UR_APICALL urMemRetain(ur_mem_handle_t hMem) {
75+
hMem->RefCount++;
76+
return UR_RESULT_SUCCESS;
77+
}
78+
79+
UR_APIEXPORT ur_result_t UR_APICALL urMemRelease(ur_mem_handle_t hMem) {
80+
if (--hMem->RefCount > 0) {
81+
return UR_RESULT_SUCCESS;
82+
}
83+
84+
std::unique_ptr<ur_mem_handle_t_> MemObjPtr(hMem);
85+
if (hMem->MemType == ur_mem_handle_t_::Type::Buffer) {
86+
// TODO: Handle registered host memory
87+
auto &BufferImpl = std::get<BufferMem>(MemObjPtr->Mem);
88+
auto Res = olMemFree(BufferImpl.Ptr);
89+
if (Res) {
90+
return offloadResultToUR(Res);
91+
}
92+
}
93+
94+
return UR_RESULT_SUCCESS;
95+
}
96+
97+
UR_APIEXPORT ur_result_t UR_APICALL urMemGetInfo(ur_mem_handle_t hMemory,
98+
ur_mem_info_t MemInfoType,
99+
size_t propSize,
100+
void *pMemInfo,
101+
size_t *pPropSizeRet) {
102+
UrReturnHelper ReturnValue(propSize, pMemInfo, pPropSizeRet);
103+
104+
switch (MemInfoType) {
105+
case UR_MEM_INFO_SIZE: {
106+
return ReturnValue(std::get<BufferMem>(hMemory->Mem).Size);
107+
}
108+
case UR_MEM_INFO_CONTEXT: {
109+
return ReturnValue(hMemory->getContext());
110+
}
111+
case UR_MEM_INFO_REFERENCE_COUNT: {
112+
return ReturnValue(hMemory->RefCount.load());
113+
}
114+
115+
default:
116+
return UR_RESULT_ERROR_INVALID_ENUMERATION;
117+
}
118+
}
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
//===----------- memory.hpp - LLVM Offload Adapter -----------------------===//
2+
//
3+
// Copyright (C) 2025 Intel Corporation
4+
//
5+
// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
6+
// Exceptions. See LICENSE.TXT
7+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
8+
//
9+
//===----------------------------------------------------------------------===//
10+
11+
#pragma once
12+
13+
#include "ur_api.h"
14+
15+
#include "common.hpp"
16+
17+
struct BufferMem {
18+
enum class AllocMode {
19+
Default,
20+
UseHostPtr,
21+
CopyIn,
22+
AllocHostPtr,
23+
};
24+
25+
ur_mem_handle_t Parent;
26+
// Underlying device pointer
27+
void *Ptr;
28+
// Pointer associated with this device on the host
29+
void *HostPtr;
30+
size_t Size;
31+
32+
AllocMode MemAllocMode;
33+
34+
BufferMem(ur_mem_handle_t Parent, BufferMem::AllocMode Mode, void *Ptr,
35+
void *HostPtr, size_t Size)
36+
: Parent{Parent}, Ptr{Ptr}, HostPtr{HostPtr}, Size{Size},
37+
MemAllocMode{Mode} {};
38+
39+
void *get() const noexcept { return Ptr; }
40+
size_t getSize() const noexcept { return Size; }
41+
};
42+
43+
struct ur_mem_handle_t_ : RefCounted {
44+
ur_context_handle_t Context;
45+
46+
enum class Type { Buffer } MemType;
47+
ur_mem_flags_t MemFlags;
48+
49+
// For now we only support BufferMem. Eventually we'll support images, so use
50+
// a variant to store the underlying object.
51+
std::variant<BufferMem> Mem;
52+
53+
ur_mem_handle_t_(ur_context_handle_t Context, ur_mem_handle_t Parent,
54+
ur_mem_flags_t MemFlags, BufferMem::AllocMode Mode,
55+
void *Ptr, void *HostPtr, size_t Size)
56+
: Context{Context}, MemType{Type::Buffer}, MemFlags{MemFlags},
57+
Mem{BufferMem{Parent, Mode, Ptr, HostPtr, Size}} {
58+
urContextRetain(Context);
59+
};
60+
61+
~ur_mem_handle_t_() { urContextRelease(Context); }
62+
63+
ur_context_handle_t getContext() const noexcept { return Context; }
64+
};

unified-runtime/source/adapters/offload/ur_interface_loader.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -149,16 +149,16 @@ urGetMemProcAddrTable(ur_api_version_t version, ur_mem_dditable_t *pDdiTable) {
149149
if (UR_RESULT_SUCCESS != result) {
150150
return result;
151151
}
152-
pDdiTable->pfnBufferCreate = nullptr;
152+
pDdiTable->pfnBufferCreate = urMemBufferCreate;
153153
pDdiTable->pfnBufferPartition = nullptr;
154154
pDdiTable->pfnBufferCreateWithNativeHandle = nullptr;
155155
pDdiTable->pfnImageCreateWithNativeHandle = nullptr;
156-
pDdiTable->pfnGetInfo = nullptr;
156+
pDdiTable->pfnGetInfo = urMemGetInfo;
157157
pDdiTable->pfnGetNativeHandle = nullptr;
158158
pDdiTable->pfnImageCreate = nullptr;
159159
pDdiTable->pfnImageGetInfo = nullptr;
160-
pDdiTable->pfnRelease = nullptr;
161-
pDdiTable->pfnRetain = nullptr;
160+
pDdiTable->pfnRelease = urMemRelease;
161+
pDdiTable->pfnRetain = urMemRetain;
162162
return UR_RESULT_SUCCESS;
163163
}
164164

@@ -177,9 +177,9 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueProcAddrTable(
177177
pDdiTable->pfnMemBufferCopyRect = nullptr;
178178
pDdiTable->pfnMemBufferFill = nullptr;
179179
pDdiTable->pfnMemBufferMap = nullptr;
180-
pDdiTable->pfnMemBufferRead = nullptr;
180+
pDdiTable->pfnMemBufferRead = urEnqueueMemBufferRead;
181181
pDdiTable->pfnMemBufferReadRect = nullptr;
182-
pDdiTable->pfnMemBufferWrite = nullptr;
182+
pDdiTable->pfnMemBufferWrite = urEnqueueMemBufferWrite;
183183
pDdiTable->pfnMemBufferWriteRect = nullptr;
184184
pDdiTable->pfnMemImageCopy = nullptr;
185185
pDdiTable->pfnMemImageRead = nullptr;

unified-runtime/test/conformance/memory/urMemBufferCreate.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,8 +121,10 @@ TEST_P(urMemBufferCreateTest, CopyHostPointer) {
121121

122122
TEST_P(urMemBufferCreateTest, UseHostPointer) {
123123
// These all copy memory instead of mapping it
124+
// https://github.com/intel/llvm/issues/18836
124125
UUR_KNOWN_FAILURE_ON(uur::LevelZero{}, uur::LevelZeroV2{}, uur::HIP{},
125-
uur::CUDA{}, uur::OpenCL{"Intel(R) UHD Graphics 770"});
126+
uur::CUDA{}, uur::OpenCL{"Intel(R) UHD Graphics 770"},
127+
uur::Offload{});
126128

127129
std::vector<unsigned char> dataWrite{};
128130
dataWrite.resize(4096);

0 commit comments

Comments
 (0)