Skip to content

Commit 4fb2814

Browse files
committed
Device global implementation
--------------------------------- Currently the device global pointer is being mocked by passing an actual device allocation into the function. However this should be removed after the pointer is available through autodiscovery string.
1 parent 67f7f88 commit 4fb2814

File tree

10 files changed

+377
-35
lines changed

10 files changed

+377
-35
lines changed

include/CL/cl_ext.h

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2384,6 +2384,68 @@ clCreateBufferWithPropertiesINTEL_fn)(
23842384
void * host_ptr,
23852385
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
23862386

2387+
/**********************************
2388+
* cl_intel_global_variable_access *
2389+
***********************************/
2390+
2391+
#define CL_COMMAND_READ_GLOBAL_VARIABLE_INTEL 0x418E
2392+
#define CL_COMMAND_WRITE_GLOBAL_VARIABLE_INTEL 0x418F
2393+
2394+
// TODO; remove dev_global_ptr from parameter
2395+
extern CL_API_ENTRY cl_int CL_API_CALL
2396+
clEnqueueReadGlobalVariableINTEL(
2397+
cl_command_queue command_queue,
2398+
cl_program program,
2399+
const char* name,
2400+
cl_bool blocking_read,
2401+
size_t size,
2402+
size_t offset,
2403+
void* ptr,
2404+
cl_uint num_events_in_wait_list,
2405+
const cl_event* event_wait_list,
2406+
cl_event* event,
2407+
const void* dev_global_ptr) CL_API_SUFFIX__VERSION_1_0;
2408+
2409+
2410+
typedef CL_API_ENTRY cl_int (CL_API_CALL *
2411+
clEnqueueReadGlobalVariableINTEL_fn)(
2412+
cl_command_queue command_queue,
2413+
cl_program program,
2414+
const char* name,
2415+
cl_bool blocking_read,
2416+
size_t size,
2417+
size_t offset,
2418+
const void* ptr,
2419+
cl_uint num_events_in_wait_list,
2420+
const cl_event* event_wait_list,
2421+
cl_event* event, const void* dev_global_ptr) CL_API_SUFFIX__VERSION_1_0;
2422+
2423+
extern CL_API_ENTRY cl_int CL_API_CALL
2424+
clEnqueueWriteGlobalVariableINTEL(
2425+
cl_command_queue command_queue,
2426+
cl_program program,
2427+
const char* name,
2428+
cl_bool blocking_write,
2429+
size_t size,
2430+
size_t offset,
2431+
const void* ptr,
2432+
cl_uint num_events_in_wait_list,
2433+
const cl_event* event_wait_list,
2434+
cl_event* event, const void* dev_global_ptr) CL_API_SUFFIX__VERSION_1_0;
2435+
2436+
typedef CL_API_ENTRY cl_int (CL_API_CALL *
2437+
clEnqueueWriteGlobalVariableINTEL_fn)(
2438+
cl_command_queue command_queue,
2439+
cl_program program,
2440+
const char* name,
2441+
cl_bool blocking_read,
2442+
size_t size,
2443+
size_t offset,
2444+
void* ptr,
2445+
cl_uint num_events_in_wait_list,
2446+
const cl_event* event_wait_list,
2447+
cl_event* event, const void* dev_global_ptr) CL_API_SUFFIX__VERSION_1_0;
2448+
23872449
/******************************************
23882450
* cl_intel_mem_channel_property extension *
23892451
*******************************************/

include/acl_kernel.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,9 @@ void acl_receive_kernel_update(int activation_id, cl_int status);
5454
// safe to submit a kernel with subbuffers to the device_op_queue
5555
int acl_kernel_has_unmapped_subbuffers(acl_mem_migrate_t *mem_migration);
5656

57+
cl_int set_kernel_arg_mem_pointer_without_checks(
58+
cl_kernel kernel, cl_uint arg_index, void *arg_value);
59+
5760
#if defined(__cplusplus)
5861
} /* extern "C" */
5962
#endif

src/acl_event.cpp

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -812,7 +812,6 @@ cl_int acl_create_event(cl_command_queue command_queue, cl_uint num_events,
812812
#endif
813813

814814
context = command_queue->context;
815-
816815
event = l_get_unused_event(context);
817816
if (event == 0) {
818817
acl_context_callback(context, "Could not allocate an event or command");
@@ -861,7 +860,6 @@ cl_int acl_create_event(cl_command_queue command_queue, cl_uint num_events,
861860
events[i]->depend_on_me.pop_back();
862861
}
863862
}
864-
865863
// Return event back to free pool
866864
event->depend_on.clear();
867865
l_return_event_to_free_pool(event);
@@ -871,7 +869,6 @@ cl_int acl_create_event(cl_command_queue command_queue, cl_uint num_events,
871869

872870
if (event) {
873871
// Success!
874-
875872
// Normally, the command info union usage is determined only
876873
// by the command type. But in the case of CL_COMMAND_MAP_BUFFER
877874
// and CL_COMMAND_UNMAP_MEM_OBJECT, we need this extra bit to know
@@ -919,7 +916,6 @@ cl_int acl_create_event(cl_command_queue command_queue, cl_uint num_events,
919916
printf(" Notify: Event [%d] has been created:\n", event->id);
920917
acl_dump_event(event);
921918
}
922-
923919
acl_track_object(ACL_OBJ_EVENT, event);
924920
} else {
925921
acl_context_callback(context, "Could not allocate an event");

src/acl_icd_dispatch.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@ clGetExtensionFunctionAddressIntelFPGA(const char *func_name) {
5050
ADDFUNCTIONLOOKUP(clResetKernelsIntelFPGA);
5151
ADDFUNCTIONLOOKUP(clSetBoardLibraryIntelFPGA);
5252
ADDFUNCTIONLOOKUP(clCreateBufferWithPropertiesINTEL);
53+
ADDFUNCTIONLOOKUP(clEnqueueReadGlobalVariableINTEL);
54+
ADDFUNCTIONLOOKUP(clEnqueueWriteGlobalVariableINTEL);
5355

5456
// USM APIs are not currently supported on 32bit devices
5557
#ifndef __arm__

src/acl_kernel.cpp

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -837,6 +837,40 @@ CL_API_ENTRY cl_int CL_API_CALL clSetKernelArgSVMPointer(
837837
return clSetKernelArgSVMPointerIntelFPGA(kernel, arg_index, arg_value);
838838
}
839839

840+
cl_int set_kernel_arg_mem_pointer_without_checks(
841+
cl_kernel kernel, cl_uint arg_index, void *arg_value) {
842+
acl_lock();
843+
if (!acl_kernel_is_valid(kernel)) {
844+
UNLOCK_RETURN(CL_INVALID_KERNEL);
845+
}
846+
847+
cl_context context = kernel->program->context;
848+
849+
if (arg_index >= kernel->accel_def->iface.args.size()) {
850+
UNLOCK_ERR_RET(CL_INVALID_ARG_INDEX, context,
851+
"Argument index is too large");
852+
}
853+
854+
// Determine where to write the value.
855+
size_t start_idx = 0;
856+
size_t iface_arg_size = 0;
857+
l_get_arg_offset_and_size(kernel, arg_index, &start_idx, &iface_arg_size);
858+
safe_memcpy(&(kernel->arg_value[start_idx]), &arg_value, iface_arg_size,
859+
kernel->arg_value_size - start_idx, iface_arg_size);
860+
kernel->arg_is_svm[arg_index] = CL_FALSE;
861+
kernel->arg_is_ptr[arg_index] = CL_TRUE;
862+
863+
kernel->arg_defined[arg_index] = 1;
864+
865+
// double vector size if size < arg_index
866+
while (kernel->ptr_arg_vector.size() <= arg_index) {
867+
kernel->ptr_arg_vector.resize(kernel->ptr_arg_vector.size() * 2);
868+
}
869+
kernel->ptr_arg_vector[arg_index] = arg_value;
870+
871+
UNLOCK_RETURN(CL_SUCCESS);
872+
}
873+
840874
ACL_EXPORT
841875
CL_API_ENTRY cl_int CL_API_CALL clSetKernelArgMemPointerINTEL(
842876
cl_kernel kernel, cl_uint arg_index, const void *arg_value) {

src/acl_mem.cpp

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
#include <acl_support.h>
2525
#include <acl_svm.h>
2626
#include <acl_util.h>
27+
#include <acl_kernel.h>
2728
#include <check_copy_overlap.h>
2829

2930
#ifdef __GNUC__
@@ -405,6 +406,122 @@ int acl_bind_buffer_to_device(cl_device_id device, cl_mem mem) {
405406
return 1;
406407
}
407408

409+
ACL_EXPORT
410+
// CL_API_ENTRY cl_int clEnqueueReadGlobalVariableINTEL() {
411+
CL_API_ENTRY cl_int clEnqueueReadGlobalVariableINTEL(
412+
cl_command_queue command_queue,
413+
cl_program program,
414+
const char* name,
415+
cl_bool blocking_write,
416+
size_t size,
417+
size_t offset,
418+
void* ptr,
419+
cl_uint num_events_in_wait_list,
420+
const cl_event* event_wait_list,
421+
cl_event* event,
422+
const void* dev_global_ptr) {
423+
424+
// TODO: get dev_global_ptr from autodiscovery instead later
425+
// return 0;
426+
return clEnqueueWriteGlobalVariableINTEL(
427+
command_queue, program, name, blocking_write,
428+
size, offset, ptr, num_events_in_wait_list,
429+
event_wait_list, event, dev_global_ptr);
430+
}
431+
432+
ACL_EXPORT
433+
CL_API_ENTRY cl_int clEnqueueWriteGlobalVariableINTEL(
434+
cl_command_queue command_queue,
435+
cl_program program,
436+
const char* name,
437+
cl_bool blocking_write,
438+
size_t size,
439+
size_t offset,
440+
const void* ptr,
441+
cl_uint num_events_in_wait_list,
442+
const cl_event* event_wait_list,
443+
cl_event* event,
444+
const void* dev_global_ptr) {
445+
cl_int status;
446+
447+
cl_kernel kernel = clCreateKernelIntelFPGA(program, name, &status);
448+
if (status != CL_SUCCESS) {
449+
return status;
450+
}
451+
452+
// do we support ptr being a buffer instead of usm pointer? it seems to be the case on spec (only usm host pointer)
453+
// Given kernel arg must be a deivce usm pointer: When ptr is a host/shared usm pointer, this function is expected to copy it to device first? yes (currently discussing whether kernel arg accept host usm instead)
454+
void* src_dev_ptr = clDeviceMemAllocINTEL(command_queue->context, command_queue->device, NULL, size, 1, &status);
455+
if (status != CL_SUCCESS) {
456+
return status;
457+
}
458+
459+
// This copy operation have to be blocking
460+
// cl_event to_dev_event = 0;
461+
status = clEnqueueMemcpyINTEL(command_queue, CL_TRUE, src_dev_ptr, ptr, size, 0, NULL, NULL);
462+
if (status != CL_SUCCESS) {
463+
return status;
464+
}
465+
// if (to_dev_event->execution_status != CL_COMPLETE) {
466+
// return CL_INVALID_OPERATION;
467+
// }
468+
469+
status = clSetKernelArgMemPointerINTEL(kernel, 0, src_dev_ptr);
470+
if (status != CL_SUCCESS) {
471+
return status;
472+
}
473+
// should kernel header contain offset or not? no
474+
// offset is always byte offset? yes
475+
// Assuming below returns same thing as `clDeviceMemAllocINTEL`, where exactly is dev global ptr being read from?
476+
// What format is the read dev global, is it a pointer just like the return value of `clDeviceMemAllocINTEL` or is it something else? (its an unsigned value indicating address)
477+
// TODO: When passing dev global pointer directly to clSetKernelArgMemPointerINTEL, make sure REMOVE_VALID_CHECKS is defined.
478+
// Otherwise this ptr is not existing in context -> cause checks to fail
479+
// TODO: get dev_global_address from autodiscovery instead later
480+
// dev_addr_t dev_global_address = kernel->dev_bin->get_devdef().autodiscovery_def.?
481+
uintptr_t dev_global_address = 0x4000000;
482+
void* dev_global_ptr2 = (void*)(dev_global_address + offset * 8); // 1 unit of offset is 8 bits
483+
status = set_kernel_arg_mem_pointer_without_checks(kernel, 1, dev_global_ptr2);
484+
// status = clSetKernelArgMemPointerINTEL(kernel, 1, dev_global_ptr2);
485+
if (status != CL_SUCCESS) {
486+
return status;
487+
}
488+
status = clSetKernelArg(kernel, 2, sizeof(size_t), (const void *)(&size));
489+
if (status != CL_SUCCESS) {
490+
return status;
491+
}
492+
493+
status = clEnqueueTask(command_queue, kernel, num_events_in_wait_list, event_wait_list, event);
494+
if (status != CL_SUCCESS) {
495+
return status;
496+
}
497+
498+
acl_lock();
499+
// If nothing's blocking, then complete right away
500+
acl_idle_update(command_queue->context);
501+
acl_unlock();
502+
503+
if (blocking_write) {
504+
status = clWaitForEvents(1, event);
505+
}
506+
507+
if (blocking_write && status == CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST) {
508+
return status;
509+
}
510+
511+
// Free allocated device memory
512+
status = clMemFreeINTEL(command_queue->context, src_dev_ptr);
513+
if (status != CL_SUCCESS) {
514+
return status;
515+
}
516+
status = clReleaseKernel(kernel);
517+
if (status != CL_SUCCESS) {
518+
return status;
519+
}
520+
521+
return CL_SUCCESS;
522+
}
523+
524+
408525
ACL_EXPORT
409526
CL_API_ENTRY cl_mem clCreateBufferWithPropertiesINTEL(
410527
cl_context context, const cl_mem_properties_intel *properties,

src/acl_usm.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -863,7 +863,6 @@ CL_API_ENTRY cl_int CL_API_CALL clEnqueueMemcpyINTEL(
863863
UNLOCK_ERR_RET(CL_MEM_COPY_OVERLAP, command_queue->context,
864864
"Source and destination memory overlaps");
865865
}
866-
867866
acl_usm_allocation_t *dst_usm_alloc =
868867
acl_get_usm_alloc_from_ptr(command_queue->context, dst_ptr);
869868

test/acl_globals_test.cpp

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,16 @@ static acl_kernel_interface_t acltest_kernels[] = {
118118
{ACL_ARG_ADDR_GLOBAL, ACL_ARG_MEM_OBJ, sizeof(int *), 0, 0, 8},
119119
{ACL_ARG_ADDR_GLOBAL, ACL_ARG_MEM_OBJ, sizeof(int *), 0, 0, 16},
120120
{ACL_ARG_ADDR_GLOBAL, ACL_ARG_MEM_OBJ, sizeof(int *), 0, 0, 1024},
121-
}}};
121+
}},
122+
{// interface
123+
"kernel15_dev_global",
124+
{
125+
{ACL_ARG_ADDR_GLOBAL, ACL_ARG_MEM_OBJ, sizeof(int *), 0, 0, 1},
126+
{ACL_ARG_ADDR_GLOBAL, ACL_ARG_MEM_OBJ, sizeof(int *), 0, 0, 1},
127+
{ACL_ARG_ADDR_NONE, ACL_ARG_BY_VALUE, sizeof(size_t), 0, 0},
128+
// {ACL_ARG_ADDR_NONE, ACL_ARG_BY_VALUE, sizeof(size_t), 0, 0},
129+
}}
130+
};
122131

123132
template <typename T, std::size_t N>
124133
static inline constexpr acl_addr_range_t ACL_RANGE_FROM_ARRAY(T (&a)[N]) {
@@ -191,6 +200,20 @@ static std::vector<acl_accel_def_t> acltest_complex_system_device0_accel = {
191200
{},
192201
{32768, 0, 0},
193202
1},
203+
{14,
204+
ACL_RANGE_FROM_ARRAY(acltest_devicelocal[11]),
205+
acltest_kernels[14],
206+
acltest_laspace_info,
207+
{0, 0, 0},
208+
0,
209+
0,
210+
1,
211+
0,
212+
32768,
213+
3,
214+
{},
215+
{32768, 0, 0},
216+
1},
194217
{1,
195218
ACL_RANGE_FROM_ARRAY(acltest_devicelocal[1]),
196219
acltest_kernels[1],

0 commit comments

Comments
 (0)