Skip to content

Commit e3ca4d1

Browse files
committed
Device global implementation
--------------------------------- Currently the device global pointer is being mocked by passing an actual device allocation into the function. However this should be removed after the pointer is available through autodiscovery string.
1 parent 67f7f88 commit e3ca4d1

File tree

8 files changed

+357
-14
lines changed

8 files changed

+357
-14
lines changed

include/CL/cl_ext.h

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2384,6 +2384,66 @@ clCreateBufferWithPropertiesINTEL_fn)(
23842384
void * host_ptr,
23852385
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
23862386

2387+
/**********************************
2388+
* cl_intel_global_variable_access *
2389+
***********************************/
2390+
2391+
#define CL_COMMAND_READ_GLOBAL_VARIABLE_INTEL 0x418E
2392+
#define CL_COMMAND_WRITE_GLOBAL_VARIABLE_INTEL 0x418F
2393+
2394+
extern CL_API_ENTRY cl_int CL_API_CALL
2395+
clEnqueueReadGlobalVariableINTEL(
2396+
cl_command_queue command_queue,
2397+
cl_program program,
2398+
const char* name,
2399+
cl_bool blocking_read,
2400+
size_t size,
2401+
size_t offset,
2402+
void* ptr,
2403+
cl_uint num_events_in_wait_list,
2404+
const cl_event* event_wait_list,
2405+
cl_event* event) CL_API_SUFFIX__VERSION_1_0;
2406+
2407+
2408+
typedef CL_API_ENTRY cl_int (CL_API_CALL *
2409+
clEnqueueReadGlobalVariableINTEL_fn)(
2410+
cl_command_queue command_queue,
2411+
cl_program program,
2412+
const char* name,
2413+
cl_bool blocking_read,
2414+
size_t size,
2415+
size_t offset,
2416+
const void* ptr,
2417+
cl_uint num_events_in_wait_list,
2418+
const cl_event* event_wait_list,
2419+
cl_event* event) CL_API_SUFFIX__VERSION_1_0;
2420+
2421+
extern CL_API_ENTRY cl_int CL_API_CALL
2422+
clEnqueueWriteGlobalVariableINTEL(
2423+
cl_command_queue command_queue,
2424+
cl_program program,
2425+
const char* name,
2426+
cl_bool blocking_write,
2427+
size_t size,
2428+
size_t offset,
2429+
const void* ptr,
2430+
cl_uint num_events_in_wait_list,
2431+
const cl_event* event_wait_list,
2432+
cl_event* event) CL_API_SUFFIX__VERSION_1_0;
2433+
2434+
typedef CL_API_ENTRY cl_int (CL_API_CALL *
2435+
clEnqueueWriteGlobalVariableINTEL_fn)(
2436+
cl_command_queue command_queue,
2437+
cl_program program,
2438+
const char* name,
2439+
cl_bool blocking_read,
2440+
size_t size,
2441+
size_t offset,
2442+
void* ptr,
2443+
cl_uint num_events_in_wait_list,
2444+
const cl_event* event_wait_list,
2445+
cl_event* event) CL_API_SUFFIX__VERSION_1_0;
2446+
23872447
/******************************************
23882448
* cl_intel_mem_channel_property extension *
23892449
*******************************************/

include/acl_kernel.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,10 @@ void acl_receive_kernel_update(int activation_id, cl_int status);
5454
// safe to submit a kernel with subbuffers to the device_op_queue
5555
int acl_kernel_has_unmapped_subbuffers(acl_mem_migrate_t *mem_migration);
5656

57+
cl_int set_kernel_arg_mem_pointer_without_checks(cl_kernel kernel,
58+
cl_uint arg_index,
59+
void *arg_value);
60+
5761
#if defined(__cplusplus)
5862
} /* extern "C" */
5963
#endif

src/acl_icd_dispatch.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@ clGetExtensionFunctionAddressIntelFPGA(const char *func_name) {
5050
ADDFUNCTIONLOOKUP(clResetKernelsIntelFPGA);
5151
ADDFUNCTIONLOOKUP(clSetBoardLibraryIntelFPGA);
5252
ADDFUNCTIONLOOKUP(clCreateBufferWithPropertiesINTEL);
53+
ADDFUNCTIONLOOKUP(clEnqueueReadGlobalVariableINTEL);
54+
ADDFUNCTIONLOOKUP(clEnqueueWriteGlobalVariableINTEL);
5355

5456
// USM APIs are not currently supported on 32bit devices
5557
#ifndef __arm__

src/acl_kernel.cpp

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -837,6 +837,41 @@ CL_API_ENTRY cl_int CL_API_CALL clSetKernelArgSVMPointer(
837837
return clSetKernelArgSVMPointerIntelFPGA(kernel, arg_index, arg_value);
838838
}
839839

840+
cl_int set_kernel_arg_mem_pointer_without_checks(cl_kernel kernel,
841+
cl_uint arg_index,
842+
void *arg_value) {
843+
acl_lock();
844+
if (!acl_kernel_is_valid(kernel)) {
845+
UNLOCK_RETURN(CL_INVALID_KERNEL);
846+
}
847+
848+
cl_context context = kernel->program->context;
849+
850+
if (arg_index >= kernel->accel_def->iface.args.size()) {
851+
UNLOCK_ERR_RET(CL_INVALID_ARG_INDEX, context,
852+
"Argument index is too large");
853+
}
854+
855+
// Determine where to write the value.
856+
size_t start_idx = 0;
857+
size_t iface_arg_size = 0;
858+
l_get_arg_offset_and_size(kernel, arg_index, &start_idx, &iface_arg_size);
859+
safe_memcpy(&(kernel->arg_value[start_idx]), &arg_value, iface_arg_size,
860+
kernel->arg_value_size - start_idx, iface_arg_size);
861+
kernel->arg_is_svm[arg_index] = CL_FALSE;
862+
kernel->arg_is_ptr[arg_index] = CL_TRUE;
863+
864+
kernel->arg_defined[arg_index] = 1;
865+
866+
// double vector size if size < arg_index
867+
while (kernel->ptr_arg_vector.size() <= arg_index) {
868+
kernel->ptr_arg_vector.resize(kernel->ptr_arg_vector.size() * 2);
869+
}
870+
kernel->ptr_arg_vector[arg_index] = arg_value;
871+
872+
UNLOCK_RETURN(CL_SUCCESS);
873+
}
874+
840875
ACL_EXPORT
841876
CL_API_ENTRY cl_int CL_API_CALL clSetKernelArgMemPointerINTEL(
842877
cl_kernel kernel, cl_uint arg_index, const void *arg_value) {

src/acl_mem.cpp

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include <acl_globals.h>
2020
#include <acl_hal.h>
2121
#include <acl_icd_dispatch.h>
22+
#include <acl_kernel.h>
2223
#include <acl_mem.h>
2324
#include <acl_platform.h>
2425
#include <acl_support.h>
@@ -405,6 +406,119 @@ int acl_bind_buffer_to_device(cl_device_id device, cl_mem mem) {
405406
return 1;
406407
}
407408

409+
ACL_EXPORT
410+
// CL_API_ENTRY cl_int clEnqueueReadGlobalVariableINTEL() {
411+
CL_API_ENTRY cl_int clEnqueueReadGlobalVariableINTEL(
412+
cl_command_queue command_queue, cl_program program, const char *name,
413+
cl_bool blocking_write, size_t size, size_t offset, void *ptr,
414+
cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
415+
cl_event *event) {
416+
417+
// TODO: get dev_global_ptr from autodiscovery instead later
418+
// return 0;
419+
return clEnqueueWriteGlobalVariableINTEL(
420+
command_queue, program, name, blocking_write, size, offset, ptr,
421+
num_events_in_wait_list, event_wait_list, event);
422+
}
423+
424+
ACL_EXPORT
425+
CL_API_ENTRY cl_int clEnqueueWriteGlobalVariableINTEL(
426+
cl_command_queue command_queue, cl_program program, const char *name,
427+
cl_bool blocking_write, size_t size, size_t offset, const void *ptr,
428+
cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
429+
cl_event *event) {
430+
cl_int status;
431+
432+
cl_kernel kernel = clCreateKernelIntelFPGA(program, name, &status);
433+
if (status != CL_SUCCESS) {
434+
return status;
435+
}
436+
437+
// do we support ptr being a buffer instead of usm pointer? it seems to be the
438+
// case on spec (only usm host pointer) Given kernel arg must be a deivce usm
439+
// pointer: When ptr is a host/shared usm pointer, this function is expected
440+
// to copy it to device first? yes (currently discussing whether kernel arg
441+
// accept host usm instead)
442+
void *src_dev_ptr = clDeviceMemAllocINTEL(
443+
command_queue->context, command_queue->device, NULL, size, 1, &status);
444+
if (status != CL_SUCCESS) {
445+
return status;
446+
}
447+
448+
// This copy operation have to be blocking
449+
// cl_event to_dev_event = 0;
450+
status = clEnqueueMemcpyINTEL(command_queue, CL_TRUE, src_dev_ptr, ptr, size,
451+
0, NULL, NULL);
452+
if (status != CL_SUCCESS) {
453+
return status;
454+
}
455+
// if (to_dev_event->execution_status != CL_COMPLETE) {
456+
// return CL_INVALID_OPERATION;
457+
// }
458+
459+
status = clSetKernelArgMemPointerINTEL(kernel, 0, src_dev_ptr);
460+
if (status != CL_SUCCESS) {
461+
return status;
462+
}
463+
// should kernel header contain offset or not? no
464+
// offset is always byte offset? yes
465+
// Assuming below returns same thing as `clDeviceMemAllocINTEL`, where exactly
466+
// is dev global ptr being read from? What format is the read dev global, is
467+
// it a pointer just like the return value of `clDeviceMemAllocINTEL` or is it
468+
// something else? (its an unsigned value indicating address)
469+
// TODO: When passing dev global pointer directly to
470+
// clSetKernelArgMemPointerINTEL, make sure REMOVE_VALID_CHECKS is defined.
471+
// Otherwise this ptr is not existing in context -> cause checks to fail
472+
// TODO: get dev_global_address from autodiscovery instead later
473+
// dev_addr_t dev_global_address =
474+
// kernel->dev_bin->get_devdef().autodiscovery_def.?
475+
uintptr_t dev_global_address = 0x4000000;
476+
void *dev_global_ptr2 =
477+
(void *)(dev_global_address + offset * 8); // 1 unit of offset is 8 bits
478+
status =
479+
set_kernel_arg_mem_pointer_without_checks(kernel, 1, dev_global_ptr2);
480+
// status = clSetKernelArgMemPointerINTEL(kernel, 1, dev_global_ptr2);
481+
if (status != CL_SUCCESS) {
482+
return status;
483+
}
484+
status = clSetKernelArg(kernel, 2, sizeof(size_t), (const void *)(&size));
485+
if (status != CL_SUCCESS) {
486+
return status;
487+
}
488+
489+
status = clEnqueueTask(command_queue, kernel, num_events_in_wait_list,
490+
event_wait_list, event);
491+
if (status != CL_SUCCESS) {
492+
return status;
493+
}
494+
495+
acl_lock();
496+
// If nothing's blocking, then complete right away
497+
acl_idle_update(command_queue->context);
498+
acl_unlock();
499+
500+
if (blocking_write) {
501+
status = clWaitForEvents(1, event);
502+
}
503+
504+
if (blocking_write &&
505+
status == CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST) {
506+
return status;
507+
}
508+
509+
// Free allocated device memory
510+
status = clMemFreeINTEL(command_queue->context, src_dev_ptr);
511+
if (status != CL_SUCCESS) {
512+
return status;
513+
}
514+
status = clReleaseKernel(kernel);
515+
if (status != CL_SUCCESS) {
516+
return status;
517+
}
518+
519+
return CL_SUCCESS;
520+
}
521+
408522
ACL_EXPORT
409523
CL_API_ENTRY cl_mem clCreateBufferWithPropertiesINTEL(
410524
cl_context context, const cl_mem_properties_intel *properties,

test/acl_globals_test.cpp

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,14 @@ static acl_kernel_interface_t acltest_kernels[] = {
118118
{ACL_ARG_ADDR_GLOBAL, ACL_ARG_MEM_OBJ, sizeof(int *), 0, 0, 8},
119119
{ACL_ARG_ADDR_GLOBAL, ACL_ARG_MEM_OBJ, sizeof(int *), 0, 0, 16},
120120
{ACL_ARG_ADDR_GLOBAL, ACL_ARG_MEM_OBJ, sizeof(int *), 0, 0, 1024},
121+
}},
122+
{// interface
123+
"kernel15_dev_global",
124+
{
125+
{ACL_ARG_ADDR_GLOBAL, ACL_ARG_MEM_OBJ, sizeof(int *), 0, 0, 1},
126+
{ACL_ARG_ADDR_GLOBAL, ACL_ARG_MEM_OBJ, sizeof(int *), 0, 0, 1},
127+
{ACL_ARG_ADDR_NONE, ACL_ARG_BY_VALUE, sizeof(size_t), 0, 0},
128+
// {ACL_ARG_ADDR_NONE, ACL_ARG_BY_VALUE, sizeof(size_t), 0, 0},
121129
}}};
122130

123131
template <typename T, std::size_t N>
@@ -191,6 +199,20 @@ static std::vector<acl_accel_def_t> acltest_complex_system_device0_accel = {
191199
{},
192200
{32768, 0, 0},
193201
1},
202+
{14,
203+
ACL_RANGE_FROM_ARRAY(acltest_devicelocal[11]),
204+
acltest_kernels[14],
205+
acltest_laspace_info,
206+
{0, 0, 0},
207+
0,
208+
0,
209+
1,
210+
0,
211+
32768,
212+
3,
213+
{},
214+
{32768, 0, 0},
215+
1},
194216
{1,
195217
ACL_RANGE_FROM_ARRAY(acltest_devicelocal[1]),
196218
acltest_kernels[1],

test/acl_program_test.cpp

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -606,25 +606,25 @@ MT_TEST(acl_program, program_info) {
606606
// built stat. to success even before calling clbuildprogram
607607
CHECK_EQUAL(CL_SUCCESS, clGetProgramInfo(program, CL_PROGRAM_NUM_KERNELS,
608608
sizeof(size_t), &num_kernels, 0));
609-
CHECK_EQUAL(14, num_kernels);
609+
CHECK_EQUAL(15, num_kernels);
610610

611611
// This won't happen if program is built with binary since we set the program
612612
// built stat. to success even before calling clbuildprogram
613613
CHECK_EQUAL(CL_SUCCESS, clGetProgramInfo(program, CL_PROGRAM_KERNEL_NAMES, 0,
614614
NULL, &size_ret));
615-
CHECK_EQUAL(321, size_ret);
615+
CHECK_EQUAL(341, size_ret);
616616

617617
CHECK_EQUAL(CL_SUCCESS, clBuildProgram(program, 0, 0, "", 0, 0));
618618

619619
// after building the program
620620
CHECK_EQUAL(CL_SUCCESS, clGetProgramInfo(program, CL_PROGRAM_NUM_KERNELS,
621621
sizeof(size_t), &num_kernels, 0));
622-
CHECK_EQUAL(14, num_kernels);
622+
CHECK_EQUAL(15, num_kernels);
623623

624624
CHECK_EQUAL(CL_SUCCESS,
625625
clGetProgramInfo(program, CL_PROGRAM_NUM_KERNELS, sizeof(size_t),
626626
&num_kernels, &size_ret));
627-
CHECK_EQUAL(14, num_kernels);
627+
CHECK_EQUAL(15, num_kernels);
628628
CHECK_EQUAL(sizeof(size_t), size_ret);
629629

630630
CHECK_EQUAL(CL_SUCCESS, clGetProgramInfo(program, CL_PROGRAM_NUM_KERNELS, 0,
@@ -633,19 +633,21 @@ MT_TEST(acl_program, program_info) {
633633

634634
CHECK_EQUAL(CL_SUCCESS, clGetProgramInfo(program, CL_PROGRAM_KERNEL_NAMES, 0,
635635
NULL, &size_ret));
636-
CHECK_EQUAL(321, size_ret);
636+
CHECK_EQUAL(341, size_ret);
637+
// CHECK_EQUAL(321, size_ret);
637638

638639
names[size_ret] = 100; // making sure extra bytes of memory are not affected.
639640
CHECK_EQUAL(CL_SUCCESS,
640641
clGetProgramInfo(program, CL_PROGRAM_KERNEL_NAMES,
641642
2000 * sizeof(char), names, &size_ret));
642-
CHECK_EQUAL(321, size_ret); // only one kernel named: "foo"
643+
CHECK_EQUAL(341, size_ret); // only one kernel named: "foo"
643644
CHECK_EQUAL(100, names[size_ret]);
644645
CHECK_EQUAL(0, strcmp("kernel0_copy_vecin_vecout;"
645646
"kernel11_task_double;"
646647
"kernel12_task_double;"
647648
"kernel13_multi_vec_lane;"
648649
"kernel14_svm_arg_alignment;"
650+
"kernel15_dev_global;"
649651
"kernel1_vecadd_vecin_vecin_vecout;"
650652
"kernel2_vecscale_vecin_scalar_vecout;"
651653
"kernel3_locals;"

0 commit comments

Comments
 (0)