Skip to content

Device global copy kernel implementation #269

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 19 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 60 additions & 0 deletions include/CL/cl_ext.h
Original file line number Diff line number Diff line change
Expand Up @@ -2384,6 +2384,66 @@ clCreateBufferWithPropertiesINTEL_fn)(
void * host_ptr,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;

/**********************************
* cl_intel_global_variable_access *
***********************************/

#define CL_COMMAND_READ_GLOBAL_VARIABLE_INTEL 0x418E
#define CL_COMMAND_WRITE_GLOBAL_VARIABLE_INTEL 0x418F

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueReadGlobalVariableINTEL(
cl_command_queue command_queue,
cl_program program,
const char* name,
cl_bool blocking_read,
size_t size,
size_t offset,
void* ptr,
cl_uint num_events_in_wait_list,
const cl_event* event_wait_list,
cl_event* event) CL_API_SUFFIX__VERSION_1_0;


typedef CL_API_ENTRY cl_int (CL_API_CALL *
clEnqueueReadGlobalVariableINTEL_fn)(
cl_command_queue command_queue,
cl_program program,
const char* name,
cl_bool blocking_read,
size_t size,
size_t offset,
const void* ptr,
cl_uint num_events_in_wait_list,
const cl_event* event_wait_list,
cl_event* event) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueWriteGlobalVariableINTEL(
cl_command_queue command_queue,
cl_program program,
const char* name,
cl_bool blocking_write,
size_t size,
size_t offset,
const void* ptr,
cl_uint num_events_in_wait_list,
const cl_event* event_wait_list,
cl_event* event) CL_API_SUFFIX__VERSION_1_0;

typedef CL_API_ENTRY cl_int (CL_API_CALL *
clEnqueueWriteGlobalVariableINTEL_fn)(
cl_command_queue command_queue,
cl_program program,
const char* name,
cl_bool blocking_read,
size_t size,
size_t offset,
void* ptr,
cl_uint num_events_in_wait_list,
const cl_event* event_wait_list,
cl_event* event) CL_API_SUFFIX__VERSION_1_0;

/******************************************
* cl_intel_mem_channel_property extension *
*******************************************/
Expand Down
1 change: 1 addition & 0 deletions include/acl.h
Original file line number Diff line number Diff line change
Expand Up @@ -553,6 +553,7 @@ typedef struct acl_device_def_autodiscovery_t {
std::vector<acl_hostpipe_info_t> acl_hostpipe_info;

// Device global definition.
unsigned int num_device_global;
std::unordered_map<std::string, acl_device_global_mem_def_t>
device_global_mem_defs;
bool cra_ring_root_exist =
Expand Down
4 changes: 4 additions & 0 deletions include/acl_kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,10 @@ int acl_kernel_has_unmapped_subbuffers(acl_mem_migrate_t *mem_migration);
// currently loaded program.
bool acl_device_has_reprogram_device_globals(cl_device_id device);

cl_int set_kernel_arg_mem_pointer_without_checks(cl_kernel kernel,
cl_uint arg_index,
void *arg_value);

#if defined(__cplusplus)
} /* extern "C" */
#endif
Expand Down
8 changes: 8 additions & 0 deletions include/acl_mem.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,14 @@ cl_bool acl_is_sub_or_parent_buffer(cl_mem mem);
void CL_CALLBACK acl_free_allocation_after_event_completion(
cl_event event, cl_int event_command_exec_status, void *callback_data);

void CL_CALLBACK acl_dev_global_cleanup(cl_event event,
cl_int event_command_exec_status,
void *callback_data);

cl_int acl_extract_device_global_address(cl_kernel kernel,
const char *dev_global_name,
unsigned int *ret_addr);

#ifdef __GNUC__
#pragma GCC visibility pop
#endif
Expand Down
2 changes: 2 additions & 0 deletions src/acl_icd_dispatch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ clGetExtensionFunctionAddressIntelFPGA(const char *func_name) {
ADDFUNCTIONLOOKUP(clResetKernelsIntelFPGA);
ADDFUNCTIONLOOKUP(clSetBoardLibraryIntelFPGA);
ADDFUNCTIONLOOKUP(clCreateBufferWithPropertiesINTEL);
ADDFUNCTIONLOOKUP(clEnqueueReadGlobalVariableINTEL);
ADDFUNCTIONLOOKUP(clEnqueueWriteGlobalVariableINTEL);

// USM APIs are not currently supported on 32bit devices
#ifndef __arm__
Expand Down
58 changes: 54 additions & 4 deletions src/acl_kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -831,6 +831,56 @@ CL_API_ENTRY cl_int CL_API_CALL clSetKernelArgSVMPointer(
return clSetKernelArgSVMPointerIntelFPGA(kernel, arg_index, arg_value);
}

/**
* Set any provided void pointer as kernel arguments
*
* It is assumed that the provided pointer is a valid device address,
* or device global address that kernel can use to point to right address space.
*
* It is the same as `clSetKernelArgMemPointerINTEL` except the validity checks
* are removed. This is because the user provided pointer may not always be usm
* pointer, therefore will not belong to the context (as they are checked in
* clSetKernelArgMemPointerINTEL)
*
* @param kernel the kernel that accept the pointer arg
* @param arg_index which kernel argument accept the value
* @param arg_value the pointer to desired address space
* @return status code, CL_SUCCESS if all operations are successful.
*/
cl_int set_kernel_arg_mem_pointer_without_checks(cl_kernel kernel,
cl_uint arg_index,
void *arg_value) {
std::scoped_lock lock{acl_mutex_wrapper};
if (!acl_kernel_is_valid(kernel)) {
return (CL_INVALID_KERNEL);
}

cl_context context = kernel->program->context;

if (arg_index >= kernel->accel_def->iface.args.size()) {
ERR_RET(CL_INVALID_ARG_INDEX, context, "Argument index is too large");
}

// Determine where to write the value.
size_t start_idx = 0;
size_t iface_arg_size = 0;
l_get_arg_offset_and_size(kernel, arg_index, &start_idx, &iface_arg_size);
safe_memcpy(&(kernel->arg_value[start_idx]), &arg_value, iface_arg_size,
kernel->arg_value_size - start_idx, iface_arg_size);
kernel->arg_is_svm[arg_index] = CL_FALSE;
kernel->arg_is_ptr[arg_index] = CL_TRUE;

kernel->arg_defined[arg_index] = 1;

// double vector size if size < arg_index
while (kernel->ptr_arg_vector.size() <= arg_index) {
kernel->ptr_arg_vector.resize(kernel->ptr_arg_vector.size() * 2);
}
kernel->ptr_arg_vector[arg_index] = arg_value;

return (CL_SUCCESS);
}

ACL_EXPORT
CL_API_ENTRY cl_int CL_API_CALL clSetKernelArgMemPointerINTEL(
cl_kernel kernel, cl_uint arg_index, const void *arg_value) {
Expand Down Expand Up @@ -3193,10 +3243,10 @@ void acl_receive_kernel_update(int activation_id, cl_int status) {
acl_device_op_queue_t *doq = &(acl_platform.device_op_queue);

// This function can potentially be called by a HAL that does not use the
// ACL global lock, so we need to use acl_lock() instead of
// acl_assert_locked(). However, the MMD HAL calls this function from a unix
// signal handler, which can't lock mutexes, so we don't lock in that case.
// All functions called from this one therefore have to use
// ACL global lock, so we need to use std::scoped_lock lock{acl_mutex_wrapper}
// instead of acl_assert_locked(). However, the MMD HAL calls this function
// from a unix signal handler, which can't lock mutexes, so we don't lock in
// that case. All functions called from this one therefore have to use
// acl_assert_locked_or_sig() instead of just acl_assert_locked().
std::unique_lock lock{acl_mutex_wrapper, std::defer_lock};
if (!acl_is_inside_sig()) {
Expand Down
Loading