Skip to content

Device global copy kernel implementation #65

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 12 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 60 additions & 0 deletions include/CL/cl_ext.h
Original file line number Diff line number Diff line change
Expand Up @@ -2384,6 +2384,66 @@ clCreateBufferWithPropertiesINTEL_fn)(
void * host_ptr,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;

/**********************************
* cl_intel_global_variable_access *
***********************************/

#define CL_COMMAND_READ_GLOBAL_VARIABLE_INTEL 0x418E
#define CL_COMMAND_WRITE_GLOBAL_VARIABLE_INTEL 0x418F

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueReadGlobalVariableINTEL(
cl_command_queue command_queue,
cl_program program,
const char* name,
cl_bool blocking_read,
size_t size,
size_t offset,
void* ptr,
cl_uint num_events_in_wait_list,
const cl_event* event_wait_list,
cl_event* event) CL_API_SUFFIX__VERSION_1_0;


typedef CL_API_ENTRY cl_int (CL_API_CALL *
clEnqueueReadGlobalVariableINTEL_fn)(
cl_command_queue command_queue,
cl_program program,
const char* name,
cl_bool blocking_read,
size_t size,
size_t offset,
const void* ptr,
cl_uint num_events_in_wait_list,
const cl_event* event_wait_list,
cl_event* event) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueWriteGlobalVariableINTEL(
cl_command_queue command_queue,
cl_program program,
const char* name,
cl_bool blocking_write,
size_t size,
size_t offset,
const void* ptr,
cl_uint num_events_in_wait_list,
const cl_event* event_wait_list,
cl_event* event) CL_API_SUFFIX__VERSION_1_0;

typedef CL_API_ENTRY cl_int (CL_API_CALL *
clEnqueueWriteGlobalVariableINTEL_fn)(
cl_command_queue command_queue,
cl_program program,
const char* name,
cl_bool blocking_read,
size_t size,
size_t offset,
void* ptr,
cl_uint num_events_in_wait_list,
const cl_event* event_wait_list,
cl_event* event) CL_API_SUFFIX__VERSION_1_0;

/******************************************
* cl_intel_mem_channel_property extension *
*******************************************/
Expand Down
13 changes: 13 additions & 0 deletions include/acl.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include <array>
#include <assert.h>
#include <string>
#include <unordered_map>
#include <vector>

#include <CL/cl_ext.h>
Expand Down Expand Up @@ -478,6 +479,12 @@ typedef class acl_device_program_info_t *acl_device_program_info;
*/
#define ACL_MEM_CAPABILITY_P2P (1 << 3)

typedef struct acl_device_global_mem_def_t {
std::string name;
unsigned int address;
unsigned int size;
} acl_device_global_mem_def_t;

// Part of acl_device_def_t where members are populated from the information
// in the autodiscovery string. This will get updated every time the device
// is programmed with a new device binary as the new binary would contain a
Expand All @@ -496,6 +503,12 @@ typedef struct acl_device_def_autodiscovery_t {
std::array<acl_system_global_mem_def_t, ACL_MAX_GLOBAL_MEM> global_mem_defs;

std::vector<acl_hostpipe_info_t> acl_hostpipe_info;

// device global definition
unsigned int num_device_global;
// std::vector<acl_device_global_mem_def_t> device_global_mem_defs;
std::unordered_map<std::string, acl_device_global_mem_def_t>
device_global_mem_defs;
} acl_device_def_autodiscovery_t;

typedef struct acl_device_def_t {
Expand Down
4 changes: 4 additions & 0 deletions include/acl_kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,10 @@ void acl_receive_kernel_update(int activation_id, cl_int status);
// safe to submit a kernel with subbuffers to the device_op_queue
int acl_kernel_has_unmapped_subbuffers(acl_mem_migrate_t *mem_migration);

cl_int set_kernel_arg_mem_pointer_without_checks(cl_kernel kernel,
cl_uint arg_index,
void *arg_value);

#if defined(__cplusplus)
} /* extern "C" */
#endif
Expand Down
8 changes: 8 additions & 0 deletions include/acl_mem.h
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,14 @@ cl_bool acl_is_sub_or_parent_buffer(cl_mem mem);
void CL_CALLBACK acl_free_allocation_after_event_completion(
cl_event event, cl_int event_command_exec_status, void *callback_data);

void CL_CALLBACK acl_dev_global_cleanup(cl_event event,
cl_int event_command_exec_status,
void *callback_data);

cl_int acl_extract_device_global_address(cl_kernel kernel,
const char *dev_global_name,
unsigned int *ret_addr);

#ifdef __GNUC__
#pragma GCC visibility pop
#endif
Expand Down
56 changes: 56 additions & 0 deletions src/acl_auto_configure.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ static bool read_uint_counters(const std::string &str,
UNREFERENCED_PARAMETER(e);
return false;
}

return true;
}

Expand Down Expand Up @@ -493,6 +494,61 @@ bool acl_load_device_def_from_str(const std::string &config_str,
counters);
}

// Read device global information
unsigned int num_device_global = 0;
if (result && counters.back() > 0) {
result =
read_uint_counters(config_str, curr_pos, num_device_global, counters);
devdef.num_device_global = num_device_global;

// read total number of fields in device global
int total_fields_device_global = 0;
if (result) {
result = read_int_counters(config_str, curr_pos,
total_fields_device_global, counters);
}

for (auto i = 0U; result && (i < num_device_global);
i++) { // device_global_memories
counters.emplace_back(total_fields_device_global);

// read device global name
std::string device_global_name;
if (result && counters.back() > 0) {
result = read_string_counters(config_str, curr_pos, device_global_name,
counters);
}

// read device global address
unsigned int dev_global_addr = 0; // Default
if (result && counters.back() > 0) {
result =
read_uint_counters(config_str, curr_pos, dev_global_addr, counters);
}
// read device global address size
unsigned int dev_global_size = 0; // Default
if (result && counters.back() > 0) {
result =
read_uint_counters(config_str, curr_pos, dev_global_size, counters);
}

acl_device_global_mem_def_t dev_global_def = {
device_global_name, dev_global_addr, dev_global_size};
devdef.device_global_mem_defs[device_global_name] = dev_global_def;

// forward compatibility: bypassing remaining fields at the end of global
// memory
while (result && counters.size() > 0 &&
counters.back() > 0) { // total_fields_device_global>0
std::string tmp;
result =
result && read_string_counters(config_str, curr_pos, tmp, counters);
check_section_counters(counters);
}
counters.pop_back(); // removing total_fields_device_global
} // device_global_memories
}

// forward compatibility: bypassing remaining fields at the end of device
// description section
while (result && counters.size() > 0 &&
Expand Down
2 changes: 2 additions & 0 deletions src/acl_icd_dispatch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ clGetExtensionFunctionAddressIntelFPGA(const char *func_name) {
ADDFUNCTIONLOOKUP(clResetKernelsIntelFPGA);
ADDFUNCTIONLOOKUP(clSetBoardLibraryIntelFPGA);
ADDFUNCTIONLOOKUP(clCreateBufferWithPropertiesINTEL);
ADDFUNCTIONLOOKUP(clEnqueueReadGlobalVariableINTEL);
ADDFUNCTIONLOOKUP(clEnqueueWriteGlobalVariableINTEL);

// USM APIs are not currently supported on 32bit devices
#ifndef __arm__
Expand Down
51 changes: 51 additions & 0 deletions src/acl_kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -837,6 +837,57 @@ CL_API_ENTRY cl_int CL_API_CALL clSetKernelArgSVMPointer(
return clSetKernelArgSVMPointerIntelFPGA(kernel, arg_index, arg_value);
}

/**
* Set any provided void pointer as kernel arguments
*
* It is assumed that the provided pointer is a valid device address,
* or device global address that kernel can use to point to right address space.
*
* It is the same as `clSetKernelArgMemPointerINTEL` except the validity checks
* are removed. This is because the user provided pointer may not always be usm
* pointer, therefore will not belong to the context (as they are checked in
* clSetKernelArgMemPointerINTEL)
*
* @param kernel the kernel that accept the pointer arg
* @param arg_index which kernel argument accept the value
* @param arg_value the pointer to desired address space
* @return status code, CL_SUCCESS if all operations are successful.
*/
cl_int set_kernel_arg_mem_pointer_without_checks(cl_kernel kernel,
cl_uint arg_index,
void *arg_value) {
acl_lock();
if (!acl_kernel_is_valid(kernel)) {
UNLOCK_RETURN(CL_INVALID_KERNEL);
}

cl_context context = kernel->program->context;

if (arg_index >= kernel->accel_def->iface.args.size()) {
UNLOCK_ERR_RET(CL_INVALID_ARG_INDEX, context,
"Argument index is too large");
}

// Determine where to write the value.
size_t start_idx = 0;
size_t iface_arg_size = 0;
l_get_arg_offset_and_size(kernel, arg_index, &start_idx, &iface_arg_size);
safe_memcpy(&(kernel->arg_value[start_idx]), &arg_value, iface_arg_size,
kernel->arg_value_size - start_idx, iface_arg_size);
kernel->arg_is_svm[arg_index] = CL_FALSE;
kernel->arg_is_ptr[arg_index] = CL_TRUE;

kernel->arg_defined[arg_index] = 1;

// double vector size if size < arg_index
while (kernel->ptr_arg_vector.size() <= arg_index) {
kernel->ptr_arg_vector.resize(kernel->ptr_arg_vector.size() * 2);
}
kernel->ptr_arg_vector[arg_index] = arg_value;

UNLOCK_RETURN(CL_SUCCESS);
}

ACL_EXPORT
CL_API_ENTRY cl_int CL_API_CALL clSetKernelArgMemPointerINTEL(
cl_kernel kernel, cl_uint arg_index, const void *arg_value) {
Expand Down
Loading