Skip to content

Implement runtime support for device global with init_mode reprogram #161

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Dec 12, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 30 additions & 1 deletion include/acl.h
Original file line number Diff line number Diff line change
Expand Up @@ -498,10 +498,39 @@ typedef class acl_device_program_info_t *acl_device_program_info;
*/
#define ACL_MEM_CAPABILITY_P2P (1 << 3)

// Enum values here need to match the SPIRV spec for device global in
// https://github.com/intel/llvm/blob/44c6437684d64aba82d5a3de0e4bbe21d2b1f7ce/sycl/doc/design/spirv-extensions/SPV_INTEL_global_variable_decorations.asciidoc
// ACL_DEVICE_GLOBAL_HOST_ACCESS_TYPE_COUNT is used for validation
// in autodiscovery string parsing and should remain the last constant
// in the enum.
typedef enum {
ACL_DEVICE_GLOBAL_HOST_ACCESS_READ_ONLY,
ACL_DEVICE_GLOBAL_HOST_ACCESS_WRITE_ONLY,
ACL_DEVICE_GLOBAL_HOST_ACCESS_READ_WRITE,
ACL_DEVICE_GLOBAL_HOST_ACCESS_NONE,

ACL_DEVICE_GLOBAL_HOST_ACCESS_TYPE_COUNT
} acl_device_global_host_access_t;

// Enum values here also need to match the SPIRV spec for device
// global in the above link for acl_device_global_host_access_t.
// ACL_DEVICE_GLOBAL_INIT_MODE_TYPE_COUNT is used for validation in
// autodiscovery string parsing and should remain the last constant
// in the enum.
typedef enum {
ACL_DEVICE_GLOBAL_INIT_MODE_REPROGRAM,
ACL_DEVICE_GLOBAL_INIT_MODE_RESET,

ACL_DEVICE_GLOBAL_INIT_MODE_TYPE_COUNT
} acl_device_global_init_mode_t;

// Definition of device global.
struct acl_device_global_mem_def_t {
uint32_t address;
uint64_t address;
uint32_t size;
acl_device_global_host_access_t host_access;
acl_device_global_init_mode_t init_mode;
bool implement_in_csr;
};

// Part of acl_device_def_t where members are populated from the information
Expand Down
7 changes: 7 additions & 0 deletions include/acl_kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,13 @@ void acl_receive_kernel_update(int activation_id, cl_int status);
// safe to submit a kernel with subbuffers to the device_op_queue
int acl_kernel_has_unmapped_subbuffers(acl_mem_migrate_t *mem_migration);

// Checks if the program currently loaded on the passed-in device contains
// any device globals with reprogram init mode. When a kernel is submitted
// for the first time and this function returns true, a force reprogram will
// be scheduled even when the kernel binary hash matches the hash of the
// currently loaded program.
bool acl_device_has_reprogram_device_globals(cl_device_id device);

#if defined(__cplusplus)
} /* extern "C" */
#endif
Expand Down
59 changes: 55 additions & 4 deletions src/acl_auto_configure.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,28 @@ static bool read_uint32_counters(const std::string &str,
return true;
}

// Reads the next word in str and converts it into an unsigned 64-bit
// fixed-length integer. Note this read utilizes stoull and fail if
// unsigned long long is not 64-bit long on the platform.
// Returns true if a valid integer was read or false if an error occurred.
// pos is updated to the position immediately following the parsed word
// even if an error occurs.
static bool read_uint64_counters(const std::string &str,
std::string::size_type &pos, uint64_t &val,
std::vector<int> &counters) noexcept {
std::string result;
pos = read_word(str, pos, result);
decrement_section_counters(counters);
try {
static_assert(sizeof(uint64_t) == sizeof(unsigned long long));
val = static_cast<uint64_t>(std::stoull(result));
} catch (const std::exception &e) {
UNREFERENCED_PARAMETER(e);
return false;
}
return true;
}

// Reads the next word in str and converts it into an unsigned.
// Returns true if a valid integer was read or false if an error occurred.
// pos is updated to the position immediately following the parsed word
Expand Down Expand Up @@ -470,6 +492,9 @@ static bool read_device_global_mem_defs(
total_fields_device_global, counters);
}

// Clean up any residual information first
device_global_mem_defs.clear();

for (auto i = 0U; result && (i < num_device_global); i++) {
counters.emplace_back(total_fields_device_global);

Expand All @@ -481,10 +506,10 @@ static bool read_device_global_mem_defs(
}

// read device global address
uint32_t dev_global_addr = 0; // Default
uint64_t dev_global_addr = 0; // Default
if (result && counters.back() > 0) {
result =
read_uint32_counters(config_str, curr_pos, dev_global_addr, counters);
read_uint64_counters(config_str, curr_pos, dev_global_addr, counters);
}
// read device global address size
uint32_t dev_global_size = 0; // Default
Expand All @@ -493,8 +518,34 @@ static bool read_device_global_mem_defs(
read_uint32_counters(config_str, curr_pos, dev_global_size, counters);
}

acl_device_global_mem_def_t dev_global_def = {dev_global_addr,
dev_global_size};
// read device global properties
auto host_access =
static_cast<unsigned>(ACL_DEVICE_GLOBAL_HOST_ACCESS_READ_WRITE);
if (result && counters.back() > 0) {
result = read_uint_counters(config_str, curr_pos, host_access, counters);
if (host_access >=
static_cast<unsigned>(ACL_DEVICE_GLOBAL_HOST_ACCESS_TYPE_COUNT))
result = false;
}
auto init_mode =
static_cast<unsigned>(ACL_DEVICE_GLOBAL_INIT_MODE_REPROGRAM);
if (result && counters.back() > 0) {
result = read_uint_counters(config_str, curr_pos, init_mode, counters);
if (init_mode >=
static_cast<unsigned>(ACL_DEVICE_GLOBAL_INIT_MODE_TYPE_COUNT))
result = false;
}
bool implement_in_csr = false;
if (result && counters.back() > 0) {
result =
read_bool_counters(config_str, curr_pos, implement_in_csr, counters);
}

acl_device_global_mem_def_t dev_global_def = {
dev_global_addr, dev_global_size,
static_cast<acl_device_global_host_access_t>(host_access),
static_cast<acl_device_global_init_mode_t>(init_mode),
implement_in_csr};
bool ok =
device_global_mem_defs.insert({device_global_name, dev_global_def})
.second;
Expand Down
8 changes: 7 additions & 1 deletion src/acl_device_binary.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include <acl_device_binary.h>
#include <acl_globals.h>
#include <acl_hal.h>
#include <acl_kernel.h>
#include <acl_support.h>
#include <acl_util.h>

Expand Down Expand Up @@ -262,8 +263,13 @@ cl_int acl_device_binary_t::load_binary_pkg(int validate_compile_options,
AND_CHECK(acl_pkg_read_section(pkg, ".acl.rand_hash", pkg_rand_hash.data(),
data_len + 1),
CL_INVALID_BINARY, FAILREAD_MSG " (rand_hash)");
// Note that we use dev_prog->device when checking for device global
// Having the same binary suggest that the aocx on the device currently is
// the same as the aocx used to create program, so we can peek the device
// global setup now instead of later after acl_load_device_def_from_str
if (dev_prog->device->def.autodiscovery_def.binary_rand_hash ==
std::string(pkg_rand_hash.data())) {
std::string(pkg_rand_hash.data()) &&
(!acl_device_has_reprogram_device_globals(dev_prog->device))) {
dev_prog->device->last_bin = this;
dev_prog->device->loaded_bin = this;
}
Expand Down
23 changes: 19 additions & 4 deletions src/acl_kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3011,6 +3011,18 @@ int acl_kernel_has_unmapped_subbuffers(acl_mem_migrate_t *mem_migration) {
return 0;
}

bool acl_device_has_reprogram_device_globals(cl_device_id device) {
const auto &device_global_mem_defs =
device->def.autodiscovery_def.device_global_mem_defs;
return device_global_mem_defs.end() !=
std::find_if(device_global_mem_defs.begin(),
device_global_mem_defs.end(),
[](const auto &name_and_def) {
return name_and_def.second.init_mode ==
ACL_DEVICE_GLOBAL_INIT_MODE_REPROGRAM;
});
}

int acl_submit_kernel_device_op(cl_event event) {
// No user-level scheduling blocks this kernel enqueue from running.
// So submit it to the device op queue.
Expand Down Expand Up @@ -3049,15 +3061,18 @@ int acl_submit_kernel_device_op(cl_event event) {
need_reprogram =
device->last_bin->get_devdef().autodiscovery_def.binary_rand_hash !=
dev_bin->get_devdef().autodiscovery_def.binary_rand_hash;
} else {
// compare hash of program that is on the device and the program required by
// kernel
} else if (!acl_device_has_reprogram_device_globals(device)) {
// last_bin is null suggests there is no reprograms scheduled at this
// point so if the target device contains device global with reprogram
// init mode we force a reprogram, otherwise check random hash
// compare hash of program that is on the device and the program
// required by kernel
need_reprogram = device->def.autodiscovery_def.binary_rand_hash !=
dev_bin->get_devdef().autodiscovery_def.binary_rand_hash;
}

// Always reprogram in split kernel mode. This is a temporary workaround.
if (event->context->split_kernel) {
// Always reprogram in split kernel mode. This is a temporary workaround.
need_reprogram = true;
}

Expand Down
47 changes: 30 additions & 17 deletions test/acl_auto_configure_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ TEST(auto_configure, simple) {
#define VERSIONIDSTRINGIFY(x) #x
#define VERSIONIDTOSTR(x) VERSIONIDSTRINGIFY(x)
#define DEVICE_FIELDS " 23"
#define DEVICE_FIELDS_DEV_GLOBAL " 30"
#define DEVICE_FIELDS_DEV_GLOBAL " 36"
#define DEVICE_FIELDS_OLD " 18"
#define BOARDNAME "de4_gen2x4_swdimm"
#define BOARDNAME2 "pcie385_a7"
Expand Down Expand Up @@ -99,10 +99,11 @@ TEST(auto_configure, simple) {

// Device global autodiscovery entries
#define NUM_DEV_GLOBAL " 2"
#define NUM_DEV_GLOBAL_FIELD " 3" // containing dev_globa_name, address, size
#define DEV_GLOBAL_1 \
" kernel15_dev_global 4096 2048" // in format of dev_globa_name, address, size
#define DEV_GLOBAL_2 " kernel15_dev_global2 2048 1024"
#define NUM_DEV_GLOBAL_FIELD \
" 6" // contains dev_globa_name, address, size, host_access, init_mode,
// implement_in_csr with the above format
#define DEV_GLOBAL_1 " kernel15_dev_global 4096 2048 3 1 0"
#define DEV_GLOBAL_2 " kernel15_dev_global2 2048 1024 1 0 1"

int parsed;
std::string err_str;
Expand Down Expand Up @@ -283,8 +284,18 @@ TEST(auto_configure, simple) {
m_device_def.autodiscovery_def.device_global_mem_defs.end());
CHECK_EQUAL(4096, kernel15_dev_global->second.address);
CHECK_EQUAL(2048, kernel15_dev_global->second.size);
CHECK_EQUAL(ACL_DEVICE_GLOBAL_HOST_ACCESS_NONE,
kernel15_dev_global->second.host_access);
CHECK_EQUAL(ACL_DEVICE_GLOBAL_INIT_MODE_RESET,
kernel15_dev_global->second.init_mode);
CHECK_EQUAL(false, kernel15_dev_global->second.implement_in_csr);
CHECK_EQUAL(2048, kernel15_dev_global2->second.address);
CHECK_EQUAL(1024, kernel15_dev_global2->second.size);
CHECK_EQUAL(ACL_DEVICE_GLOBAL_HOST_ACCESS_WRITE_ONLY,
kernel15_dev_global2->second.host_access);
CHECK_EQUAL(ACL_DEVICE_GLOBAL_INIT_MODE_REPROGRAM,
kernel15_dev_global2->second.init_mode);
CHECK_EQUAL(true, kernel15_dev_global2->second.implement_in_csr);

// Check a second parsing.
// It should allocate a new string for the name.
Expand Down Expand Up @@ -482,11 +493,13 @@ TEST(auto_configure, many_ok_forward_compatibility) {
// sections and subsections to check forward compatibility

std::string str(VERSIONIDTOSTR(
ACL_AUTO_CONFIGURE_VERSIONID) " 29 "
ACL_AUTO_CONFIGURE_VERSIONID) " 49 "
"sample40byterandomhash000000000000000000 "
"a10gx 0 1 15 DDR 2 1 6 0 2147483648 100 "
"100 100 100 200 200 200 200 0 0 0 0 2 "
"1 name1 name2 0 0 47 "
"a10gx 0 1 17 DDR 2 1 6 0 2147483648 100 "
"100 100 100 0 - 0 200 200 200 200 0 0 0 "
"2 9 ms_dev_global1 2048 1024 3 0 0 300 "
"300 300 ms_dev_global2 4096 1024 1 1 1 "
"300 300 300 0 0 400 400 47 "
"40 external_sort_stage_0 0 128 1 0 0 1 0 "
"1 0 1 10 0 0 4 1 0 0 0 500 500 500 0 0 "
"0 0 1 1 1 3 1 1 1 3 1 0 0 800 800 800 "
Expand Down Expand Up @@ -677,10 +690,10 @@ TEST(auto_configure, many_ok_forward_compatibility) {

TEST(auto_configure, many_limit_check) {
std::string str(VERSIONIDTOSTR(
ACL_AUTO_CONFIGURE_VERSIONID) " 15 "
ACL_AUTO_CONFIGURE_VERSIONID) " 19 "
"sample40byterandomhash000000000000000000 "
"a10gx 0 1 7 DDR 2 1 2 0 2147483648 0 0 0 "
"0 75 "
"a10gx 0 1 9 DDR 2 1 2 0 2147483648 0 - 0 "
"0 0 0 0 0 75 " // 75 kernels
"31 external_sort_stage_0 0 128 1 0 0 1 0 "
"1 0 1 6 0 0 4 1 0 0 0 0 0 0 1 1 1 3 1 1 1 "
"3 1 "
Expand Down Expand Up @@ -1193,14 +1206,14 @@ TEST(auto_configure, kernel_arg_info) {

TEST(auto_configure, hostpipe) {
std::string str(VERSIONIDTOSTR(
ACL_AUTO_CONFIGURE_VERSIONID) " 46 "
ACL_AUTO_CONFIGURE_VERSIONID) " 49 "
"sample40byterandomhash000000000000000000 "
"a10gx_hostpipe 0 1 15 DDR 2 1 6 0 "
"2147483648 0 100 100 100 100 200 200 200 "
"200 "
"2 9 host_to_dev 1 0 32 32768 300 300 300 "
"300 dev_to_host 0 1 32 32768 300 300 300 "
"300 400 1 3 name3 400 0 "
"300 400 1 6 dev_global_3 1024 2048 0 0 0 "
"1 29 foo 0 128 1 0 0 1 0 1 0 0 0 0 0 0 1 "
"1 1 3 1 1 1 3 1 0 0 800 800 800 900 "
"900"
Expand Down Expand Up @@ -1230,10 +1243,10 @@ TEST(auto_configure, hostpipe) {

TEST(auto_configure, streaming) {
const std::string config_str{
"23 26 " RANDOM_HASH
"23 29 " RANDOM_HASH
" pac_a10 0 1 13 DDR 2 2 24 1 2 0 4294967296 4294967296 8589934592 0 - 0 "
"0 0 0 1 3 device_global_name 256 128 1 105 _ZTS3CRCILi0EE 0 256 1 0 0 1 "
"0 1 0 9 8 0 0 8 1 0 0 1 k0_ZTS3CRCILi0EE_arg0 8 2 1 8 1024 0 3 1 "
"0 0 0 1 6 device_global_name 256 128 0 0 0 1 105 _ZTS3CRCILi0EE 0 256 1 "
"0 0 1 0 1 0 9 8 0 0 8 1 0 0 1 k0_ZTS3CRCILi0EE_arg0 8 2 1 8 1024 0 3 1 "
"k0_ZTS3CRCILi0EE_arg1 8 0 0 8 1 0 0 1 k0_ZTS3CRCILi0EE_arg2 7 0 0 8 1 0 "
"0 0 7 0 0 8 1 0 0 0 7 2 1 8 1024 0 2 0 7 0 0 8 1 0 0 0 7 0 0 8 1 0 0 0 "
"7 0 0 8 1 0 0 0 0 0 1 2 64 4096 1 1 1 3 1 1 1 3 1 0 1 "
Expand Down
Loading