Skip to content

Commit 2ad0b64

Browse files
committed
kernel/device_binary: force reprogram if kernel device has device global with init_mode reprogram
There are two places that checks whether we can skip initial reprogram of the device: 1) When attempting device eager reprogram 2) When launching a kernel (for the first time) Changes to acl_device_binary.cpp tackles the 1) and changes to acl_kernel.cpp tackles 2)
1 parent 06958e9 commit 2ad0b64

File tree

4 files changed

+313
-18
lines changed

4 files changed

+313
-18
lines changed

include/acl_kernel.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,13 @@ void acl_receive_kernel_update(int activation_id, cl_int status);
5454
// safe to submit a kernel with subbuffers to the device_op_queue
5555
int acl_kernel_has_unmapped_subbuffers(acl_mem_migrate_t *mem_migration);
5656

57+
// Checks if the program currently loaded on the passed-in device contains
58+
// any device globals with reprogram init mode. When a kernel is submitted
59+
// for the first time and this function returns true, a force reprogram will
60+
// be scheduled even when the kernel binary hash matches the hash of the
61+
// currently loaded program.
62+
bool acl_device_has_reprogram_device_globals(cl_device_id device);
63+
5764
#if defined(__cplusplus)
5865
} /* extern "C" */
5966
#endif

src/acl_device_binary.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include <acl_device_binary.h>
1818
#include <acl_globals.h>
1919
#include <acl_hal.h>
20+
#include <acl_kernel.h>
2021
#include <acl_support.h>
2122
#include <acl_util.h>
2223

@@ -262,8 +263,13 @@ cl_int acl_device_binary_t::load_binary_pkg(int validate_compile_options,
262263
AND_CHECK(acl_pkg_read_section(pkg, ".acl.rand_hash", pkg_rand_hash.data(),
263264
data_len + 1),
264265
CL_INVALID_BINARY, FAILREAD_MSG " (rand_hash)");
266+
// Note that we use dev_prog->device when checking for device global
267+
// Having the same binary suggest that the aocx on the device currently is
268+
// the same as the aocx used to create program, so we can peek the device
269+
// global setup now instead of later after acl_load_device_def_from_str
265270
if (dev_prog->device->def.autodiscovery_def.binary_rand_hash ==
266-
std::string(pkg_rand_hash.data())) {
271+
std::string(pkg_rand_hash.data()) &&
272+
(!acl_device_has_reprogram_device_globals(dev_prog->device))) {
267273
dev_prog->device->last_bin = this;
268274
dev_prog->device->loaded_bin = this;
269275
}

src/acl_kernel.cpp

Lines changed: 32 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -3010,6 +3010,18 @@ int acl_kernel_has_unmapped_subbuffers(acl_mem_migrate_t *mem_migration) {
30103010
return 0;
30113011
}
30123012

3013+
bool acl_device_has_reprogram_device_globals(cl_device_id device) {
3014+
std::unordered_map<std::string, acl_device_global_mem_def_t>
3015+
device_global_mem_defs =
3016+
device->def.autodiscovery_def.device_global_mem_defs;
3017+
const auto reprogram_it = std::find_if(
3018+
device_global_mem_defs.begin(), device_global_mem_defs.end(),
3019+
[](const auto &it) {
3020+
return it.second.init_mode == ACL_DEVICE_GLOBAL_INIT_MODE_REPROGRAM;
3021+
});
3022+
return reprogram_it != device_global_mem_defs.end();
3023+
}
3024+
30133025
int acl_submit_kernel_device_op(cl_event event) {
30143026
// No user-level scheduling blocks this kernel enqueue from running.
30153027
// So submit it to the device op queue.
@@ -3042,22 +3054,26 @@ int acl_submit_kernel_device_op(cl_event event) {
30423054
acl_forget_proposed_device_ops(doq);
30433055

30443056
bool need_reprogram = true;
3045-
if (device->last_bin) {
3046-
// compare hash of last program that went through device op queue and the
3047-
// program required by kernel
3048-
need_reprogram =
3049-
device->last_bin->get_devdef().autodiscovery_def.binary_rand_hash !=
3050-
dev_bin->get_devdef().autodiscovery_def.binary_rand_hash;
3051-
} else {
3052-
// compare hash of program that is on the device and the program required by
3053-
// kernel
3054-
need_reprogram = device->def.autodiscovery_def.binary_rand_hash !=
3055-
dev_bin->get_devdef().autodiscovery_def.binary_rand_hash;
3056-
}
3057-
3058-
if (event->context->split_kernel) {
3059-
// Always reprogram in split kernel mode. This is a temporary workaround.
3060-
need_reprogram = true;
3057+
// Force reprogram if split kernel, otherwise check random hash
3058+
if (!(event->context->split_kernel)) {
3059+
if (device->last_bin) {
3060+
// compare hash of last program that went through device op queue and the
3061+
// program required by kernel
3062+
need_reprogram =
3063+
device->last_bin->get_devdef().autodiscovery_def.binary_rand_hash !=
3064+
dev_bin->get_devdef().autodiscovery_def.binary_rand_hash;
3065+
} else {
3066+
// last_bin is null suggests there is no reprograms scheduled at this
3067+
// point so if the target device contains device global with reprogram
3068+
// init mode we force a reprogram, otherwise check random hash
3069+
if (!acl_device_has_reprogram_device_globals(device)) {
3070+
// compare hash of program that is on the device and the program
3071+
// required by kernel
3072+
need_reprogram =
3073+
device->def.autodiscovery_def.binary_rand_hash !=
3074+
dev_bin->get_devdef().autodiscovery_def.binary_rand_hash;
3075+
}
3076+
}
30613077
}
30623078

30633079
if (need_reprogram) {

test/acl_kernel_test.cpp

Lines changed: 267 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3968,7 +3968,7 @@ TEST(acl_kernel_reprogram_scheduler, switch_prog) {
39683968
// set MEM_MIGRATE2.1 to COMPLETE +
39693969
// set MEM_MIGRATE2.2 to RUNNING +
39703970
// set MEM_MIGRATE2.2 to COMPLETE +
3971-
// submit KERNEL2 to device = 5
3971+
// submit KERNEL2 to device = 10
39723972
CHECK_EQUAL(offset + 15, m_devlog.num_ops);
39733973

39743974
// Should have copied the memory over.
@@ -4332,6 +4332,272 @@ TEST(acl_kernel_reprogram_scheduler, switch_prog) {
43324332
CHECK_EQUAL(CL_SUCCESS, clReleaseCommandQueue(cq2));
43334333
}
43344334

4335+
TEST(acl_kernel_reprogram_scheduler, device_global_reprogram) {
4336+
// In this test, we will force the device to contain reprogram
4337+
// device global. The device will be first reprogrammed eagerly
4338+
// due to the clCreateProgramWithBinary call which will set the
4339+
// last_bin and loaded_bin. We revert that by setting them to
4340+
// null again to emulate a hw device with binary on the board
4341+
// but not yet reprogrammed in execution.
4342+
// The kernel will be launched two times, the first time should
4343+
// trigger a reprogram even thought the random hash matches due
4344+
// to the device global, the second time shouldn't as the device
4345+
// has been reprogrammed in the execution.
4346+
4347+
// Force device to contain device global
4348+
m_device->def.autodiscovery_def.device_global_mem_defs.insert(
4349+
{"dev_glob1",
4350+
{/* address */ 1024,
4351+
/* size */ 1024,
4352+
/* host_access */ ACL_DEVICE_GLOBAL_HOST_ACCESS_READ_WRITE,
4353+
/* init_mode */ ACL_DEVICE_GLOBAL_INIT_MODE_REPROGRAM,
4354+
/* implement_in_csr */ false}});
4355+
4356+
// Initial eager reprogram
4357+
int offset = m_devlog.num_ops;
4358+
CHECK_EQUAL(3, offset);
4359+
// Just the initial program load.
4360+
CHECK_EQUAL(m_first_dev_bin, m_device->last_bin);
4361+
CHECK_EQUAL(m_first_dev_bin, m_device->loaded_bin);
4362+
4363+
// Pretend execution starts now
4364+
m_device->last_bin->unload_content();
4365+
m_device->last_bin = nullptr;
4366+
m_device->loaded_bin->unload_content();
4367+
m_device->loaded_bin = nullptr;
4368+
4369+
acl_device_program_info_t *dp0 = check_dev_prog(m_program0);
4370+
m_context->reprogram_buf_read_callback = read_mem_callback;
4371+
m_context->reprogram_buf_write_callback = write_mem_callback;
4372+
4373+
// A device side buffer
4374+
cl_int status = CL_INVALID_VALUE;
4375+
cl_mem mem = clCreateBuffer(m_context, CL_MEM_READ_WRITE, 2048, 0, &status);
4376+
CHECK_EQUAL(CL_SUCCESS, status);
4377+
CHECK(mem);
4378+
memset(mem->host_mem.aligned_ptr, 'X', mem->size);
4379+
memset(mem->block_allocation->range.begin, 'x', mem->size);
4380+
4381+
CHECK_EQUAL(1, m_context->device_buffers_have_backing_store);
4382+
CHECK_EQUAL(0, mem->block_allocation->region->is_host_accessible);
4383+
CHECK_EQUAL(0, mem->writable_copy_on_host);
4384+
4385+
cl_kernel k = get_kernel(m_program0);
4386+
cl_event ue1 = get_user_event();
4387+
cl_event ue2 = get_user_event();
4388+
cl_event k_e1 = 0;
4389+
cl_event k_e2 = 0;
4390+
4391+
// Launch the kernel for the first time
4392+
CHECK_EQUAL(CL_SUCCESS, clSetKernelArg(k, 0, sizeof(cl_mem), &mem));
4393+
CHECK_EQUAL(CL_SUCCESS, clSetKernelArg(k, 1, sizeof(cl_mem), &mem));
4394+
CHECK_EQUAL(CL_SUCCESS, clEnqueueTask(m_cq, k, 1, &ue1, &k_e1));
4395+
CHECK_EQUAL(CL_COMMAND_TASK, k_e1->cmd.type);
4396+
CHECK(m_device->def.autodiscovery_def.binary_rand_hash ==
4397+
k_e1->cmd.info.ndrange_kernel.dev_bin->get_devdef()
4398+
.autodiscovery_def.binary_rand_hash);
4399+
4400+
// last_bin and loaded_bin should still in a reset state
4401+
CHECK(m_device->last_bin == nullptr);
4402+
CHECK(m_device->loaded_bin == nullptr);
4403+
4404+
acl_print_debug_msg("Forcing user event completion for first kernel\n");
4405+
CHECK_EQUAL(CL_SUCCESS, clSetUserEventStatus(ue1, CL_COMPLETE));
4406+
CHECK_EQUAL(CL_SUCCESS, clReleaseEvent(ue1));
4407+
4408+
// Should have recorded that we loaded the program.
4409+
CHECK_EQUAL(&(dp0->device_binary), m_device->last_bin);
4410+
CHECK_EQUAL(&(dp0->device_binary), m_device->loaded_bin);
4411+
4412+
// submit device global forced REPROGRAM +
4413+
// set REPROGRAM to RUNNING +
4414+
// set REPROGRAM to COMPLETE +
4415+
// set MEM_MIGRATE 1 to RUNNING +
4416+
// set MEM_MIGRATE 1 to COMPLETE +
4417+
// set MEM_MIGRATE 2 to RUNNING +
4418+
// set MEM_MIGRATE 2 to COMPLETE +
4419+
// submit KERNEL = 8
4420+
CHECK_EQUAL(offset + 8, m_devlog.num_ops);
4421+
const acl_device_op_t *op0submit = &(m_devlog.before[3]);
4422+
const acl_device_op_t *op0running = &(m_devlog.before[4]);
4423+
const acl_device_op_t *op0complete = &(m_devlog.before[5]);
4424+
4425+
// Device global forced reprogram
4426+
CHECK_EQUAL(ACL_DEVICE_OP_REPROGRAM, op0submit->info.type);
4427+
CHECK_EQUAL(0, op0submit->id);
4428+
CHECK(op0submit->info.event);
4429+
CHECK_EQUAL(CL_SUBMITTED, op0submit->status);
4430+
CHECK_EQUAL(0, op0submit->info.num_printf_bytes_pending);
4431+
CHECK_EQUAL(1, op0submit->first_in_group);
4432+
CHECK_EQUAL(0, op0submit->last_in_group);
4433+
4434+
CHECK_EQUAL(ACL_DEVICE_OP_REPROGRAM, op0running->info.type);
4435+
CHECK_EQUAL(0, op0running->id);
4436+
CHECK(op0running->info.event);
4437+
CHECK_EQUAL(CL_RUNNING, op0running->status);
4438+
CHECK_EQUAL(0, op0running->info.num_printf_bytes_pending);
4439+
CHECK_EQUAL(1, op0running->first_in_group);
4440+
CHECK_EQUAL(0, op0running->last_in_group);
4441+
4442+
CHECK_EQUAL(ACL_DEVICE_OP_REPROGRAM, op0complete->info.type);
4443+
CHECK_EQUAL(0, op0complete->id);
4444+
CHECK(op0complete->info.event);
4445+
CHECK_EQUAL(CL_COMPLETE, op0complete->status);
4446+
CHECK_EQUAL(0, op0complete->info.num_printf_bytes_pending);
4447+
CHECK_EQUAL(1, op0complete->first_in_group);
4448+
CHECK_EQUAL(0, op0complete->last_in_group);
4449+
4450+
// The device is still programmed with the same program.
4451+
CHECK_EQUAL(&(dp0->device_binary), m_device->last_bin);
4452+
CHECK_EQUAL(&(dp0->device_binary), m_device->loaded_bin);
4453+
4454+
const acl_device_op_t *op1submit = &(m_devlog.before[10]);
4455+
CHECK_EQUAL(ACL_DEVICE_OP_KERNEL, op1submit->info.type);
4456+
CHECK_EQUAL(k_e1, op1submit->info.event);
4457+
CHECK_EQUAL(CL_SUBMITTED, op1submit->status);
4458+
CHECK_EQUAL(0, op1submit->info.num_printf_bytes_pending);
4459+
CHECK_EQUAL(0, op1submit->first_in_group); // reprogram is first
4460+
CHECK_EQUAL(1, op1submit->last_in_group);
4461+
4462+
// The user-level event is linked to the kernel device op now.
4463+
CHECK_EQUAL(op1submit->id, k_e1->current_device_op->id);
4464+
4465+
// Pretend to start the kernel
4466+
acl_print_debug_msg("Say kernel is running\n");
4467+
ACL_LOCKED(
4468+
acl_receive_kernel_update(k_e1->current_device_op->id, CL_RUNNING));
4469+
CHECK_EQUAL(CL_RUNNING, k_e1->current_device_op->execution_status);
4470+
4471+
ACL_LOCKED(acl_idle_update(m_context));
4472+
4473+
// Now we have a "running" transition
4474+
CHECK_EQUAL(offset + 9, m_devlog.num_ops);
4475+
const acl_device_op_t *op1running = &(m_devlog.after[11]);
4476+
CHECK_EQUAL(ACL_DEVICE_OP_KERNEL, op1running->info.type);
4477+
CHECK_EQUAL(k_e1, op1running->info.event);
4478+
CHECK_EQUAL(CL_RUNNING, op1running->status);
4479+
CHECK_EQUAL(0, op1running->info.num_printf_bytes_pending);
4480+
CHECK_EQUAL(0, op1running->first_in_group);
4481+
CHECK_EQUAL(1, op1running->last_in_group);
4482+
4483+
// The running status was propagated up to the user-level event.
4484+
CHECK_EQUAL(CL_RUNNING, k_e1->execution_status);
4485+
4486+
acl_print_debug_msg("Say kernel is complete\n");
4487+
ACL_LOCKED(
4488+
acl_receive_kernel_update(k_e1->current_device_op->id, CL_COMPLETE));
4489+
CHECK_EQUAL(CL_COMPLETE, k_e1->current_device_op->execution_status);
4490+
4491+
ACL_LOCKED(acl_idle_update(m_context));
4492+
// Now we have a "complete" transition
4493+
CHECK_EQUAL(offset + 10, m_devlog.num_ops);
4494+
const acl_device_op_t *op1complete = &(m_devlog.after[12]);
4495+
CHECK_EQUAL(ACL_DEVICE_OP_KERNEL, op1complete->info.type);
4496+
CHECK_EQUAL(k_e1, op1complete->info.event);
4497+
CHECK_EQUAL(CL_COMPLETE, op1complete->status);
4498+
CHECK_EQUAL(0, op1complete->info.num_printf_bytes_pending);
4499+
CHECK_EQUAL(0, op1complete->first_in_group);
4500+
CHECK_EQUAL(1, op1complete->last_in_group);
4501+
4502+
// Completion timestamp has propagated up to the user level event.
4503+
CHECK_EQUAL(
4504+
acl_platform.device_op_queue.op[op1complete->id].timestamp[CL_COMPLETE],
4505+
k_e1->timestamp[CL_COMPLETE]);
4506+
4507+
// Completion wipes out the downlink.
4508+
CHECK_EQUAL(0, k_e1->current_device_op);
4509+
4510+
// Launch the kernel for the second time
4511+
CHECK_EQUAL(CL_SUCCESS, clEnqueueTask(m_cq, k, 1, &ue2, &k_e2));
4512+
CHECK_EQUAL(CL_COMMAND_TASK, k_e2->cmd.type);
4513+
CHECK(m_device->def.autodiscovery_def.binary_rand_hash ==
4514+
k_e2->cmd.info.ndrange_kernel.dev_bin->get_devdef()
4515+
.autodiscovery_def.binary_rand_hash);
4516+
4517+
acl_print_debug_msg("Forcing user event completion for second kernel\n");
4518+
CHECK_EQUAL(CL_SUCCESS, clSetUserEventStatus(ue2, CL_COMPLETE));
4519+
CHECK_EQUAL(CL_SUCCESS, clReleaseEvent(ue2));
4520+
4521+
// Should still have the same program loaded
4522+
CHECK_EQUAL(&(dp0->device_binary), m_device->last_bin);
4523+
CHECK_EQUAL(&(dp0->device_binary), m_device->loaded_bin);
4524+
4525+
// set MEM_MIGRATE 1 to RUNNING +
4526+
// set MEM_MIGRATE 1 to COMPLETE +
4527+
// set MEM_MIGRATE 2 to RUNNING +
4528+
// set MEM_MIGRATE 2 to COMPLETE +
4529+
// submit KERNEL = 5
4530+
CHECK_EQUAL(offset + 15, m_devlog.num_ops);
4531+
const acl_device_op_t *op2submit = &(m_devlog.before[17]);
4532+
CHECK_EQUAL(ACL_DEVICE_OP_KERNEL, op2submit->info.type);
4533+
CHECK_EQUAL(k_e2, op2submit->info.event);
4534+
CHECK_EQUAL(CL_SUBMITTED, op2submit->status);
4535+
CHECK_EQUAL(0, op2submit->info.num_printf_bytes_pending);
4536+
CHECK_EQUAL(0, op2submit->first_in_group); // mem migration is first
4537+
CHECK_EQUAL(1, op2submit->last_in_group);
4538+
4539+
// The user-level event is linked to the kernel device op now.
4540+
CHECK_EQUAL(op2submit->id, k_e2->current_device_op->id);
4541+
4542+
// Pretend to start the kernel
4543+
acl_print_debug_msg("Say kernel is running\n");
4544+
ACL_LOCKED(
4545+
acl_receive_kernel_update(k_e2->current_device_op->id, CL_RUNNING));
4546+
CHECK_EQUAL(CL_RUNNING, k_e2->current_device_op->execution_status);
4547+
4548+
ACL_LOCKED(acl_idle_update(m_context));
4549+
4550+
// Now we have a "running" transition
4551+
CHECK_EQUAL(offset + 16, m_devlog.num_ops);
4552+
const acl_device_op_t *op2running = &(m_devlog.after[18]);
4553+
CHECK_EQUAL(ACL_DEVICE_OP_KERNEL, op2running->info.type);
4554+
CHECK_EQUAL(k_e2, op2running->info.event);
4555+
CHECK_EQUAL(CL_RUNNING, op2running->status);
4556+
CHECK_EQUAL(0, op2running->info.num_printf_bytes_pending);
4557+
CHECK_EQUAL(0, op2running->first_in_group);
4558+
CHECK_EQUAL(1, op2running->last_in_group);
4559+
4560+
// The running status was propagated up to the user-level event.
4561+
CHECK_EQUAL(CL_RUNNING, k_e2->execution_status);
4562+
4563+
acl_print_debug_msg("Say kernel is complete\n");
4564+
ACL_LOCKED(
4565+
acl_receive_kernel_update(k_e2->current_device_op->id, CL_COMPLETE));
4566+
CHECK_EQUAL(CL_COMPLETE, k_e2->current_device_op->execution_status);
4567+
4568+
ACL_LOCKED(acl_idle_update(m_context));
4569+
// Now we have a "complete" transition
4570+
CHECK_EQUAL(offset + 17, m_devlog.num_ops);
4571+
const acl_device_op_t *op2complete = &(m_devlog.after[19]);
4572+
CHECK_EQUAL(ACL_DEVICE_OP_KERNEL, op2complete->info.type);
4573+
CHECK_EQUAL(k_e2, op2complete->info.event);
4574+
CHECK_EQUAL(CL_COMPLETE, op2complete->status);
4575+
CHECK_EQUAL(0, op2complete->info.num_printf_bytes_pending);
4576+
CHECK_EQUAL(0, op2complete->first_in_group);
4577+
CHECK_EQUAL(1, op2complete->last_in_group);
4578+
4579+
// Completion timestamp has propagated up to the user level event.
4580+
CHECK_EQUAL(
4581+
acl_platform.device_op_queue.op[op2complete->id].timestamp[CL_COMPLETE],
4582+
k_e2->timestamp[CL_COMPLETE]);
4583+
4584+
// Completion wipes out the downlink.
4585+
CHECK_EQUAL(0, k_e2->current_device_op);
4586+
4587+
// And let go.
4588+
// (Don't check for CL_INVALID_EVENT on a second release of each of
4589+
// these events because the events might be reused.)
4590+
CHECK_EQUAL(CL_SUCCESS, clReleaseMemObject(mem));
4591+
CHECK_EQUAL(CL_SUCCESS, clReleaseEvent(k_e1));
4592+
CHECK_EQUAL(CL_SUCCESS, clReleaseEvent(k_e2));
4593+
CHECK_EQUAL(CL_SUCCESS, clReleaseKernel(k));
4594+
4595+
// Clean up device global
4596+
m_device->def.autodiscovery_def.device_global_mem_defs.clear();
4597+
4598+
acl_print_debug_msg("DONE!\n");
4599+
}
4600+
43354601
TEST(acl_kernel_reprogram_scheduler, use_host_buf_as_arg) {
43364602
// Must be able to use a host-side buffer as a kernel argument.
43374603
cl_int status = 0;

0 commit comments

Comments
 (0)