Skip to content

Commit fc1446f

Browse files
committed
kernel: force reprogram if kernel device has device global with init_mode reprogram and extend unit test
1 parent 385bd9c commit fc1446f

File tree

2 files changed

+201
-17
lines changed

2 files changed

+201
-17
lines changed

src/acl_kernel.cpp

Lines changed: 26 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -3030,23 +3030,33 @@ int acl_submit_kernel_device_op(cl_event event) {
30303030
// to free up old operation slots.
30313031
acl_forget_proposed_device_ops(doq);
30323032

3033+
// Force reprogram if there is device global with init_mode:reprogram
3034+
// or if this is the split kernel mode (split kernel workaround)
30333035
bool need_reprogram = true;
3034-
if (device->last_bin) {
3035-
// compare hash of last program that went through device op queue and the
3036-
// program required by kernel
3037-
need_reprogram =
3038-
device->last_bin->get_devdef().autodiscovery_def.binary_rand_hash !=
3039-
dev_bin->get_devdef().autodiscovery_def.binary_rand_hash;
3040-
} else {
3041-
// compare hash of program that is on the device and the program required by
3042-
// kernel
3043-
need_reprogram = device->def.autodiscovery_def.binary_rand_hash !=
3044-
dev_bin->get_devdef().autodiscovery_def.binary_rand_hash;
3045-
}
3046-
3047-
if (event->context->split_kernel) {
3048-
// Always reprogram in split kernel mode. This is a temporary workaround.
3049-
need_reprogram = true;
3036+
// Else check if reprogram is needed based on hash of loaded binary
3037+
// First try to find if there are any reprogram device globals
3038+
std::unordered_map<std::string, acl_device_global_mem_def_t>
3039+
device_global_mem_defs =
3040+
device->def.autodiscovery_def.device_global_mem_defs;
3041+
auto reprogram_it = std::find_if(
3042+
device_global_mem_defs.begin(), device_global_mem_defs.end(),
3043+
[](const std::pair<std::string, acl_device_global_mem_def_t> &it) {
3044+
return it.second.init_mode == ACL_DEVICE_GLOBAL_INIT_MODE_REPROGRAM;
3045+
});
3046+
if ((reprogram_it == device_global_mem_defs.end()) &&
3047+
!event->context->split_kernel) {
3048+
if (device->last_bin) {
3049+
// compare hash of last program that went through device op queue and the
3050+
// program required by kernel
3051+
need_reprogram =
3052+
device->last_bin->get_devdef().autodiscovery_def.binary_rand_hash !=
3053+
dev_bin->get_devdef().autodiscovery_def.binary_rand_hash;
3054+
} else {
3055+
// compare hash of program that is on the device and the program required
3056+
// by kernel
3057+
need_reprogram = device->def.autodiscovery_def.binary_rand_hash !=
3058+
dev_bin->get_devdef().autodiscovery_def.binary_rand_hash;
3059+
}
30503060
}
30513061

30523062
if (need_reprogram) {

test/acl_kernel_test.cpp

Lines changed: 175 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3962,7 +3962,7 @@ TEST(acl_kernel_reprogram_scheduler, switch_prog) {
39623962
// set MEM_MIGRATE2.1 to COMPLETE +
39633963
// set MEM_MIGRATE2.2 to RUNNING +
39643964
// set MEM_MIGRATE2.2 to COMPLETE +
3965-
// submit KERNEL2 to device = 5
3965+
// submit KERNEL2 to device = 10
39663966
CHECK_EQUAL(offset + 15, m_devlog.num_ops);
39673967

39683968
// Should have copied the memory over.
@@ -4326,6 +4326,180 @@ TEST(acl_kernel_reprogram_scheduler, switch_prog) {
43264326
CHECK_EQUAL(CL_SUCCESS, clReleaseCommandQueue(cq2));
43274327
}
43284328

4329+
TEST(acl_kernel_reprogram_scheduler, device_global_reprogram) {
4330+
// In this test, we will force the device to contain reprogram
4331+
// device global. The device will be first reprogrammed eagerly
4332+
// due to the clCreateProgramWithBinary call, then when the
4333+
// kernel is enqueued, another reprogram should be scheduled
4334+
// even though the device is already programmed with the right
4335+
// binary, due to the presence of the device global.
4336+
4337+
// Force device to contain device global
4338+
m_device->def.autodiscovery_def.device_global_mem_defs.insert(
4339+
{"dev_glob1",
4340+
{/* address */ 1024,
4341+
/* size */ 1024,
4342+
/* host_access */ ACL_DEVICE_GLOBAL_HOST_ACCESS_READ_WRITE,
4343+
/* init_mode */ ACL_DEVICE_GLOBAL_INIT_MODE_REPROGRAM,
4344+
/* implement_in_csr */ false}});
4345+
4346+
// Initial eager reprogram
4347+
int offset = m_devlog.num_ops;
4348+
CHECK_EQUAL(3, offset);
4349+
4350+
acl_device_program_info_t *dp0 = check_dev_prog(m_program0);
4351+
4352+
m_context->reprogram_buf_read_callback = read_mem_callback;
4353+
m_context->reprogram_buf_write_callback = write_mem_callback;
4354+
4355+
// A device side buffer
4356+
cl_int status = CL_INVALID_VALUE;
4357+
cl_mem mem = clCreateBuffer(m_context, CL_MEM_READ_WRITE, 2048, 0, &status);
4358+
CHECK_EQUAL(CL_SUCCESS, status);
4359+
CHECK(mem);
4360+
memset(mem->host_mem.aligned_ptr, 'X', mem->size);
4361+
memset(mem->block_allocation->range.begin, 'x', mem->size);
4362+
4363+
CHECK_EQUAL(1, m_context->device_buffers_have_backing_store);
4364+
CHECK_EQUAL(0, mem->block_allocation->region->is_host_accessible);
4365+
CHECK_EQUAL(0, mem->writable_copy_on_host);
4366+
4367+
cl_kernel k = get_kernel(m_program0);
4368+
4369+
// Just the initial program load.
4370+
CHECK_EQUAL(m_first_dev_bin, m_device->last_bin);
4371+
CHECK_EQUAL(m_first_dev_bin, m_device->loaded_bin);
4372+
4373+
cl_event ue = get_user_event();
4374+
cl_event k_e = 0;
4375+
4376+
CHECK_EQUAL(CL_SUCCESS, clSetKernelArg(k, 0, sizeof(cl_mem), &mem));
4377+
CHECK_EQUAL(CL_SUCCESS, clSetKernelArg(k, 1, sizeof(cl_mem), &mem));
4378+
CHECK_EQUAL(CL_SUCCESS, clEnqueueTask(m_cq, k, 1, &ue, &k_e));
4379+
CHECK_EQUAL(CL_COMMAND_TASK, k_e->cmd.type);
4380+
4381+
// Only initial programming has occurred.
4382+
// Has 3 transitions logged: SUBMITTED, RUNNING, COMPLETE
4383+
CHECK_EQUAL(m_first_dev_bin, m_device->last_bin);
4384+
CHECK_EQUAL(m_first_dev_bin, m_device->loaded_bin);
4385+
4386+
acl_print_debug_msg("Forcing user event completion\n");
4387+
CHECK_EQUAL(CL_SUCCESS, clSetUserEventStatus(ue, CL_COMPLETE));
4388+
CHECK_EQUAL(CL_SUCCESS, clReleaseEvent(ue));
4389+
4390+
// Should have recorded that we loaded the program.
4391+
CHECK_EQUAL(&(dp0->device_binary), m_device->last_bin);
4392+
CHECK_EQUAL(&(dp0->device_binary), m_device->loaded_bin);
4393+
4394+
// submit device global forced REPROGRAM +
4395+
// set REPROGRAM to RUNNING +
4396+
// set REPROGRAM to COMPLETE +
4397+
// set MEM_MIGRATE 1 to RUNNING +
4398+
// set MEM_MIGRATE 1 to COMPLETE +
4399+
// set MEM_MIGRATE 2 to RUNNING +
4400+
// set MEM_MIGRATE 2 to COMPLETE +
4401+
// submit KERNEL = 8
4402+
CHECK_EQUAL(offset+8, m_devlog.num_ops);
4403+
const acl_device_op_t *op0submit = &(m_devlog.before[3]);
4404+
const acl_device_op_t *op0running = &(m_devlog.before[4]);
4405+
const acl_device_op_t *op0complete = &(m_devlog.before[5]);
4406+
4407+
// Device global forced reprogram
4408+
CHECK_EQUAL(ACL_DEVICE_OP_REPROGRAM, op0submit->info.type);
4409+
CHECK_EQUAL(0, op0submit->id);
4410+
CHECK(op0submit->info.event);
4411+
CHECK_EQUAL(CL_SUBMITTED, op0submit->status);
4412+
CHECK_EQUAL(0, op0submit->info.num_printf_bytes_pending);
4413+
CHECK_EQUAL(1, op0submit->first_in_group);
4414+
CHECK_EQUAL(0, op0submit->last_in_group);
4415+
4416+
CHECK_EQUAL(ACL_DEVICE_OP_REPROGRAM, op0running->info.type);
4417+
CHECK_EQUAL(0, op0running->id);
4418+
CHECK(op0running->info.event);
4419+
CHECK_EQUAL(CL_RUNNING, op0running->status);
4420+
CHECK_EQUAL(0, op0running->info.num_printf_bytes_pending);
4421+
CHECK_EQUAL(1, op0running->first_in_group);
4422+
CHECK_EQUAL(0, op0running->last_in_group);
4423+
4424+
CHECK_EQUAL(ACL_DEVICE_OP_REPROGRAM, op0complete->info.type);
4425+
CHECK_EQUAL(0, op0complete->id);
4426+
CHECK(op0complete->info.event);
4427+
CHECK_EQUAL(CL_COMPLETE, op0complete->status);
4428+
CHECK_EQUAL(0, op0complete->info.num_printf_bytes_pending);
4429+
CHECK_EQUAL(1, op0complete->first_in_group);
4430+
CHECK_EQUAL(0, op0complete->last_in_group);
4431+
4432+
// The device is still programmed with the same program.
4433+
CHECK_EQUAL(&(dp0->device_binary), m_device->last_bin);
4434+
CHECK_EQUAL(&(dp0->device_binary), m_device->loaded_bin);
4435+
4436+
const acl_device_op_t *op1submit = &(m_devlog.before[10]);
4437+
CHECK_EQUAL(ACL_DEVICE_OP_KERNEL, op1submit->info.type);
4438+
CHECK_EQUAL(k_e, op1submit->info.event);
4439+
CHECK_EQUAL(CL_SUBMITTED, op1submit->status);
4440+
CHECK_EQUAL(0, op1submit->info.num_printf_bytes_pending);
4441+
CHECK_EQUAL(0, op1submit->first_in_group); // reprogram is first
4442+
CHECK_EQUAL(1, op1submit->last_in_group);
4443+
4444+
// The user-level event is linked to the kernel device op now.
4445+
CHECK_EQUAL(op1submit->id, k_e->current_device_op->id);
4446+
4447+
// Pretend to start the kernel
4448+
acl_print_debug_msg("Say kernel is running\n");
4449+
ACL_LOCKED(acl_receive_kernel_update(k_e->current_device_op->id, CL_RUNNING));
4450+
CHECK_EQUAL(CL_RUNNING, k_e->current_device_op->execution_status);
4451+
4452+
ACL_LOCKED(acl_idle_update(m_context));
4453+
4454+
// Now we have a "running" transition
4455+
CHECK_EQUAL(offset+9, m_devlog.num_ops);
4456+
const acl_device_op_t *op2a = &(m_devlog.after[11]);
4457+
CHECK_EQUAL(ACL_DEVICE_OP_KERNEL, op2a->info.type);
4458+
CHECK_EQUAL(k_e, op2a->info.event);
4459+
CHECK_EQUAL(CL_RUNNING, op2a->status);
4460+
CHECK_EQUAL(0, op2a->info.num_printf_bytes_pending);
4461+
CHECK_EQUAL(0, op2a->first_in_group);
4462+
CHECK_EQUAL(1, op2a->last_in_group);
4463+
4464+
// The running status was propagated up to the user-level event.
4465+
CHECK_EQUAL(CL_RUNNING, k_e->execution_status);
4466+
4467+
acl_print_debug_msg("Say kernel is complete\n");
4468+
ACL_LOCKED(
4469+
acl_receive_kernel_update(k_e->current_device_op->id, CL_COMPLETE));
4470+
CHECK_EQUAL(CL_COMPLETE, k_e->current_device_op->execution_status);
4471+
4472+
ACL_LOCKED(acl_idle_update(m_context));
4473+
// Now we have a "complete" transition
4474+
CHECK_EQUAL(offset+10, m_devlog.num_ops);
4475+
const acl_device_op_t *op3a = &(m_devlog.after[12]);
4476+
CHECK_EQUAL(ACL_DEVICE_OP_KERNEL, op3a->info.type);
4477+
CHECK_EQUAL(k_e, op3a->info.event);
4478+
CHECK_EQUAL(CL_COMPLETE, op3a->status);
4479+
CHECK_EQUAL(0, op3a->info.num_printf_bytes_pending);
4480+
CHECK_EQUAL(0, op3a->first_in_group);
4481+
CHECK_EQUAL(1, op3a->last_in_group);
4482+
4483+
// Completion timestamp has propagated up to the user level event.
4484+
CHECK_EQUAL(acl_platform.device_op_queue.op[op3a->id].timestamp[CL_COMPLETE],
4485+
k_e->timestamp[CL_COMPLETE]);
4486+
4487+
// Completion wipes out the downlink.
4488+
CHECK_EQUAL(0, k_e->current_device_op);
4489+
4490+
// And let go.
4491+
// (Don't check for CL_INVALID_EVENT on a second release of each of
4492+
// these events because the events might be reused.)
4493+
CHECK_EQUAL(CL_SUCCESS, clReleaseMemObject(mem));
4494+
CHECK_EQUAL(CL_SUCCESS, clReleaseEvent(k_e));
4495+
CHECK_EQUAL(CL_SUCCESS, clReleaseKernel(k));
4496+
4497+
// Clean up device global
4498+
m_device->def.autodiscovery_def.device_global_mem_defs.clear();
4499+
4500+
acl_print_debug_msg("DONE!\n");
4501+
}
4502+
43294503
TEST(acl_kernel_reprogram_scheduler, use_host_buf_as_arg) {
43304504
// Must be able to use a host-side buffer as a kernel argument.
43314505
cl_int status = 0;

0 commit comments

Comments
 (0)