Skip to content

Perform a memory copy for simulation buffer with buffer location #214

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Nov 29, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions include/acl_mem.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,10 @@ int acl_submit_mem_transfer_device_op(cl_event event);

int acl_submit_migrate_mem_device_op(cl_event event);

int acl_realloc_buffer_for_simulator(cl_mem mem,
const unsigned int physical_device_id,
const unsigned int mem_id);

// Actually execute the memory transfer device operation.
// In the normal case source and destination are different, in which case
// the HAL is called and the transfer is non-blocking.
Expand Down
8 changes: 8 additions & 0 deletions src/acl_kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2917,6 +2917,14 @@ static cl_int l_copy_and_adjust_arguments_for_device(
[needed_mem_id]);
#endif

int env_override = 0;
(void)acl_get_offline_device_user_setting(&env_override);
if (env_override == ACL_CONTEXT_MPSIM) {
if (!acl_realloc_buffer_for_simulator(mem_obj, needed_physical_id,
needed_mem_id)) {
return CL_MEM_OBJECT_ALLOCATION_FAILURE;
}
}
// copy the address of the reserved allocation into the invocation
// image:
const void *mem_addr =
Expand Down
99 changes: 95 additions & 4 deletions src/acl_mem.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4224,8 +4224,6 @@ static void l_get_working_range(const acl_block_allocation_t *block_allocation,
acl_assert_locked();

if (block_allocation->region == &(acl_platform.global_mem)) {
int env_override = 0;
(void)acl_get_offline_device_user_setting(&env_override);
const auto *global_mem_defs = &(acl_platform.device[physical_device_id]
.def.autodiscovery_def.global_mem_defs);

Expand Down Expand Up @@ -4725,7 +4723,7 @@ cl_int l_enqueue_mem_transfer(cl_command_queue command_queue, cl_bool blocking,
if (src_buffer->flags & CL_MEM_HOST_WRITE_ONLY ||
src_buffer->flags & CL_MEM_HOST_NO_ACCESS) {
ERR_RET(CL_INVALID_OPERATION, context,
"clEnqeueueReadBuffer cannot be called on a buffer "
"clEnqueueReadBuffer cannot be called on a buffer "
"created with CL_MEM_HOST_WRITE_ONLY or CL_MEM_HOST_NO_ACCESS");
}
break;
Expand All @@ -4743,7 +4741,7 @@ cl_int l_enqueue_mem_transfer(cl_command_queue command_queue, cl_bool blocking,
if (dst_buffer->flags & CL_MEM_HOST_READ_ONLY ||
dst_buffer->flags & CL_MEM_HOST_NO_ACCESS) {
ERR_RET(CL_INVALID_OPERATION, context,
"clEnqeueueWriteBuffer cannot be called on a buffer "
"clEnqueueWriteBuffer cannot be called on a buffer "
"created with CL_MEM_HOST_READ_ONLY or CL_MEM_HOST_NO_ACCESS");
}
break;
Expand Down Expand Up @@ -6583,6 +6581,99 @@ void acl_copy_device_buffers_from_host_after_programming(
}
}

// Simulator does not have any global memory interface information before
// reprogram, the runtime initializes device def to have the same global
// memory address range obtained from a autodiscovery string predefined in
// acl_shipped_board_cfgs.h
// When a buffer is created with the buffer location property specifying a
// global memory whose address range lies beyond the range defined in the
// default autodiscovery string, and is written before the device reprogram,
// the write will bind the buffer to the wrong address range, causing issues
// when running the kernel
// The following function do a memory copy for the buffers binded to the
// wrong address range to the right one after the global memory information
// becomes available and before the kernel launch
// Returns 1 on success and 0 on failure
int acl_realloc_buffer_for_simulator(cl_mem mem,
const unsigned int physical_device_id,
const unsigned int mem_id) {
// Only reallocate and migrate if mem resides in global memory
if (mem->block_allocation->region != &(acl_platform.global_mem)) {
return 1;
}

const acl_addr_range_t global_mem_range =
acl_platform.device[physical_device_id]
.def.autodiscovery_def.global_mem_defs[mem_id]
.get_usable_range();

// Save old address
int mem_on_host;
void *const old_mem_address = l_get_address_of_writable_copy(
mem, physical_device_id, &mem_on_host, CL_FALSE);

// The mem copy is only needed if the buffer is bound to the device
// before global memory range is confirmed (i.e., before reprogram), and
// assumed address range before reprogram is different from actual
// Therefore, check if:
// 1. allocation is deferred (if so auto migration will happen)
// 2. buffer is on host
// 3. buffer appears to be "at the destination"
// 4. block allocation is outside the global memory range
if (!mem->allocation_deferred &&
!(mem->mem_cpy_host_ptr_pending || mem_on_host) &&
(mem->block_allocation ==
mem->reserved_allocations[physical_device_id][mem_id]) &&
(ACL_STRIP_PHYSICAL_ID(mem->block_allocation->range.begin) >=
global_mem_range.next ||
ACL_STRIP_PHYSICAL_ID(mem->block_allocation->range.next) <
global_mem_range.begin)) {

// mem_id should align if block allocation is the same as reserved
// allocation
assert(mem->mem_id == mem_id);

// Okay to set this to NULL, memory tracked in mem->block_allocation
mem->reserved_allocations[physical_device_id][mem_id] = NULL;
// We will reallocate block, so remove it from linked list first
acl_block_allocation_t **block_ptr =
&(mem->block_allocation->region->first_block);
// try to find the mem->block_allocation in the linked list, error if
// the block is not found before reaching the end of list
while (true) {
acl_block_allocation_t *const block = *block_ptr;
assert(block != NULL);
if (block == mem->block_allocation) {
*block_ptr = block->next_block_in_region;
break;
}
// Advance to the next block in the region
block_ptr = &(block->next_block_in_region);
}
// Reallocate buffer range
if (!acl_do_physical_buffer_allocation(physical_device_id, mem)) {
return 0;
}

void *const new_mem_address =
mem->reserved_allocations[physical_device_id][mem_id]->range.begin;
const acl_hal_t *const hal = acl_get_hal();

#ifdef MEM_DEBUG_MSG
printf("reallocating mem obj for simulation after getting global mem "
"info, device %u ([0]%zx -> [0]%zx) ",
physical_device_id, (size_t)(ACL_STRIP_PHYSICAL_ID(old_mem_address)),
(size_t)(ACL_STRIP_PHYSICAL_ID(new_mem_address)));
#endif

// do blocking copy, this is for simulation only so performance is
// probably not a huge concern
hal->copy_globalmem_to_globalmem(0, old_mem_address, new_mem_address,
mem->size);
}
return 1;
}

static void acl_print_all_mem_in_region(acl_mem_region_t *region);
void acl_print_all_mem(void) {
acl_assert_locked();
Expand Down
119 changes: 119 additions & 0 deletions test/acl_mem_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2844,6 +2844,125 @@ TEST(acl_mem, buffer_location_property) {
CHECK_EQUAL(CL_SUCCESS, clReleaseMemObject(a));
}

TEST(acl_mem, simulation_copy_buffer) {
// Test mocks a simulation run where a predefined autodiscovery string
// is loaded at the beginning of the run with default global memory
// set-up that doesn't match actual. It checks whether the function
// acl_realloc_buffer_for_simulator moves buffer to the right global
// memory address range after a fake reprogram updates the global
// memory configuration.
cl_mem buffer;
cl_int status = CL_SUCCESS;
int input_data = 0xaaaaaaaa;
int output_data = 0x55555555;
size_t total_size = ACL_RANGE_SIZE(
m_device[0]->def.autodiscovery_def.global_mem_defs[0].range);
size_t global_mem_size = total_size / 2;

// save original autodiscovery def
acl_device_def_autodiscovery_t orig_def = m_device[0]->def.autodiscovery_def;
// create a fake multi global memory system where unit test global
// memory is split into 2 halves for the 2 global memories
acl_device_def_autodiscovery_t actual_def =
m_device[0]->def.autodiscovery_def;
actual_def.num_global_mem_systems = 2;
actual_def.global_mem_defs[1].range.next =
actual_def.global_mem_defs[0].range.next;
actual_def.global_mem_defs[0].range.next =
(char *)actual_def.global_mem_defs[0].range.begin + global_mem_size;
actual_def.global_mem_defs[1].range.begin =
actual_def.global_mem_defs[0].range.next;

// simulate loading from a predefined autodiscovery string in
// acl_shipped_board_cfgs.h
m_device[0]->def.autodiscovery_def.num_global_mem_systems =
ACL_MAX_GLOBAL_MEM;
for (int i = 0; i < ACL_MAX_GLOBAL_MEM; i++) {
m_device[0]->def.autodiscovery_def.global_mem_defs[i] =
actual_def.global_mem_defs[0];
}

// Create memory with buffer location property
cl_mem_properties_intel props[] = {CL_MEM_ALLOC_BUFFER_LOCATION_INTEL, 1, 0};
buffer = clCreateBufferWithPropertiesINTEL(m_context, props, 0, sizeof(int),
0, &status);
ACL_LOCKED(CHECK(acl_mem_is_valid(buffer)));
CHECK_EQUAL(CL_SUCCESS, status);
assert(buffer);
CHECK_EQUAL(1, acl_ref_count(buffer));

// Check if the buffer has the right mem id
cl_uint read_mem_id = 4; // set to a dummy value
size_t size_ret;
CHECK_EQUAL(CL_SUCCESS,
clGetMemObjectInfo(buffer, CL_MEM_ALLOC_BUFFER_LOCATION_INTEL,
sizeof(cl_uint), &read_mem_id, &size_ret));
CHECK_EQUAL(1, read_mem_id);

// Enqueue write binds buffer to wrong global memory address range
status = clEnqueueWriteBuffer(m_cq, buffer, CL_TRUE, 0, sizeof(int),
&input_data, 0, NULL, NULL);
CHECK_EQUAL(CL_SUCCESS, status);
CHECK(ACL_STRIP_PHYSICAL_ID(buffer->block_allocation->range.begin) >=
m_device[0]
->def.autodiscovery_def.global_mem_defs[1]
.get_usable_range()
.begin);
CHECK(ACL_STRIP_PHYSICAL_ID(buffer->block_allocation->range.next) <
m_device[0]
->def.autodiscovery_def.global_mem_defs[1]
.get_usable_range()
.next);

// Pretend a reprogram happened for simulation, update global memory info
m_device[0]->def.autodiscovery_def = actual_def;
CHECK_EQUAL(2, m_device[0]->def.autodiscovery_def.num_global_mem_systems);
CHECK(m_device[0]
->def.autodiscovery_def.global_mem_defs[0]
.get_usable_range()
.begin != m_device[0]
->def.autodiscovery_def.global_mem_defs[1]
.get_usable_range()
.begin);
CHECK(m_device[0]
->def.autodiscovery_def.global_mem_defs[0]
.get_usable_range()
.next != m_device[0]
->def.autodiscovery_def.global_mem_defs[1]
.get_usable_range()
.next);
CHECK(ACL_STRIP_PHYSICAL_ID(buffer->block_allocation->range.begin) <
m_device[0]
->def.autodiscovery_def.global_mem_defs[1]
.get_usable_range()
.begin);

// Now call the migration function
ACL_LOCKED(CHECK_EQUAL(acl_realloc_buffer_for_simulator(buffer, 0, 1), 1));
CHECK(ACL_STRIP_PHYSICAL_ID(buffer->block_allocation->range.begin) >=
m_device[0]
->def.autodiscovery_def.global_mem_defs[1]
.get_usable_range()
.begin);
CHECK(ACL_STRIP_PHYSICAL_ID(buffer->block_allocation->range.next) <
m_device[0]
->def.autodiscovery_def.global_mem_defs[1]
.get_usable_range()
.next);

// Enqueue a blocking read to the right location and check data
status = clEnqueueReadBuffer(m_cq, buffer, CL_TRUE, 0, sizeof(int),
&output_data, 0, NULL, NULL);
CHECK_EQUAL(CL_SUCCESS, status);

// Check data preservation
CHECK_EQUAL(input_data, output_data);

// restore and clean up
m_device[0]->def.autodiscovery_def = orig_def;
CHECK_EQUAL(CL_SUCCESS, clReleaseMemObject(buffer));
}

MT_TEST(acl_mem, map_buf_bad_flags) {
ACL_LOCKED(acl_print_debug_msg("begin buf_bad_flags\n"));
cl_int status = CL_SUCCESS;
Expand Down