Skip to content

Commit cd2164e

Browse files
committed
Perform a memory copy for simulation buffer located in incorrect global memory address range
1 parent 08e9cd0 commit cd2164e

File tree

4 files changed

+226
-4
lines changed

4 files changed

+226
-4
lines changed

include/acl_mem.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,10 @@ int acl_submit_mem_transfer_device_op(cl_event event);
3232

3333
int acl_submit_migrate_mem_device_op(cl_event event);
3434

35+
int acl_realloc_buffer_for_simulator(cl_mem mem,
36+
const unsigned int physical_device_id,
37+
const unsigned int mem_id);
38+
3539
// Actually execute the memory transfer device operation.
3640
// In the normal case source and destination are different, in which case
3741
// the HAL is called and the transfer is non-blocking.

src/acl_kernel.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2917,6 +2917,14 @@ static cl_int l_copy_and_adjust_arguments_for_device(
29172917
[needed_mem_id]);
29182918
#endif
29192919

2920+
int env_override = 0;
2921+
(void)acl_get_offline_device_user_setting(&env_override);
2922+
if (env_override == ACL_CONTEXT_MPSIM) {
2923+
if (!acl_realloc_buffer_for_simulator(mem_obj, needed_physical_id,
2924+
needed_mem_id)) {
2925+
return CL_MEM_OBJECT_ALLOCATION_FAILURE;
2926+
}
2927+
}
29202928
// copy the address of the reserved allocation into the invocation
29212929
// image:
29222930
const void *mem_addr =

src/acl_mem.cpp

Lines changed: 95 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4224,8 +4224,6 @@ static void l_get_working_range(const acl_block_allocation_t *block_allocation,
42244224
acl_assert_locked();
42254225

42264226
if (block_allocation->region == &(acl_platform.global_mem)) {
4227-
int env_override = 0;
4228-
(void)acl_get_offline_device_user_setting(&env_override);
42294227
const auto *global_mem_defs = &(acl_platform.device[physical_device_id]
42304228
.def.autodiscovery_def.global_mem_defs);
42314229

@@ -4725,7 +4723,7 @@ cl_int l_enqueue_mem_transfer(cl_command_queue command_queue, cl_bool blocking,
47254723
if (src_buffer->flags & CL_MEM_HOST_WRITE_ONLY ||
47264724
src_buffer->flags & CL_MEM_HOST_NO_ACCESS) {
47274725
ERR_RET(CL_INVALID_OPERATION, context,
4728-
"clEnqeueueReadBuffer cannot be called on a buffer "
4726+
"clEnqueueReadBuffer cannot be called on a buffer "
47294727
"created with CL_MEM_HOST_WRITE_ONLY or CL_MEM_HOST_NO_ACCESS");
47304728
}
47314729
break;
@@ -4743,7 +4741,7 @@ cl_int l_enqueue_mem_transfer(cl_command_queue command_queue, cl_bool blocking,
47434741
if (dst_buffer->flags & CL_MEM_HOST_READ_ONLY ||
47444742
dst_buffer->flags & CL_MEM_HOST_NO_ACCESS) {
47454743
ERR_RET(CL_INVALID_OPERATION, context,
4746-
"clEnqeueueWriteBuffer cannot be called on a buffer "
4744+
"clEnqueueWriteBuffer cannot be called on a buffer "
47474745
"created with CL_MEM_HOST_READ_ONLY or CL_MEM_HOST_NO_ACCESS");
47484746
}
47494747
break;
@@ -6583,6 +6581,99 @@ void acl_copy_device_buffers_from_host_after_programming(
65836581
}
65846582
}
65856583

6584+
// Simulator does not have any global memory interface information before
6585+
// reprogram, the runtime initializes device def to have the same global
6586+
// memory address range obtained from a autodiscovery string predefined in
6587+
// acl_shipped_board_cfgs.h
6588+
// When a buffer is created with the buffer location property specifying a
6589+
// global memory whose address range lies beyond the range defined in the
6590+
// default autodiscovery string, and is written before the device reprogram,
6591+
// the write will bind the buffer to the wrong address range, causing issues
6592+
// when running the kernel
6593+
// The following function do a memory copy for the buffers binded to the
6594+
// wrong address range to the right one after the global memory information
6595+
// becomes available and before the kernel launch
6596+
// Returns 1 on success and 0 on failure
6597+
int acl_realloc_buffer_for_simulator(cl_mem mem,
6598+
const unsigned int physical_device_id,
6599+
const unsigned int mem_id) {
6600+
// Only reallocate and migrate if mem resides in global memory
6601+
if (mem->block_allocation->region != &(acl_platform.global_mem)) {
6602+
return 1;
6603+
}
6604+
6605+
const acl_addr_range_t global_mem_range =
6606+
acl_platform.device[physical_device_id]
6607+
.def.autodiscovery_def.global_mem_defs[mem_id]
6608+
.get_usable_range();
6609+
6610+
// Save old address
6611+
int mem_on_host;
6612+
void *const old_mem_address = l_get_address_of_writable_copy(
6613+
mem, physical_device_id, &mem_on_host, CL_FALSE);
6614+
6615+
// The mem copy is only needed if the buffer is bound to the device
6616+
// before global memory range is confirmed (i.e., before reprogram), and
6617+
// assumed address range before reprogram is different from actual
6618+
// Therefore, check if:
6619+
// 1. allocation is deferred (if so auto migration will happen)
6620+
// 2. buffer is on host
6621+
// 3. buffer appears to be "at the destination"
6622+
// 4. block allocation is outside the global memory range
6623+
if (!mem->allocation_deferred &&
6624+
!(mem->mem_cpy_host_ptr_pending || mem_on_host) &&
6625+
(mem->block_allocation ==
6626+
mem->reserved_allocations[physical_device_id][mem_id]) &&
6627+
(ACL_STRIP_PHYSICAL_ID(mem->block_allocation->range.begin) >=
6628+
global_mem_range.next ||
6629+
ACL_STRIP_PHYSICAL_ID(mem->block_allocation->range.next) <
6630+
global_mem_range.begin)) {
6631+
6632+
// mem_id should align if block allocation is the same as reserved
6633+
// allocation
6634+
assert(mem->mem_id == mem_id);
6635+
6636+
// Okay to set this to NULL, memory tracked in mem->block_allocation
6637+
mem->reserved_allocations[physical_device_id][mem_id] = NULL;
6638+
// We will reallocate block, so remove it from linked list first
6639+
acl_block_allocation_t **block_ptr =
6640+
&(mem->block_allocation->region->first_block);
6641+
// try to find the mem->block_allocation in the linked list, error if
6642+
// the block is not found before reaching the end of list
6643+
while (true) {
6644+
acl_block_allocation_t *const block = *block_ptr;
6645+
assert(block != NULL);
6646+
if (block == mem->block_allocation) {
6647+
*block_ptr = block->next_block_in_region;
6648+
break;
6649+
}
6650+
// Advance to the next block in the region
6651+
block_ptr = &(block->next_block_in_region);
6652+
}
6653+
// Reallocate buffer range
6654+
if (!acl_do_physical_buffer_allocation(physical_device_id, mem)) {
6655+
return 0;
6656+
}
6657+
6658+
void *const new_mem_address =
6659+
mem->reserved_allocations[physical_device_id][mem_id]->range.begin;
6660+
const acl_hal_t *const hal = acl_get_hal();
6661+
6662+
#ifdef MEM_DEBUG_MSG
6663+
printf("reallocating mem obj for simulation after getting global mem "
6664+
"info, device %u ([0]%zx -> [0]%zx) ",
6665+
physical_device_id, (size_t)(ACL_STRIP_PHYSICAL_ID(old_mem_address)),
6666+
(size_t)(ACL_STRIP_PHYSICAL_ID(new_mem_address)));
6667+
#endif
6668+
6669+
// do blocking copy, this is for simulation only so performance is
6670+
// probably not a huge concern
6671+
hal->copy_globalmem_to_globalmem(0, old_mem_address, new_mem_address,
6672+
mem->size);
6673+
}
6674+
return 1;
6675+
}
6676+
65866677
static void acl_print_all_mem_in_region(acl_mem_region_t *region);
65876678
void acl_print_all_mem(void) {
65886679
acl_assert_locked();

test/acl_mem_test.cpp

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2844,6 +2844,125 @@ TEST(acl_mem, buffer_location_property) {
28442844
CHECK_EQUAL(CL_SUCCESS, clReleaseMemObject(a));
28452845
}
28462846

2847+
TEST(acl_mem, simulation_copy_buffer) {
2848+
// Test mocks a simulation run where a predefined autodiscovery string
2849+
// is loaded at the beginning of the run with default global memory
2850+
// set-up that doesn't match actual. It checks whether the function
2851+
// acl_realloc_buffer_for_simulator moves buffer to the right global
2852+
// memory address range after a fake reprogram updates the global
2853+
// memory configuration.
2854+
cl_mem buffer;
2855+
cl_int status = CL_SUCCESS;
2856+
int input_data = 0xaaaaaaaa;
2857+
int output_data = 0x55555555;
2858+
size_t total_size = ACL_RANGE_SIZE(
2859+
m_device[0]->def.autodiscovery_def.global_mem_defs[0].range);
2860+
size_t global_mem_size = total_size / 2;
2861+
2862+
// save original autodiscovery def
2863+
acl_device_def_autodiscovery_t orig_def = m_device[0]->def.autodiscovery_def;
2864+
// create a fake multi global memory system where unit test global
2865+
// memory is split into 2 halves for the 2 global memories
2866+
acl_device_def_autodiscovery_t actual_def =
2867+
m_device[0]->def.autodiscovery_def;
2868+
actual_def.num_global_mem_systems = 2;
2869+
actual_def.global_mem_defs[1].range.next =
2870+
actual_def.global_mem_defs[0].range.next;
2871+
actual_def.global_mem_defs[0].range.next =
2872+
(char *)actual_def.global_mem_defs[0].range.begin + global_mem_size;
2873+
actual_def.global_mem_defs[1].range.begin =
2874+
actual_def.global_mem_defs[0].range.next;
2875+
2876+
// simulate loading from a predefined autodiscovery string in
2877+
// acl_shipped_board_cfgs.h
2878+
m_device[0]->def.autodiscovery_def.num_global_mem_systems =
2879+
ACL_MAX_GLOBAL_MEM;
2880+
for (int i = 0; i < ACL_MAX_GLOBAL_MEM; i++) {
2881+
m_device[0]->def.autodiscovery_def.global_mem_defs[i] =
2882+
actual_def.global_mem_defs[0];
2883+
}
2884+
2885+
// Create memory with buffer location property
2886+
cl_mem_properties_intel props[] = {CL_MEM_ALLOC_BUFFER_LOCATION_INTEL, 1, 0};
2887+
buffer = clCreateBufferWithPropertiesINTEL(m_context, props, 0, sizeof(int),
2888+
0, &status);
2889+
ACL_LOCKED(CHECK(acl_mem_is_valid(buffer)));
2890+
CHECK_EQUAL(CL_SUCCESS, status);
2891+
assert(buffer);
2892+
CHECK_EQUAL(1, acl_ref_count(buffer));
2893+
2894+
// Check if the buffer has the right mem id
2895+
cl_uint read_mem_id = 4; // set to a dummy value
2896+
size_t size_ret;
2897+
CHECK_EQUAL(CL_SUCCESS,
2898+
clGetMemObjectInfo(buffer, CL_MEM_ALLOC_BUFFER_LOCATION_INTEL,
2899+
sizeof(cl_uint), &read_mem_id, &size_ret));
2900+
CHECK_EQUAL(1, read_mem_id);
2901+
2902+
// Enqueue write binds buffer to wrong global memory address range
2903+
status = clEnqueueWriteBuffer(m_cq, buffer, CL_TRUE, 0, sizeof(int),
2904+
&input_data, 0, NULL, NULL);
2905+
CHECK_EQUAL(CL_SUCCESS, status);
2906+
CHECK(ACL_STRIP_PHYSICAL_ID(buffer->block_allocation->range.begin) >=
2907+
m_device[0]
2908+
->def.autodiscovery_def.global_mem_defs[1]
2909+
.get_usable_range()
2910+
.begin);
2911+
CHECK(ACL_STRIP_PHYSICAL_ID(buffer->block_allocation->range.next) <
2912+
m_device[0]
2913+
->def.autodiscovery_def.global_mem_defs[1]
2914+
.get_usable_range()
2915+
.next);
2916+
2917+
// Pretend a reprogram happened for simulation, update global memory info
2918+
m_device[0]->def.autodiscovery_def = actual_def;
2919+
CHECK_EQUAL(2, m_device[0]->def.autodiscovery_def.num_global_mem_systems);
2920+
CHECK(m_device[0]
2921+
->def.autodiscovery_def.global_mem_defs[0]
2922+
.get_usable_range()
2923+
.begin != m_device[0]
2924+
->def.autodiscovery_def.global_mem_defs[1]
2925+
.get_usable_range()
2926+
.begin);
2927+
CHECK(m_device[0]
2928+
->def.autodiscovery_def.global_mem_defs[0]
2929+
.get_usable_range()
2930+
.next != m_device[0]
2931+
->def.autodiscovery_def.global_mem_defs[1]
2932+
.get_usable_range()
2933+
.next);
2934+
CHECK(ACL_STRIP_PHYSICAL_ID(buffer->block_allocation->range.begin) <
2935+
m_device[0]
2936+
->def.autodiscovery_def.global_mem_defs[1]
2937+
.get_usable_range()
2938+
.begin);
2939+
2940+
// Now call the migration function
2941+
ACL_LOCKED(CHECK_EQUAL(acl_realloc_buffer_for_simulator(buffer, 0, 1), 1));
2942+
CHECK(ACL_STRIP_PHYSICAL_ID(buffer->block_allocation->range.begin) >=
2943+
m_device[0]
2944+
->def.autodiscovery_def.global_mem_defs[1]
2945+
.get_usable_range()
2946+
.begin);
2947+
CHECK(ACL_STRIP_PHYSICAL_ID(buffer->block_allocation->range.next) <
2948+
m_device[0]
2949+
->def.autodiscovery_def.global_mem_defs[1]
2950+
.get_usable_range()
2951+
.next);
2952+
2953+
// Enqueue a blocking read to the right location and check data
2954+
status = clEnqueueReadBuffer(m_cq, buffer, CL_TRUE, 0, sizeof(int),
2955+
&output_data, 0, NULL, NULL);
2956+
CHECK_EQUAL(CL_SUCCESS, status);
2957+
2958+
// Check data preservation
2959+
CHECK_EQUAL(input_data, output_data);
2960+
2961+
// restore and clean up
2962+
m_device[0]->def.autodiscovery_def = orig_def;
2963+
CHECK_EQUAL(CL_SUCCESS, clReleaseMemObject(buffer));
2964+
}
2965+
28472966
MT_TEST(acl_mem, map_buf_bad_flags) {
28482967
ACL_LOCKED(acl_print_debug_msg("begin buf_bad_flags\n"));
28492968
cl_int status = CL_SUCCESS;

0 commit comments

Comments
 (0)