Skip to content

Commit f4c11e2

Browse files
committed
Address comment and add unit test for the new function
1 parent 22cbb66 commit f4c11e2

File tree

2 files changed

+199
-66
lines changed

2 files changed

+199
-66
lines changed

src/acl_mem.cpp

Lines changed: 84 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -4723,7 +4723,7 @@ cl_int l_enqueue_mem_transfer(cl_command_queue command_queue, cl_bool blocking,
47234723
if (src_buffer->flags & CL_MEM_HOST_WRITE_ONLY ||
47244724
src_buffer->flags & CL_MEM_HOST_NO_ACCESS) {
47254725
ERR_RET(CL_INVALID_OPERATION, context,
4726-
"clEnqeueueReadBuffer cannot be called on a buffer "
4726+
"clEnqueueReadBuffer cannot be called on a buffer "
47274727
"created with CL_MEM_HOST_WRITE_ONLY or CL_MEM_HOST_NO_ACCESS");
47284728
}
47294729
break;
@@ -4741,7 +4741,7 @@ cl_int l_enqueue_mem_transfer(cl_command_queue command_queue, cl_bool blocking,
47414741
if (dst_buffer->flags & CL_MEM_HOST_READ_ONLY ||
47424742
dst_buffer->flags & CL_MEM_HOST_NO_ACCESS) {
47434743
ERR_RET(CL_INVALID_OPERATION, context,
4744-
"clEnqeueueWriteBuffer cannot be called on a buffer "
4744+
"clEnqueueWriteBuffer cannot be called on a buffer "
47454745
"created with CL_MEM_HOST_READ_ONLY or CL_MEM_HOST_NO_ACCESS");
47464746
}
47474747
break;
@@ -6581,80 +6581,98 @@ void acl_copy_device_buffers_from_host_after_programming(
65816581
}
65826582
}
65836583

6584+
// Simulator does not have any global memory interface information before
6585+
// reprogram, the runtime initializes device def to have the same global
6586+
// memory address range obtained from a autodiscovery string predefined in
6587+
// acl_shipped_board_cfgs.h
6588+
// When a buffer is created with the buffer location property specifying a
6589+
// global memory whose address range lies beyond the range defined in the
6590+
// default autodiscovery string, and is written before the device reprogram,
6591+
// the write will bind the buffer to the wrong address range, causing issues
6592+
// when running the kernel
6593+
// The following function do a memory copy for the buffers binded to the
6594+
// wrong address range to the right one after the global memory information
6595+
// becomes available and before the kernel launch
6596+
// Returns 1 on success and 0 on failure
65846597
int acl_realloc_buffer_for_simulator(cl_mem mem,
65856598
const unsigned int physical_device_id,
65866599
const unsigned int mem_id) {
65876600
// Only reallocate and migrate if mem resides in global memory
6588-
if (mem->block_allocation->region == &(acl_platform.global_mem)) {
6589-
int mem_on_host;
6590-
void *old_mem_address;
6591-
void *new_mem_address;
6592-
6593-
acl_addr_range_t global_mem_range =
6594-
acl_platform.device[physical_device_id]
6595-
.def.autodiscovery_def.global_mem_defs[mem_id]
6596-
.get_usable_range();
6597-
6598-
// Save old address
6599-
old_mem_address = l_get_address_of_writable_copy(mem, physical_device_id,
6600-
&mem_on_host, CL_FALSE);
6601-
6602-
// The mem migration is only needed if the buffer is binded to the device
6603-
// before global memory range is confirmed (i.e., before reprogram), and
6604-
// assumed address range before reprogram is different from actual
6605-
// Therefore, check if:
6606-
// 1. allocation is deferred (if so auto migration will happen)
6607-
// 2. buffer is on host
6608-
// 3. buffer appears to be "at the destination"
6609-
// 4. block allocation is outside the global memory range
6610-
if (!mem->allocation_deferred &&
6611-
!(mem->mem_cpy_host_ptr_pending || mem_on_host) &&
6612-
(mem->block_allocation ==
6613-
mem->reserved_allocations[physical_device_id][mem_id]) &&
6614-
(mem->block_allocation->range.begin >= global_mem_range.next ||
6615-
mem->block_allocation->range.next < global_mem_range.begin)) {
6616-
6617-
// mem_id should align if block allocation is the same as reserved
6618-
// allocation
6619-
assert(mem->mem_id == mem_id);
6620-
6621-
// Okay to set this to NULL, memory tracked in mem->block_allocation
6622-
mem->reserved_allocations[physical_device_id][mem_id] = NULL;
6623-
// We will reallocate block, so remove it from linked list first
6624-
acl_block_allocation_t **block_ptr =
6625-
&(mem->block_allocation->region->first_block);
6626-
acl_block_allocation_t *block = *block_ptr;
6627-
assert(block != NULL); // Should be at least one block
6628-
while (*block_ptr) {
6629-
if (block == mem->block_allocation) {
6630-
*block_ptr = block->next_block_in_region;
6631-
break;
6632-
}
6633-
// Advance to the next block in the region
6634-
block_ptr = &(block->next_block_in_region);
6635-
block = *block_ptr;
6636-
}
6637-
if (!acl_do_physical_buffer_allocation(physical_device_id, mem)) {
6638-
return 0;
6601+
if (mem->block_allocation->region != &(acl_platform.global_mem)) {
6602+
return 1;
6603+
}
6604+
6605+
int mem_on_host;
6606+
void *old_mem_address;
6607+
void *new_mem_address;
6608+
6609+
acl_addr_range_t global_mem_range =
6610+
acl_platform.device[physical_device_id]
6611+
.def.autodiscovery_def.global_mem_defs[mem_id]
6612+
.get_usable_range();
6613+
6614+
// Save old address
6615+
old_mem_address = l_get_address_of_writable_copy(mem, physical_device_id,
6616+
&mem_on_host, CL_FALSE);
6617+
6618+
// The mem copy is only needed if the buffer is binded to the device
6619+
// before global memory range is confirmed (i.e., before reprogram), and
6620+
// assumed address range before reprogram is different from actual
6621+
// Therefore, check if:
6622+
// 1. allocation is deferred (if so auto migration will happen)
6623+
// 2. buffer is on host
6624+
// 3. buffer appears to be "at the destination"
6625+
// 4. block allocation is outside the global memory range
6626+
if (!mem->allocation_deferred &&
6627+
!(mem->mem_cpy_host_ptr_pending || mem_on_host) &&
6628+
(mem->block_allocation ==
6629+
mem->reserved_allocations[physical_device_id][mem_id]) &&
6630+
(ACL_STRIP_PHYSICAL_ID(mem->block_allocation->range.begin) >=
6631+
global_mem_range.next ||
6632+
ACL_STRIP_PHYSICAL_ID(mem->block_allocation->range.next) <
6633+
global_mem_range.begin)) {
6634+
6635+
// mem_id should align if block allocation is the same as reserved
6636+
// allocation
6637+
assert(mem->mem_id == mem_id);
6638+
6639+
// Okay to set this to NULL, memory tracked in mem->block_allocation
6640+
mem->reserved_allocations[physical_device_id][mem_id] = NULL;
6641+
// We will reallocate block, so remove it from linked list first
6642+
acl_block_allocation_t **block_ptr =
6643+
&(mem->block_allocation->region->first_block);
6644+
// try to find the mem->block_allocation in the linked list, error if
6645+
// the block is not found before reaching the end of list
6646+
while (true) {
6647+
acl_block_allocation_t *const block = *block_ptr;
6648+
assert(block != NULL);
6649+
if (block == mem->block_allocation) {
6650+
*block_ptr = block->next_block_in_region;
6651+
break;
66396652
}
6653+
// Advance to the next block in the region
6654+
block_ptr = &(block->next_block_in_region);
6655+
}
6656+
// Reallocate buffer range
6657+
if (!acl_do_physical_buffer_allocation(physical_device_id, mem)) {
6658+
return 0;
6659+
}
66406660

6641-
new_mem_address =
6642-
mem->reserved_allocations[physical_device_id][mem_id]->range.begin;
6643-
const acl_hal_t *const hal = acl_get_hal();
6661+
new_mem_address =
6662+
mem->reserved_allocations[physical_device_id][mem_id]->range.begin;
6663+
const acl_hal_t *const hal = acl_get_hal();
66446664

66456665
#ifdef MEM_DEBUG_MSG
6646-
printf("reallocating mem obj for simulation after getting global mem "
6647-
"info, device %u ([0]%zx -> [0]%zx) ",
6648-
physical_device_id,
6649-
(size_t)(ACL_STRIP_PHYSICAL_ID(old_mem_address)),
6650-
(size_t)(ACL_STRIP_PHYSICAL_ID(new_mem_address)));
6666+
printf("reallocating mem obj for simulation after getting global mem "
6667+
"info, device %u ([0]%zx -> [0]%zx) ",
6668+
physical_device_id, (size_t)(ACL_STRIP_PHYSICAL_ID(old_mem_address)),
6669+
(size_t)(ACL_STRIP_PHYSICAL_ID(new_mem_address)));
66516670
#endif
66526671

6653-
// do blocking copy, this is for simulation only so performance is
6654-
// probably not a huge concern
6655-
hal->copy_globalmem_to_globalmem(0, old_mem_address, new_mem_address,
6656-
mem->size);
6657-
}
6672+
// do blocking copy, this is for simulation only so performance is
6673+
// probably not a huge concern
6674+
hal->copy_globalmem_to_globalmem(0, old_mem_address, new_mem_address,
6675+
mem->size);
66586676
}
66596677
return 1;
66606678
}

test/acl_mem_test.cpp

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2844,6 +2844,121 @@ TEST(acl_mem, buffer_location_property) {
28442844
CHECK_EQUAL(CL_SUCCESS, clReleaseMemObject(a));
28452845
}
28462846

2847+
TEST(acl_mem, simulation_copy_buffer) {
2848+
ACL_LOCKED(acl_print_debug_msg("begin simulation_copy_buffer\n"));
2849+
// TODO: describe Test
2850+
cl_mem buffer;
2851+
cl_int status = CL_SUCCESS;
2852+
int input_data = 0xaaaaaaaa;
2853+
int output_data = 0x55555555;
2854+
size_t total_size = ACL_RANGE_SIZE(
2855+
m_device[0]->def.autodiscovery_def.global_mem_defs[0].range);
2856+
size_t global_mem_size = total_size / 2;
2857+
2858+
// save original autodiscovery def
2859+
acl_device_def_autodiscovery_t orig_def = m_device[0]->def.autodiscovery_def;
2860+
// create a fake multi global memory system where acltest_global is
2861+
// split into 2 halves for the 2 global memories
2862+
acl_device_def_autodiscovery_t actual_def =
2863+
m_device[0]->def.autodiscovery_def;
2864+
actual_def.num_global_mem_systems = 2;
2865+
actual_def.global_mem_defs[1].range.next =
2866+
actual_def.global_mem_defs[0].range.next;
2867+
actual_def.global_mem_defs[0].range.next =
2868+
(char *)actual_def.global_mem_defs[0].range.begin + global_mem_size;
2869+
actual_def.global_mem_defs[1].range.begin =
2870+
actual_def.global_mem_defs[0].range.next;
2871+
2872+
// simulate loading from a predefined autodiscovery string in
2873+
// acl_shipped_board_cfgs.h
2874+
m_device[0]->def.autodiscovery_def.num_global_mem_systems =
2875+
ACL_MAX_GLOBAL_MEM;
2876+
for (int i = 0; i < ACL_MAX_GLOBAL_MEM; i++) {
2877+
m_device[0]->def.autodiscovery_def.global_mem_defs[i] =
2878+
actual_def.global_mem_defs[0];
2879+
}
2880+
2881+
// Create memory with buffer location property
2882+
cl_mem_properties_intel props[] = {CL_MEM_ALLOC_BUFFER_LOCATION_INTEL, 1, 0};
2883+
buffer = clCreateBufferWithPropertiesINTEL(m_context, props, 0, sizeof(int),
2884+
0, &status);
2885+
ACL_LOCKED(CHECK(acl_mem_is_valid(buffer)));
2886+
CHECK_EQUAL(CL_SUCCESS, status);
2887+
assert(buffer);
2888+
CHECK_EQUAL(1, acl_ref_count(buffer));
2889+
2890+
// Check if the buffer has the right mem id
2891+
cl_uint read_mem_id = 4; // set to a dummy value
2892+
size_t size_ret;
2893+
CHECK_EQUAL(CL_SUCCESS,
2894+
clGetMemObjectInfo(buffer, CL_MEM_ALLOC_BUFFER_LOCATION_INTEL,
2895+
sizeof(cl_uint), &read_mem_id, &size_ret));
2896+
CHECK_EQUAL(1, read_mem_id);
2897+
2898+
// Enqueue write binds buffer to wrong global memory address range
2899+
status = clEnqueueWriteBuffer(m_cq, buffer, CL_TRUE, 0, sizeof(int),
2900+
&input_data, 0, NULL, NULL);
2901+
CHECK_EQUAL(CL_SUCCESS, status);
2902+
CHECK(ACL_STRIP_PHYSICAL_ID(buffer->block_allocation->range.begin) >=
2903+
m_device[0]
2904+
->def.autodiscovery_def.global_mem_defs[1]
2905+
.get_usable_range()
2906+
.begin);
2907+
CHECK(ACL_STRIP_PHYSICAL_ID(buffer->block_allocation->range.next) <
2908+
m_device[0]
2909+
->def.autodiscovery_def.global_mem_defs[1]
2910+
.get_usable_range()
2911+
.next);
2912+
2913+
// Pretend a reprogram happened for simulation, update global memory info
2914+
m_device[0]->def.autodiscovery_def = actual_def;
2915+
CHECK_EQUAL(2, m_device[0]->def.autodiscovery_def.num_global_mem_systems);
2916+
CHECK(m_device[0]
2917+
->def.autodiscovery_def.global_mem_defs[0]
2918+
.get_usable_range()
2919+
.begin != m_device[0]
2920+
->def.autodiscovery_def.global_mem_defs[1]
2921+
.get_usable_range()
2922+
.begin);
2923+
CHECK(m_device[0]
2924+
->def.autodiscovery_def.global_mem_defs[0]
2925+
.get_usable_range()
2926+
.next != m_device[0]
2927+
->def.autodiscovery_def.global_mem_defs[1]
2928+
.get_usable_range()
2929+
.next);
2930+
CHECK(ACL_STRIP_PHYSICAL_ID(buffer->block_allocation->range.begin) <
2931+
m_device[0]
2932+
->def.autodiscovery_def.global_mem_defs[1]
2933+
.get_usable_range()
2934+
.begin);
2935+
2936+
// Now call the migration function
2937+
ACL_LOCKED(CHECK_EQUAL(acl_realloc_buffer_for_simulator(buffer, 0, 1), 1));
2938+
CHECK(ACL_STRIP_PHYSICAL_ID(buffer->block_allocation->range.begin) >=
2939+
m_device[0]
2940+
->def.autodiscovery_def.global_mem_defs[1]
2941+
.get_usable_range()
2942+
.begin);
2943+
CHECK(ACL_STRIP_PHYSICAL_ID(buffer->block_allocation->range.next) <
2944+
m_device[0]
2945+
->def.autodiscovery_def.global_mem_defs[1]
2946+
.get_usable_range()
2947+
.next);
2948+
2949+
// Enqueue a blocking read to the right location and check data
2950+
status = clEnqueueReadBuffer(m_cq, buffer, CL_TRUE, 0, sizeof(int),
2951+
&output_data, 0, NULL, NULL);
2952+
CHECK_EQUAL(CL_SUCCESS, status);
2953+
2954+
// Check data preservation
2955+
CHECK_EQUAL(input_data, output_data);
2956+
2957+
// restore and clean up
2958+
m_device[0]->def.autodiscovery_def = orig_def;
2959+
CHECK_EQUAL(CL_SUCCESS, clReleaseMemObject(buffer));
2960+
}
2961+
28472962
MT_TEST(acl_mem, map_buf_bad_flags) {
28482963
ACL_LOCKED(acl_print_debug_msg("begin buf_bad_flags\n"));
28492964
cl_int status = CL_SUCCESS;

0 commit comments

Comments
 (0)