Skip to content

Commit 9ff42de

Browse files
nawrinsusaiislam
authored andcommitted
[OpenMP] Add memory allocation using hwloc (llvm#132843)
This patch adds support for memory allocation using hwloc. To enable memory allocation using hwloc, env KMP_TOPOLOGY_METHOD=hwloc needs to be used. If hwloc is not supported/available, allocation will fallback to default path.
1 parent 3b95899 commit 9ff42de

File tree

6 files changed

+159
-41
lines changed

6 files changed

+159
-41
lines changed

openmp/runtime/src/kmp.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1111,6 +1111,7 @@ extern omp_allocator_handle_t __kmp_def_allocator;
11111111
#endif
11121112

11131113
extern int __kmp_memkind_available;
1114+
extern bool __kmp_hwloc_available;
11141115

11151116
typedef struct kmp_memspace_t {
11161117
omp_memspace_handle_t memspace;
@@ -1127,6 +1128,9 @@ typedef struct kmp_allocator_t {
11271128
kmp_uint64 pool_size;
11281129
kmp_uint64 pool_used;
11291130
bool pinned;
1131+
#if KMP_USE_HWLOC
1132+
omp_alloctrait_value_t membind;
1133+
#endif
11301134
} kmp_allocator_t;
11311135

11321136
extern omp_memspace_handle_t

openmp/runtime/src/kmp_affinity.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1444,6 +1444,7 @@ void KMPAffinity::pick_api() {
14441444
if (__kmp_affinity_top_method == affinity_top_method_hwloc &&
14451445
__kmp_affinity.type != affinity_disabled) {
14461446
affinity_dispatch = new KMPHwlocAffinity();
1447+
__kmp_hwloc_available = true;
14471448
} else
14481449
#endif
14491450
{

openmp/runtime/src/kmp_alloc.cpp

Lines changed: 126 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,20 @@
1414
#include "kmp_io.h"
1515
#include "kmp_wrapper_malloc.h"
1616

17+
#if KMP_USE_HWLOC
18+
#if HWLOC_API_VERSION > 0x00020300
19+
#define KMP_HWLOC_LOCATION_TYPE_CPUSET HWLOC_LOCATION_TYPE_CPUSET
20+
#elif HWLOC_API_VERSION == 0x00020300
21+
#define KMP_HWLOC_LOCATION_TYPE_CPUSET \
22+
hwloc_location::HWLOC_LOCATION_TYPE_CPUSET
23+
#else
24+
enum hwloc_memattr_id_e {
25+
HWLOC_MEMATTR_ID_BANDWIDTH,
26+
HWLOC_MEMATTR_ID_CAPACITY
27+
};
28+
#endif
29+
#endif // KMP_USE_HWLOC
30+
1731
// Disable bget when it is not used
1832
#if KMP_USE_BGET
1933

@@ -1358,6 +1372,74 @@ void __kmp_fini_memkind() {
13581372
#endif
13591373
}
13601374

1375+
#if KMP_USE_HWLOC
1376+
static bool __kmp_is_hwloc_membind_supported(hwloc_membind_policy_t policy) {
1377+
#if HWLOC_API_VERSION >= 0x00020300
1378+
const hwloc_topology_support *support;
1379+
support = hwloc_topology_get_support(__kmp_hwloc_topology);
1380+
if (support) {
1381+
if (policy == HWLOC_MEMBIND_BIND)
1382+
return (support->membind->alloc_membind &&
1383+
support->membind->bind_membind);
1384+
if (policy == HWLOC_MEMBIND_INTERLEAVE)
1385+
return (support->membind->alloc_membind &&
1386+
support->membind->interleave_membind);
1387+
}
1388+
return false;
1389+
#else
1390+
return false;
1391+
#endif
1392+
}
1393+
1394+
void *__kmp_hwloc_alloc_membind(hwloc_memattr_id_e attr, size_t size,
1395+
hwloc_membind_policy_t policy) {
1396+
#if HWLOC_API_VERSION >= 0x00020300
1397+
void *ptr = NULL;
1398+
hwloc_obj_t node;
1399+
struct hwloc_location initiator;
1400+
int ret;
1401+
// TODO: We should make this more efficient by getting rid of the OS syscall
1402+
// 'hwloc_bitmap_alloc' and 'hwloc_get_cpubind' to get affinity and instead
1403+
// use th_affin_mask field when it's capable of getting the underlying
1404+
// mask implementation.
1405+
hwloc_cpuset_t mask = hwloc_bitmap_alloc();
1406+
ret = hwloc_get_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
1407+
if (ret < 0) {
1408+
hwloc_bitmap_free(mask);
1409+
return ptr;
1410+
}
1411+
initiator.type = KMP_HWLOC_LOCATION_TYPE_CPUSET;
1412+
initiator.location.cpuset = mask;
1413+
ret = hwloc_memattr_get_best_target(__kmp_hwloc_topology, attr, &initiator, 0,
1414+
&node, NULL);
1415+
if (ret < 0) {
1416+
return ptr;
1417+
}
1418+
return hwloc_alloc_membind(__kmp_hwloc_topology, size, node->nodeset, policy,
1419+
HWLOC_MEMBIND_BYNODESET);
1420+
#else
1421+
return NULL;
1422+
#endif
1423+
}
1424+
1425+
void *__kmp_hwloc_membind_policy(omp_memspace_handle_t ms, size_t size,
1426+
hwloc_membind_policy_t policy) {
1427+
#if HWLOC_API_VERSION >= 0x00020300
1428+
void *ptr = NULL;
1429+
if (ms == omp_high_bw_mem_space) {
1430+
ptr = __kmp_hwloc_alloc_membind(HWLOC_MEMATTR_ID_BANDWIDTH, size, policy);
1431+
} else if (ms == omp_large_cap_mem_space) {
1432+
ptr = __kmp_hwloc_alloc_membind(HWLOC_MEMATTR_ID_CAPACITY, size, policy);
1433+
} else {
1434+
ptr = hwloc_alloc(__kmp_hwloc_topology, size);
1435+
}
1436+
return ptr;
1437+
#else
1438+
return NULL;
1439+
#endif
1440+
}
1441+
#endif // KMP_USE_HWLOC
1442+
13611443
void __kmp_init_target_mem() {
13621444
*(void **)(&kmp_target_alloc_host) = KMP_DLSYM("llvm_omp_target_alloc_host");
13631445
*(void **)(&kmp_target_alloc_shared) =
@@ -1470,6 +1552,13 @@ omp_allocator_handle_t __kmpc_init_allocator(int gtid, omp_memspace_handle_t ms,
14701552
al->fb_data = RCAST(kmp_allocator_t *, traits[i].value);
14711553
break;
14721554
case omp_atk_partition:
1555+
#if KMP_USE_HWLOC
1556+
al->membind = (omp_alloctrait_value_t)traits[i].value;
1557+
KMP_DEBUG_ASSERT(al->membind == omp_atv_environment ||
1558+
al->membind == omp_atv_nearest ||
1559+
al->membind == omp_atv_blocked ||
1560+
al->membind == omp_atv_interleaved);
1561+
#endif
14731562
al->memkind = RCAST(void **, traits[i].value);
14741563
break;
14751564
default:
@@ -1524,7 +1613,8 @@ omp_allocator_handle_t __kmpc_init_allocator(int gtid, omp_memspace_handle_t ms,
15241613
__kmp_free(al);
15251614
return omp_null_allocator;
15261615
} else {
1527-
if (ms == omp_high_bw_mem_space) {
1616+
if (!__kmp_hwloc_available &&
1617+
(ms == omp_high_bw_mem_space || ms == omp_large_cap_mem_space)) {
15281618
// cannot detect HBW memory presence without memkind library
15291619
__kmp_free(al);
15301620
return omp_null_allocator;
@@ -1634,8 +1724,9 @@ void *__kmp_alloc(int gtid, size_t algn, size_t size,
16341724
else if (allocator == ompx_pinned_mem_alloc)
16351725
is_pinned = true;
16361726

1637-
// Use default allocator if libmemkind is not available
1638-
int use_default_allocator = (__kmp_memkind_available) ? false : true;
1727+
// Use default allocator if hwloc and libmemkind are not available
1728+
int use_default_allocator =
1729+
(!__kmp_hwloc_available && !__kmp_memkind_available);
16391730

16401731
if (KMP_IS_TARGET_MEM_ALLOC(allocator)) {
16411732
// Use size input directly as the memory may not be accessible on host.
@@ -1740,24 +1831,6 @@ void *__kmp_alloc(int gtid, size_t algn, size_t size,
17401831
}
17411832
}
17421833
} else if (allocator < kmp_max_mem_alloc) {
1743-
if (KMP_IS_TARGET_MEM_ALLOC(allocator)) {
1744-
// Use size input directly as the memory may not be accessible on host.
1745-
// Use default device for now.
1746-
if (__kmp_target_mem_available) {
1747-
kmp_int32 device =
1748-
__kmp_threads[gtid]->th.th_current_task->td_icvs.default_device;
1749-
if (allocator == llvm_omp_target_host_mem_alloc)
1750-
ptr = kmp_target_alloc_host(size, device);
1751-
else if (allocator == llvm_omp_target_shared_mem_alloc)
1752-
ptr = kmp_target_alloc_shared(size, device);
1753-
else // allocator == llvm_omp_target_device_mem_alloc
1754-
ptr = kmp_target_alloc_device(size, device);
1755-
if (is_pinned && kmp_target_lock_mem)
1756-
kmp_target_lock_mem(ptr, size, device);
1757-
}
1758-
return ptr;
1759-
}
1760-
17611834
// pre-defined allocator
17621835
if (allocator == omp_high_bw_mem_alloc) {
17631836
KMP_WARNING(OmpNoAllocator, "omp_high_bw_mem_alloc");
@@ -1953,34 +2026,48 @@ void ___kmpc_free(int gtid, void *ptr, omp_allocator_handle_t allocator) {
19532026
kmp_target_unlock_mem(desc.ptr_alloc, device);
19542027
}
19552028

1956-
if (__kmp_memkind_available) {
1957-
if (oal < kmp_max_mem_alloc) {
1958-
// pre-defined allocator
1959-
if (oal == omp_high_bw_mem_alloc && mk_hbw_preferred) {
1960-
kmp_mk_free(*mk_hbw_preferred, desc.ptr_alloc);
1961-
} else if (oal == omp_large_cap_mem_alloc && mk_dax_kmem_all) {
1962-
kmp_mk_free(*mk_dax_kmem_all, desc.ptr_alloc);
2029+
#if KMP_USE_HWLOC
2030+
if (__kmp_hwloc_available) {
2031+
if (oal > kmp_max_mem_alloc && al->pool_size > 0) {
2032+
kmp_uint64 used =
2033+
KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, -desc.size_a);
2034+
(void)used; // to suppress compiler warning
2035+
KMP_DEBUG_ASSERT(used >= desc.size_a);
2036+
}
2037+
hwloc_free(__kmp_hwloc_topology, desc.ptr_alloc, desc.size_a);
2038+
} else {
2039+
#endif
2040+
if (__kmp_memkind_available) {
2041+
if (oal < kmp_max_mem_alloc) {
2042+
// pre-defined allocator
2043+
if (oal == omp_high_bw_mem_alloc && mk_hbw_preferred) {
2044+
kmp_mk_free(*mk_hbw_preferred, desc.ptr_alloc);
2045+
} else if (oal == omp_large_cap_mem_alloc && mk_dax_kmem_all) {
2046+
kmp_mk_free(*mk_dax_kmem_all, desc.ptr_alloc);
2047+
} else {
2048+
kmp_mk_free(*mk_default, desc.ptr_alloc);
2049+
}
19632050
} else {
1964-
kmp_mk_free(*mk_default, desc.ptr_alloc);
2051+
if (al->pool_size > 0) { // custom allocator with pool size requested
2052+
kmp_uint64 used =
2053+
KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, -desc.size_a);
2054+
(void)used; // to suppress compiler warning
2055+
KMP_DEBUG_ASSERT(used >= desc.size_a);
2056+
}
2057+
kmp_mk_free(*al->memkind, desc.ptr_alloc);
19652058
}
19662059
} else {
1967-
if (al->pool_size > 0) { // custom allocator with pool size requested
2060+
if (oal > kmp_max_mem_alloc && al->pool_size > 0) {
19682061
kmp_uint64 used =
19692062
KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, -desc.size_a);
19702063
(void)used; // to suppress compiler warning
19712064
KMP_DEBUG_ASSERT(used >= desc.size_a);
19722065
}
1973-
kmp_mk_free(*al->memkind, desc.ptr_alloc);
2066+
__kmp_thread_free(__kmp_thread_from_gtid(gtid), desc.ptr_alloc);
19742067
}
1975-
} else {
1976-
if (oal > kmp_max_mem_alloc && al->pool_size > 0) {
1977-
kmp_uint64 used =
1978-
KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, -desc.size_a);
1979-
(void)used; // to suppress compiler warning
1980-
KMP_DEBUG_ASSERT(used >= desc.size_a);
1981-
}
1982-
__kmp_thread_free(__kmp_thread_from_gtid(gtid), desc.ptr_alloc);
2068+
#if KMP_USE_HWLOC
19832069
}
2070+
#endif
19842071
}
19852072

19862073
/* If LEAK_MEMORY is defined, __kmp_free() will *not* free memory. It causes

openmp/runtime/src/kmp_global.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -296,6 +296,7 @@ kmp_int32 __kmp_max_task_priority = 0;
296296
kmp_uint64 __kmp_taskloop_min_tasks = 0;
297297

298298
int __kmp_memkind_available = 0;
299+
bool __kmp_hwloc_available = false;
299300
omp_allocator_handle_t const omp_null_allocator = NULL;
300301
omp_allocator_handle_t const omp_default_mem_alloc =
301302
(omp_allocator_handle_t const)1;

openmp/runtime/src/kmp_settings.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3773,7 +3773,7 @@ static void __kmp_stg_parse_allocator(char const *name, char const *value,
37733773
if (__kmp_match_str("omp_high_bw_mem_alloc", scan, &next)) {
37743774
SKIP_WS(next);
37753775
if (is_memalloc) {
3776-
if (__kmp_memkind_available) {
3776+
if (__kmp_hwloc_available || __kmp_memkind_available) {
37773777
__kmp_def_allocator = omp_high_bw_mem_alloc;
37783778
return;
37793779
} else {
@@ -3786,7 +3786,7 @@ static void __kmp_stg_parse_allocator(char const *name, char const *value,
37863786
} else if (__kmp_match_str("omp_large_cap_mem_alloc", scan, &next)) {
37873787
SKIP_WS(next);
37883788
if (is_memalloc) {
3789-
if (__kmp_memkind_available) {
3789+
if (__kmp_hwloc_available || __kmp_memkind_available) {
37903790
__kmp_def_allocator = omp_large_cap_mem_alloc;
37913791
return;
37923792
} else {
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
// RUN: %libomp-compile && env KMP_TOPOLOGY_METHOD=hwloc %libomp-run
2+
// REQUIRES: hwloc
3+
4+
#include <stdio.h>
5+
#include <omp.h>
6+
7+
int main() {
8+
void *p[2];
9+
#pragma omp parallel num_threads(2)
10+
{
11+
int i = omp_get_thread_num();
12+
p[i] = omp_alloc(1024 * 1024, omp_get_default_allocator());
13+
#pragma omp barrier
14+
printf("th %d, ptr %p\n", i, p[i]);
15+
omp_free(p[i], omp_get_default_allocator());
16+
}
17+
// Both pointers should be non-NULL
18+
if (p[0] != NULL && p[1] != NULL) {
19+
printf("passed\n");
20+
return 0;
21+
} else {
22+
printf("failed: pointers %p %p\n", p[0], p[1]);
23+
return 1;
24+
}
25+
}

0 commit comments

Comments
 (0)