|
14 | 14 | #include "kmp_io.h"
|
15 | 15 | #include "kmp_wrapper_malloc.h"
|
16 | 16 |
|
| 17 | +#if KMP_USE_HWLOC |
| 18 | +#if HWLOC_API_VERSION > 0x00020300 |
| 19 | +#define KMP_HWLOC_LOCATION_TYPE_CPUSET HWLOC_LOCATION_TYPE_CPUSET |
| 20 | +#elif HWLOC_API_VERSION == 0x00020300 |
| 21 | +#define KMP_HWLOC_LOCATION_TYPE_CPUSET \ |
| 22 | + hwloc_location::HWLOC_LOCATION_TYPE_CPUSET |
| 23 | +#else |
| 24 | +enum hwloc_memattr_id_e { |
| 25 | + HWLOC_MEMATTR_ID_BANDWIDTH, |
| 26 | + HWLOC_MEMATTR_ID_CAPACITY |
| 27 | +}; |
| 28 | +#endif |
| 29 | +#endif // KMP_USE_HWLOC |
| 30 | + |
17 | 31 | // Disable bget when it is not used
|
18 | 32 | #if KMP_USE_BGET
|
19 | 33 |
|
@@ -1358,6 +1372,74 @@ void __kmp_fini_memkind() {
|
1358 | 1372 | #endif
|
1359 | 1373 | }
|
1360 | 1374 |
|
| 1375 | +#if KMP_USE_HWLOC |
| 1376 | +static bool __kmp_is_hwloc_membind_supported(hwloc_membind_policy_t policy) { |
| 1377 | +#if HWLOC_API_VERSION >= 0x00020300 |
| 1378 | + const hwloc_topology_support *support; |
| 1379 | + support = hwloc_topology_get_support(__kmp_hwloc_topology); |
| 1380 | + if (support) { |
| 1381 | + if (policy == HWLOC_MEMBIND_BIND) |
| 1382 | + return (support->membind->alloc_membind && |
| 1383 | + support->membind->bind_membind); |
| 1384 | + if (policy == HWLOC_MEMBIND_INTERLEAVE) |
| 1385 | + return (support->membind->alloc_membind && |
| 1386 | + support->membind->interleave_membind); |
| 1387 | + } |
| 1388 | + return false; |
| 1389 | +#else |
| 1390 | + return false; |
| 1391 | +#endif |
| 1392 | +} |
| 1393 | + |
| 1394 | +void *__kmp_hwloc_alloc_membind(hwloc_memattr_id_e attr, size_t size, |
| 1395 | + hwloc_membind_policy_t policy) { |
| 1396 | +#if HWLOC_API_VERSION >= 0x00020300 |
| 1397 | + void *ptr = NULL; |
| 1398 | + hwloc_obj_t node; |
| 1399 | + struct hwloc_location initiator; |
| 1400 | + int ret; |
| 1401 | + // TODO: We should make this more efficient by getting rid of the OS syscall |
| 1402 | + // 'hwloc_bitmap_alloc' and 'hwloc_get_cpubind' to get affinity and instead |
| 1403 | + // use th_affin_mask field when it's capable of getting the underlying |
| 1404 | + // mask implementation. |
| 1405 | + hwloc_cpuset_t mask = hwloc_bitmap_alloc(); |
| 1406 | + ret = hwloc_get_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD); |
| 1407 | + if (ret < 0) { |
| 1408 | + hwloc_bitmap_free(mask); |
| 1409 | + return ptr; |
| 1410 | + } |
| 1411 | + initiator.type = KMP_HWLOC_LOCATION_TYPE_CPUSET; |
| 1412 | + initiator.location.cpuset = mask; |
| 1413 | + ret = hwloc_memattr_get_best_target(__kmp_hwloc_topology, attr, &initiator, 0, |
| 1414 | + &node, NULL); |
| 1415 | + if (ret < 0) { |
| 1416 | + return ptr; |
| 1417 | + } |
| 1418 | + return hwloc_alloc_membind(__kmp_hwloc_topology, size, node->nodeset, policy, |
| 1419 | + HWLOC_MEMBIND_BYNODESET); |
| 1420 | +#else |
| 1421 | + return NULL; |
| 1422 | +#endif |
| 1423 | +} |
| 1424 | + |
| 1425 | +void *__kmp_hwloc_membind_policy(omp_memspace_handle_t ms, size_t size, |
| 1426 | + hwloc_membind_policy_t policy) { |
| 1427 | +#if HWLOC_API_VERSION >= 0x00020300 |
| 1428 | + void *ptr = NULL; |
| 1429 | + if (ms == omp_high_bw_mem_space) { |
| 1430 | + ptr = __kmp_hwloc_alloc_membind(HWLOC_MEMATTR_ID_BANDWIDTH, size, policy); |
| 1431 | + } else if (ms == omp_large_cap_mem_space) { |
| 1432 | + ptr = __kmp_hwloc_alloc_membind(HWLOC_MEMATTR_ID_CAPACITY, size, policy); |
| 1433 | + } else { |
| 1434 | + ptr = hwloc_alloc(__kmp_hwloc_topology, size); |
| 1435 | + } |
| 1436 | + return ptr; |
| 1437 | +#else |
| 1438 | + return NULL; |
| 1439 | +#endif |
| 1440 | +} |
| 1441 | +#endif // KMP_USE_HWLOC |
| 1442 | + |
1361 | 1443 | void __kmp_init_target_mem() {
|
1362 | 1444 | *(void **)(&kmp_target_alloc_host) = KMP_DLSYM("llvm_omp_target_alloc_host");
|
1363 | 1445 | *(void **)(&kmp_target_alloc_shared) =
|
@@ -1470,6 +1552,13 @@ omp_allocator_handle_t __kmpc_init_allocator(int gtid, omp_memspace_handle_t ms,
|
1470 | 1552 | al->fb_data = RCAST(kmp_allocator_t *, traits[i].value);
|
1471 | 1553 | break;
|
1472 | 1554 | case omp_atk_partition:
|
| 1555 | +#if KMP_USE_HWLOC |
| 1556 | + al->membind = (omp_alloctrait_value_t)traits[i].value; |
| 1557 | + KMP_DEBUG_ASSERT(al->membind == omp_atv_environment || |
| 1558 | + al->membind == omp_atv_nearest || |
| 1559 | + al->membind == omp_atv_blocked || |
| 1560 | + al->membind == omp_atv_interleaved); |
| 1561 | +#endif |
1473 | 1562 | al->memkind = RCAST(void **, traits[i].value);
|
1474 | 1563 | break;
|
1475 | 1564 | default:
|
@@ -1524,7 +1613,8 @@ omp_allocator_handle_t __kmpc_init_allocator(int gtid, omp_memspace_handle_t ms,
|
1524 | 1613 | __kmp_free(al);
|
1525 | 1614 | return omp_null_allocator;
|
1526 | 1615 | } else {
|
1527 |
| - if (ms == omp_high_bw_mem_space) { |
| 1616 | + if (!__kmp_hwloc_available && |
| 1617 | + (ms == omp_high_bw_mem_space || ms == omp_large_cap_mem_space)) { |
1528 | 1618 | // cannot detect HBW memory presence without memkind library
|
1529 | 1619 | __kmp_free(al);
|
1530 | 1620 | return omp_null_allocator;
|
@@ -1634,8 +1724,9 @@ void *__kmp_alloc(int gtid, size_t algn, size_t size,
|
1634 | 1724 | else if (allocator == ompx_pinned_mem_alloc)
|
1635 | 1725 | is_pinned = true;
|
1636 | 1726 |
|
1637 |
| - // Use default allocator if libmemkind is not available |
1638 |
| - int use_default_allocator = (__kmp_memkind_available) ? false : true; |
| 1727 | + // Use default allocator if hwloc and libmemkind are not available |
| 1728 | + int use_default_allocator = |
| 1729 | + (!__kmp_hwloc_available && !__kmp_memkind_available); |
1639 | 1730 |
|
1640 | 1731 | if (KMP_IS_TARGET_MEM_ALLOC(allocator)) {
|
1641 | 1732 | // Use size input directly as the memory may not be accessible on host.
|
@@ -1740,24 +1831,6 @@ void *__kmp_alloc(int gtid, size_t algn, size_t size,
|
1740 | 1831 | }
|
1741 | 1832 | }
|
1742 | 1833 | } else if (allocator < kmp_max_mem_alloc) {
|
1743 |
| - if (KMP_IS_TARGET_MEM_ALLOC(allocator)) { |
1744 |
| - // Use size input directly as the memory may not be accessible on host. |
1745 |
| - // Use default device for now. |
1746 |
| - if (__kmp_target_mem_available) { |
1747 |
| - kmp_int32 device = |
1748 |
| - __kmp_threads[gtid]->th.th_current_task->td_icvs.default_device; |
1749 |
| - if (allocator == llvm_omp_target_host_mem_alloc) |
1750 |
| - ptr = kmp_target_alloc_host(size, device); |
1751 |
| - else if (allocator == llvm_omp_target_shared_mem_alloc) |
1752 |
| - ptr = kmp_target_alloc_shared(size, device); |
1753 |
| - else // allocator == llvm_omp_target_device_mem_alloc |
1754 |
| - ptr = kmp_target_alloc_device(size, device); |
1755 |
| - if (is_pinned && kmp_target_lock_mem) |
1756 |
| - kmp_target_lock_mem(ptr, size, device); |
1757 |
| - } |
1758 |
| - return ptr; |
1759 |
| - } |
1760 |
| - |
1761 | 1834 | // pre-defined allocator
|
1762 | 1835 | if (allocator == omp_high_bw_mem_alloc) {
|
1763 | 1836 | KMP_WARNING(OmpNoAllocator, "omp_high_bw_mem_alloc");
|
@@ -1953,34 +2026,48 @@ void ___kmpc_free(int gtid, void *ptr, omp_allocator_handle_t allocator) {
|
1953 | 2026 | kmp_target_unlock_mem(desc.ptr_alloc, device);
|
1954 | 2027 | }
|
1955 | 2028 |
|
1956 |
| - if (__kmp_memkind_available) { |
1957 |
| - if (oal < kmp_max_mem_alloc) { |
1958 |
| - // pre-defined allocator |
1959 |
| - if (oal == omp_high_bw_mem_alloc && mk_hbw_preferred) { |
1960 |
| - kmp_mk_free(*mk_hbw_preferred, desc.ptr_alloc); |
1961 |
| - } else if (oal == omp_large_cap_mem_alloc && mk_dax_kmem_all) { |
1962 |
| - kmp_mk_free(*mk_dax_kmem_all, desc.ptr_alloc); |
| 2029 | +#if KMP_USE_HWLOC |
| 2030 | + if (__kmp_hwloc_available) { |
| 2031 | + if (oal > kmp_max_mem_alloc && al->pool_size > 0) { |
| 2032 | + kmp_uint64 used = |
| 2033 | + KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, -desc.size_a); |
| 2034 | + (void)used; // to suppress compiler warning |
| 2035 | + KMP_DEBUG_ASSERT(used >= desc.size_a); |
| 2036 | + } |
| 2037 | + hwloc_free(__kmp_hwloc_topology, desc.ptr_alloc, desc.size_a); |
| 2038 | + } else { |
| 2039 | +#endif |
| 2040 | + if (__kmp_memkind_available) { |
| 2041 | + if (oal < kmp_max_mem_alloc) { |
| 2042 | + // pre-defined allocator |
| 2043 | + if (oal == omp_high_bw_mem_alloc && mk_hbw_preferred) { |
| 2044 | + kmp_mk_free(*mk_hbw_preferred, desc.ptr_alloc); |
| 2045 | + } else if (oal == omp_large_cap_mem_alloc && mk_dax_kmem_all) { |
| 2046 | + kmp_mk_free(*mk_dax_kmem_all, desc.ptr_alloc); |
| 2047 | + } else { |
| 2048 | + kmp_mk_free(*mk_default, desc.ptr_alloc); |
| 2049 | + } |
1963 | 2050 | } else {
|
1964 |
| - kmp_mk_free(*mk_default, desc.ptr_alloc); |
| 2051 | + if (al->pool_size > 0) { // custom allocator with pool size requested |
| 2052 | + kmp_uint64 used = |
| 2053 | + KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, -desc.size_a); |
| 2054 | + (void)used; // to suppress compiler warning |
| 2055 | + KMP_DEBUG_ASSERT(used >= desc.size_a); |
| 2056 | + } |
| 2057 | + kmp_mk_free(*al->memkind, desc.ptr_alloc); |
1965 | 2058 | }
|
1966 | 2059 | } else {
|
1967 |
| - if (al->pool_size > 0) { // custom allocator with pool size requested |
| 2060 | + if (oal > kmp_max_mem_alloc && al->pool_size > 0) { |
1968 | 2061 | kmp_uint64 used =
|
1969 | 2062 | KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, -desc.size_a);
|
1970 | 2063 | (void)used; // to suppress compiler warning
|
1971 | 2064 | KMP_DEBUG_ASSERT(used >= desc.size_a);
|
1972 | 2065 | }
|
1973 |
| - kmp_mk_free(*al->memkind, desc.ptr_alloc); |
| 2066 | + __kmp_thread_free(__kmp_thread_from_gtid(gtid), desc.ptr_alloc); |
1974 | 2067 | }
|
1975 |
| - } else { |
1976 |
| - if (oal > kmp_max_mem_alloc && al->pool_size > 0) { |
1977 |
| - kmp_uint64 used = |
1978 |
| - KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, -desc.size_a); |
1979 |
| - (void)used; // to suppress compiler warning |
1980 |
| - KMP_DEBUG_ASSERT(used >= desc.size_a); |
1981 |
| - } |
1982 |
| - __kmp_thread_free(__kmp_thread_from_gtid(gtid), desc.ptr_alloc); |
| 2068 | +#if KMP_USE_HWLOC |
1983 | 2069 | }
|
| 2070 | +#endif |
1984 | 2071 | }
|
1985 | 2072 |
|
1986 | 2073 | /* If LEAK_MEMORY is defined, __kmp_free() will *not* free memory. It causes
|
|
0 commit comments