Skip to content

Commit 745800f

Browse files
committed
implement interleave mode with customizable size
Signed-off-by: Łukasz Plewa <[email protected]>
1 parent b8d0883 commit 745800f

File tree

6 files changed

+209
-70
lines changed

6 files changed

+209
-70
lines changed

include/umf/providers/provider_os_memory.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,9 @@ typedef struct umf_os_memory_provider_params_t {
7070

7171
/// Describes how node list is interpreted
7272
umf_numa_mode_t numa_mode;
73+
/// part size for interleave mode - 0 means default (system specific)
74+
/// It might be rounded up because of HW constraints
75+
size_t part_size;
7376
} umf_os_memory_provider_params_t;
7477

7578
/// @brief OS Memory Provider operation results
@@ -94,6 +97,7 @@ umfOsMemoryProviderParamsDefault(void) {
9497
NULL, /* numa_list */
9598
0, /* numa_list_len */
9699
UMF_NUMA_MODE_DEFAULT, /* numa_mode */
100+
0 /* part_size */
97101
};
98102

99103
return params;

src/provider/provider_os_memory.c

Lines changed: 150 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
#include "base_alloc_global.h"
1818
#include "provider_os_memory_internal.h"
19+
#include "utils_concurrency.h"
1920
#include "utils_log.h"
2021

2122
#include <umf.h>
@@ -28,11 +29,14 @@ typedef struct os_memory_provider_t {
2829
unsigned protection; // combination of OS-specific protection flags
2930

3031
// NUMA config
31-
hwloc_bitmap_t nodeset;
32+
hwloc_bitmap_t *nodeset;
33+
unsigned nodeset_len;
3234
char *nodeset_str_buf;
3335
hwloc_membind_policy_t numa_policy;
3436
int numa_flags; // combination of hwloc flags
3537

38+
size_t part_size;
39+
size_t alloc_sum; // sum of all allocations - used for manual interleaving
3640
hwloc_topology_t topo;
3741
} os_memory_provider_t;
3842

@@ -81,30 +85,67 @@ static void os_store_last_native_error(int32_t native_error, int errno_value) {
8185
TLS_last_native_error.errno_value = errno_value;
8286
}
8387

84-
static umf_result_t nodemask_to_hwloc_nodeset(const unsigned *nodelist,
85-
unsigned long listsize,
86-
hwloc_bitmap_t *out_nodeset) {
87-
if (out_nodeset == NULL) {
88-
return UMF_RESULT_ERROR_INVALID_ARGUMENT;
89-
}
88+
static umf_result_t initialize_nodeset(os_memory_provider_t *os_provider,
89+
const unsigned *nodelist,
90+
unsigned long listsize,
91+
int is_separate_nodes) {
9092

91-
*out_nodeset = hwloc_bitmap_alloc();
92-
if (!*out_nodeset) {
93+
unsigned long array_size = (listsize && is_separate_nodes) ? listsize : 1;
94+
os_provider->nodeset =
95+
umf_ba_global_alloc(sizeof(*os_provider->nodeset) * array_size);
96+
97+
if (!os_provider->nodeset) {
9398
return UMF_RESULT_ERROR_OUT_OF_HOST_MEMORY;
9499
}
95100

101+
hwloc_bitmap_t *out_nodeset = os_provider->nodeset;
102+
os_provider->nodeset_len = array_size;
96103
if (listsize == 0) {
104+
// Hwloc_set_area_membind fails if empty nodeset is passed so
105+
// if no node is specified, just pass all available nodes.
106+
// For modes where no node is needed, they will be ignored anyway.
107+
out_nodeset[0] = hwloc_bitmap_dup(
108+
hwloc_topology_get_complete_nodeset(os_provider->topo));
109+
if (!out_nodeset[0]) {
110+
goto err_free_list;
111+
}
97112
return UMF_RESULT_SUCCESS;
98113
}
99114

100-
for (unsigned long i = 0; i < listsize; i++) {
101-
if (hwloc_bitmap_set(*out_nodeset, nodelist[i])) {
102-
hwloc_bitmap_free(*out_nodeset);
103-
return UMF_RESULT_ERROR_OUT_OF_HOST_MEMORY;
115+
for (unsigned long i = 0; i < array_size; i++) {
116+
out_nodeset[i] = hwloc_bitmap_alloc();
117+
if (!out_nodeset[i]) {
118+
for (unsigned long j = 0; j < i; j++) {
119+
hwloc_bitmap_free(out_nodeset[j]);
120+
}
121+
goto err_free_list;
122+
}
123+
}
124+
125+
if (is_separate_nodes) {
126+
for (unsigned long i = 0; i < listsize; i++) {
127+
if (hwloc_bitmap_set(out_nodeset[i], nodelist[i])) {
128+
goto err_free_bitmaps;
129+
}
130+
}
131+
} else {
132+
for (unsigned long i = 0; i < listsize; i++) {
133+
if (hwloc_bitmap_set(out_nodeset[0], nodelist[i])) {
134+
goto err_free_bitmaps;
135+
}
104136
}
105137
}
106138

107139
return UMF_RESULT_SUCCESS;
140+
141+
err_free_bitmaps:
142+
for (unsigned long i = 0; i < array_size; i++) {
143+
hwloc_bitmap_free(out_nodeset[i]);
144+
}
145+
err_free_list:
146+
umf_ba_global_free(*out_nodeset);
147+
os_provider->nodeset_len = 0;
148+
return UMF_RESULT_ERROR_OUT_OF_HOST_MEMORY;
108149
}
109150

110151
umf_result_t os_translate_flags(unsigned in_flags, unsigned max,
@@ -132,51 +173,73 @@ umf_result_t os_translate_flags(unsigned in_flags, unsigned max,
132173
return UMF_RESULT_SUCCESS;
133174
}
134175

135-
static umf_result_t translate_numa_mode(umf_numa_mode_t mode, int nodemaskEmpty,
136-
hwloc_membind_policy_t *numa_policy) {
176+
static umf_result_t validate_numa_mode(umf_numa_mode_t mode,
177+
int nodemaskEmpty) {
137178
switch (mode) {
138179
case UMF_NUMA_MODE_DEFAULT:
180+
case UMF_NUMA_MODE_LOCAL:
139181
if (!nodemaskEmpty) {
140182
// nodeset must be empty
141183
return UMF_RESULT_ERROR_INVALID_ARGUMENT;
142184
}
143-
*numa_policy = HWLOC_MEMBIND_DEFAULT;
144185
return UMF_RESULT_SUCCESS;
145186
case UMF_NUMA_MODE_BIND:
146-
if (nodemaskEmpty) {
147-
// nodeset must not be empty
148-
return UMF_RESULT_ERROR_INVALID_ARGUMENT;
149-
}
150-
*numa_policy = HWLOC_MEMBIND_BIND;
151-
return UMF_RESULT_SUCCESS;
152187
case UMF_NUMA_MODE_INTERLEAVE:
153188
if (nodemaskEmpty) {
154189
// nodeset must not be empty
155190
return UMF_RESULT_ERROR_INVALID_ARGUMENT;
156191
}
157-
*numa_policy = HWLOC_MEMBIND_INTERLEAVE;
158192
return UMF_RESULT_SUCCESS;
159193
case UMF_NUMA_MODE_PREFERRED:
160-
*numa_policy = HWLOC_MEMBIND_BIND;
161194
return UMF_RESULT_SUCCESS;
162-
case UMF_NUMA_MODE_LOCAL:
163-
if (!nodemaskEmpty) {
164-
// nodeset must be empty
165-
return UMF_RESULT_ERROR_INVALID_ARGUMENT;
195+
default:
196+
assert(0);
197+
return UMF_RESULT_ERROR_INVALID_ARGUMENT;
198+
}
199+
}
200+
201+
static hwloc_membind_policy_t translate_numa_mode(umf_numa_mode_t mode,
202+
int dedicated_node_bind) {
203+
switch (mode) {
204+
case UMF_NUMA_MODE_DEFAULT:
205+
return HWLOC_MEMBIND_DEFAULT;
206+
case UMF_NUMA_MODE_BIND:
207+
return HWLOC_MEMBIND_BIND;
208+
case UMF_NUMA_MODE_INTERLEAVE:
209+
// In manual mode, we manually implement interleaving,
210+
// by binding memory to specific NUMA nodes.
211+
if (dedicated_node_bind) {
212+
return HWLOC_MEMBIND_BIND;
166213
}
167-
*numa_policy = HWLOC_MEMBIND_BIND;
168-
return UMF_RESULT_SUCCESS;
214+
return HWLOC_MEMBIND_INTERLEAVE;
215+
case UMF_NUMA_MODE_PREFERRED:
216+
return HWLOC_MEMBIND_BIND;
217+
case UMF_NUMA_MODE_LOCAL:
218+
return HWLOC_MEMBIND_BIND;
169219
}
170-
return UMF_RESULT_ERROR_INVALID_ARGUMENT;
220+
assert(0);
221+
return -1;
171222
}
172223

173-
static int getHwlocMembindFlags(umf_numa_mode_t mode) {
224+
//return 1 if umf will bind memory directly to single NUMA node, based on internal algorithm
225+
//return 0 if umf will just set numa memory policy, and kernel will decide where to allocate memory
226+
static int dedicated_node_bind(umf_os_memory_provider_params_t *in_params) {
227+
if (in_params->numa_mode == UMF_NUMA_MODE_INTERLEAVE) {
228+
return in_params->part_size > 0;
229+
}
230+
return 0;
231+
}
232+
233+
static int getHwlocMembindFlags(umf_numa_mode_t mode, int dedicated_node_bind) {
174234
/* UMF always operates on NUMA nodes */
175235
int flags = HWLOC_MEMBIND_BYNODESET;
176236
if (mode == UMF_NUMA_MODE_BIND) {
177237
/* HWLOC uses MPOL_PREFERRED[_MANY] unless HWLOC_MEMBIND_STRICT is specified */
178238
flags |= HWLOC_MEMBIND_STRICT;
179239
}
240+
if (dedicated_node_bind) {
241+
flags |= HWLOC_MEMBIND_STRICT;
242+
}
180243
return flags;
181244
}
182245

@@ -193,19 +256,22 @@ static umf_result_t translate_params(umf_os_memory_provider_params_t *in_params,
193256

194257
// NUMA config
195258
int emptyNodeset = in_params->numa_list_len == 0;
196-
result = translate_numa_mode(in_params->numa_mode, emptyNodeset,
197-
&provider->numa_policy);
259+
result = validate_numa_mode(in_params->numa_mode, emptyNodeset);
198260
if (result != UMF_RESULT_SUCCESS) {
199261
LOG_ERR("incorrect NUMA mode (%u) or wrong params",
200262
in_params->numa_mode);
201263
return result;
202264
}
203265
LOG_INFO("established HWLOC NUMA policy: %u", provider->numa_policy);
204266

205-
provider->numa_flags = getHwlocMembindFlags(in_params->numa_mode);
206-
207-
return nodemask_to_hwloc_nodeset(
208-
in_params->numa_list, in_params->numa_list_len, &provider->nodeset);
267+
int is_dedicated_node_bind = dedicated_node_bind(in_params);
268+
provider->numa_policy =
269+
translate_numa_mode(in_params->numa_mode, is_dedicated_node_bind);
270+
provider->numa_flags =
271+
getHwlocMembindFlags(in_params->numa_mode, is_dedicated_node_bind);
272+
provider->part_size = in_params->part_size;
273+
return initialize_nodeset(provider, in_params->numa_list,
274+
in_params->numa_list_len, is_dedicated_node_bind);
209275
}
210276

211277
static umf_result_t os_initialize(void *params, void **provider) {
@@ -251,13 +317,13 @@ static umf_result_t os_initialize(void *params, void **provider) {
251317
if (!os_provider->nodeset_str_buf) {
252318
LOG_INFO("allocating memory for printing NUMA nodes failed");
253319
} else {
254-
if (hwloc_bitmap_list_snprintf(os_provider->nodeset_str_buf,
255-
NODESET_STR_BUF_LEN,
256-
os_provider->nodeset)) {
257-
LOG_INFO("OS provider initialized with NUMA nodes: %s",
258-
os_provider->nodeset_str_buf);
259-
} else if (hwloc_bitmap_iszero(os_provider->nodeset)) {
260-
LOG_INFO("OS provider initialized with empty NUMA nodeset");
320+
LOG_INFO("OS provider initialized with NUMA nodes:");
321+
for (unsigned i = 0; i < os_provider->nodeset_len; i++) {
322+
if (hwloc_bitmap_list_snprintf(os_provider->nodeset_str_buf,
323+
NODESET_STR_BUF_LEN,
324+
os_provider->nodeset[i])) {
325+
LOG_INFO("%s", os_provider->nodeset_str_buf);
326+
}
261327
}
262328
}
263329

@@ -283,7 +349,10 @@ static void os_finalize(void *provider) {
283349
umf_ba_global_free(os_provider->nodeset_str_buf);
284350
}
285351

286-
hwloc_bitmap_free(os_provider->nodeset);
352+
for (unsigned i = 0; i < os_provider->nodeset_len; i++) {
353+
hwloc_bitmap_free(os_provider->nodeset[i]);
354+
}
355+
umf_ba_global_free(os_provider->nodeset);
287356
hwloc_topology_destroy(os_provider->topo);
288357
umf_ba_global_free(os_provider);
289358
}
@@ -390,6 +459,17 @@ static int os_mmap_aligned(void *hint_addr, size_t length, size_t alignment,
390459
return 0;
391460
}
392461

462+
static int get_membind(os_memory_provider_t *provider, size_t size) {
463+
if (provider->nodeset_len == 1) {
464+
return 0;
465+
}
466+
467+
assert(provider->part_size != 0);
468+
size_t s = util_fetch_and_add64(&provider->alloc_sum, size);
469+
470+
return (s / provider->part_size) % provider->nodeset_len;
471+
}
472+
393473
static umf_result_t os_alloc(void *provider, size_t size, size_t alignment,
394474
void **resultPtr) {
395475
int ret;
@@ -437,32 +517,34 @@ static umf_result_t os_alloc(void *provider, size_t size, size_t alignment,
437517
}
438518

439519
errno = 0;
440-
if (hwloc_bitmap_iszero(os_provider->nodeset)) {
441-
// Hwloc_set_area_membind fails if empty nodeset is passed so if no node is specified,
442-
// just pass all available nodes. For modes where no node is needed, they will be
443-
// ignored anyway.
444-
hwloc_const_nodeset_t complete_nodeset =
445-
hwloc_topology_get_complete_nodeset(os_provider->topo);
446-
ret = hwloc_set_area_membind(os_provider->topo, addr, size,
447-
complete_nodeset, os_provider->numa_policy,
448-
os_provider->numa_flags);
449-
} else {
520+
unsigned membind = get_membind(os_provider, ALIGN_UP(size, page_size));
521+
size_t bind_size = os_provider->nodeset_len == 1
522+
? size
523+
: ALIGN_UP(os_provider->part_size, page_size);
524+
char *ptr_iter = addr;
525+
526+
do {
527+
size_t s = bind_size < size ? bind_size : size;
450528
ret = hwloc_set_area_membind(
451-
os_provider->topo, addr, size, os_provider->nodeset,
529+
os_provider->topo, ptr_iter, s, os_provider->nodeset[membind++],
452530
os_provider->numa_policy, os_provider->numa_flags);
453-
}
454531

455-
if (ret) {
456-
os_store_last_native_error(UMF_OS_RESULT_ERROR_BIND_FAILED, errno);
457-
LOG_PERR("binding memory to NUMA node failed");
458-
// TODO: (errno == 0) when hwloc_set_area_membind() fails on Windows - ignore this temporarily
459-
if (errno != ENOSYS &&
460-
errno != 0) { // ENOSYS - Function not implemented
461-
// Do not error out if memory binding is not implemented at all (like in case of WSL on Windows).
462-
goto err_unmap;
532+
size -= s;
533+
ptr_iter += s;
534+
membind %= os_provider->nodeset_len;
535+
if (ret) {
536+
os_store_last_native_error(UMF_OS_RESULT_ERROR_BIND_FAILED, errno);
537+
LOG_PERR("binding memory to NUMA node failed");
538+
// TODO: (errno == 0) when hwloc_set_area_membind() fails on Windows,
539+
// ignore this temporarily
540+
if (errno != ENOSYS &&
541+
errno != 0) { // ENOSYS - Function not implemented
542+
// Do not error out if memory binding is not implemented at all
543+
// (like in case of WSL on Windows).
544+
goto err_unmap;
545+
}
463546
}
464-
}
465-
547+
} while (size > 0);
466548
*resultPtr = addr;
467549

468550
return UMF_RESULT_SUCCESS;

src/utils/utils_concurrency.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,8 @@ static __inline unsigned char util_mssb_index(long long value) {
7070
InterlockedExchange64((LONG64 volatile *)object, (LONG64)desired)
7171
#define util_atomic_increment(object) \
7272
InterlockedIncrement64((LONG64 volatile *)object)
73+
#define util_fetch_and_add64(ptr, value) \
74+
InterlockedExchangeAdd64((LONG64 *)(ptr), value)
7375
#else
7476
#define util_lssb_index(x) ((unsigned char)__builtin_ctzll(x))
7577
#define util_mssb_index(x) ((unsigned char)(63 - __builtin_clzll(x)))
@@ -87,6 +89,7 @@ static __inline unsigned char util_mssb_index(long long value) {
8789

8890
#define util_atomic_increment(object) \
8991
__atomic_add_fetch(object, 1, __ATOMIC_ACQ_REL)
92+
#define util_fetch_and_add64 __sync_fetch_and_add
9093
#endif
9194

9295
#ifdef __cplusplus

test/provider_os_memory.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ using providerCreateExtParams = std::tuple<umf_memory_provider_ops_t *, void *>;
4545

4646
umf::provider_unique_handle_t
4747
providerCreateExt(providerCreateExtParams params) {
48-
umf_memory_provider_handle_t hProvider;
48+
umf_memory_provider_handle_t hProvider = nullptr;
4949
auto [provider_ops, provider_params] = params;
5050

5151
auto ret =

0 commit comments

Comments
 (0)