Skip to content

Commit 7639963

Browse files
committed
implement interleave mode with customizable size
Signed-off-by: Łukasz Plewa <[email protected]>
1 parent 3298399 commit 7639963

File tree

6 files changed

+209
-69
lines changed

6 files changed

+209
-69
lines changed

include/umf/providers/provider_os_memory.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,9 @@ typedef struct umf_os_memory_provider_params_t {
7878

7979
/// Describes how node list is interpreted
8080
umf_numa_mode_t numa_mode;
81+
/// part size for interleave mode - 0 means default (system specific)
82+
/// It might be rounded up because of HW constraints
83+
size_t part_size;
8184
} umf_os_memory_provider_params_t;
8285

8386
/// @brief OS Memory Provider operation results
@@ -103,6 +106,7 @@ umfOsMemoryProviderParamsDefault(void) {
103106
NULL, /* numa_list */
104107
0, /* numa_list_len */
105108
UMF_NUMA_MODE_DEFAULT, /* numa_mode */
109+
0 /* part_size */
106110
};
107111

108112
return params;

src/provider/provider_os_memory.c

Lines changed: 150 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include "base_alloc_global.h"
1818
#include "critnib.h"
1919
#include "provider_os_memory_internal.h"
20+
#include "utils_concurrency.h"
2021
#include "utils_log.h"
2122

2223
#include <umf.h>
@@ -39,11 +40,14 @@ typedef struct os_memory_provider_t {
3940
critnib *fd_offset_map;
4041

4142
// NUMA config
42-
hwloc_bitmap_t nodeset;
43+
hwloc_bitmap_t *nodeset;
44+
unsigned nodeset_len;
4345
char *nodeset_str_buf;
4446
hwloc_membind_policy_t numa_policy;
4547
int numa_flags; // combination of hwloc flags
4648

49+
size_t part_size;
50+
size_t alloc_sum; // sum of all allocations - used for manual interleaving
4751
hwloc_topology_t topo;
4852
} os_memory_provider_t;
4953

@@ -92,30 +96,67 @@ static void os_store_last_native_error(int32_t native_error, int errno_value) {
9296
TLS_last_native_error.errno_value = errno_value;
9397
}
9498

95-
static umf_result_t nodemask_to_hwloc_nodeset(const unsigned *nodelist,
96-
unsigned long listsize,
97-
hwloc_bitmap_t *out_nodeset) {
98-
if (out_nodeset == NULL) {
99-
return UMF_RESULT_ERROR_INVALID_ARGUMENT;
100-
}
99+
static umf_result_t initialize_nodeset(os_memory_provider_t *os_provider,
100+
const unsigned *nodelist,
101+
unsigned long listsize,
102+
int is_separate_nodes) {
101103

102-
*out_nodeset = hwloc_bitmap_alloc();
103-
if (!*out_nodeset) {
104+
unsigned long array_size = (listsize && is_separate_nodes) ? listsize : 1;
105+
os_provider->nodeset =
106+
umf_ba_global_alloc(sizeof(*os_provider->nodeset) * array_size);
107+
108+
if (!os_provider->nodeset) {
104109
return UMF_RESULT_ERROR_OUT_OF_HOST_MEMORY;
105110
}
106111

112+
hwloc_bitmap_t *out_nodeset = os_provider->nodeset;
113+
os_provider->nodeset_len = array_size;
107114
if (listsize == 0) {
115+
// Hwloc_set_area_membind fails if empty nodeset is passed so
116+
// if no node is specified, just pass all available nodes.
117+
// For modes where no node is needed, they will be ignored anyway.
118+
out_nodeset[0] = hwloc_bitmap_dup(
119+
hwloc_topology_get_complete_nodeset(os_provider->topo));
120+
if (!out_nodeset[0]) {
121+
goto err_free_list;
122+
}
108123
return UMF_RESULT_SUCCESS;
109124
}
110125

111-
for (unsigned long i = 0; i < listsize; i++) {
112-
if (hwloc_bitmap_set(*out_nodeset, nodelist[i])) {
113-
hwloc_bitmap_free(*out_nodeset);
114-
return UMF_RESULT_ERROR_OUT_OF_HOST_MEMORY;
126+
for (unsigned long i = 0; i < array_size; i++) {
127+
out_nodeset[i] = hwloc_bitmap_alloc();
128+
if (!out_nodeset[i]) {
129+
for (unsigned long j = 0; j < i; j++) {
130+
hwloc_bitmap_free(out_nodeset[j]);
131+
}
132+
goto err_free_list;
133+
}
134+
}
135+
136+
if (is_separate_nodes) {
137+
for (unsigned long i = 0; i < listsize; i++) {
138+
if (hwloc_bitmap_set(out_nodeset[i], nodelist[i])) {
139+
goto err_free_bitmaps;
140+
}
141+
}
142+
} else {
143+
for (unsigned long i = 0; i < listsize; i++) {
144+
if (hwloc_bitmap_set(out_nodeset[0], nodelist[i])) {
145+
goto err_free_bitmaps;
146+
}
115147
}
116148
}
117149

118150
return UMF_RESULT_SUCCESS;
151+
152+
err_free_bitmaps:
153+
for (unsigned long i = 0; i < array_size; i++) {
154+
hwloc_bitmap_free(out_nodeset[i]);
155+
}
156+
err_free_list:
157+
umf_ba_global_free(*out_nodeset);
158+
os_provider->nodeset_len = 0;
159+
return UMF_RESULT_ERROR_OUT_OF_HOST_MEMORY;
119160
}
120161

121162
umf_result_t os_translate_flags(unsigned in_flags, unsigned max,
@@ -143,51 +184,73 @@ umf_result_t os_translate_flags(unsigned in_flags, unsigned max,
143184
return UMF_RESULT_SUCCESS;
144185
}
145186

146-
static umf_result_t translate_numa_mode(umf_numa_mode_t mode, int nodemaskEmpty,
147-
hwloc_membind_policy_t *numa_policy) {
187+
static umf_result_t validate_numa_mode(umf_numa_mode_t mode,
188+
int nodemaskEmpty) {
148189
switch (mode) {
149190
case UMF_NUMA_MODE_DEFAULT:
191+
case UMF_NUMA_MODE_LOCAL:
150192
if (!nodemaskEmpty) {
151193
// nodeset must be empty
152194
return UMF_RESULT_ERROR_INVALID_ARGUMENT;
153195
}
154-
*numa_policy = HWLOC_MEMBIND_DEFAULT;
155196
return UMF_RESULT_SUCCESS;
156197
case UMF_NUMA_MODE_BIND:
157-
if (nodemaskEmpty) {
158-
// nodeset must not be empty
159-
return UMF_RESULT_ERROR_INVALID_ARGUMENT;
160-
}
161-
*numa_policy = HWLOC_MEMBIND_BIND;
162-
return UMF_RESULT_SUCCESS;
163198
case UMF_NUMA_MODE_INTERLEAVE:
164199
if (nodemaskEmpty) {
165200
// nodeset must not be empty
166201
return UMF_RESULT_ERROR_INVALID_ARGUMENT;
167202
}
168-
*numa_policy = HWLOC_MEMBIND_INTERLEAVE;
169203
return UMF_RESULT_SUCCESS;
170204
case UMF_NUMA_MODE_PREFERRED:
171-
*numa_policy = HWLOC_MEMBIND_BIND;
172205
return UMF_RESULT_SUCCESS;
173-
case UMF_NUMA_MODE_LOCAL:
174-
if (!nodemaskEmpty) {
175-
// nodeset must be empty
176-
return UMF_RESULT_ERROR_INVALID_ARGUMENT;
206+
default:
207+
assert(0);
208+
return UMF_RESULT_ERROR_INVALID_ARGUMENT;
209+
}
210+
}
211+
212+
static hwloc_membind_policy_t translate_numa_mode(umf_numa_mode_t mode,
213+
int dedicated_node_bind) {
214+
switch (mode) {
215+
case UMF_NUMA_MODE_DEFAULT:
216+
return HWLOC_MEMBIND_DEFAULT;
217+
case UMF_NUMA_MODE_BIND:
218+
return HWLOC_MEMBIND_BIND;
219+
case UMF_NUMA_MODE_INTERLEAVE:
220+
// In manual mode, we manually implement interleaving,
221+
// by binding memory to specific NUMA nodes.
222+
if (dedicated_node_bind) {
223+
return HWLOC_MEMBIND_BIND;
177224
}
178-
*numa_policy = HWLOC_MEMBIND_BIND;
179-
return UMF_RESULT_SUCCESS;
225+
return HWLOC_MEMBIND_INTERLEAVE;
226+
case UMF_NUMA_MODE_PREFERRED:
227+
return HWLOC_MEMBIND_BIND;
228+
case UMF_NUMA_MODE_LOCAL:
229+
return HWLOC_MEMBIND_BIND;
180230
}
181-
return UMF_RESULT_ERROR_INVALID_ARGUMENT;
231+
assert(0);
232+
return -1;
182233
}
183234

184-
static int getHwlocMembindFlags(umf_numa_mode_t mode) {
235+
//return 1 if umf will bind memory directly to single NUMA node, based on internal algorithm
236+
//return 0 if umf will just set numa memory policy, and kernel will decide where to allocate memory
237+
static int dedicated_node_bind(umf_os_memory_provider_params_t *in_params) {
238+
if (in_params->numa_mode == UMF_NUMA_MODE_INTERLEAVE) {
239+
return in_params->part_size > 0;
240+
}
241+
return 0;
242+
}
243+
244+
static int getHwlocMembindFlags(umf_numa_mode_t mode, int dedicated_node_bind) {
185245
/* UMF always operates on NUMA nodes */
186246
int flags = HWLOC_MEMBIND_BYNODESET;
187247
if (mode == UMF_NUMA_MODE_BIND) {
188248
/* HWLOC uses MPOL_PREFERRED[_MANY] unless HWLOC_MEMBIND_STRICT is specified */
189249
flags |= HWLOC_MEMBIND_STRICT;
190250
}
251+
if (dedicated_node_bind) {
252+
flags |= HWLOC_MEMBIND_STRICT;
253+
}
191254
return flags;
192255
}
193256

@@ -232,19 +295,22 @@ static umf_result_t translate_params(umf_os_memory_provider_params_t *in_params,
232295

233296
// NUMA config
234297
int emptyNodeset = in_params->numa_list_len == 0;
235-
result = translate_numa_mode(in_params->numa_mode, emptyNodeset,
236-
&provider->numa_policy);
298+
result = validate_numa_mode(in_params->numa_mode, emptyNodeset);
237299
if (result != UMF_RESULT_SUCCESS) {
238300
LOG_ERR("incorrect NUMA mode (%u) or wrong params",
239301
in_params->numa_mode);
240302
return result;
241303
}
242304
LOG_INFO("established HWLOC NUMA policy: %u", provider->numa_policy);
243305

244-
provider->numa_flags = getHwlocMembindFlags(in_params->numa_mode);
245-
246-
return nodemask_to_hwloc_nodeset(
247-
in_params->numa_list, in_params->numa_list_len, &provider->nodeset);
306+
int is_dedicated_node_bind = dedicated_node_bind(in_params);
307+
provider->numa_policy =
308+
translate_numa_mode(in_params->numa_mode, is_dedicated_node_bind);
309+
provider->numa_flags =
310+
getHwlocMembindFlags(in_params->numa_mode, is_dedicated_node_bind);
311+
provider->part_size = in_params->part_size;
312+
return initialize_nodeset(provider, in_params->numa_list,
313+
in_params->numa_list_len, is_dedicated_node_bind);
248314
}
249315

250316
static umf_result_t os_initialize(void *params, void **provider) {
@@ -298,13 +364,13 @@ static umf_result_t os_initialize(void *params, void **provider) {
298364
if (!os_provider->nodeset_str_buf) {
299365
LOG_INFO("allocating memory for printing NUMA nodes failed");
300366
} else {
301-
if (hwloc_bitmap_list_snprintf(os_provider->nodeset_str_buf,
302-
NODESET_STR_BUF_LEN,
303-
os_provider->nodeset)) {
304-
LOG_INFO("OS provider initialized with NUMA nodes: %s",
305-
os_provider->nodeset_str_buf);
306-
} else if (hwloc_bitmap_iszero(os_provider->nodeset)) {
307-
LOG_INFO("OS provider initialized with empty NUMA nodeset");
367+
LOG_INFO("OS provider initialized with NUMA nodes:");
368+
for (unsigned i = 0; i < os_provider->nodeset_len; i++) {
369+
if (hwloc_bitmap_list_snprintf(os_provider->nodeset_str_buf,
370+
NODESET_STR_BUF_LEN,
371+
os_provider->nodeset[i])) {
372+
LOG_INFO("%s", os_provider->nodeset_str_buf);
373+
}
308374
}
309375
}
310376

@@ -342,7 +408,10 @@ static void os_finalize(void *provider) {
342408
umf_ba_global_free(os_provider->nodeset_str_buf);
343409
}
344410

345-
hwloc_bitmap_free(os_provider->nodeset);
411+
for (unsigned i = 0; i < os_provider->nodeset_len; i++) {
412+
hwloc_bitmap_free(os_provider->nodeset[i]);
413+
}
414+
umf_ba_global_free(os_provider->nodeset);
346415
hwloc_topology_destroy(os_provider->topo);
347416
umf_ba_global_free(os_provider);
348417
}
@@ -464,6 +533,17 @@ static int os_mmap_aligned(void *hint_addr, size_t length, size_t alignment,
464533
return 0;
465534
}
466535

536+
static int get_membind(os_memory_provider_t *provider, size_t size) {
537+
if (provider->nodeset_len == 1) {
538+
return 0;
539+
}
540+
541+
assert(provider->part_size != 0);
542+
size_t s = util_fetch_and_add64(&provider->alloc_sum, size);
543+
544+
return (s / provider->part_size) % provider->nodeset_len;
545+
}
546+
467547
static umf_result_t os_alloc(void *provider, size_t size, size_t alignment,
468548
void **resultPtr) {
469549
int ret;
@@ -512,31 +592,34 @@ static umf_result_t os_alloc(void *provider, size_t size, size_t alignment,
512592
}
513593

514594
errno = 0;
515-
if (hwloc_bitmap_iszero(os_provider->nodeset)) {
516-
// Hwloc_set_area_membind fails if empty nodeset is passed so if no node is specified,
517-
// just pass all available nodes. For modes where no node is needed, they will be
518-
// ignored anyway.
519-
hwloc_const_nodeset_t complete_nodeset =
520-
hwloc_topology_get_complete_nodeset(os_provider->topo);
521-
ret = hwloc_set_area_membind(os_provider->topo, addr, size,
522-
complete_nodeset, os_provider->numa_policy,
523-
os_provider->numa_flags);
524-
} else {
595+
unsigned membind = get_membind(os_provider, ALIGN_UP(size, page_size));
596+
size_t bind_size = os_provider->nodeset_len == 1
597+
? size
598+
: ALIGN_UP(os_provider->part_size, page_size);
599+
char *ptr_iter = addr;
600+
601+
do {
602+
size_t s = bind_size < size ? bind_size : size;
525603
ret = hwloc_set_area_membind(
526-
os_provider->topo, addr, size, os_provider->nodeset,
604+
os_provider->topo, ptr_iter, s, os_provider->nodeset[membind++],
527605
os_provider->numa_policy, os_provider->numa_flags);
528-
}
529606

530-
if (ret) {
531-
os_store_last_native_error(UMF_OS_RESULT_ERROR_BIND_FAILED, errno);
532-
LOG_PERR("binding memory to NUMA node failed");
533-
// TODO: (errno == 0) when hwloc_set_area_membind() fails on Windows - ignore this temporarily
534-
if (errno != ENOSYS &&
535-
errno != 0) { // ENOSYS - Function not implemented
536-
// Do not error out if memory binding is not implemented at all (like in case of WSL on Windows).
537-
goto err_unmap;
607+
size -= s;
608+
ptr_iter += s;
609+
membind %= os_provider->nodeset_len;
610+
if (ret) {
611+
os_store_last_native_error(UMF_OS_RESULT_ERROR_BIND_FAILED, errno);
612+
LOG_PERR("binding memory to NUMA node failed");
613+
// TODO: (errno == 0) when hwloc_set_area_membind() fails on Windows,
614+
// ignore this temporarily
615+
if (errno != ENOSYS &&
616+
errno != 0) { // ENOSYS - Function not implemented
617+
// Do not error out if memory binding is not implemented at all
618+
// (like in case of WSL on Windows).
619+
goto err_unmap;
620+
}
538621
}
539-
}
622+
} while (size > 0);
540623

541624
if (os_provider->fd > 0) {
542625
// store (fd_offset + 1) to be able to store fd_offset == 0

src/utils/utils_concurrency.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,8 @@ static __inline unsigned char util_mssb_index(long long value) {
7070
InterlockedExchange64((LONG64 volatile *)object, (LONG64)desired)
7171
#define util_atomic_increment(object) \
7272
InterlockedIncrement64((LONG64 volatile *)object)
73+
#define util_fetch_and_add64(ptr, value) \
74+
InterlockedExchangeAdd64((LONG64 *)(ptr), value)
7375
#else
7476
#define util_lssb_index(x) ((unsigned char)__builtin_ctzll(x))
7577
#define util_mssb_index(x) ((unsigned char)(63 - __builtin_clzll(x)))
@@ -87,6 +89,7 @@ static __inline unsigned char util_mssb_index(long long value) {
8789

8890
#define util_atomic_increment(object) \
8991
__atomic_add_fetch(object, 1, __ATOMIC_ACQ_REL)
92+
#define util_fetch_and_add64 __sync_fetch_and_add
9093
#endif
9194

9295
#ifdef __cplusplus

test/provider_os_memory.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ using providerCreateExtParams = std::tuple<umf_memory_provider_ops_t *, void *>;
4545

4646
umf::provider_unique_handle_t
4747
providerCreateExt(providerCreateExtParams params) {
48-
umf_memory_provider_handle_t hProvider;
48+
umf_memory_provider_handle_t hProvider = nullptr;
4949
auto [provider_ops, provider_params] = params;
5050

5151
auto ret =

0 commit comments

Comments
 (0)