17
17
#include "base_alloc_global.h"
18
18
#include "critnib.h"
19
19
#include "provider_os_memory_internal.h"
20
+ #include "utils_concurrency.h"
20
21
#include "utils_log.h"
21
22
22
23
#include <umf.h>
@@ -39,11 +40,14 @@ typedef struct os_memory_provider_t {
39
40
critnib * fd_offset_map ;
40
41
41
42
// NUMA config
42
- hwloc_bitmap_t nodeset ;
43
+ hwloc_bitmap_t * nodeset ;
44
+ unsigned nodeset_len ;
43
45
char * nodeset_str_buf ;
44
46
hwloc_membind_policy_t numa_policy ;
45
47
int numa_flags ; // combination of hwloc flags
46
48
49
+ size_t part_size ;
50
+ size_t alloc_sum ; // sum of all allocations - used for manual interleaving
47
51
hwloc_topology_t topo ;
48
52
} os_memory_provider_t ;
49
53
@@ -92,30 +96,67 @@ static void os_store_last_native_error(int32_t native_error, int errno_value) {
92
96
TLS_last_native_error .errno_value = errno_value ;
93
97
}
94
98
95
- static umf_result_t nodemask_to_hwloc_nodeset (const unsigned * nodelist ,
96
- unsigned long listsize ,
97
- hwloc_bitmap_t * out_nodeset ) {
98
- if (out_nodeset == NULL ) {
99
- return UMF_RESULT_ERROR_INVALID_ARGUMENT ;
100
- }
99
+ static umf_result_t initialize_nodeset (os_memory_provider_t * os_provider ,
100
+ const unsigned * nodelist ,
101
+ unsigned long listsize ,
102
+ int is_separate_nodes ) {
101
103
102
- * out_nodeset = hwloc_bitmap_alloc ();
103
- if (!* out_nodeset ) {
104
+ unsigned long array_size = (listsize && is_separate_nodes ) ? listsize : 1 ;
105
+ os_provider -> nodeset =
106
+ umf_ba_global_alloc (sizeof (* os_provider -> nodeset ) * array_size );
107
+
108
+ if (!os_provider -> nodeset ) {
104
109
return UMF_RESULT_ERROR_OUT_OF_HOST_MEMORY ;
105
110
}
106
111
112
+ hwloc_bitmap_t * out_nodeset = os_provider -> nodeset ;
113
+ os_provider -> nodeset_len = array_size ;
107
114
if (listsize == 0 ) {
115
+ // Hwloc_set_area_membind fails if empty nodeset is passed so
116
+ // if no node is specified, just pass all available nodes.
117
+ // For modes where no node is needed, they will be ignored anyway.
118
+ out_nodeset [0 ] = hwloc_bitmap_dup (
119
+ hwloc_topology_get_complete_nodeset (os_provider -> topo ));
120
+ if (!out_nodeset [0 ]) {
121
+ goto err_free_list ;
122
+ }
108
123
return UMF_RESULT_SUCCESS ;
109
124
}
110
125
111
- for (unsigned long i = 0 ; i < listsize ; i ++ ) {
112
- if (hwloc_bitmap_set (* out_nodeset , nodelist [i ])) {
113
- hwloc_bitmap_free (* out_nodeset );
114
- return UMF_RESULT_ERROR_OUT_OF_HOST_MEMORY ;
126
+ for (unsigned long i = 0 ; i < array_size ; i ++ ) {
127
+ out_nodeset [i ] = hwloc_bitmap_alloc ();
128
+ if (!out_nodeset [i ]) {
129
+ for (unsigned long j = 0 ; j < i ; j ++ ) {
130
+ hwloc_bitmap_free (out_nodeset [j ]);
131
+ }
132
+ goto err_free_list ;
133
+ }
134
+ }
135
+
136
+ if (is_separate_nodes ) {
137
+ for (unsigned long i = 0 ; i < listsize ; i ++ ) {
138
+ if (hwloc_bitmap_set (out_nodeset [i ], nodelist [i ])) {
139
+ goto err_free_bitmaps ;
140
+ }
141
+ }
142
+ } else {
143
+ for (unsigned long i = 0 ; i < listsize ; i ++ ) {
144
+ if (hwloc_bitmap_set (out_nodeset [0 ], nodelist [i ])) {
145
+ goto err_free_bitmaps ;
146
+ }
115
147
}
116
148
}
117
149
118
150
return UMF_RESULT_SUCCESS ;
151
+
152
+ err_free_bitmaps :
153
+ for (unsigned long i = 0 ; i < array_size ; i ++ ) {
154
+ hwloc_bitmap_free (out_nodeset [i ]);
155
+ }
156
+ err_free_list :
157
+ umf_ba_global_free (* out_nodeset );
158
+ os_provider -> nodeset_len = 0 ;
159
+ return UMF_RESULT_ERROR_OUT_OF_HOST_MEMORY ;
119
160
}
120
161
121
162
umf_result_t os_translate_flags (unsigned in_flags , unsigned max ,
@@ -143,51 +184,73 @@ umf_result_t os_translate_flags(unsigned in_flags, unsigned max,
143
184
return UMF_RESULT_SUCCESS ;
144
185
}
145
186
146
- static umf_result_t translate_numa_mode (umf_numa_mode_t mode , int nodemaskEmpty ,
147
- hwloc_membind_policy_t * numa_policy ) {
187
+ static umf_result_t validate_numa_mode (umf_numa_mode_t mode ,
188
+ int nodemaskEmpty ) {
148
189
switch (mode ) {
149
190
case UMF_NUMA_MODE_DEFAULT :
191
+ case UMF_NUMA_MODE_LOCAL :
150
192
if (!nodemaskEmpty ) {
151
193
// nodeset must be empty
152
194
return UMF_RESULT_ERROR_INVALID_ARGUMENT ;
153
195
}
154
- * numa_policy = HWLOC_MEMBIND_DEFAULT ;
155
196
return UMF_RESULT_SUCCESS ;
156
197
case UMF_NUMA_MODE_BIND :
157
- if (nodemaskEmpty ) {
158
- // nodeset must not be empty
159
- return UMF_RESULT_ERROR_INVALID_ARGUMENT ;
160
- }
161
- * numa_policy = HWLOC_MEMBIND_BIND ;
162
- return UMF_RESULT_SUCCESS ;
163
198
case UMF_NUMA_MODE_INTERLEAVE :
164
199
if (nodemaskEmpty ) {
165
200
// nodeset must not be empty
166
201
return UMF_RESULT_ERROR_INVALID_ARGUMENT ;
167
202
}
168
- * numa_policy = HWLOC_MEMBIND_INTERLEAVE ;
169
203
return UMF_RESULT_SUCCESS ;
170
204
case UMF_NUMA_MODE_PREFERRED :
171
- * numa_policy = HWLOC_MEMBIND_BIND ;
172
205
return UMF_RESULT_SUCCESS ;
173
- case UMF_NUMA_MODE_LOCAL :
174
- if (!nodemaskEmpty ) {
175
- // nodeset must be empty
176
- return UMF_RESULT_ERROR_INVALID_ARGUMENT ;
206
+ default :
207
+ assert (0 );
208
+ return UMF_RESULT_ERROR_INVALID_ARGUMENT ;
209
+ }
210
+ }
211
+
212
+ static hwloc_membind_policy_t translate_numa_mode (umf_numa_mode_t mode ,
213
+ int dedicated_node_bind ) {
214
+ switch (mode ) {
215
+ case UMF_NUMA_MODE_DEFAULT :
216
+ return HWLOC_MEMBIND_DEFAULT ;
217
+ case UMF_NUMA_MODE_BIND :
218
+ return HWLOC_MEMBIND_BIND ;
219
+ case UMF_NUMA_MODE_INTERLEAVE :
220
+ // In manual mode, we manually implement interleaving,
221
+ // by binding memory to specific NUMA nodes.
222
+ if (dedicated_node_bind ) {
223
+ return HWLOC_MEMBIND_BIND ;
177
224
}
178
- * numa_policy = HWLOC_MEMBIND_BIND ;
179
- return UMF_RESULT_SUCCESS ;
225
+ return HWLOC_MEMBIND_INTERLEAVE ;
226
+ case UMF_NUMA_MODE_PREFERRED :
227
+ return HWLOC_MEMBIND_BIND ;
228
+ case UMF_NUMA_MODE_LOCAL :
229
+ return HWLOC_MEMBIND_BIND ;
180
230
}
181
- return UMF_RESULT_ERROR_INVALID_ARGUMENT ;
231
+ assert (0 );
232
+ return -1 ;
182
233
}
183
234
184
- static int getHwlocMembindFlags (umf_numa_mode_t mode ) {
235
+ //return 1 if umf will bind memory directly to single NUMA node, based on internal algorithm
236
+ //return 0 if umf will just set numa memory policy, and kernel will decide where to allocate memory
237
+ static int dedicated_node_bind (umf_os_memory_provider_params_t * in_params ) {
238
+ if (in_params -> numa_mode == UMF_NUMA_MODE_INTERLEAVE ) {
239
+ return in_params -> part_size > 0 ;
240
+ }
241
+ return 0 ;
242
+ }
243
+
244
+ static int getHwlocMembindFlags (umf_numa_mode_t mode , int dedicated_node_bind ) {
185
245
/* UMF always operates on NUMA nodes */
186
246
int flags = HWLOC_MEMBIND_BYNODESET ;
187
247
if (mode == UMF_NUMA_MODE_BIND ) {
188
248
/* HWLOC uses MPOL_PREFERRED[_MANY] unless HWLOC_MEMBIND_STRICT is specified */
189
249
flags |= HWLOC_MEMBIND_STRICT ;
190
250
}
251
+ if (dedicated_node_bind ) {
252
+ flags |= HWLOC_MEMBIND_STRICT ;
253
+ }
191
254
return flags ;
192
255
}
193
256
@@ -232,19 +295,22 @@ static umf_result_t translate_params(umf_os_memory_provider_params_t *in_params,
232
295
233
296
// NUMA config
234
297
int emptyNodeset = in_params -> numa_list_len == 0 ;
235
- result = translate_numa_mode (in_params -> numa_mode , emptyNodeset ,
236
- & provider -> numa_policy );
298
+ result = validate_numa_mode (in_params -> numa_mode , emptyNodeset );
237
299
if (result != UMF_RESULT_SUCCESS ) {
238
300
LOG_ERR ("incorrect NUMA mode (%u) or wrong params" ,
239
301
in_params -> numa_mode );
240
302
return result ;
241
303
}
242
304
LOG_INFO ("established HWLOC NUMA policy: %u" , provider -> numa_policy );
243
305
244
- provider -> numa_flags = getHwlocMembindFlags (in_params -> numa_mode );
245
-
246
- return nodemask_to_hwloc_nodeset (
247
- in_params -> numa_list , in_params -> numa_list_len , & provider -> nodeset );
306
+ int is_dedicated_node_bind = dedicated_node_bind (in_params );
307
+ provider -> numa_policy =
308
+ translate_numa_mode (in_params -> numa_mode , is_dedicated_node_bind );
309
+ provider -> numa_flags =
310
+ getHwlocMembindFlags (in_params -> numa_mode , is_dedicated_node_bind );
311
+ provider -> part_size = in_params -> part_size ;
312
+ return initialize_nodeset (provider , in_params -> numa_list ,
313
+ in_params -> numa_list_len , is_dedicated_node_bind );
248
314
}
249
315
250
316
static umf_result_t os_initialize (void * params , void * * provider ) {
@@ -298,13 +364,13 @@ static umf_result_t os_initialize(void *params, void **provider) {
298
364
if (!os_provider -> nodeset_str_buf ) {
299
365
LOG_INFO ("allocating memory for printing NUMA nodes failed" );
300
366
} else {
301
- if ( hwloc_bitmap_list_snprintf ( os_provider -> nodeset_str_buf ,
302
- NODESET_STR_BUF_LEN ,
303
- os_provider -> nodeset )) {
304
- LOG_INFO ( "OS provider initialized with NUMA nodes: %s" ,
305
- os_provider -> nodeset_str_buf );
306
- } else if ( hwloc_bitmap_iszero ( os_provider -> nodeset )) {
307
- LOG_INFO ( "OS provider initialized with empty NUMA nodeset" );
367
+ LOG_INFO ( "OS provider initialized with NUMA nodes:" );
368
+ for ( unsigned i = 0 ; i < os_provider -> nodeset_len ; i ++ ) {
369
+ if ( hwloc_bitmap_list_snprintf ( os_provider -> nodeset_str_buf ,
370
+ NODESET_STR_BUF_LEN ,
371
+ os_provider -> nodeset [ i ])) {
372
+ LOG_INFO ( "%s" , os_provider -> nodeset_str_buf );
373
+ }
308
374
}
309
375
}
310
376
@@ -342,7 +408,10 @@ static void os_finalize(void *provider) {
342
408
umf_ba_global_free (os_provider -> nodeset_str_buf );
343
409
}
344
410
345
- hwloc_bitmap_free (os_provider -> nodeset );
411
+ for (unsigned i = 0 ; i < os_provider -> nodeset_len ; i ++ ) {
412
+ hwloc_bitmap_free (os_provider -> nodeset [i ]);
413
+ }
414
+ umf_ba_global_free (os_provider -> nodeset );
346
415
hwloc_topology_destroy (os_provider -> topo );
347
416
umf_ba_global_free (os_provider );
348
417
}
@@ -464,6 +533,17 @@ static int os_mmap_aligned(void *hint_addr, size_t length, size_t alignment,
464
533
return 0 ;
465
534
}
466
535
536
+ static int get_membind (os_memory_provider_t * provider , size_t size ) {
537
+ if (provider -> nodeset_len == 1 ) {
538
+ return 0 ;
539
+ }
540
+
541
+ assert (provider -> part_size != 0 );
542
+ size_t s = util_fetch_and_add64 (& provider -> alloc_sum , size );
543
+
544
+ return (s / provider -> part_size ) % provider -> nodeset_len ;
545
+ }
546
+
467
547
static umf_result_t os_alloc (void * provider , size_t size , size_t alignment ,
468
548
void * * resultPtr ) {
469
549
int ret ;
@@ -512,31 +592,34 @@ static umf_result_t os_alloc(void *provider, size_t size, size_t alignment,
512
592
}
513
593
514
594
errno = 0 ;
515
- if (hwloc_bitmap_iszero (os_provider -> nodeset )) {
516
- // Hwloc_set_area_membind fails if empty nodeset is passed so if no node is specified,
517
- // just pass all available nodes. For modes where no node is needed, they will be
518
- // ignored anyway.
519
- hwloc_const_nodeset_t complete_nodeset =
520
- hwloc_topology_get_complete_nodeset (os_provider -> topo );
521
- ret = hwloc_set_area_membind (os_provider -> topo , addr , size ,
522
- complete_nodeset , os_provider -> numa_policy ,
523
- os_provider -> numa_flags );
524
- } else {
595
+ unsigned membind = get_membind (os_provider , ALIGN_UP (size , page_size ));
596
+ size_t bind_size = os_provider -> nodeset_len == 1
597
+ ? size
598
+ : ALIGN_UP (os_provider -> part_size , page_size );
599
+ char * ptr_iter = addr ;
600
+
601
+ do {
602
+ size_t s = bind_size < size ? bind_size : size ;
525
603
ret = hwloc_set_area_membind (
526
- os_provider -> topo , addr , size , os_provider -> nodeset ,
604
+ os_provider -> topo , ptr_iter , s , os_provider -> nodeset [ membind ++ ] ,
527
605
os_provider -> numa_policy , os_provider -> numa_flags );
528
- }
529
606
530
- if (ret ) {
531
- os_store_last_native_error (UMF_OS_RESULT_ERROR_BIND_FAILED , errno );
532
- LOG_PERR ("binding memory to NUMA node failed" );
533
- // TODO: (errno == 0) when hwloc_set_area_membind() fails on Windows - ignore this temporarily
534
- if (errno != ENOSYS &&
535
- errno != 0 ) { // ENOSYS - Function not implemented
536
- // Do not error out if memory binding is not implemented at all (like in case of WSL on Windows).
537
- goto err_unmap ;
607
+ size -= s ;
608
+ ptr_iter += s ;
609
+ membind %= os_provider -> nodeset_len ;
610
+ if (ret ) {
611
+ os_store_last_native_error (UMF_OS_RESULT_ERROR_BIND_FAILED , errno );
612
+ LOG_PERR ("binding memory to NUMA node failed" );
613
+ // TODO: (errno == 0) when hwloc_set_area_membind() fails on Windows,
614
+ // ignore this temporarily
615
+ if (errno != ENOSYS &&
616
+ errno != 0 ) { // ENOSYS - Function not implemented
617
+ // Do not error out if memory binding is not implemented at all
618
+ // (like in case of WSL on Windows).
619
+ goto err_unmap ;
620
+ }
538
621
}
539
- }
622
+ } while ( size > 0 );
540
623
541
624
if (os_provider -> fd > 0 ) {
542
625
// store (fd_offset + 1) to be able to store fd_offset == 0
0 commit comments