16
16
17
17
#include "base_alloc_global.h"
18
18
#include "provider_os_memory_internal.h"
19
+ #include "utils_concurrency.h"
19
20
#include "utils_log.h"
20
21
21
22
#include <umf.h>
@@ -28,11 +29,14 @@ typedef struct os_memory_provider_t {
28
29
unsigned protection ; // combination of OS-specific protection flags
29
30
30
31
// NUMA config
31
- hwloc_bitmap_t nodeset ;
32
+ hwloc_bitmap_t * nodeset ;
33
+ unsigned nodeset_len ;
32
34
char * nodeset_str_buf ;
33
35
hwloc_membind_policy_t numa_policy ;
34
36
int numa_flags ; // combination of hwloc flags
35
37
38
+ size_t part_size ;
39
+ size_t alloc_sum ; // sum of all allocations - used for manual interleaving
36
40
hwloc_topology_t topo ;
37
41
} os_memory_provider_t ;
38
42
@@ -81,30 +85,67 @@ static void os_store_last_native_error(int32_t native_error, int errno_value) {
81
85
TLS_last_native_error .errno_value = errno_value ;
82
86
}
83
87
84
- static umf_result_t nodemask_to_hwloc_nodeset (const unsigned * nodelist ,
85
- unsigned long listsize ,
86
- hwloc_bitmap_t * out_nodeset ) {
87
- if (out_nodeset == NULL ) {
88
- return UMF_RESULT_ERROR_INVALID_ARGUMENT ;
89
- }
88
+ static umf_result_t initialize_nodeset (os_memory_provider_t * os_provider ,
89
+ const unsigned * nodelist ,
90
+ unsigned long listsize ,
91
+ int is_separate_nodes ) {
90
92
91
- * out_nodeset = hwloc_bitmap_alloc ();
92
- if (!* out_nodeset ) {
93
+ unsigned long array_size = (listsize && is_separate_nodes ) ? listsize : 1 ;
94
+ os_provider -> nodeset =
95
+ umf_ba_global_alloc (sizeof (* os_provider -> nodeset ) * array_size );
96
+
97
+ if (!os_provider -> nodeset ) {
93
98
return UMF_RESULT_ERROR_OUT_OF_HOST_MEMORY ;
94
99
}
95
100
101
+ hwloc_bitmap_t * out_nodeset = os_provider -> nodeset ;
102
+ os_provider -> nodeset_len = array_size ;
96
103
if (listsize == 0 ) {
104
+ // Hwloc_set_area_membind fails if empty nodeset is passed so
105
+ // if no node is specified, just pass all available nodes.
106
+ // For modes where no node is needed, they will be ignored anyway.
107
+ out_nodeset [0 ] = hwloc_bitmap_dup (
108
+ hwloc_topology_get_complete_nodeset (os_provider -> topo ));
109
+ if (!out_nodeset [0 ]) {
110
+ goto err_free_list ;
111
+ }
97
112
return UMF_RESULT_SUCCESS ;
98
113
}
99
114
100
- for (unsigned long i = 0 ; i < listsize ; i ++ ) {
101
- if (hwloc_bitmap_set (* out_nodeset , nodelist [i ])) {
102
- hwloc_bitmap_free (* out_nodeset );
103
- return UMF_RESULT_ERROR_OUT_OF_HOST_MEMORY ;
115
+ for (unsigned long i = 0 ; i < array_size ; i ++ ) {
116
+ out_nodeset [i ] = hwloc_bitmap_alloc ();
117
+ if (!out_nodeset [i ]) {
118
+ for (unsigned long j = 0 ; j < i ; j ++ ) {
119
+ hwloc_bitmap_free (out_nodeset [j ]);
120
+ }
121
+ goto err_free_list ;
122
+ }
123
+ }
124
+
125
+ if (is_separate_nodes ) {
126
+ for (unsigned long i = 0 ; i < listsize ; i ++ ) {
127
+ if (hwloc_bitmap_set (out_nodeset [i ], nodelist [i ])) {
128
+ goto err_free_bitmaps ;
129
+ }
130
+ }
131
+ } else {
132
+ for (unsigned long i = 0 ; i < listsize ; i ++ ) {
133
+ if (hwloc_bitmap_set (out_nodeset [0 ], nodelist [i ])) {
134
+ goto err_free_bitmaps ;
135
+ }
104
136
}
105
137
}
106
138
107
139
return UMF_RESULT_SUCCESS ;
140
+
141
+ err_free_bitmaps :
142
+ for (unsigned long i = 0 ; i < array_size ; i ++ ) {
143
+ hwloc_bitmap_free (out_nodeset [i ]);
144
+ }
145
+ err_free_list :
146
+ umf_ba_global_free (* out_nodeset );
147
+ os_provider -> nodeset_len = 0 ;
148
+ return UMF_RESULT_ERROR_OUT_OF_HOST_MEMORY ;
108
149
}
109
150
110
151
umf_result_t os_translate_flags (unsigned in_flags , unsigned max ,
@@ -132,51 +173,73 @@ umf_result_t os_translate_flags(unsigned in_flags, unsigned max,
132
173
return UMF_RESULT_SUCCESS ;
133
174
}
134
175
135
- static umf_result_t translate_numa_mode (umf_numa_mode_t mode , int nodemaskEmpty ,
136
- hwloc_membind_policy_t * numa_policy ) {
176
+ static umf_result_t validate_numa_mode (umf_numa_mode_t mode ,
177
+ int nodemaskEmpty ) {
137
178
switch (mode ) {
138
179
case UMF_NUMA_MODE_DEFAULT :
180
+ case UMF_NUMA_MODE_LOCAL :
139
181
if (!nodemaskEmpty ) {
140
182
// nodeset must be empty
141
183
return UMF_RESULT_ERROR_INVALID_ARGUMENT ;
142
184
}
143
- * numa_policy = HWLOC_MEMBIND_DEFAULT ;
144
185
return UMF_RESULT_SUCCESS ;
145
186
case UMF_NUMA_MODE_BIND :
146
- if (nodemaskEmpty ) {
147
- // nodeset must not be empty
148
- return UMF_RESULT_ERROR_INVALID_ARGUMENT ;
149
- }
150
- * numa_policy = HWLOC_MEMBIND_BIND ;
151
- return UMF_RESULT_SUCCESS ;
152
187
case UMF_NUMA_MODE_INTERLEAVE :
153
188
if (nodemaskEmpty ) {
154
189
// nodeset must not be empty
155
190
return UMF_RESULT_ERROR_INVALID_ARGUMENT ;
156
191
}
157
- * numa_policy = HWLOC_MEMBIND_INTERLEAVE ;
158
192
return UMF_RESULT_SUCCESS ;
159
193
case UMF_NUMA_MODE_PREFERRED :
160
- * numa_policy = HWLOC_MEMBIND_BIND ;
161
194
return UMF_RESULT_SUCCESS ;
162
- case UMF_NUMA_MODE_LOCAL :
163
- if (!nodemaskEmpty ) {
164
- // nodeset must be empty
165
- return UMF_RESULT_ERROR_INVALID_ARGUMENT ;
195
+ default :
196
+ assert (0 );
197
+ return UMF_RESULT_ERROR_INVALID_ARGUMENT ;
198
+ }
199
+ }
200
+
201
+ static hwloc_membind_policy_t translate_numa_mode (umf_numa_mode_t mode ,
202
+ int dedicated_node_bind ) {
203
+ switch (mode ) {
204
+ case UMF_NUMA_MODE_DEFAULT :
205
+ return HWLOC_MEMBIND_DEFAULT ;
206
+ case UMF_NUMA_MODE_BIND :
207
+ return HWLOC_MEMBIND_BIND ;
208
+ case UMF_NUMA_MODE_INTERLEAVE :
209
+ // In manual mode, we manually implement interleaving,
210
+ // by binding memory to specific NUMA nodes.
211
+ if (dedicated_node_bind ) {
212
+ return HWLOC_MEMBIND_BIND ;
166
213
}
167
- * numa_policy = HWLOC_MEMBIND_BIND ;
168
- return UMF_RESULT_SUCCESS ;
214
+ return HWLOC_MEMBIND_INTERLEAVE ;
215
+ case UMF_NUMA_MODE_PREFERRED :
216
+ return HWLOC_MEMBIND_BIND ;
217
+ case UMF_NUMA_MODE_LOCAL :
218
+ return HWLOC_MEMBIND_BIND ;
169
219
}
170
- return UMF_RESULT_ERROR_INVALID_ARGUMENT ;
220
+ assert (0 );
221
+ return -1 ;
171
222
}
172
223
173
- static int getHwlocMembindFlags (umf_numa_mode_t mode ) {
224
+ //return 1 if umf will bind memory directly to single NUMA node, based on internal algorithm
225
+ //return 0 if umf will just set numa memory policy, and kernel will decide where to allocate memory
226
+ static int dedicated_node_bind (umf_os_memory_provider_params_t * in_params ) {
227
+ if (in_params -> numa_mode == UMF_NUMA_MODE_INTERLEAVE ) {
228
+ return in_params -> part_size > 0 ;
229
+ }
230
+ return 0 ;
231
+ }
232
+
233
+ static int getHwlocMembindFlags (umf_numa_mode_t mode , int dedicated_node_bind ) {
174
234
/* UMF always operates on NUMA nodes */
175
235
int flags = HWLOC_MEMBIND_BYNODESET ;
176
236
if (mode == UMF_NUMA_MODE_BIND ) {
177
237
/* HWLOC uses MPOL_PREFERRED[_MANY] unless HWLOC_MEMBIND_STRICT is specified */
178
238
flags |= HWLOC_MEMBIND_STRICT ;
179
239
}
240
+ if (dedicated_node_bind ) {
241
+ flags |= HWLOC_MEMBIND_STRICT ;
242
+ }
180
243
return flags ;
181
244
}
182
245
@@ -193,19 +256,22 @@ static umf_result_t translate_params(umf_os_memory_provider_params_t *in_params,
193
256
194
257
// NUMA config
195
258
int emptyNodeset = in_params -> numa_list_len == 0 ;
196
- result = translate_numa_mode (in_params -> numa_mode , emptyNodeset ,
197
- & provider -> numa_policy );
259
+ result = validate_numa_mode (in_params -> numa_mode , emptyNodeset );
198
260
if (result != UMF_RESULT_SUCCESS ) {
199
261
LOG_ERR ("incorrect NUMA mode (%u) or wrong params" ,
200
262
in_params -> numa_mode );
201
263
return result ;
202
264
}
203
265
LOG_INFO ("established HWLOC NUMA policy: %u" , provider -> numa_policy );
204
266
205
- provider -> numa_flags = getHwlocMembindFlags (in_params -> numa_mode );
206
-
207
- return nodemask_to_hwloc_nodeset (
208
- in_params -> numa_list , in_params -> numa_list_len , & provider -> nodeset );
267
+ int is_dedicated_node_bind = dedicated_node_bind (in_params );
268
+ provider -> numa_policy =
269
+ translate_numa_mode (in_params -> numa_mode , is_dedicated_node_bind );
270
+ provider -> numa_flags =
271
+ getHwlocMembindFlags (in_params -> numa_mode , is_dedicated_node_bind );
272
+ provider -> part_size = in_params -> part_size ;
273
+ return initialize_nodeset (provider , in_params -> numa_list ,
274
+ in_params -> numa_list_len , is_dedicated_node_bind );
209
275
}
210
276
211
277
static umf_result_t os_initialize (void * params , void * * provider ) {
@@ -251,13 +317,13 @@ static umf_result_t os_initialize(void *params, void **provider) {
251
317
if (!os_provider -> nodeset_str_buf ) {
252
318
LOG_INFO ("allocating memory for printing NUMA nodes failed" );
253
319
} else {
254
- if ( hwloc_bitmap_list_snprintf ( os_provider -> nodeset_str_buf ,
255
- NODESET_STR_BUF_LEN ,
256
- os_provider -> nodeset )) {
257
- LOG_INFO ( "OS provider initialized with NUMA nodes: %s" ,
258
- os_provider -> nodeset_str_buf );
259
- } else if ( hwloc_bitmap_iszero ( os_provider -> nodeset )) {
260
- LOG_INFO ( "OS provider initialized with empty NUMA nodeset" );
320
+ LOG_INFO ( "OS provider initialized with NUMA nodes:" );
321
+ for ( unsigned i = 0 ; i < os_provider -> nodeset_len ; i ++ ) {
322
+ if ( hwloc_bitmap_list_snprintf ( os_provider -> nodeset_str_buf ,
323
+ NODESET_STR_BUF_LEN ,
324
+ os_provider -> nodeset [ i ])) {
325
+ LOG_INFO ( "%s" , os_provider -> nodeset_str_buf );
326
+ }
261
327
}
262
328
}
263
329
@@ -283,7 +349,10 @@ static void os_finalize(void *provider) {
283
349
umf_ba_global_free (os_provider -> nodeset_str_buf );
284
350
}
285
351
286
- hwloc_bitmap_free (os_provider -> nodeset );
352
+ for (unsigned i = 0 ; i < os_provider -> nodeset_len ; i ++ ) {
353
+ hwloc_bitmap_free (os_provider -> nodeset [i ]);
354
+ }
355
+ umf_ba_global_free (os_provider -> nodeset );
287
356
hwloc_topology_destroy (os_provider -> topo );
288
357
umf_ba_global_free (os_provider );
289
358
}
@@ -390,6 +459,17 @@ static int os_mmap_aligned(void *hint_addr, size_t length, size_t alignment,
390
459
return 0 ;
391
460
}
392
461
462
+ static int get_membind (os_memory_provider_t * provider , size_t size ) {
463
+ if (provider -> nodeset_len == 1 ) {
464
+ return 0 ;
465
+ }
466
+
467
+ assert (provider -> part_size != 0 );
468
+ size_t s = util_fetch_and_add64 (& provider -> alloc_sum , size );
469
+
470
+ return (s / provider -> part_size ) % provider -> nodeset_len ;
471
+ }
472
+
393
473
static umf_result_t os_alloc (void * provider , size_t size , size_t alignment ,
394
474
void * * resultPtr ) {
395
475
int ret ;
@@ -437,32 +517,34 @@ static umf_result_t os_alloc(void *provider, size_t size, size_t alignment,
437
517
}
438
518
439
519
errno = 0 ;
440
- if (hwloc_bitmap_iszero (os_provider -> nodeset )) {
441
- // Hwloc_set_area_membind fails if empty nodeset is passed so if no node is specified,
442
- // just pass all available nodes. For modes where no node is needed, they will be
443
- // ignored anyway.
444
- hwloc_const_nodeset_t complete_nodeset =
445
- hwloc_topology_get_complete_nodeset (os_provider -> topo );
446
- ret = hwloc_set_area_membind (os_provider -> topo , addr , size ,
447
- complete_nodeset , os_provider -> numa_policy ,
448
- os_provider -> numa_flags );
449
- } else {
520
+ unsigned membind = get_membind (os_provider , ALIGN_UP (size , page_size ));
521
+ size_t bind_size = os_provider -> nodeset_len == 1
522
+ ? size
523
+ : ALIGN_UP (os_provider -> part_size , page_size );
524
+ char * ptr_iter = addr ;
525
+
526
+ do {
527
+ size_t s = bind_size < size ? bind_size : size ;
450
528
ret = hwloc_set_area_membind (
451
- os_provider -> topo , addr , size , os_provider -> nodeset ,
529
+ os_provider -> topo , ptr_iter , s , os_provider -> nodeset [ membind ++ ] ,
452
530
os_provider -> numa_policy , os_provider -> numa_flags );
453
- }
454
531
455
- if (ret ) {
456
- os_store_last_native_error (UMF_OS_RESULT_ERROR_BIND_FAILED , errno );
457
- LOG_PERR ("binding memory to NUMA node failed" );
458
- // TODO: (errno == 0) when hwloc_set_area_membind() fails on Windows - ignore this temporarily
459
- if (errno != ENOSYS &&
460
- errno != 0 ) { // ENOSYS - Function not implemented
461
- // Do not error out if memory binding is not implemented at all (like in case of WSL on Windows).
462
- goto err_unmap ;
532
+ size -= s ;
533
+ ptr_iter += s ;
534
+ membind %= os_provider -> nodeset_len ;
535
+ if (ret ) {
536
+ os_store_last_native_error (UMF_OS_RESULT_ERROR_BIND_FAILED , errno );
537
+ LOG_PERR ("binding memory to NUMA node failed" );
538
+ // TODO: (errno == 0) when hwloc_set_area_membind() fails on Windows,
539
+ // ignore this temporarily
540
+ if (errno != ENOSYS &&
541
+ errno != 0 ) { // ENOSYS - Function not implemented
542
+ // Do not error out if memory binding is not implemented at all
543
+ // (like in case of WSL on Windows).
544
+ goto err_unmap ;
545
+ }
463
546
}
464
- }
465
-
547
+ } while (size > 0 );
466
548
* resultPtr = addr ;
467
549
468
550
return UMF_RESULT_SUCCESS ;
0 commit comments