16
16
17
17
#include "base_alloc_global.h"
18
18
#include "provider_os_memory_internal.h"
19
+ #include "utils_concurrency.h"
19
20
#include "utils_log.h"
20
21
21
22
#include <umf.h>
@@ -28,11 +29,14 @@ typedef struct os_memory_provider_t {
28
29
unsigned protection ; // combination of OS-specific protection flags
29
30
30
31
// NUMA config
31
- hwloc_bitmap_t nodeset ;
32
+ hwloc_bitmap_t * nodeset ;
33
+ unsigned nodeset_len ;
32
34
char * nodeset_str_buf ;
33
35
hwloc_membind_policy_t numa_policy ;
34
36
int numa_flags ; // combination of hwloc flags
35
37
38
+ size_t part_size ;
39
+ size_t alloc_sum ; // sum of all allocations - used for manual interleaving
36
40
hwloc_topology_t topo ;
37
41
} os_memory_provider_t ;
38
42
@@ -81,30 +85,67 @@ static void os_store_last_native_error(int32_t native_error, int errno_value) {
81
85
TLS_last_native_error .errno_value = errno_value ;
82
86
}
83
87
84
- static umf_result_t nodemask_to_hwloc_nodeset (const unsigned * nodelist ,
85
- unsigned long listsize ,
86
- hwloc_bitmap_t * out_nodeset ) {
87
- if (out_nodeset == NULL ) {
88
- return UMF_RESULT_ERROR_INVALID_ARGUMENT ;
89
- }
88
+ static umf_result_t initialize_nodeset (os_memory_provider_t * os_provider ,
89
+ const unsigned * nodelist ,
90
+ unsigned long listsize ,
91
+ int separate_nodes ) {
90
92
91
- * out_nodeset = hwloc_bitmap_alloc ();
92
- if (!* out_nodeset ) {
93
+ unsigned long array_size = (listsize && separate_nodes ) ? listsize : 1 ;
94
+ os_provider -> nodeset =
95
+ umf_ba_global_alloc (sizeof (* os_provider -> nodeset ) * array_size );
96
+
97
+ if (!os_provider -> nodeset ) {
93
98
return UMF_RESULT_ERROR_OUT_OF_HOST_MEMORY ;
94
99
}
95
100
101
+ hwloc_bitmap_t * out_nodeset = os_provider -> nodeset ;
102
+ os_provider -> nodeset_len = array_size ;
96
103
if (listsize == 0 ) {
104
+ // Hwloc_set_area_membind fails if empty nodeset is passed so
105
+ // if no node is specified, just pass all available nodes.
106
+ // For modes where no node is needed, they will be ignored anyway.
107
+ out_nodeset [0 ] = hwloc_bitmap_dup (
108
+ hwloc_topology_get_complete_nodeset (os_provider -> topo ));
109
+ if (!out_nodeset [0 ]) {
110
+ goto err_free_list ;
111
+ }
97
112
return UMF_RESULT_SUCCESS ;
98
113
}
99
114
100
- for (unsigned long i = 0 ; i < listsize ; i ++ ) {
101
- if (hwloc_bitmap_set (* out_nodeset , nodelist [i ])) {
102
- hwloc_bitmap_free (* out_nodeset );
103
- return UMF_RESULT_ERROR_OUT_OF_HOST_MEMORY ;
115
+ for (unsigned long i = 0 ; i < array_size ; i ++ ) {
116
+ out_nodeset [i ] = hwloc_bitmap_alloc ();
117
+ if (!out_nodeset [i ]) {
118
+ for (unsigned long j = 0 ; j < i ; j ++ ) {
119
+ hwloc_bitmap_free (out_nodeset [j ]);
120
+ }
121
+ goto err_free_list ;
122
+ }
123
+ }
124
+
125
+ if (separate_nodes ) {
126
+ for (unsigned long i = 0 ; i < listsize ; i ++ ) {
127
+ if (hwloc_bitmap_set (out_nodeset [i ], nodelist [i ])) {
128
+ goto err_free_bitmaps ;
129
+ }
130
+ }
131
+ } else {
132
+ for (unsigned long i = 0 ; i < listsize ; i ++ ) {
133
+ if (hwloc_bitmap_set (out_nodeset [0 ], nodelist [i ])) {
134
+ goto err_free_bitmaps ;
135
+ }
104
136
}
105
137
}
106
138
107
139
return UMF_RESULT_SUCCESS ;
140
+
141
+ err_free_bitmaps :
142
+ for (unsigned long i = 0 ; i < array_size ; i ++ ) {
143
+ hwloc_bitmap_free (out_nodeset [i ]);
144
+ }
145
+ err_free_list :
146
+ umf_ba_global_free (* out_nodeset );
147
+ os_provider -> nodeset_len = 0 ;
148
+ return UMF_RESULT_ERROR_OUT_OF_HOST_MEMORY ;
108
149
}
109
150
110
151
umf_result_t os_translate_flags (unsigned in_flags , unsigned max ,
@@ -132,42 +173,61 @@ umf_result_t os_translate_flags(unsigned in_flags, unsigned max,
132
173
return UMF_RESULT_SUCCESS ;
133
174
}
134
175
135
- static umf_result_t translate_numa_mode (umf_numa_mode_t mode , int nodemaskEmpty ,
136
- hwloc_membind_policy_t * numa_policy ) {
176
+ static umf_result_t validate_numa_mode (umf_numa_mode_t mode ,
177
+ int nodemaskEmpty ) {
137
178
switch (mode ) {
138
179
case UMF_NUMA_MODE_DEFAULT :
180
+ case UMF_NUMA_MODE_LOCAL :
139
181
if (!nodemaskEmpty ) {
140
182
// nodeset must be empty
141
183
return UMF_RESULT_ERROR_INVALID_ARGUMENT ;
142
184
}
143
- * numa_policy = HWLOC_MEMBIND_DEFAULT ;
144
185
return UMF_RESULT_SUCCESS ;
145
186
case UMF_NUMA_MODE_BIND :
146
- if (nodemaskEmpty ) {
147
- // nodeset must not be empty
148
- return UMF_RESULT_ERROR_INVALID_ARGUMENT ;
149
- }
150
- * numa_policy = HWLOC_MEMBIND_BIND ;
151
- return UMF_RESULT_SUCCESS ;
152
187
case UMF_NUMA_MODE_INTERLEAVE :
153
188
if (nodemaskEmpty ) {
154
189
// nodeset must not be empty
155
190
return UMF_RESULT_ERROR_INVALID_ARGUMENT ;
156
191
}
157
- * numa_policy = HWLOC_MEMBIND_INTERLEAVE ;
158
192
return UMF_RESULT_SUCCESS ;
159
193
case UMF_NUMA_MODE_PREFERRED :
160
- * numa_policy = HWLOC_MEMBIND_BIND ;
161
194
return UMF_RESULT_SUCCESS ;
162
- case UMF_NUMA_MODE_LOCAL :
163
- if (!nodemaskEmpty ) {
164
- // nodeset must be empty
165
- return UMF_RESULT_ERROR_INVALID_ARGUMENT ;
195
+ default :
196
+ assert (0 );
197
+ return UMF_RESULT_ERROR_INVALID_ARGUMENT ;
198
+ }
199
+ }
200
+
201
+ static hwloc_membind_policy_t translate_numa_mode (umf_numa_mode_t mode ,
202
+ int manual ) {
203
+ switch (mode ) {
204
+ case UMF_NUMA_MODE_DEFAULT :
205
+ return HWLOC_MEMBIND_DEFAULT ;
206
+ case UMF_NUMA_MODE_BIND :
207
+ return HWLOC_MEMBIND_BIND ;
208
+ case UMF_NUMA_MODE_INTERLEAVE :
209
+ // In manual mode, we manually implement interleaving,
210
+ // by binding memory to specific NUMA nodes.
211
+ if (manual ) {
212
+ return HWLOC_MEMBIND_BIND ;
166
213
}
167
- * numa_policy = HWLOC_MEMBIND_BIND ;
168
- return UMF_RESULT_SUCCESS ;
214
+ return HWLOC_MEMBIND_INTERLEAVE ;
215
+ case UMF_NUMA_MODE_PREFERRED :
216
+ return HWLOC_MEMBIND_BIND ;
217
+ case UMF_NUMA_MODE_LOCAL :
218
+ return HWLOC_MEMBIND_BIND ;
169
219
}
170
- return UMF_RESULT_ERROR_INVALID_ARGUMENT ;
220
+ assert (0 );
221
+ return -1 ;
222
+ }
223
+
224
+ //return 1 if umf will binds memory directly to single NUMA node, based on internal algorithm
225
+ //return 0 if umf will just set numa memory policy, and kernel will decide where to allocate memory
226
+ static int is_dedicated_node_bind (umf_os_memory_provider_params_t * in_params ) {
227
+ if (in_params -> numa_mode == UMF_NUMA_MODE_INTERLEAVE ) {
228
+ return in_params -> part_size > 0 ;
229
+ }
230
+ return 0 ;
171
231
}
172
232
173
233
static int getHwlocMembindFlags (umf_numa_mode_t mode ) {
@@ -193,17 +253,19 @@ static umf_result_t translate_params(umf_os_memory_provider_params_t *in_params,
193
253
194
254
// NUMA config
195
255
int emptyNodeset = in_params -> numa_list_len == 0 ;
196
- result = translate_numa_mode (in_params -> numa_mode , emptyNodeset ,
197
- & provider -> numa_policy );
256
+ result = validate_numa_mode (in_params -> numa_mode , emptyNodeset );
198
257
if (result != UMF_RESULT_SUCCESS ) {
199
258
LOG_ERR ("incorrect NUMA mode: %u" , in_params -> numa_mode );
200
259
return result ;
201
260
}
202
261
262
+ int dedicated_node_bind = is_dedicated_node_bind (in_params );
263
+ provider -> numa_policy =
264
+ translate_numa_mode (in_params -> numa_mode , dedicated_node_bind );
203
265
provider -> numa_flags = getHwlocMembindFlags (in_params -> numa_mode );
204
-
205
- return nodemask_to_hwloc_nodeset (
206
- in_params -> numa_list , in_params -> numa_list_len , & provider -> nodeset );
266
+ provider -> part_size = in_params -> part_size ;
267
+ return initialize_nodeset ( provider , in_params -> numa_list ,
268
+ in_params -> numa_list_len , dedicated_node_bind );
207
269
}
208
270
209
271
static umf_result_t os_initialize (void * params , void * * provider ) {
@@ -250,11 +312,13 @@ static umf_result_t os_initialize(void *params, void **provider) {
250
312
if (!os_provider -> nodeset_str_buf ) {
251
313
LOG_INFO ("Allocating memory for printing NUMA nodes failed" );
252
314
} else {
253
- if (hwloc_bitmap_list_snprintf (os_provider -> nodeset_str_buf ,
254
- NODESET_STR_BUF_LEN ,
255
- os_provider -> nodeset )) {
256
- LOG_INFO ("OS provider initialized with NUMA nodes: %s" ,
257
- os_provider -> nodeset_str_buf );
315
+ LOG_INFO ("OS provider initialized with NUMA nodes:" );
316
+ for (unsigned i = 0 ; i < os_provider -> nodeset_len ; i ++ ) {
317
+ if (hwloc_bitmap_list_snprintf (os_provider -> nodeset_str_buf ,
318
+ NODESET_STR_BUF_LEN ,
319
+ os_provider -> nodeset [i ])) {
320
+ LOG_INFO ("%s" , os_provider -> nodeset_str_buf );
321
+ }
258
322
}
259
323
}
260
324
@@ -280,7 +344,10 @@ static void os_finalize(void *provider) {
280
344
umf_ba_global_free (os_provider -> nodeset_str_buf );
281
345
}
282
346
283
- hwloc_bitmap_free (os_provider -> nodeset );
347
+ for (unsigned i = 0 ; i < os_provider -> nodeset_len ; i ++ ) {
348
+ hwloc_bitmap_free (os_provider -> nodeset [i ]);
349
+ }
350
+ umf_ba_global_free (os_provider -> nodeset );
284
351
hwloc_topology_destroy (os_provider -> topo );
285
352
umf_ba_global_free (os_provider );
286
353
}
@@ -387,6 +454,17 @@ static int os_mmap_aligned(void *hint_addr, size_t length, size_t alignment,
387
454
return 0 ;
388
455
}
389
456
457
+ static int get_membind (os_memory_provider_t * provider , size_t size ) {
458
+ if (provider -> nodeset_len == 1 ) {
459
+ return 0 ;
460
+ }
461
+
462
+ assert (provider -> part_size != 0 );
463
+ size_t s = util_fetch_and_add64 (& provider -> alloc_sum , size );
464
+
465
+ return (s / provider -> part_size ) % provider -> nodeset_len ;
466
+ }
467
+
390
468
static umf_result_t os_alloc (void * provider , size_t size , size_t alignment ,
391
469
void * * resultPtr ) {
392
470
int ret ;
@@ -434,32 +512,31 @@ static umf_result_t os_alloc(void *provider, size_t size, size_t alignment,
434
512
}
435
513
436
514
errno = 0 ;
437
- if (hwloc_bitmap_iszero (os_provider -> nodeset )) {
438
- // Hwloc_set_area_membind fails if empty nodeset is passed so if no node is specified,
439
- // just pass all available nodes. For modes where no node is needed, they will be
440
- // ignored anyway.
441
- hwloc_const_nodeset_t complete_nodeset =
442
- hwloc_topology_get_complete_nodeset (os_provider -> topo );
443
- ret = hwloc_set_area_membind (os_provider -> topo , addr , size ,
444
- complete_nodeset , os_provider -> numa_policy ,
445
- os_provider -> numa_flags );
446
- } else {
515
+ unsigned membind = get_membind (os_provider , size );
516
+ size_t bind_size =
517
+ os_provider -> nodeset_len == 1 ? size : os_provider -> part_size ;
518
+
519
+ do {
520
+ size_t s = bind_size < size ? bind_size : size ;
447
521
ret = hwloc_set_area_membind (
448
- os_provider -> topo , addr , size , os_provider -> nodeset ,
522
+ os_provider -> topo , addr , s , os_provider -> nodeset [ membind ++ ] ,
449
523
os_provider -> numa_policy , os_provider -> numa_flags );
450
- }
451
524
452
- if (ret ) {
453
- os_store_last_native_error (UMF_OS_RESULT_ERROR_BIND_FAILED , errno );
454
- LOG_PERR ("binding memory to NUMA node failed" );
455
- // TODO: (errno == 0) when hwloc_set_area_membind() fails on Windows - ignore this temporarily
456
- if (errno != ENOSYS &&
457
- errno != 0 ) { // ENOSYS - Function not implemented
458
- // Do not error out if memory binding is not implemented at all (like in case of WSL on Windows).
459
- goto err_unmap ;
525
+ size -= s ;
526
+ membind %= os_provider -> nodeset_len ;
527
+ if (ret ) {
528
+ os_store_last_native_error (UMF_OS_RESULT_ERROR_BIND_FAILED , errno );
529
+ LOG_PERR ("binding memory to NUMA node failed" );
530
+ // TODO: (errno == 0) when hwloc_set_area_membind() fails on Windows,
531
+ // ignore this temporarily
532
+ if (errno != ENOSYS &&
533
+ errno != 0 ) { // ENOSYS - Function not implemented
534
+ // Do not error out if memory binding is not implemented at all
535
+ // (like in case of WSL on Windows).
536
+ goto err_unmap ;
537
+ }
460
538
}
461
- }
462
-
539
+ } while (size > 0 );
463
540
* resultPtr = addr ;
464
541
465
542
return UMF_RESULT_SUCCESS ;
0 commit comments