Skip to content

Commit adc1938

Browse files
rientjesIngo Molnar
authored andcommitted
x86: Interleave emulated nodes over physical nodes
Add interleaved NUMA emulation support This patch interleaves emulated nodes over the system's physical nodes. This is required for interleave optimizations since mempolicies, for example, operate by iterating over a nodemask and act without knowledge of node distances. It can also be used for testing memory latencies and NUMA bugs in the kernel. There're a couple of ways to do this: - divide the number of emulated nodes by the number of physical nodes and allocate the result on each physical node, or - allocate each successive emulated node on a different physical node until all memory is exhausted. The disadvantage of the first option is, depending on the asymmetry in node capacities of each physical node, emulated nodes may substantially differ in size on a particular physical node compared to another. The disadvantage of the second option is, also depending on the asymmetry in node capacities of each physical node, there may be more emulated nodes allocated on a single physical node as another. This patch implements the second option; we sacrifice the possibility that we may have slightly more emulated nodes on a particular physical node compared to another in lieu of node size asymmetry. [ Note that "node capacity" of a physical node is not only a function of its addressable range, but also is affected by subtracting out the amount of reserved memory over that range. NUMA emulation only deals with available, non-reserved memory quantities. ] We ensure there is at least a minimal amount of available memory allocated to each node. We also make sure that at least this amount of available memory is available in ZONE_DMA32 for any node that includes both ZONE_DMA32 and ZONE_NORMAL. This patch also cleans the emulation code up by no longer passing the statically allocated struct bootnode array among the various functions. This init.data array is not allocated on the stack since it may be very large and thus it may be accessed at file scope. The WARN_ON() for nodes_cover_memory() when faking proximity domains is removed since it relies on successive nodes always having greater start addresses than previous nodes; with interleaving this is no longer always true. Signed-off-by: David Rientjes <[email protected]> Cc: Linus Torvalds <[email protected]> Cc: Andreas Herrmann <[email protected]> Cc: Yinghai Lu <[email protected]> Cc: Balbir Singh <[email protected]> Cc: Ankita Garg <[email protected]> Cc: Len Brown <[email protected]> LKML-Reference: <[email protected]> Signed-off-by: Ingo Molnar <[email protected]>
1 parent 8716273 commit adc1938

File tree

2 files changed

+184
-28
lines changed

2 files changed

+184
-28
lines changed

arch/x86/mm/numa_64.c

Lines changed: 184 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -306,20 +306,81 @@ void __init numa_init_array(void)
306306

307307
#ifdef CONFIG_NUMA_EMU
308308
/* Numa emulation */
309+
static struct bootnode nodes[MAX_NUMNODES] __initdata;
310+
static struct bootnode physnodes[MAX_NUMNODES] __initdata;
309311
static char *cmdline __initdata;
310312

313+
static int __init setup_physnodes(unsigned long start, unsigned long end,
314+
int acpi, int k8)
315+
{
316+
int nr_nodes = 0;
317+
int ret = 0;
318+
int i;
319+
320+
#ifdef CONFIG_ACPI_NUMA
321+
if (acpi)
322+
nr_nodes = acpi_get_nodes(physnodes);
323+
#endif
324+
#ifdef CONFIG_K8_NUMA
325+
if (k8)
326+
nr_nodes = k8_get_nodes(physnodes);
327+
#endif
328+
/*
329+
* Basic sanity checking on the physical node map: there may be errors
330+
* if the SRAT or K8 incorrectly reported the topology or the mem=
331+
* kernel parameter is used.
332+
*/
333+
for (i = 0; i < nr_nodes; i++) {
334+
if (physnodes[i].start == physnodes[i].end)
335+
continue;
336+
if (physnodes[i].start > end) {
337+
physnodes[i].end = physnodes[i].start;
338+
continue;
339+
}
340+
if (physnodes[i].end < start) {
341+
physnodes[i].start = physnodes[i].end;
342+
continue;
343+
}
344+
if (physnodes[i].start < start)
345+
physnodes[i].start = start;
346+
if (physnodes[i].end > end)
347+
physnodes[i].end = end;
348+
}
349+
350+
/*
351+
* Remove all nodes that have no memory or were truncated because of the
352+
* limited address range.
353+
*/
354+
for (i = 0; i < nr_nodes; i++) {
355+
if (physnodes[i].start == physnodes[i].end)
356+
continue;
357+
physnodes[ret].start = physnodes[i].start;
358+
physnodes[ret].end = physnodes[i].end;
359+
ret++;
360+
}
361+
362+
/*
363+
* If no physical topology was detected, a single node is faked to cover
364+
* the entire address space.
365+
*/
366+
if (!ret) {
367+
physnodes[ret].start = start;
368+
physnodes[ret].end = end;
369+
ret = 1;
370+
}
371+
return ret;
372+
}
373+
311374
/*
312375
* Setups up nid to range from addr to addr + size. If the end
313376
* boundary is greater than max_addr, then max_addr is used instead.
314377
* The return value is 0 if there is additional memory left for
315378
* allocation past addr and -1 otherwise. addr is adjusted to be at
316379
* the end of the node.
317380
*/
318-
static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr,
319-
u64 size, u64 max_addr)
381+
static int __init setup_node_range(int nid, u64 *addr, u64 size, u64 max_addr)
320382
{
321383
int ret = 0;
322-
323384
nodes[nid].start = *addr;
324385
*addr += size;
325386
if (*addr >= max_addr) {
@@ -334,13 +395,112 @@ static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr,
334395
return ret;
335396
}
336397

398+
/*
399+
* Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
400+
* to max_addr. The return value is the number of nodes allocated.
401+
*/
402+
static int __init split_nodes_interleave(u64 addr, u64 max_addr,
403+
int nr_phys_nodes, int nr_nodes)
404+
{
405+
nodemask_t physnode_mask = NODE_MASK_NONE;
406+
u64 size;
407+
int big;
408+
int ret = 0;
409+
int i;
410+
411+
if (nr_nodes <= 0)
412+
return -1;
413+
if (nr_nodes > MAX_NUMNODES) {
414+
pr_info("numa=fake=%d too large, reducing to %d\n",
415+
nr_nodes, MAX_NUMNODES);
416+
nr_nodes = MAX_NUMNODES;
417+
}
418+
419+
size = (max_addr - addr - e820_hole_size(addr, max_addr)) / nr_nodes;
420+
/*
421+
* Calculate the number of big nodes that can be allocated as a result
422+
* of consolidating the remainder.
423+
*/
424+
big = ((size & ~FAKE_NODE_MIN_HASH_MASK) & nr_nodes) /
425+
FAKE_NODE_MIN_SIZE;
426+
427+
size &= FAKE_NODE_MIN_HASH_MASK;
428+
if (!size) {
429+
pr_err("Not enough memory for each node. "
430+
"NUMA emulation disabled.\n");
431+
return -1;
432+
}
433+
434+
for (i = 0; i < nr_phys_nodes; i++)
435+
if (physnodes[i].start != physnodes[i].end)
436+
node_set(i, physnode_mask);
437+
438+
/*
439+
* Continue to fill physical nodes with fake nodes until there is no
440+
* memory left on any of them.
441+
*/
442+
while (nodes_weight(physnode_mask)) {
443+
for_each_node_mask(i, physnode_mask) {
444+
u64 end = physnodes[i].start + size;
445+
u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
446+
447+
if (ret < big)
448+
end += FAKE_NODE_MIN_SIZE;
449+
450+
/*
451+
* Continue to add memory to this fake node if its
452+
* non-reserved memory is less than the per-node size.
453+
*/
454+
while (end - physnodes[i].start -
455+
e820_hole_size(physnodes[i].start, end) < size) {
456+
end += FAKE_NODE_MIN_SIZE;
457+
if (end > physnodes[i].end) {
458+
end = physnodes[i].end;
459+
break;
460+
}
461+
}
462+
463+
/*
464+
* If there won't be at least FAKE_NODE_MIN_SIZE of
465+
* non-reserved memory in ZONE_DMA32 for the next node,
466+
* this one must extend to the boundary.
467+
*/
468+
if (end < dma32_end && dma32_end - end -
469+
e820_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
470+
end = dma32_end;
471+
472+
/*
473+
* If there won't be enough non-reserved memory for the
474+
* next node, this one must extend to the end of the
475+
* physical node.
476+
*/
477+
if (physnodes[i].end - end -
478+
e820_hole_size(end, physnodes[i].end) < size)
479+
end = physnodes[i].end;
480+
481+
/*
482+
* Avoid allocating more nodes than requested, which can
483+
* happen as a result of rounding down each node's size
484+
* to FAKE_NODE_MIN_SIZE.
485+
*/
486+
if (nodes_weight(physnode_mask) + ret >= nr_nodes)
487+
end = physnodes[i].end;
488+
489+
if (setup_node_range(ret++, &physnodes[i].start,
490+
end - physnodes[i].start,
491+
physnodes[i].end) < 0)
492+
node_clear(i, physnode_mask);
493+
}
494+
}
495+
return ret;
496+
}
497+
337498
/*
338499
* Splits num_nodes nodes up equally starting at node_start. The return value
339500
* is the number of nodes split up and addr is adjusted to be at the end of the
340501
* last node allocated.
341502
*/
342-
static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
343-
u64 max_addr, int node_start,
503+
static int __init split_nodes_equally(u64 *addr, u64 max_addr, int node_start,
344504
int num_nodes)
345505
{
346506
unsigned int big;
@@ -388,7 +548,7 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
388548
break;
389549
}
390550
}
391-
if (setup_node_range(i, nodes, addr, end - *addr, max_addr) < 0)
551+
if (setup_node_range(i, addr, end - *addr, max_addr) < 0)
392552
break;
393553
}
394554
return i - node_start + 1;
@@ -399,12 +559,12 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
399559
* always assigned to a final node and can be asymmetric. Returns the number of
400560
* nodes split.
401561
*/
402-
static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr,
403-
u64 max_addr, int node_start, u64 size)
562+
static int __init split_nodes_by_size(u64 *addr, u64 max_addr, int node_start,
563+
u64 size)
404564
{
405565
int i = node_start;
406566
size = (size << 20) & FAKE_NODE_MIN_HASH_MASK;
407-
while (!setup_node_range(i++, nodes, addr, size, max_addr))
567+
while (!setup_node_range(i++, addr, size, max_addr))
408568
;
409569
return i - node_start;
410570
}
@@ -413,23 +573,24 @@ static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr,
413573
* Sets up the system RAM area from start_pfn to last_pfn according to the
414574
* numa=fake command-line option.
415575
*/
416-
static struct bootnode nodes[MAX_NUMNODES] __initdata;
417-
418-
static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn)
576+
static int __init numa_emulation(unsigned long start_pfn,
577+
unsigned long last_pfn, int acpi, int k8)
419578
{
420579
u64 size, addr = start_pfn << PAGE_SHIFT;
421580
u64 max_addr = last_pfn << PAGE_SHIFT;
422581
int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i;
582+
int num_phys_nodes;
423583

424-
memset(&nodes, 0, sizeof(nodes));
584+
num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8);
425585
/*
426586
* If the numa=fake command-line is just a single number N, split the
427587
* system RAM into N fake nodes.
428588
*/
429589
if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) {
430590
long n = simple_strtol(cmdline, NULL, 0);
431591

432-
num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, n);
592+
num_nodes = split_nodes_interleave(addr, max_addr,
593+
num_phys_nodes, n);
433594
if (num_nodes < 0)
434595
return num_nodes;
435596
goto out;
@@ -456,8 +617,8 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn
456617
size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK;
457618
if (size)
458619
for (i = 0; i < coeff; i++, num_nodes++)
459-
if (setup_node_range(num_nodes, nodes,
460-
&addr, size, max_addr) < 0)
620+
if (setup_node_range(num_nodes, &addr,
621+
size, max_addr) < 0)
461622
goto done;
462623
if (!*cmdline)
463624
break;
@@ -473,7 +634,7 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn
473634
if (addr < max_addr) {
474635
if (coeff_flag && coeff < 0) {
475636
/* Split remaining nodes into num-sized chunks */
476-
num_nodes += split_nodes_by_size(nodes, &addr, max_addr,
637+
num_nodes += split_nodes_by_size(&addr, max_addr,
477638
num_nodes, num);
478639
goto out;
479640
}
@@ -482,16 +643,16 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn
482643
/* Split remaining nodes into coeff chunks */
483644
if (coeff <= 0)
484645
break;
485-
num_nodes += split_nodes_equally(nodes, &addr, max_addr,
646+
num_nodes += split_nodes_equally(&addr, max_addr,
486647
num_nodes, coeff);
487648
break;
488649
case ',':
489650
/* Do not allocate remaining system RAM */
490651
break;
491652
default:
492653
/* Give one final node */
493-
setup_node_range(num_nodes, nodes, &addr,
494-
max_addr - addr, max_addr);
654+
setup_node_range(num_nodes, &addr, max_addr - addr,
655+
max_addr);
495656
num_nodes++;
496657
}
497658
}
@@ -505,14 +666,10 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn
505666
}
506667

507668
/*
508-
* We need to vacate all active ranges that may have been registered by
509-
* SRAT and set acpi_numa to -1 so that srat_disabled() always returns
510-
* true. NUMA emulation has succeeded so we will not scan ACPI nodes.
669+
* We need to vacate all active ranges that may have been registered for
670+
* the e820 memory map.
511671
*/
512672
remove_all_active_ranges();
513-
#ifdef CONFIG_ACPI_NUMA
514-
acpi_numa = -1;
515-
#endif
516673
for_each_node_mask(i, node_possible_map) {
517674
e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
518675
nodes[i].end >> PAGE_SHIFT);
@@ -533,7 +690,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
533690
nodes_clear(node_online_map);
534691

535692
#ifdef CONFIG_NUMA_EMU
536-
if (cmdline && !numa_emulation(start_pfn, last_pfn))
693+
if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, k8))
537694
return;
538695
nodes_clear(node_possible_map);
539696
nodes_clear(node_online_map);

arch/x86/mm/srat_64.c

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -468,7 +468,6 @@ void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
468468
for (i = 0; i < num_nodes; i++)
469469
if (fake_nodes[i].start != fake_nodes[i].end)
470470
node_set(i, nodes_parsed);
471-
WARN_ON(!nodes_cover_memory(fake_nodes));
472471
}
473472

474473
static int null_slit_node_compare(int a, int b)

0 commit comments

Comments
 (0)