Skip to content

Commit 1d90ca8

Browse files
Kemi Wangtorvalds
authored andcommitted
mm: update NUMA counter threshold size
There is significant overhead in cache bouncing caused by zone counters (NUMA associated counters) update in parallel in multi-threaded page allocation (suggested by Dave Hansen). This patch updates NUMA counter threshold to a fixed size of MAX_U16 - 2, as a small threshold greatly increases the update frequency of the global counter from local per cpu counter(suggested by Ying Huang). The rationality is that these statistics counters don't affect the kernel's decision, unlike other VM counters, so it's not a problem to use a large threshold. With this patchset, we see 31.3% drop of CPU cycles(537-->369) for per single page allocation and reclaim on Jesper's page_bench03 benchmark. Benchmark provided by Jesper D Brouer(increase loop times to 10000000): https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/mm/ bench Threshold CPU cycles Throughput(88 threads) 32 799 241760478 64 640 301628829 125 537 358906028 <==> system by default (base) 256 468 412397590 512 428 450550704 4096 399 482520943 20000 394 489009617 30000 395 488017817 65533 369(-31.3%) 521661345(+45.3%) <==> with this patchset N/A 342(-36.3%) 562900157(+56.8%) <==> disable zone_statistics Link: http://lkml.kernel.org/r/[email protected] Signed-off-by: Kemi Wang <[email protected]> Reported-by: Jesper Dangaard Brouer <[email protected]> Suggested-by: Dave Hansen <[email protected]> Suggested-by: Ying Huang <[email protected]> Acked-by: Mel Gorman <[email protected]> Cc: Aaron Lu <[email protected]> Cc: Andi Kleen <[email protected]> Cc: Christopher Lameter <[email protected]> Cc: Johannes Weiner <[email protected]> Cc: Michal Hocko <[email protected]> Cc: Tim Chen <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent 3a321d2 commit 1d90ca8

File tree

2 files changed

+11
-20
lines changed

2 files changed

+11
-20
lines changed

include/linux/mmzone.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -282,8 +282,7 @@ struct per_cpu_pageset {
282282
struct per_cpu_pages pcp;
283283
#ifdef CONFIG_NUMA
284284
s8 expire;
285-
s8 numa_stat_threshold;
286-
s8 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS];
285+
u16 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS];
287286
#endif
288287
#ifdef CONFIG_SMP
289288
s8 stat_threshold;

mm/vmstat.c

Lines changed: 10 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@
3030

3131
#include "internal.h"
3232

33+
#define NUMA_STATS_THRESHOLD (U16_MAX - 2)
34+
3335
#ifdef CONFIG_VM_EVENT_COUNTERS
3436
DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
3537
EXPORT_PER_CPU_SYMBOL(vm_event_states);
@@ -194,10 +196,7 @@ void refresh_zone_stat_thresholds(void)
194196

195197
per_cpu_ptr(zone->pageset, cpu)->stat_threshold
196198
= threshold;
197-
#ifdef CONFIG_NUMA
198-
per_cpu_ptr(zone->pageset, cpu)->numa_stat_threshold
199-
= threshold;
200-
#endif
199+
201200
/* Base nodestat threshold on the largest populated zone. */
202201
pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold;
203202
per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold
@@ -231,14 +230,9 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat,
231230
continue;
232231

233232
threshold = (*calculate_pressure)(zone);
234-
for_each_online_cpu(cpu) {
233+
for_each_online_cpu(cpu)
235234
per_cpu_ptr(zone->pageset, cpu)->stat_threshold
236235
= threshold;
237-
#ifdef CONFIG_NUMA
238-
per_cpu_ptr(zone->pageset, cpu)->numa_stat_threshold
239-
= threshold;
240-
#endif
241-
}
242236
}
243237
}
244238

@@ -874,16 +868,14 @@ void __inc_numa_state(struct zone *zone,
874868
enum numa_stat_item item)
875869
{
876870
struct per_cpu_pageset __percpu *pcp = zone->pageset;
877-
s8 __percpu *p = pcp->vm_numa_stat_diff + item;
878-
s8 v, t;
871+
u16 __percpu *p = pcp->vm_numa_stat_diff + item;
872+
u16 v;
879873

880874
v = __this_cpu_inc_return(*p);
881-
t = __this_cpu_read(pcp->numa_stat_threshold);
882-
if (unlikely(v > t)) {
883-
s8 overstep = t >> 1;
884875

885-
zone_numa_state_add(v + overstep, zone, item);
886-
__this_cpu_write(*p, -overstep);
876+
if (unlikely(v > NUMA_STATS_THRESHOLD)) {
877+
zone_numa_state_add(v, zone, item);
878+
__this_cpu_write(*p, 0);
887879
}
888880
}
889881

@@ -1798,7 +1790,7 @@ static bool need_update(int cpu)
17981790

17991791
BUILD_BUG_ON(sizeof(p->vm_stat_diff[0]) != 1);
18001792
#ifdef CONFIG_NUMA
1801-
BUILD_BUG_ON(sizeof(p->vm_numa_stat_diff[0]) != 1);
1793+
BUILD_BUG_ON(sizeof(p->vm_numa_stat_diff[0]) != 2);
18021794
#endif
18031795
/*
18041796
* The fast way of checking if there are any vmstat diffs.

0 commit comments

Comments
 (0)