Skip to content

Commit f1a7941

Browse files
shakeelbakpm00
authored andcommitted
mm: convert mm's rss stats into percpu_counter
Currently mm_struct maintains rss_stats which are updated on page fault and the unmapping codepaths. For page fault codepath the updates are cached per thread with the batch of TASK_RSS_EVENTS_THRESH which is 64. The reason for caching is performance for multithreaded applications otherwise the rss_stats updates may become hotspot for such applications. However this optimization comes with the cost of error margin in the rss stats. The rss_stats for applications with large number of threads can be very skewed. At worst the error margin is (nr_threads * 64) and we have a lot of applications with 100s of threads, so the error margin can be very high. Internally we had to reduce TASK_RSS_EVENTS_THRESH to 32. Recently we started seeing the unbounded errors for rss_stats for specific applications which use TCP rx0cp. It seems like vm_insert_pages() codepath does not sync rss_stats at all. This patch converts the rss_stats into percpu_counter to convert the error margin from (nr_threads * 64) to approximately (nr_cpus ^ 2). However this conversion enable us to get the accurate stats for situations where accuracy is more important than the cpu cost. This patch does not make such tradeoffs - we can just use percpu_counter_add_local() for the updates and percpu_counter_sum() (or percpu_counter_sync() + percpu_counter_read) for the readers. At the moment the readers are either procfs interface, oom_killer and memory reclaim which I think are not performance critical and should be ok with slow read. However I think we can make that change in a separate patch. Link: https://lkml.kernel.org/r/[email protected] Signed-off-by: Shakeel Butt <[email protected]> Cc: Marek Szyprowski <[email protected]> Signed-off-by: Andrew Morton <[email protected]>
1 parent 9cd6ffa commit f1a7941

File tree

8 files changed

+40
-107
lines changed

8 files changed

+40
-107
lines changed

include/linux/mm.h

Lines changed: 8 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -2052,40 +2052,30 @@ static inline bool get_user_page_fast_only(unsigned long addr,
20522052
*/
20532053
static inline unsigned long get_mm_counter(struct mm_struct *mm, int member)
20542054
{
2055-
long val = atomic_long_read(&mm->rss_stat.count[member]);
2056-
2057-
#ifdef SPLIT_RSS_COUNTING
2058-
/*
2059-
* counter is updated in asynchronous manner and may go to minus.
2060-
* But it's never be expected number for users.
2061-
*/
2062-
if (val < 0)
2063-
val = 0;
2064-
#endif
2065-
return (unsigned long)val;
2055+
return percpu_counter_read_positive(&mm->rss_stat[member]);
20662056
}
20672057

2068-
void mm_trace_rss_stat(struct mm_struct *mm, int member, long count);
2058+
void mm_trace_rss_stat(struct mm_struct *mm, int member);
20692059

20702060
static inline void add_mm_counter(struct mm_struct *mm, int member, long value)
20712061
{
2072-
long count = atomic_long_add_return(value, &mm->rss_stat.count[member]);
2062+
percpu_counter_add(&mm->rss_stat[member], value);
20732063

2074-
mm_trace_rss_stat(mm, member, count);
2064+
mm_trace_rss_stat(mm, member);
20752065
}
20762066

20772067
static inline void inc_mm_counter(struct mm_struct *mm, int member)
20782068
{
2079-
long count = atomic_long_inc_return(&mm->rss_stat.count[member]);
2069+
percpu_counter_inc(&mm->rss_stat[member]);
20802070

2081-
mm_trace_rss_stat(mm, member, count);
2071+
mm_trace_rss_stat(mm, member);
20822072
}
20832073

20842074
static inline void dec_mm_counter(struct mm_struct *mm, int member)
20852075
{
2086-
long count = atomic_long_dec_return(&mm->rss_stat.count[member]);
2076+
percpu_counter_dec(&mm->rss_stat[member]);
20872077

2088-
mm_trace_rss_stat(mm, member, count);
2078+
mm_trace_rss_stat(mm, member);
20892079
}
20902080

20912081
/* Optimized variant when page is already known not to be PageAnon */

include/linux/mm_types.h

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#include <linux/page-flags-layout.h>
1919
#include <linux/workqueue.h>
2020
#include <linux/seqlock.h>
21+
#include <linux/percpu_counter.h>
2122

2223
#include <asm/mmu.h>
2324

@@ -626,11 +627,7 @@ struct mm_struct {
626627

627628
unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
628629

629-
/*
630-
* Special counters, in some configurations protected by the
631-
* page_table_lock, in other configurations by being atomic.
632-
*/
633-
struct mm_rss_stat rss_stat;
630+
struct percpu_counter rss_stat[NR_MM_COUNTERS];
634631

635632
struct linux_binfmt *binfmt;
636633

include/linux/mm_types_task.h

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -36,19 +36,6 @@ enum {
3636
NR_MM_COUNTERS
3737
};
3838

39-
#if USE_SPLIT_PTE_PTLOCKS && defined(CONFIG_MMU)
40-
#define SPLIT_RSS_COUNTING
41-
/* per-thread cached information, */
42-
struct task_rss_stat {
43-
int events; /* for synchronization threshold */
44-
int count[NR_MM_COUNTERS];
45-
};
46-
#endif /* USE_SPLIT_PTE_PTLOCKS */
47-
48-
struct mm_rss_stat {
49-
atomic_long_t count[NR_MM_COUNTERS];
50-
};
51-
5239
struct page_frag {
5340
struct page *page;
5441
#if (BITS_PER_LONG > 32) || (PAGE_SIZE >= 65536)

include/linux/percpu_counter.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
#include <linux/threads.h>
1414
#include <linux/percpu.h>
1515
#include <linux/types.h>
16-
#include <linux/gfp.h>
1716

1817
/* percpu_counter batch for local add or sub */
1918
#define PERCPU_COUNTER_LOCAL_BATCH INT_MAX

include/linux/sched.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -870,9 +870,6 @@ struct task_struct {
870870
struct mm_struct *mm;
871871
struct mm_struct *active_mm;
872872

873-
#ifdef SPLIT_RSS_COUNTING
874-
struct task_rss_stat rss_stat;
875-
#endif
876873
int exit_state;
877874
int exit_code;
878875
int exit_signal;

include/trace/events/kmem.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -346,10 +346,9 @@ TRACE_MM_PAGES
346346
TRACE_EVENT(rss_stat,
347347

348348
TP_PROTO(struct mm_struct *mm,
349-
int member,
350-
long count),
349+
int member),
351350

352-
TP_ARGS(mm, member, count),
351+
TP_ARGS(mm, member),
353352

354353
TP_STRUCT__entry(
355354
__field(unsigned int, mm_id)
@@ -362,7 +361,8 @@ TRACE_EVENT(rss_stat,
362361
__entry->mm_id = mm_ptr_to_hash(mm);
363362
__entry->curr = !!(current->mm == mm);
364363
__entry->member = member;
365-
__entry->size = (count << PAGE_SHIFT);
364+
__entry->size = (percpu_counter_sum_positive(&mm->rss_stat[member])
365+
<< PAGE_SHIFT);
366366
),
367367

368368
TP_printk("mm_id=%u curr=%d type=%s size=%ldB",

kernel/fork.c

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -753,7 +753,7 @@ static void check_mm(struct mm_struct *mm)
753753
"Please make sure 'struct resident_page_types[]' is updated as well");
754754

755755
for (i = 0; i < NR_MM_COUNTERS; i++) {
756-
long x = atomic_long_read(&mm->rss_stat.count[i]);
756+
long x = percpu_counter_sum(&mm->rss_stat[i]);
757757

758758
if (unlikely(x))
759759
pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n",
@@ -779,6 +779,8 @@ static void check_mm(struct mm_struct *mm)
779779
*/
780780
void __mmdrop(struct mm_struct *mm)
781781
{
782+
int i;
783+
782784
BUG_ON(mm == &init_mm);
783785
WARN_ON_ONCE(mm == current->mm);
784786
WARN_ON_ONCE(mm == current->active_mm);
@@ -788,6 +790,9 @@ void __mmdrop(struct mm_struct *mm)
788790
check_mm(mm);
789791
put_user_ns(mm->user_ns);
790792
mm_pasid_drop(mm);
793+
794+
for (i = 0; i < NR_MM_COUNTERS; i++)
795+
percpu_counter_destroy(&mm->rss_stat[i]);
791796
free_mm(mm);
792797
}
793798
EXPORT_SYMBOL_GPL(__mmdrop);
@@ -1107,6 +1112,8 @@ static void mm_init_uprobes_state(struct mm_struct *mm)
11071112
static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
11081113
struct user_namespace *user_ns)
11091114
{
1115+
int i;
1116+
11101117
mt_init_flags(&mm->mm_mt, MM_MT_FLAGS);
11111118
mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock);
11121119
atomic_set(&mm->mm_users, 1);
@@ -1148,10 +1155,17 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
11481155
if (init_new_context(p, mm))
11491156
goto fail_nocontext;
11501157

1158+
for (i = 0; i < NR_MM_COUNTERS; i++)
1159+
if (percpu_counter_init(&mm->rss_stat[i], 0, GFP_KERNEL_ACCOUNT))
1160+
goto fail_pcpu;
1161+
11511162
mm->user_ns = get_user_ns(user_ns);
11521163
lru_gen_init_mm(mm);
11531164
return mm;
11541165

1166+
fail_pcpu:
1167+
while (i > 0)
1168+
percpu_counter_destroy(&mm->rss_stat[--i]);
11551169
fail_nocontext:
11561170
mm_free_pgd(mm);
11571171
fail_nopgd:

mm/memory.c

Lines changed: 11 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -162,58 +162,11 @@ static int __init init_zero_pfn(void)
162162
}
163163
early_initcall(init_zero_pfn);
164164

165-
void mm_trace_rss_stat(struct mm_struct *mm, int member, long count)
165+
void mm_trace_rss_stat(struct mm_struct *mm, int member)
166166
{
167-
trace_rss_stat(mm, member, count);
167+
trace_rss_stat(mm, member);
168168
}
169169

170-
#if defined(SPLIT_RSS_COUNTING)
171-
172-
void sync_mm_rss(struct mm_struct *mm)
173-
{
174-
int i;
175-
176-
for (i = 0; i < NR_MM_COUNTERS; i++) {
177-
if (current->rss_stat.count[i]) {
178-
add_mm_counter(mm, i, current->rss_stat.count[i]);
179-
current->rss_stat.count[i] = 0;
180-
}
181-
}
182-
current->rss_stat.events = 0;
183-
}
184-
185-
static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
186-
{
187-
struct task_struct *task = current;
188-
189-
if (likely(task->mm == mm))
190-
task->rss_stat.count[member] += val;
191-
else
192-
add_mm_counter(mm, member, val);
193-
}
194-
#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
195-
#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
196-
197-
/* sync counter once per 64 page faults */
198-
#define TASK_RSS_EVENTS_THRESH (64)
199-
static void check_sync_rss_stat(struct task_struct *task)
200-
{
201-
if (unlikely(task != current))
202-
return;
203-
if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
204-
sync_mm_rss(task->mm);
205-
}
206-
#else /* SPLIT_RSS_COUNTING */
207-
208-
#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
209-
#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
210-
211-
static void check_sync_rss_stat(struct task_struct *task)
212-
{
213-
}
214-
215-
#endif /* SPLIT_RSS_COUNTING */
216-
217170
/*
218171
* Note: this doesn't free the actual pages themselves. That
219172
* has been handled earlier when unmapping all the memory regions.
@@ -1857,7 +1810,7 @@ static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte,
18571810
return -EBUSY;
18581811
/* Ok, finally just insert the thing.. */
18591812
get_page(page);
1860-
inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
1813+
inc_mm_counter(vma->vm_mm, mm_counter_file(page));
18611814
page_add_file_rmap(page, vma, false);
18621815
set_pte_at(vma->vm_mm, addr, pte, mk_pte(page, prot));
18631816
return 0;
@@ -3153,12 +3106,11 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
31533106
if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
31543107
if (old_page) {
31553108
if (!PageAnon(old_page)) {
3156-
dec_mm_counter_fast(mm,
3157-
mm_counter_file(old_page));
3158-
inc_mm_counter_fast(mm, MM_ANONPAGES);
3109+
dec_mm_counter(mm, mm_counter_file(old_page));
3110+
inc_mm_counter(mm, MM_ANONPAGES);
31593111
}
31603112
} else {
3161-
inc_mm_counter_fast(mm, MM_ANONPAGES);
3113+
inc_mm_counter(mm, MM_ANONPAGES);
31623114
}
31633115
flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
31643116
entry = mk_pte(new_page, vma->vm_page_prot);
@@ -3965,8 +3917,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
39653917
if (should_try_to_free_swap(folio, vma, vmf->flags))
39663918
folio_free_swap(folio);
39673919

3968-
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
3969-
dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
3920+
inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
3921+
dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
39703922
pte = mk_pte(page, vma->vm_page_prot);
39713923

39723924
/*
@@ -4146,7 +4098,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
41464098
return handle_userfault(vmf, VM_UFFD_MISSING);
41474099
}
41484100

4149-
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
4101+
inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
41504102
page_add_new_anon_rmap(page, vma, vmf->address);
41514103
lru_cache_add_inactive_or_unevictable(page, vma);
41524104
setpte:
@@ -4336,11 +4288,11 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
43364288
entry = pte_mkuffd_wp(pte_wrprotect(entry));
43374289
/* copy-on-write page */
43384290
if (write && !(vma->vm_flags & VM_SHARED)) {
4339-
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
4291+
inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
43404292
page_add_new_anon_rmap(page, vma, addr);
43414293
lru_cache_add_inactive_or_unevictable(page, vma);
43424294
} else {
4343-
inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
4295+
inc_mm_counter(vma->vm_mm, mm_counter_file(page));
43444296
page_add_file_rmap(page, vma, false);
43454297
}
43464298
set_pte_at(vma->vm_mm, addr, vmf->pte, entry);
@@ -5192,9 +5144,6 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
51925144
count_vm_event(PGFAULT);
51935145
count_memcg_event_mm(vma->vm_mm, PGFAULT);
51945146

5195-
/* do counter updates before entering really critical section. */
5196-
check_sync_rss_stat(current);
5197-
51985147
if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
51995148
flags & FAULT_FLAG_INSTRUCTION,
52005149
flags & FAULT_FLAG_REMOTE))

0 commit comments

Comments
 (0)