Skip to content

Commit d4af56c

Browse files
howlettakpm00
authored andcommitted
mm: start tracking VMAs with maple tree
Start tracking the VMAs with the new maple tree structure in parallel with the rb_tree. Add debug and trace events for maple tree operations and duplicate the rb_tree that is created on forks into the maple tree. The maple tree is added to the mm_struct including the mm_init struct, added support in required mm/mmap functions, added tracking in kernel/fork for process forking, and used to find the unmapped_area and checked against what the rbtree finds. This also moves the mmap_lock() in exit_mmap() since the oom reaper call does walk the VMAs. Otherwise lockdep will be unhappy if oom happens. When splitting a vma fails due to allocations of the maple tree nodes, the error path in __split_vma() calls new->vm_ops->close(new). The page accounting for hugetlb is actually in the close() operation, so it accounts for the removal of 1/2 of the VMA which was not adjusted. This results in a negative exit value. To avoid the negative charge, set vm_start = vm_end and vm_pgoff = 0. There is also a potential accounting issue in special mappings from insert_vm_struct() failing to allocate, so reverse the charge there in the failure scenario. Link: https://lkml.kernel.org/r/[email protected] Signed-off-by: Liam R. Howlett <[email protected]> Signed-off-by: Matthew Wilcox (Oracle) <[email protected]> Tested-by: Yu Zhao <[email protected]> Cc: Catalin Marinas <[email protected]> Cc: David Hildenbrand <[email protected]> Cc: David Howells <[email protected]> Cc: Davidlohr Bueso <[email protected]> Cc: SeongJae Park <[email protected]> Cc: Sven Schnelle <[email protected]> Cc: Vlastimil Babka <[email protected]> Cc: Will Deacon <[email protected]> Signed-off-by: Andrew Morton <[email protected]>
1 parent e15e06a commit d4af56c

File tree

9 files changed

+435
-36
lines changed

9 files changed

+435
-36
lines changed

arch/x86/kernel/tboot.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ void __init tboot_probe(void)
9696
static pgd_t *tboot_pg_dir;
9797
static struct mm_struct tboot_mm = {
9898
.mm_rb = RB_ROOT,
99+
.mm_mt = MTREE_INIT_EXT(mm_mt, MM_MT_FLAGS, tboot_mm.mmap_lock),
99100
.pgd = swapper_pg_dir,
100101
.mm_users = ATOMIC_INIT(2),
101102
.mm_count = ATOMIC_INIT(1),

drivers/firmware/efi/efi.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ static unsigned long __initdata rt_prop = EFI_INVALID_TABLE_ADDR;
5858

5959
struct mm_struct efi_mm = {
6060
.mm_rb = RB_ROOT,
61+
.mm_mt = MTREE_INIT_EXT(mm_mt, MM_MT_FLAGS, efi_mm.mmap_lock),
6162
.mm_users = ATOMIC_INIT(2),
6263
.mm_count = ATOMIC_INIT(1),
6364
.write_protect_seq = SEQCNT_ZERO(efi_mm.write_protect_seq),

include/linux/mm.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2567,6 +2567,8 @@ extern bool arch_has_descending_max_zone_pfns(void);
25672567
/* nommu.c */
25682568
extern atomic_long_t mmap_pages_allocated;
25692569
extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t);
2570+
/* mmap.c */
2571+
void vma_mas_store(struct vm_area_struct *vma, struct ma_state *mas);
25702572

25712573
/* interval_tree.c */
25722574
void vma_interval_tree_insert(struct vm_area_struct *node,
@@ -2630,6 +2632,9 @@ extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
26302632
bool *need_rmap_locks);
26312633
extern void exit_mmap(struct mm_struct *);
26322634

2635+
void vma_mas_store(struct vm_area_struct *vma, struct ma_state *mas);
2636+
void vma_mas_remove(struct vm_area_struct *vma, struct ma_state *mas);
2637+
26332638
static inline int check_data_rlimit(unsigned long rlim,
26342639
unsigned long new,
26352640
unsigned long start,

include/linux/mm_types.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#include <linux/list.h>
1010
#include <linux/spinlock.h>
1111
#include <linux/rbtree.h>
12+
#include <linux/maple_tree.h>
1213
#include <linux/rwsem.h>
1314
#include <linux/completion.h>
1415
#include <linux/cpumask.h>
@@ -486,6 +487,7 @@ struct kioctx_table;
486487
struct mm_struct {
487488
struct {
488489
struct vm_area_struct *mmap; /* list of VMAs */
490+
struct maple_tree mm_mt;
489491
struct rb_root mm_rb;
490492
u64 vmacache_seqnum; /* per-thread vmacache */
491493
#ifdef CONFIG_MMU
@@ -697,6 +699,7 @@ struct mm_struct {
697699
unsigned long cpu_bitmap[];
698700
};
699701

702+
#define MM_MT_FLAGS (MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN)
700703
extern struct mm_struct init_mm;
701704

702705
/* Pointer magic because the dynamic array size confuses some compilers. */

include/trace/events/mmap.h

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,79 @@ TRACE_EVENT(vm_unmapped_area,
4242
__entry->low_limit, __entry->high_limit, __entry->align_mask,
4343
__entry->align_offset)
4444
);
45+
46+
TRACE_EVENT(vma_mas_szero,
47+
TP_PROTO(struct maple_tree *mt, unsigned long start,
48+
unsigned long end),
49+
50+
TP_ARGS(mt, start, end),
51+
52+
TP_STRUCT__entry(
53+
__field(struct maple_tree *, mt)
54+
__field(unsigned long, start)
55+
__field(unsigned long, end)
56+
),
57+
58+
TP_fast_assign(
59+
__entry->mt = mt;
60+
__entry->start = start;
61+
__entry->end = end;
62+
),
63+
64+
TP_printk("mt_mod %p, (NULL), SNULL, %lu, %lu,",
65+
__entry->mt,
66+
(unsigned long) __entry->start,
67+
(unsigned long) __entry->end
68+
)
69+
);
70+
71+
TRACE_EVENT(vma_store,
72+
TP_PROTO(struct maple_tree *mt, struct vm_area_struct *vma),
73+
74+
TP_ARGS(mt, vma),
75+
76+
TP_STRUCT__entry(
77+
__field(struct maple_tree *, mt)
78+
__field(struct vm_area_struct *, vma)
79+
__field(unsigned long, vm_start)
80+
__field(unsigned long, vm_end)
81+
),
82+
83+
TP_fast_assign(
84+
__entry->mt = mt;
85+
__entry->vma = vma;
86+
__entry->vm_start = vma->vm_start;
87+
__entry->vm_end = vma->vm_end - 1;
88+
),
89+
90+
TP_printk("mt_mod %p, (%p), STORE, %lu, %lu,",
91+
__entry->mt, __entry->vma,
92+
(unsigned long) __entry->vm_start,
93+
(unsigned long) __entry->vm_end
94+
)
95+
);
96+
97+
98+
TRACE_EVENT(exit_mmap,
99+
TP_PROTO(struct mm_struct *mm),
100+
101+
TP_ARGS(mm),
102+
103+
TP_STRUCT__entry(
104+
__field(struct mm_struct *, mm)
105+
__field(struct maple_tree *, mt)
106+
),
107+
108+
TP_fast_assign(
109+
__entry->mm = mm;
110+
__entry->mt = &mm->mm_mt;
111+
),
112+
113+
TP_printk("mt_mod %p, DESTROY\n",
114+
__entry->mt
115+
)
116+
);
117+
45118
#endif
46119

47120
/* This part must be outside protection */

kernel/fork.c

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -585,6 +585,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
585585
int retval;
586586
unsigned long charge;
587587
LIST_HEAD(uf);
588+
MA_STATE(mas, &mm->mm_mt, 0, 0);
588589

589590
uprobe_start_dup_mmap();
590591
if (mmap_write_lock_killable(oldmm)) {
@@ -614,6 +615,10 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
614615
goto out;
615616
khugepaged_fork(mm, oldmm);
616617

618+
retval = mas_expected_entries(&mas, oldmm->map_count);
619+
if (retval)
620+
goto out;
621+
617622
prev = NULL;
618623
for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
619624
struct file *file;
@@ -629,7 +634,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
629634
*/
630635
if (fatal_signal_pending(current)) {
631636
retval = -EINTR;
632-
goto out;
637+
goto loop_out;
633638
}
634639
if (mpnt->vm_flags & VM_ACCOUNT) {
635640
unsigned long len = vma_pages(mpnt);
@@ -694,6 +699,11 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
694699
rb_link = &tmp->vm_rb.rb_right;
695700
rb_parent = &tmp->vm_rb;
696701

702+
/* Link the vma into the MT */
703+
mas.index = tmp->vm_start;
704+
mas.last = tmp->vm_end - 1;
705+
mas_store(&mas, tmp);
706+
697707
mm->map_count++;
698708
if (!(tmp->vm_flags & VM_WIPEONFORK))
699709
retval = copy_page_range(tmp, mpnt);
@@ -702,10 +712,12 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
702712
tmp->vm_ops->open(tmp);
703713

704714
if (retval)
705-
goto out;
715+
goto loop_out;
706716
}
707717
/* a new mm has just been created */
708718
retval = arch_dup_mmap(oldmm, mm);
719+
loop_out:
720+
mas_destroy(&mas);
709721
out:
710722
mmap_write_unlock(mm);
711723
flush_tlb_mm(oldmm);
@@ -721,7 +733,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
721733
fail_nomem:
722734
retval = -ENOMEM;
723735
vm_unacct_memory(charge);
724-
goto out;
736+
goto loop_out;
725737
}
726738

727739
static inline int mm_alloc_pgd(struct mm_struct *mm)
@@ -1111,6 +1123,8 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
11111123
{
11121124
mm->mmap = NULL;
11131125
mm->mm_rb = RB_ROOT;
1126+
mt_init_flags(&mm->mm_mt, MM_MT_FLAGS);
1127+
mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock);
11141128
mm->vmacache_seqnum = 0;
11151129
atomic_set(&mm->mm_users, 1);
11161130
atomic_set(&mm->mm_count, 1);

mm/init-mm.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
// SPDX-License-Identifier: GPL-2.0
22
#include <linux/mm_types.h>
33
#include <linux/rbtree.h>
4+
#include <linux/maple_tree.h>
45
#include <linux/rwsem.h>
56
#include <linux/spinlock.h>
67
#include <linux/list.h>
@@ -29,6 +30,7 @@
2930
*/
3031
struct mm_struct init_mm = {
3132
.mm_rb = RB_ROOT,
33+
.mm_mt = MTREE_INIT_EXT(mm_mt, MM_MT_FLAGS, init_mm.mmap_lock),
3234
.pgd = swapper_pg_dir,
3335
.mm_users = ATOMIC_INIT(2),
3436
.mm_count = ATOMIC_INIT(1),

0 commit comments

Comments
 (0)