Skip to content

Commit 38a7601

Browse files
walken-googletorvalds
authored andcommitted
mm: avoid taking rmap locks in move_ptes()
During mremap(), the destination VMA is generally placed after the original vma in rmap traversal order: in move_vma(), we always have new_pgoff >= vma->vm_pgoff, and as a result new_vma->vm_pgoff >= vma->vm_pgoff unless vma_merge() merged the new vma with an adjacent one. When the destination VMA is placed after the original in rmap traversal order, we can avoid taking the rmap locks in move_ptes(). Essentially, this reintroduces the optimization that had been disabled in "mm anon rmap: remove anon_vma_moveto_tail". The difference is that we don't try to impose the rmap traversal order; instead we just rely on things being in the desired order in the common case and fall back to taking locks in the uncommon case. Also we skip the i_mmap_mutex in addition to the anon_vma lock: in both cases, the vmas are traversed in increasing vm_pgoff order with ties resolved in tree insertion order. Signed-off-by: Michel Lespinasse <[email protected]> Cc: Andrea Arcangeli <[email protected]> Cc: Rik van Riel <[email protected]> Cc: Peter Zijlstra <[email protected]> Cc: Daniel Santos <[email protected]> Cc: Hugh Dickins <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent 523d4e2 commit 38a7601

File tree

4 files changed

+49
-23
lines changed

4 files changed

+49
-23
lines changed

fs/exec.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -603,7 +603,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
603603
* process cleanup to remove whatever mess we made.
604604
*/
605605
if (length != move_page_tables(vma, old_start,
606-
vma, new_start, length))
606+
vma, new_start, length, false))
607607
return -ENOMEM;
608608

609609
lru_add_drain();

include/linux/mm.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1060,7 +1060,8 @@ vm_is_stack(struct task_struct *task, struct vm_area_struct *vma, int in_group);
10601060

10611061
extern unsigned long move_page_tables(struct vm_area_struct *vma,
10621062
unsigned long old_addr, struct vm_area_struct *new_vma,
1063-
unsigned long new_addr, unsigned long len);
1063+
unsigned long new_addr, unsigned long len,
1064+
bool need_rmap_locks);
10641065
extern unsigned long do_mremap(unsigned long addr,
10651066
unsigned long old_len, unsigned long new_len,
10661067
unsigned long flags, unsigned long new_addr);
@@ -1410,7 +1411,8 @@ extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *,
14101411
struct rb_node **, struct rb_node *);
14111412
extern void unlink_file_vma(struct vm_area_struct *);
14121413
extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
1413-
unsigned long addr, unsigned long len, pgoff_t pgoff);
1414+
unsigned long addr, unsigned long len, pgoff_t pgoff,
1415+
bool *need_rmap_locks);
14141416
extern void exit_mmap(struct mm_struct *);
14151417

14161418
extern int mm_take_all_locks(struct mm_struct *mm);

mm/mmap.c

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2371,7 +2371,8 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
23712371
* prior to moving page table entries, to effect an mremap move.
23722372
*/
23732373
struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
2374-
unsigned long addr, unsigned long len, pgoff_t pgoff)
2374+
unsigned long addr, unsigned long len, pgoff_t pgoff,
2375+
bool *need_rmap_locks)
23752376
{
23762377
struct vm_area_struct *vma = *vmap;
23772378
unsigned long vma_start = vma->vm_start;
@@ -2413,8 +2414,9 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
24132414
* linear if there are no pages mapped yet.
24142415
*/
24152416
VM_BUG_ON(faulted_in_anon_vma);
2416-
*vmap = new_vma;
2417+
*vmap = vma = new_vma;
24172418
}
2419+
*need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
24182420
} else {
24192421
new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
24202422
if (new_vma) {
@@ -2434,6 +2436,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
24342436
if (new_vma->vm_ops && new_vma->vm_ops->open)
24352437
new_vma->vm_ops->open(new_vma);
24362438
vma_link(mm, new_vma, prev, rb_link, rb_parent);
2439+
*need_rmap_locks = false;
24372440
}
24382441
}
24392442
return new_vma;

mm/mremap.c

Lines changed: 39 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -71,26 +71,42 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
7171
static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
7272
unsigned long old_addr, unsigned long old_end,
7373
struct vm_area_struct *new_vma, pmd_t *new_pmd,
74-
unsigned long new_addr)
74+
unsigned long new_addr, bool need_rmap_locks)
7575
{
7676
struct address_space *mapping = NULL;
77-
struct anon_vma *anon_vma = vma->anon_vma;
77+
struct anon_vma *anon_vma = NULL;
7878
struct mm_struct *mm = vma->vm_mm;
7979
pte_t *old_pte, *new_pte, pte;
8080
spinlock_t *old_ptl, *new_ptl;
8181

82-
if (vma->vm_file) {
83-
/*
84-
* Subtle point from Rajesh Venkatasubramanian: before
85-
* moving file-based ptes, we must lock truncate_pagecache
86-
* out, since it might clean the dst vma before the src vma,
87-
* and we propagate stale pages into the dst afterward.
88-
*/
89-
mapping = vma->vm_file->f_mapping;
90-
mutex_lock(&mapping->i_mmap_mutex);
82+
/*
83+
* When need_rmap_locks is true, we take the i_mmap_mutex and anon_vma
84+
* locks to ensure that rmap will always observe either the old or the
85+
* new ptes. This is the easiest way to avoid races with
86+
* truncate_pagecache(), page migration, etc...
87+
*
88+
* When need_rmap_locks is false, we use other ways to avoid
89+
* such races:
90+
*
91+
* - During exec() shift_arg_pages(), we use a specially tagged vma
92+
* which rmap call sites look for using is_vma_temporary_stack().
93+
*
94+
* - During mremap(), new_vma is often known to be placed after vma
95+
* in rmap traversal order. This ensures rmap will always observe
96+
* either the old pte, or the new pte, or both (the page table locks
97+
* serialize access to individual ptes, but only rmap traversal
98+
* order guarantees that we won't miss both the old and new ptes).
99+
*/
100+
if (need_rmap_locks) {
101+
if (vma->vm_file) {
102+
mapping = vma->vm_file->f_mapping;
103+
mutex_lock(&mapping->i_mmap_mutex);
104+
}
105+
if (vma->anon_vma) {
106+
anon_vma = vma->anon_vma;
107+
anon_vma_lock(anon_vma);
108+
}
91109
}
92-
if (anon_vma)
93-
anon_vma_lock(anon_vma);
94110

95111
/*
96112
* We don't have to worry about the ordering of src and dst
@@ -127,7 +143,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
127143

128144
unsigned long move_page_tables(struct vm_area_struct *vma,
129145
unsigned long old_addr, struct vm_area_struct *new_vma,
130-
unsigned long new_addr, unsigned long len)
146+
unsigned long new_addr, unsigned long len,
147+
bool need_rmap_locks)
131148
{
132149
unsigned long extent, next, old_end;
133150
pmd_t *old_pmd, *new_pmd;
@@ -174,7 +191,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
174191
if (extent > LATENCY_LIMIT)
175192
extent = LATENCY_LIMIT;
176193
move_ptes(vma, old_pmd, old_addr, old_addr + extent,
177-
new_vma, new_pmd, new_addr);
194+
new_vma, new_pmd, new_addr, need_rmap_locks);
178195
need_flush = true;
179196
}
180197
if (likely(need_flush))
@@ -198,6 +215,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
198215
unsigned long hiwater_vm;
199216
int split = 0;
200217
int err;
218+
bool need_rmap_locks;
201219

202220
/*
203221
* We'd prefer to avoid failure later on in do_munmap:
@@ -219,18 +237,21 @@ static unsigned long move_vma(struct vm_area_struct *vma,
219237
return err;
220238

221239
new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
222-
new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff);
240+
new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
241+
&need_rmap_locks);
223242
if (!new_vma)
224243
return -ENOMEM;
225244

226-
moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len);
245+
moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
246+
need_rmap_locks);
227247
if (moved_len < old_len) {
228248
/*
229249
* On error, move entries back from new area to old,
230250
* which will succeed since page tables still there,
231251
* and then proceed to unmap new area instead of old.
232252
*/
233-
move_page_tables(new_vma, new_addr, vma, old_addr, moved_len);
253+
move_page_tables(new_vma, new_addr, vma, old_addr, moved_len,
254+
true);
234255
vma = new_vma;
235256
old_len = new_len;
236257
old_addr = new_addr;

0 commit comments

Comments
 (0)