Skip to content

Commit bf181b9

Browse files
walken-googletorvalds
authored andcommitted
mm anon rmap: replace same_anon_vma linked list with an interval tree.
When a large VMA (anon or private file mapping) is first touched, which will populate its anon_vma field, and then split into many regions through the use of mprotect(), the original anon_vma ends up linking all of the vmas on a linked list. This can cause rmap to become inefficient, as we have to walk potentially thousands of irrelevent vmas before finding the one a given anon page might fall into. By replacing the same_anon_vma linked list with an interval tree (where each avc's interval is determined by its vma's start and last pgoffs), we can make rmap efficient for this use case again. While the change is large, all of its pieces are fairly simple. Most places that were walking the same_anon_vma list were looking for a known pgoff, so they can just use the anon_vma_interval_tree_foreach() interval tree iterator instead. The exception here is ksm, where the page's index is not known. It would probably be possible to rework ksm so that the index would be known, but for now I have decided to keep things simple and just walk the entirety of the interval tree there. When updating vma's that already have an anon_vma assigned, we must take care to re-index the corresponding avc's on their interval tree. This is done through the use of anon_vma_interval_tree_pre_update_vma() and anon_vma_interval_tree_post_update_vma(), which remove the avc's from their interval tree before the update and re-insert them after the update. The anon_vma stays locked during the update, so there is no chance that rmap would miss the vmas that are being updated. Signed-off-by: Michel Lespinasse <[email protected]> Cc: Andrea Arcangeli <[email protected]> Cc: Rik van Riel <[email protected]> Cc: Peter Zijlstra <[email protected]> Cc: Daniel Santos <[email protected]> Cc: Hugh Dickins <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent 108d664 commit bf181b9

File tree

8 files changed

+114
-41
lines changed

8 files changed

+114
-41
lines changed

include/linux/mm.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020

2121
struct mempolicy;
2222
struct anon_vma;
23+
struct anon_vma_chain;
2324
struct file_ra_state;
2425
struct user_struct;
2526
struct writeback_control;
@@ -1377,6 +1378,19 @@ static inline void vma_nonlinear_insert(struct vm_area_struct *vma,
13771378
list_add_tail(&vma->shared.nonlinear, list);
13781379
}
13791380

1381+
void anon_vma_interval_tree_insert(struct anon_vma_chain *node,
1382+
struct rb_root *root);
1383+
void anon_vma_interval_tree_remove(struct anon_vma_chain *node,
1384+
struct rb_root *root);
1385+
struct anon_vma_chain *anon_vma_interval_tree_iter_first(
1386+
struct rb_root *root, unsigned long start, unsigned long last);
1387+
struct anon_vma_chain *anon_vma_interval_tree_iter_next(
1388+
struct anon_vma_chain *node, unsigned long start, unsigned long last);
1389+
1390+
#define anon_vma_interval_tree_foreach(avc, root, start, last) \
1391+
for (avc = anon_vma_interval_tree_iter_first(root, start, last); \
1392+
avc; avc = anon_vma_interval_tree_iter_next(avc, start, last))
1393+
13801394
/* mmap.c */
13811395
extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin);
13821396
extern int vma_adjust(struct vm_area_struct *vma, unsigned long start,

include/linux/rmap.h

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -37,14 +37,14 @@ struct anon_vma {
3737
atomic_t refcount;
3838

3939
/*
40-
* NOTE: the LSB of the head.next is set by
40+
* NOTE: the LSB of the rb_root.rb_node is set by
4141
* mm_take_all_locks() _after_ taking the above lock. So the
42-
* head must only be read/written after taking the above lock
42+
* rb_root must only be read/written after taking the above lock
4343
* to be sure to see a valid next pointer. The LSB bit itself
4444
* is serialized by a system wide lock only visible to
4545
* mm_take_all_locks() (mm_all_locks_mutex).
4646
*/
47-
struct list_head head; /* Chain of private "related" vmas */
47+
struct rb_root rb_root; /* Interval tree of private "related" vmas */
4848
};
4949

5050
/*
@@ -57,14 +57,15 @@ struct anon_vma {
5757
* with a VMA, or the VMAs associated with an anon_vma.
5858
* The "same_vma" list contains the anon_vma_chains linking
5959
* all the anon_vmas associated with this VMA.
60-
* The "same_anon_vma" list contains the anon_vma_chains
60+
* The "rb" field indexes on an interval tree the anon_vma_chains
6161
* which link all the VMAs associated with this anon_vma.
6262
*/
6363
struct anon_vma_chain {
6464
struct vm_area_struct *vma;
6565
struct anon_vma *anon_vma;
6666
struct list_head same_vma; /* locked by mmap_sem & page_table_lock */
67-
struct list_head same_anon_vma; /* locked by anon_vma->mutex */
67+
struct rb_node rb; /* locked by anon_vma->mutex */
68+
unsigned long rb_subtree_last;
6869
};
6970

7071
#ifdef CONFIG_MMU

mm/huge_memory.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1375,13 +1375,14 @@ static void __split_huge_page(struct page *page,
13751375
struct anon_vma *anon_vma)
13761376
{
13771377
int mapcount, mapcount2;
1378+
pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
13781379
struct anon_vma_chain *avc;
13791380

13801381
BUG_ON(!PageHead(page));
13811382
BUG_ON(PageTail(page));
13821383

13831384
mapcount = 0;
1384-
list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
1385+
anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
13851386
struct vm_area_struct *vma = avc->vma;
13861387
unsigned long addr = vma_address(page, vma);
13871388
BUG_ON(is_vma_temporary_stack(vma));
@@ -1407,7 +1408,7 @@ static void __split_huge_page(struct page *page,
14071408
__split_huge_page_refcount(page);
14081409

14091410
mapcount2 = 0;
1410-
list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
1411+
anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
14111412
struct vm_area_struct *vma = avc->vma;
14121413
unsigned long addr = vma_address(page, vma);
14131414
BUG_ON(is_vma_temporary_stack(vma));

mm/interval_tree.c

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
#include <linux/mm.h>
1010
#include <linux/fs.h>
11+
#include <linux/rmap.h>
1112
#include <linux/interval_tree_generic.h>
1213

1314
static inline unsigned long vma_start_pgoff(struct vm_area_struct *v)
@@ -57,3 +58,16 @@ void vma_interval_tree_insert_after(struct vm_area_struct *node,
5758
rb_insert_augmented(&node->shared.linear.rb, root,
5859
&vma_interval_tree_augment);
5960
}
61+
62+
static inline unsigned long avc_start_pgoff(struct anon_vma_chain *avc)
63+
{
64+
return vma_start_pgoff(avc->vma);
65+
}
66+
67+
static inline unsigned long avc_last_pgoff(struct anon_vma_chain *avc)
68+
{
69+
return vma_last_pgoff(avc->vma);
70+
}
71+
72+
INTERVAL_TREE_DEFINE(struct anon_vma_chain, rb, unsigned long, rb_subtree_last,
73+
avc_start_pgoff, avc_last_pgoff,, anon_vma_interval_tree)

mm/ksm.c

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1618,7 +1618,8 @@ int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg,
16181618
struct vm_area_struct *vma;
16191619

16201620
anon_vma_lock(anon_vma);
1621-
list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
1621+
anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
1622+
0, ULONG_MAX) {
16221623
vma = vmac->vma;
16231624
if (rmap_item->address < vma->vm_start ||
16241625
rmap_item->address >= vma->vm_end)
@@ -1671,7 +1672,8 @@ int try_to_unmap_ksm(struct page *page, enum ttu_flags flags)
16711672
struct vm_area_struct *vma;
16721673

16731674
anon_vma_lock(anon_vma);
1674-
list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
1675+
anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
1676+
0, ULONG_MAX) {
16751677
vma = vmac->vma;
16761678
if (rmap_item->address < vma->vm_start ||
16771679
rmap_item->address >= vma->vm_end)
@@ -1723,7 +1725,8 @@ int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *,
17231725
struct vm_area_struct *vma;
17241726

17251727
anon_vma_lock(anon_vma);
1726-
list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
1728+
anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
1729+
0, ULONG_MAX) {
17271730
vma = vmac->vma;
17281731
if (rmap_item->address < vma->vm_start ||
17291732
rmap_item->address >= vma->vm_end)

mm/memory-failure.c

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -400,18 +400,21 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
400400
struct vm_area_struct *vma;
401401
struct task_struct *tsk;
402402
struct anon_vma *av;
403+
pgoff_t pgoff;
403404

404405
av = page_lock_anon_vma(page);
405406
if (av == NULL) /* Not actually mapped anymore */
406407
return;
407408

409+
pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
408410
read_lock(&tasklist_lock);
409411
for_each_process (tsk) {
410412
struct anon_vma_chain *vmac;
411413

412414
if (!task_early_kill(tsk))
413415
continue;
414-
list_for_each_entry(vmac, &av->head, same_anon_vma) {
416+
anon_vma_interval_tree_foreach(vmac, &av->rb_root,
417+
pgoff, pgoff) {
415418
vma = vmac->vma;
416419
if (!page_mapped_in_vma(page, vma))
417420
continue;

mm/mmap.c

Lines changed: 55 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -353,6 +353,38 @@ void validate_mm(struct mm_struct *mm)
353353
#define validate_mm(mm) do { } while (0)
354354
#endif
355355

356+
/*
357+
* vma has some anon_vma assigned, and is already inserted on that
358+
* anon_vma's interval trees.
359+
*
360+
* Before updating the vma's vm_start / vm_end / vm_pgoff fields, the
361+
* vma must be removed from the anon_vma's interval trees using
362+
* anon_vma_interval_tree_pre_update_vma().
363+
*
364+
* After the update, the vma will be reinserted using
365+
* anon_vma_interval_tree_post_update_vma().
366+
*
367+
* The entire update must be protected by exclusive mmap_sem and by
368+
* the root anon_vma's mutex.
369+
*/
370+
static inline void
371+
anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma)
372+
{
373+
struct anon_vma_chain *avc;
374+
375+
list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
376+
anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root);
377+
}
378+
379+
static inline void
380+
anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
381+
{
382+
struct anon_vma_chain *avc;
383+
384+
list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
385+
anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
386+
}
387+
356388
static int find_vma_links(struct mm_struct *mm, unsigned long addr,
357389
unsigned long end, struct vm_area_struct **pprev,
358390
struct rb_node ***rb_link, struct rb_node **rb_parent)
@@ -565,20 +597,17 @@ again: remove_next = 1 + (end > next->vm_end);
565597

566598
vma_adjust_trans_huge(vma, start, end, adjust_next);
567599

568-
/*
569-
* When changing only vma->vm_end, we don't really need anon_vma
570-
* lock. This is a fairly rare case by itself, but the anon_vma
571-
* lock may be shared between many sibling processes. Skipping
572-
* the lock for brk adjustments makes a difference sometimes.
573-
*/
574-
if (vma->anon_vma && (importer || start != vma->vm_start)) {
575-
anon_vma = vma->anon_vma;
600+
anon_vma = vma->anon_vma;
601+
if (!anon_vma && adjust_next)
602+
anon_vma = next->anon_vma;
603+
if (anon_vma) {
576604
VM_BUG_ON(adjust_next && next->anon_vma &&
577605
anon_vma != next->anon_vma);
578-
} else if (adjust_next && next->anon_vma)
579-
anon_vma = next->anon_vma;
580-
if (anon_vma)
581606
anon_vma_lock(anon_vma);
607+
anon_vma_interval_tree_pre_update_vma(vma);
608+
if (adjust_next)
609+
anon_vma_interval_tree_pre_update_vma(next);
610+
}
582611

583612
if (root) {
584613
flush_dcache_mmap_lock(mapping);
@@ -619,8 +648,12 @@ again: remove_next = 1 + (end > next->vm_end);
619648
__insert_vm_struct(mm, insert);
620649
}
621650

622-
if (anon_vma)
651+
if (anon_vma) {
652+
anon_vma_interval_tree_post_update_vma(vma);
653+
if (adjust_next)
654+
anon_vma_interval_tree_post_update_vma(next);
623655
anon_vma_unlock(anon_vma);
656+
}
624657
if (mapping)
625658
mutex_unlock(&mapping->i_mmap_mutex);
626659

@@ -1748,7 +1781,9 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
17481781
if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
17491782
error = acct_stack_growth(vma, size, grow);
17501783
if (!error) {
1784+
anon_vma_interval_tree_pre_update_vma(vma);
17511785
vma->vm_end = address;
1786+
anon_vma_interval_tree_post_update_vma(vma);
17521787
perf_event_mmap(vma);
17531788
}
17541789
}
@@ -1798,8 +1833,10 @@ int expand_downwards(struct vm_area_struct *vma,
17981833
if (grow <= vma->vm_pgoff) {
17991834
error = acct_stack_growth(vma, size, grow);
18001835
if (!error) {
1836+
anon_vma_interval_tree_pre_update_vma(vma);
18011837
vma->vm_start = address;
18021838
vma->vm_pgoff -= grow;
1839+
anon_vma_interval_tree_post_update_vma(vma);
18031840
perf_event_mmap(vma);
18041841
}
18051842
}
@@ -2515,7 +2552,7 @@ static DEFINE_MUTEX(mm_all_locks_mutex);
25152552

25162553
static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
25172554
{
2518-
if (!test_bit(0, (unsigned long *) &anon_vma->root->head.next)) {
2555+
if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) {
25192556
/*
25202557
* The LSB of head.next can't change from under us
25212558
* because we hold the mm_all_locks_mutex.
@@ -2531,7 +2568,7 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
25312568
* anon_vma->root->mutex.
25322569
*/
25332570
if (__test_and_set_bit(0, (unsigned long *)
2534-
&anon_vma->root->head.next))
2571+
&anon_vma->root->rb_root.rb_node))
25352572
BUG();
25362573
}
25372574
}
@@ -2572,7 +2609,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
25722609
* A single task can't take more than one mm_take_all_locks() in a row
25732610
* or it would deadlock.
25742611
*
2575-
* The LSB in anon_vma->head.next and the AS_MM_ALL_LOCKS bitflag in
2612+
* The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in
25762613
* mapping->flags avoid to take the same lock twice, if more than one
25772614
* vma in this mm is backed by the same anon_vma or address_space.
25782615
*
@@ -2619,21 +2656,21 @@ int mm_take_all_locks(struct mm_struct *mm)
26192656

26202657
static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
26212658
{
2622-
if (test_bit(0, (unsigned long *) &anon_vma->root->head.next)) {
2659+
if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) {
26232660
/*
26242661
* The LSB of head.next can't change to 0 from under
26252662
* us because we hold the mm_all_locks_mutex.
26262663
*
26272664
* We must however clear the bitflag before unlocking
2628-
* the vma so the users using the anon_vma->head will
2665+
* the vma so the users using the anon_vma->rb_root will
26292666
* never see our bitflag.
26302667
*
26312668
* No need of atomic instructions here, head.next
26322669
* can't change from under us until we release the
26332670
* anon_vma->root->mutex.
26342671
*/
26352672
if (!__test_and_clear_bit(0, (unsigned long *)
2636-
&anon_vma->root->head.next))
2673+
&anon_vma->root->rb_root.rb_node))
26372674
BUG();
26382675
anon_vma_unlock(anon_vma);
26392676
}

0 commit comments

Comments
 (0)