Skip to content

Commit d8d55f5

Browse files
Muchun Songtorvalds
authored andcommitted
mm: sparsemem: use page table lock to protect kernel pmd operations
The init_mm.page_table_lock is used to protect kernel page tables, we can use it to serialize splitting vmemmap PMD mappings instead of mmap write lock, which can increase the concurrency of vmemmap_remap_free(). Actually, It increase the concurrency between allocations of HugeTLB pages. But it is not the only benefit. There are a lot of users of mmap read lock of init_mm. The mmap write lock is holding through vmemmap_remap_free(), removing mmap write lock usage to make it does not affect other users of mmap read lock. It is not making anything worse and always a win to move. Now the kernel page table walker does not hold the page_table_lock when walking pmd entries. There may be consistency issue of a pmd entry, because pmd entry might change from a huge pmd entry to a PTE page table. There is only one user of kernel page table walker, namely ptdump. The ptdump already considers the consistency, which use a local variable to cache the value of pmd entry. But we also need to update ->action to ACTION_CONTINUE to make sure the walker does not walk every pte entry again when concurrent thread has split the huge pmd. Link: https://lkml.kernel.org/r/[email protected] Signed-off-by: Muchun Song <[email protected]> Cc: Barry Song <[email protected]> Cc: Bodeddula Balasubramaniam <[email protected]> Cc: Chen Huang <[email protected]> Cc: David Hildenbrand <[email protected]> Cc: Fam Zheng <[email protected]> Cc: Jonathan Corbet <[email protected]> Cc: Matthew Wilcox <[email protected]> Cc: Michal Hocko <[email protected]> Cc: Mike Kravetz <[email protected]> Cc: Oscar Salvador <[email protected]> Cc: Qi Zheng <[email protected]> Cc: Xiongchun Duan <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent a6b4085 commit d8d55f5

File tree

2 files changed

+43
-20
lines changed

2 files changed

+43
-20
lines changed

mm/ptdump.c

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,10 @@ static int ptdump_pgd_entry(pgd_t *pgd, unsigned long addr,
4040
if (st->effective_prot)
4141
st->effective_prot(st, 0, pgd_val(val));
4242

43-
if (pgd_leaf(val))
43+
if (pgd_leaf(val)) {
4444
st->note_page(st, addr, 0, pgd_val(val));
45+
walk->action = ACTION_CONTINUE;
46+
}
4547

4648
return 0;
4749
}
@@ -61,8 +63,10 @@ static int ptdump_p4d_entry(p4d_t *p4d, unsigned long addr,
6163
if (st->effective_prot)
6264
st->effective_prot(st, 1, p4d_val(val));
6365

64-
if (p4d_leaf(val))
66+
if (p4d_leaf(val)) {
6567
st->note_page(st, addr, 1, p4d_val(val));
68+
walk->action = ACTION_CONTINUE;
69+
}
6670

6771
return 0;
6872
}
@@ -82,8 +86,10 @@ static int ptdump_pud_entry(pud_t *pud, unsigned long addr,
8286
if (st->effective_prot)
8387
st->effective_prot(st, 2, pud_val(val));
8488

85-
if (pud_leaf(val))
89+
if (pud_leaf(val)) {
8690
st->note_page(st, addr, 2, pud_val(val));
91+
walk->action = ACTION_CONTINUE;
92+
}
8793

8894
return 0;
8995
}
@@ -101,8 +107,10 @@ static int ptdump_pmd_entry(pmd_t *pmd, unsigned long addr,
101107

102108
if (st->effective_prot)
103109
st->effective_prot(st, 3, pmd_val(val));
104-
if (pmd_leaf(val))
110+
if (pmd_leaf(val)) {
105111
st->note_page(st, addr, 3, pmd_val(val));
112+
walk->action = ACTION_CONTINUE;
113+
}
106114

107115
return 0;
108116
}

mm/sparse-vmemmap.c

Lines changed: 31 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -53,8 +53,7 @@ struct vmemmap_remap_walk {
5353
struct list_head *vmemmap_pages;
5454
};
5555

56-
static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start,
57-
struct vmemmap_remap_walk *walk)
56+
static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
5857
{
5958
pmd_t __pmd;
6059
int i;
@@ -76,15 +75,34 @@ static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start,
7675
set_pte_at(&init_mm, addr, pte, entry);
7776
}
7877

79-
/* Make pte visible before pmd. See comment in pmd_install(). */
80-
smp_wmb();
81-
pmd_populate_kernel(&init_mm, pmd, pgtable);
82-
83-
flush_tlb_kernel_range(start, start + PMD_SIZE);
78+
spin_lock(&init_mm.page_table_lock);
79+
if (likely(pmd_leaf(*pmd))) {
80+
/* Make pte visible before pmd. See comment in pmd_install(). */
81+
smp_wmb();
82+
pmd_populate_kernel(&init_mm, pmd, pgtable);
83+
flush_tlb_kernel_range(start, start + PMD_SIZE);
84+
} else {
85+
pte_free_kernel(&init_mm, pgtable);
86+
}
87+
spin_unlock(&init_mm.page_table_lock);
8488

8589
return 0;
8690
}
8791

92+
static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
93+
{
94+
int leaf;
95+
96+
spin_lock(&init_mm.page_table_lock);
97+
leaf = pmd_leaf(*pmd);
98+
spin_unlock(&init_mm.page_table_lock);
99+
100+
if (!leaf)
101+
return 0;
102+
103+
return __split_vmemmap_huge_pmd(pmd, start);
104+
}
105+
88106
static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr,
89107
unsigned long end,
90108
struct vmemmap_remap_walk *walk)
@@ -121,13 +139,12 @@ static int vmemmap_pmd_range(pud_t *pud, unsigned long addr,
121139

122140
pmd = pmd_offset(pud, addr);
123141
do {
124-
if (pmd_leaf(*pmd)) {
125-
int ret;
142+
int ret;
143+
144+
ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK);
145+
if (ret)
146+
return ret;
126147

127-
ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK, walk);
128-
if (ret)
129-
return ret;
130-
}
131148
next = pmd_addr_end(addr, end);
132149
vmemmap_pte_range(pmd, addr, next, walk);
133150
} while (pmd++, addr = next, addr != end);
@@ -321,10 +338,8 @@ int vmemmap_remap_free(unsigned long start, unsigned long end,
321338
*/
322339
BUG_ON(start - reuse != PAGE_SIZE);
323340

324-
mmap_write_lock(&init_mm);
341+
mmap_read_lock(&init_mm);
325342
ret = vmemmap_remap_range(reuse, end, &walk);
326-
mmap_write_downgrade(&init_mm);
327-
328343
if (ret && walk.nr_walked) {
329344
end = reuse + walk.nr_walked * PAGE_SIZE;
330345
/*

0 commit comments

Comments
 (0)