Skip to content

Commit b191f9b

Browse files
Mel Gormantorvalds
authored andcommitted
mm: numa: preserve PTE write permissions across a NUMA hinting fault
Protecting a PTE to trap a NUMA hinting fault clears the writable bit and further faults are needed after trapping a NUMA hinting fault to set the writable bit again. This patch preserves the writable bit when trapping NUMA hinting faults. The impact is obvious from the number of minor faults trapped during the basis balancing benchmark and the system CPU usage; autonumabench 4.0.0-rc4 4.0.0-rc4 baseline preserve Time System-NUMA01 107.13 ( 0.00%) 103.13 ( 3.73%) Time System-NUMA01_THEADLOCAL 131.87 ( 0.00%) 83.30 ( 36.83%) Time System-NUMA02 8.95 ( 0.00%) 10.72 (-19.78%) Time System-NUMA02_SMT 4.57 ( 0.00%) 3.99 ( 12.69%) Time Elapsed-NUMA01 515.78 ( 0.00%) 517.26 ( -0.29%) Time Elapsed-NUMA01_THEADLOCAL 384.10 ( 0.00%) 384.31 ( -0.05%) Time Elapsed-NUMA02 48.86 ( 0.00%) 48.78 ( 0.16%) Time Elapsed-NUMA02_SMT 47.98 ( 0.00%) 48.12 ( -0.29%) 4.0.0-rc4 4.0.0-rc4 baseline preserve User 44383.95 43971.89 System 252.61 201.24 Elapsed 998.68 1000.94 Minor Faults 2597249 1981230 Major Faults 365 364 There is a similar drop in system CPU usage using Dave Chinner's xfsrepair workload 4.0.0-rc4 4.0.0-rc4 baseline preserve Amean real-xfsrepair 454.14 ( 0.00%) 442.36 ( 2.60%) Amean syst-xfsrepair 277.20 ( 0.00%) 204.68 ( 26.16%) The patch looks hacky but the alternatives looked worse. The tidest was to rewalk the page tables after a hinting fault but it was more complex than this approach and the performance was worse. It's not generally safe to just mark the page writable during the fault if it's a write fault as it may have been read-only for COW so that approach was discarded. Signed-off-by: Mel Gorman <[email protected]> Reported-by: Dave Chinner <[email protected]> Tested-by: Dave Chinner <[email protected]> Cc: Ingo Molnar <[email protected]> Cc: Aneesh Kumar <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent bea66fb commit b191f9b

File tree

3 files changed

+14
-6
lines changed

3 files changed

+14
-6
lines changed

mm/huge_memory.c

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1260,6 +1260,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
12601260
int target_nid, last_cpupid = -1;
12611261
bool page_locked;
12621262
bool migrated = false;
1263+
bool was_writable;
12631264
int flags = 0;
12641265

12651266
/* A PROT_NONE fault should not end up here */
@@ -1354,7 +1355,10 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
13541355
goto out;
13551356
clear_pmdnuma:
13561357
BUG_ON(!PageLocked(page));
1358+
was_writable = pmd_write(pmd);
13571359
pmd = pmd_modify(pmd, vma->vm_page_prot);
1360+
if (was_writable)
1361+
pmd = pmd_mkwrite(pmd);
13581362
set_pmd_at(mm, haddr, pmdp, pmd);
13591363
update_mmu_cache_pmd(vma, addr, pmdp);
13601364
unlock_page(page);
@@ -1478,6 +1482,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
14781482

14791483
if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
14801484
pmd_t entry;
1485+
bool preserve_write = prot_numa && pmd_write(*pmd);
14811486
ret = 1;
14821487

14831488
/*
@@ -1493,9 +1498,11 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
14931498
if (!prot_numa || !pmd_protnone(*pmd)) {
14941499
entry = pmdp_get_and_clear_notify(mm, addr, pmd);
14951500
entry = pmd_modify(entry, newprot);
1501+
if (preserve_write)
1502+
entry = pmd_mkwrite(entry);
14961503
ret = HPAGE_PMD_NR;
14971504
set_pmd_at(mm, addr, pmd, entry);
1498-
BUG_ON(pmd_write(entry));
1505+
BUG_ON(!preserve_write && pmd_write(entry));
14991506
}
15001507
spin_unlock(ptl);
15011508
}

mm/memory.c

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3035,6 +3035,7 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
30353035
int last_cpupid;
30363036
int target_nid;
30373037
bool migrated = false;
3038+
bool was_writable = pte_write(pte);
30383039
int flags = 0;
30393040

30403041
/* A PROT_NONE fault should not end up here */
@@ -3059,6 +3060,8 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
30593060
/* Make it present again */
30603061
pte = pte_modify(pte, vma->vm_page_prot);
30613062
pte = pte_mkyoung(pte);
3063+
if (was_writable)
3064+
pte = pte_mkwrite(pte);
30623065
set_pte_at(mm, addr, ptep, pte);
30633066
update_mmu_cache(vma, addr, ptep);
30643067

@@ -3075,11 +3078,6 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
30753078
* to it but pte_write gets cleared during protection updates and
30763079
* pte_dirty has unpredictable behaviour between PTE scan updates,
30773080
* background writeback, dirty balancing and application behaviour.
3078-
*
3079-
* TODO: Note that the ideal here would be to avoid a situation where a
3080-
* NUMA fault is taken immediately followed by a write fault in
3081-
* some cases which would have lower overhead overall but would be
3082-
* invasive as the fault paths would need to be unified.
30833081
*/
30843082
if (!(vma->vm_flags & VM_WRITE))
30853083
flags |= TNF_NO_GROUP;

mm/mprotect.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
7575
oldpte = *pte;
7676
if (pte_present(oldpte)) {
7777
pte_t ptent;
78+
bool preserve_write = prot_numa && pte_write(oldpte);
7879

7980
/*
8081
* Avoid trapping faults against the zero or KSM
@@ -94,6 +95,8 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
9495

9596
ptent = ptep_modify_prot_start(mm, addr, pte);
9697
ptent = pte_modify(ptent, newprot);
98+
if (preserve_write)
99+
ptent = pte_mkwrite(ptent);
97100

98101
/* Avoid taking write faults for known dirty pages */
99102
if (dirty_accountable && pte_dirty(ptent) &&

0 commit comments

Comments
 (0)