Skip to content

Commit 1d4832b

Browse files
yuzhaogoogleakpm00
authored andcommitted
mm: multi-gen LRU: use {ptep,pmdp}_clear_young_notify()
When the MM_WALK capability is enabled, memory that is mostly accessed by a VM appears younger than it really is, therefore this memory will be less likely to be evicted. Therefore, the presence of a running VM can significantly increase swap-outs for non-VM memory, regressing the performance for the rest of the system. Fix this regression by always calling {ptep,pmdp}_clear_young_notify() whenever we clear the young bits on PMDs/PTEs. [[email protected]: fix link-time error] Link: https://lkml.kernel.org/r/[email protected] Fixes: bd74fda ("mm: multi-gen LRU: support page table walks") Signed-off-by: Yu Zhao <[email protected]> Signed-off-by: James Houghton <[email protected]> Reported-by: David Stevens <[email protected]> Cc: Axel Rasmussen <[email protected]> Cc: David Matlack <[email protected]> Cc: David Rientjes <[email protected]> Cc: Oliver Upton <[email protected]> Cc: Paolo Bonzini <[email protected]> Cc: Sean Christopherson <[email protected]> Cc: Wei Xu <[email protected]> Cc: <[email protected]> Cc: kernel test robot <[email protected]> Signed-off-by: Andrew Morton <[email protected]>
1 parent ddd6d8e commit 1d4832b

File tree

3 files changed

+55
-47
lines changed

3 files changed

+55
-47
lines changed

include/linux/mmzone.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -555,7 +555,7 @@ struct lru_gen_memcg {
555555

556556
void lru_gen_init_pgdat(struct pglist_data *pgdat);
557557
void lru_gen_init_lruvec(struct lruvec *lruvec);
558-
void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
558+
bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
559559

560560
void lru_gen_init_memcg(struct mem_cgroup *memcg);
561561
void lru_gen_exit_memcg(struct mem_cgroup *memcg);
@@ -574,8 +574,9 @@ static inline void lru_gen_init_lruvec(struct lruvec *lruvec)
574574
{
575575
}
576576

577-
static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
577+
static inline bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
578578
{
579+
return false;
579580
}
580581

581582
static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)

mm/rmap.c

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -885,13 +885,10 @@ static bool folio_referenced_one(struct folio *folio,
885885
return false;
886886
}
887887

888-
if (pvmw.pte) {
889-
if (lru_gen_enabled() &&
890-
pte_young(ptep_get(pvmw.pte))) {
891-
lru_gen_look_around(&pvmw);
888+
if (lru_gen_enabled() && pvmw.pte) {
889+
if (lru_gen_look_around(&pvmw))
892890
referenced++;
893-
}
894-
891+
} else if (pvmw.pte) {
895892
if (ptep_clear_flush_young_notify(vma, address,
896893
pvmw.pte))
897894
referenced++;

mm/vmscan.c

Lines changed: 49 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@
5656
#include <linux/khugepaged.h>
5757
#include <linux/rculist_nulls.h>
5858
#include <linux/random.h>
59+
#include <linux/mmu_notifier.h>
5960

6061
#include <asm/tlbflush.h>
6162
#include <asm/div64.h>
@@ -3294,7 +3295,8 @@ static bool get_next_vma(unsigned long mask, unsigned long size, struct mm_walk
32943295
return false;
32953296
}
32963297

3297-
static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr)
3298+
static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr,
3299+
struct pglist_data *pgdat)
32983300
{
32993301
unsigned long pfn = pte_pfn(pte);
33003302

@@ -3306,13 +3308,20 @@ static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned
33063308
if (WARN_ON_ONCE(pte_devmap(pte) || pte_special(pte)))
33073309
return -1;
33083310

3311+
if (!pte_young(pte) && !mm_has_notifiers(vma->vm_mm))
3312+
return -1;
3313+
33093314
if (WARN_ON_ONCE(!pfn_valid(pfn)))
33103315
return -1;
33113316

3317+
if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
3318+
return -1;
3319+
33123320
return pfn;
33133321
}
33143322

3315-
static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned long addr)
3323+
static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned long addr,
3324+
struct pglist_data *pgdat)
33163325
{
33173326
unsigned long pfn = pmd_pfn(pmd);
33183327

@@ -3324,9 +3333,15 @@ static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned
33243333
if (WARN_ON_ONCE(pmd_devmap(pmd)))
33253334
return -1;
33263335

3336+
if (!pmd_young(pmd) && !mm_has_notifiers(vma->vm_mm))
3337+
return -1;
3338+
33273339
if (WARN_ON_ONCE(!pfn_valid(pfn)))
33283340
return -1;
33293341

3342+
if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
3343+
return -1;
3344+
33303345
return pfn;
33313346
}
33323347

@@ -3335,10 +3350,6 @@ static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg,
33353350
{
33363351
struct folio *folio;
33373352

3338-
/* try to avoid unnecessary memory loads */
3339-
if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
3340-
return NULL;
3341-
33423353
folio = pfn_folio(pfn);
33433354
if (folio_nid(folio) != pgdat->node_id)
33443355
return NULL;
@@ -3394,20 +3405,16 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
33943405
total++;
33953406
walk->mm_stats[MM_LEAF_TOTAL]++;
33963407

3397-
pfn = get_pte_pfn(ptent, args->vma, addr);
3408+
pfn = get_pte_pfn(ptent, args->vma, addr, pgdat);
33983409
if (pfn == -1)
33993410
continue;
34003411

3401-
if (!pte_young(ptent)) {
3402-
continue;
3403-
}
3404-
34053412
folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap);
34063413
if (!folio)
34073414
continue;
34083415

3409-
if (!ptep_test_and_clear_young(args->vma, addr, pte + i))
3410-
VM_WARN_ON_ONCE(true);
3416+
if (!ptep_clear_young_notify(args->vma, addr, pte + i))
3417+
continue;
34113418

34123419
young++;
34133420
walk->mm_stats[MM_LEAF_YOUNG]++;
@@ -3473,21 +3480,25 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area
34733480
/* don't round down the first address */
34743481
addr = i ? (*first & PMD_MASK) + i * PMD_SIZE : *first;
34753482

3476-
pfn = get_pmd_pfn(pmd[i], vma, addr);
3477-
if (pfn == -1)
3483+
if (!pmd_present(pmd[i]))
34783484
goto next;
34793485

34803486
if (!pmd_trans_huge(pmd[i])) {
3481-
if (!walk->force_scan && should_clear_pmd_young())
3487+
if (!walk->force_scan && should_clear_pmd_young() &&
3488+
!mm_has_notifiers(args->mm))
34823489
pmdp_test_and_clear_young(vma, addr, pmd + i);
34833490
goto next;
34843491
}
34853492

3493+
pfn = get_pmd_pfn(pmd[i], vma, addr, pgdat);
3494+
if (pfn == -1)
3495+
goto next;
3496+
34863497
folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap);
34873498
if (!folio)
34883499
goto next;
34893500

3490-
if (!pmdp_test_and_clear_young(vma, addr, pmd + i))
3501+
if (!pmdp_clear_young_notify(vma, addr, pmd + i))
34913502
goto next;
34923503

34933504
walk->mm_stats[MM_LEAF_YOUNG]++;
@@ -3545,24 +3556,18 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
35453556
}
35463557

35473558
if (pmd_trans_huge(val)) {
3548-
unsigned long pfn = pmd_pfn(val);
35493559
struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
3560+
unsigned long pfn = get_pmd_pfn(val, vma, addr, pgdat);
35503561

35513562
walk->mm_stats[MM_LEAF_TOTAL]++;
35523563

3553-
if (!pmd_young(val)) {
3554-
continue;
3555-
}
3556-
3557-
/* try to avoid unnecessary memory loads */
3558-
if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
3559-
continue;
3560-
3561-
walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first);
3564+
if (pfn != -1)
3565+
walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first);
35623566
continue;
35633567
}
35643568

3565-
if (!walk->force_scan && should_clear_pmd_young()) {
3569+
if (!walk->force_scan && should_clear_pmd_young() &&
3570+
!mm_has_notifiers(args->mm)) {
35663571
if (!pmd_young(val))
35673572
continue;
35683573

@@ -4036,13 +4041,13 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
40364041
* the PTE table to the Bloom filter. This forms a feedback loop between the
40374042
* eviction and the aging.
40384043
*/
4039-
void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
4044+
bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
40404045
{
40414046
int i;
40424047
unsigned long start;
40434048
unsigned long end;
40444049
struct lru_gen_mm_walk *walk;
4045-
int young = 0;
4050+
int young = 1;
40464051
pte_t *pte = pvmw->pte;
40474052
unsigned long addr = pvmw->address;
40484053
struct vm_area_struct *vma = pvmw->vma;
@@ -4058,19 +4063,25 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
40584063
lockdep_assert_held(pvmw->ptl);
40594064
VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio);
40604065

4066+
if (!ptep_clear_young_notify(vma, addr, pte))
4067+
return false;
4068+
40614069
if (spin_is_contended(pvmw->ptl))
4062-
return;
4070+
return true;
40634071

40644072
/* exclude special VMAs containing anon pages from COW */
40654073
if (vma->vm_flags & VM_SPECIAL)
4066-
return;
4074+
return true;
40674075

40684076
/* avoid taking the LRU lock under the PTL when possible */
40694077
walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL;
40704078

40714079
start = max(addr & PMD_MASK, vma->vm_start);
40724080
end = min(addr | ~PMD_MASK, vma->vm_end - 1) + 1;
40734081

4082+
if (end - start == PAGE_SIZE)
4083+
return true;
4084+
40744085
if (end - start > MIN_LRU_BATCH * PAGE_SIZE) {
40754086
if (addr - start < MIN_LRU_BATCH * PAGE_SIZE / 2)
40764087
end = start + MIN_LRU_BATCH * PAGE_SIZE;
@@ -4084,7 +4095,7 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
40844095

40854096
/* folio_update_gen() requires stable folio_memcg() */
40864097
if (!mem_cgroup_trylock_pages(memcg))
4087-
return;
4098+
return true;
40884099

40894100
arch_enter_lazy_mmu_mode();
40904101

@@ -4094,19 +4105,16 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
40944105
unsigned long pfn;
40954106
pte_t ptent = ptep_get(pte + i);
40964107

4097-
pfn = get_pte_pfn(ptent, vma, addr);
4108+
pfn = get_pte_pfn(ptent, vma, addr, pgdat);
40984109
if (pfn == -1)
40994110
continue;
41004111

4101-
if (!pte_young(ptent))
4102-
continue;
4103-
41044112
folio = get_pfn_folio(pfn, memcg, pgdat, can_swap);
41054113
if (!folio)
41064114
continue;
41074115

4108-
if (!ptep_test_and_clear_young(vma, addr, pte + i))
4109-
VM_WARN_ON_ONCE(true);
4116+
if (!ptep_clear_young_notify(vma, addr, pte + i))
4117+
continue;
41104118

41114119
young++;
41124120

@@ -4136,6 +4144,8 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
41364144
/* feedback from rmap walkers to page table walkers */
41374145
if (mm_state && suitable_to_scan(i, young))
41384146
update_bloom_filter(mm_state, max_seq, pvmw->pmd);
4147+
4148+
return true;
41394149
}
41404150

41414151
/******************************************************************************

0 commit comments

Comments
 (0)