56
56
#include <linux/khugepaged.h>
57
57
#include <linux/rculist_nulls.h>
58
58
#include <linux/random.h>
59
+ #include <linux/mmu_notifier.h>
59
60
60
61
#include <asm/tlbflush.h>
61
62
#include <asm/div64.h>
@@ -3294,7 +3295,8 @@ static bool get_next_vma(unsigned long mask, unsigned long size, struct mm_walk
3294
3295
return false;
3295
3296
}
3296
3297
3297
- static unsigned long get_pte_pfn (pte_t pte , struct vm_area_struct * vma , unsigned long addr )
3298
+ static unsigned long get_pte_pfn (pte_t pte , struct vm_area_struct * vma , unsigned long addr ,
3299
+ struct pglist_data * pgdat )
3298
3300
{
3299
3301
unsigned long pfn = pte_pfn (pte );
3300
3302
@@ -3306,13 +3308,20 @@ static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned
3306
3308
if (WARN_ON_ONCE (pte_devmap (pte ) || pte_special (pte )))
3307
3309
return -1 ;
3308
3310
3311
+ if (!pte_young (pte ) && !mm_has_notifiers (vma -> vm_mm ))
3312
+ return -1 ;
3313
+
3309
3314
if (WARN_ON_ONCE (!pfn_valid (pfn )))
3310
3315
return -1 ;
3311
3316
3317
+ if (pfn < pgdat -> node_start_pfn || pfn >= pgdat_end_pfn (pgdat ))
3318
+ return -1 ;
3319
+
3312
3320
return pfn ;
3313
3321
}
3314
3322
3315
- static unsigned long get_pmd_pfn (pmd_t pmd , struct vm_area_struct * vma , unsigned long addr )
3323
+ static unsigned long get_pmd_pfn (pmd_t pmd , struct vm_area_struct * vma , unsigned long addr ,
3324
+ struct pglist_data * pgdat )
3316
3325
{
3317
3326
unsigned long pfn = pmd_pfn (pmd );
3318
3327
@@ -3324,9 +3333,15 @@ static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned
3324
3333
if (WARN_ON_ONCE (pmd_devmap (pmd )))
3325
3334
return -1 ;
3326
3335
3336
+ if (!pmd_young (pmd ) && !mm_has_notifiers (vma -> vm_mm ))
3337
+ return -1 ;
3338
+
3327
3339
if (WARN_ON_ONCE (!pfn_valid (pfn )))
3328
3340
return -1 ;
3329
3341
3342
+ if (pfn < pgdat -> node_start_pfn || pfn >= pgdat_end_pfn (pgdat ))
3343
+ return -1 ;
3344
+
3330
3345
return pfn ;
3331
3346
}
3332
3347
@@ -3335,10 +3350,6 @@ static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg,
3335
3350
{
3336
3351
struct folio * folio ;
3337
3352
3338
- /* try to avoid unnecessary memory loads */
3339
- if (pfn < pgdat -> node_start_pfn || pfn >= pgdat_end_pfn (pgdat ))
3340
- return NULL ;
3341
-
3342
3353
folio = pfn_folio (pfn );
3343
3354
if (folio_nid (folio ) != pgdat -> node_id )
3344
3355
return NULL ;
@@ -3394,20 +3405,16 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
3394
3405
total ++ ;
3395
3406
walk -> mm_stats [MM_LEAF_TOTAL ]++ ;
3396
3407
3397
- pfn = get_pte_pfn (ptent , args -> vma , addr );
3408
+ pfn = get_pte_pfn (ptent , args -> vma , addr , pgdat );
3398
3409
if (pfn == -1 )
3399
3410
continue ;
3400
3411
3401
- if (!pte_young (ptent )) {
3402
- continue ;
3403
- }
3404
-
3405
3412
folio = get_pfn_folio (pfn , memcg , pgdat , walk -> can_swap );
3406
3413
if (!folio )
3407
3414
continue ;
3408
3415
3409
- if (!ptep_test_and_clear_young (args -> vma , addr , pte + i ))
3410
- VM_WARN_ON_ONCE (true) ;
3416
+ if (!ptep_clear_young_notify (args -> vma , addr , pte + i ))
3417
+ continue ;
3411
3418
3412
3419
young ++ ;
3413
3420
walk -> mm_stats [MM_LEAF_YOUNG ]++ ;
@@ -3473,21 +3480,25 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area
3473
3480
/* don't round down the first address */
3474
3481
addr = i ? (* first & PMD_MASK ) + i * PMD_SIZE : * first ;
3475
3482
3476
- pfn = get_pmd_pfn (pmd [i ], vma , addr );
3477
- if (pfn == -1 )
3483
+ if (!pmd_present (pmd [i ]))
3478
3484
goto next ;
3479
3485
3480
3486
if (!pmd_trans_huge (pmd [i ])) {
3481
- if (!walk -> force_scan && should_clear_pmd_young ())
3487
+ if (!walk -> force_scan && should_clear_pmd_young () &&
3488
+ !mm_has_notifiers (args -> mm ))
3482
3489
pmdp_test_and_clear_young (vma , addr , pmd + i );
3483
3490
goto next ;
3484
3491
}
3485
3492
3493
+ pfn = get_pmd_pfn (pmd [i ], vma , addr , pgdat );
3494
+ if (pfn == -1 )
3495
+ goto next ;
3496
+
3486
3497
folio = get_pfn_folio (pfn , memcg , pgdat , walk -> can_swap );
3487
3498
if (!folio )
3488
3499
goto next ;
3489
3500
3490
- if (!pmdp_test_and_clear_young (vma , addr , pmd + i ))
3501
+ if (!pmdp_clear_young_notify (vma , addr , pmd + i ))
3491
3502
goto next ;
3492
3503
3493
3504
walk -> mm_stats [MM_LEAF_YOUNG ]++ ;
@@ -3545,24 +3556,18 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
3545
3556
}
3546
3557
3547
3558
if (pmd_trans_huge (val )) {
3548
- unsigned long pfn = pmd_pfn (val );
3549
3559
struct pglist_data * pgdat = lruvec_pgdat (walk -> lruvec );
3560
+ unsigned long pfn = get_pmd_pfn (val , vma , addr , pgdat );
3550
3561
3551
3562
walk -> mm_stats [MM_LEAF_TOTAL ]++ ;
3552
3563
3553
- if (!pmd_young (val )) {
3554
- continue ;
3555
- }
3556
-
3557
- /* try to avoid unnecessary memory loads */
3558
- if (pfn < pgdat -> node_start_pfn || pfn >= pgdat_end_pfn (pgdat ))
3559
- continue ;
3560
-
3561
- walk_pmd_range_locked (pud , addr , vma , args , bitmap , & first );
3564
+ if (pfn != -1 )
3565
+ walk_pmd_range_locked (pud , addr , vma , args , bitmap , & first );
3562
3566
continue ;
3563
3567
}
3564
3568
3565
- if (!walk -> force_scan && should_clear_pmd_young ()) {
3569
+ if (!walk -> force_scan && should_clear_pmd_young () &&
3570
+ !mm_has_notifiers (args -> mm )) {
3566
3571
if (!pmd_young (val ))
3567
3572
continue ;
3568
3573
@@ -4036,13 +4041,13 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
4036
4041
* the PTE table to the Bloom filter. This forms a feedback loop between the
4037
4042
* eviction and the aging.
4038
4043
*/
4039
- void lru_gen_look_around (struct page_vma_mapped_walk * pvmw )
4044
+ bool lru_gen_look_around (struct page_vma_mapped_walk * pvmw )
4040
4045
{
4041
4046
int i ;
4042
4047
unsigned long start ;
4043
4048
unsigned long end ;
4044
4049
struct lru_gen_mm_walk * walk ;
4045
- int young = 0 ;
4050
+ int young = 1 ;
4046
4051
pte_t * pte = pvmw -> pte ;
4047
4052
unsigned long addr = pvmw -> address ;
4048
4053
struct vm_area_struct * vma = pvmw -> vma ;
@@ -4058,19 +4063,25 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
4058
4063
lockdep_assert_held (pvmw -> ptl );
4059
4064
VM_WARN_ON_ONCE_FOLIO (folio_test_lru (folio ), folio );
4060
4065
4066
+ if (!ptep_clear_young_notify (vma , addr , pte ))
4067
+ return false;
4068
+
4061
4069
if (spin_is_contended (pvmw -> ptl ))
4062
- return ;
4070
+ return true ;
4063
4071
4064
4072
/* exclude special VMAs containing anon pages from COW */
4065
4073
if (vma -> vm_flags & VM_SPECIAL )
4066
- return ;
4074
+ return true ;
4067
4075
4068
4076
/* avoid taking the LRU lock under the PTL when possible */
4069
4077
walk = current -> reclaim_state ? current -> reclaim_state -> mm_walk : NULL ;
4070
4078
4071
4079
start = max (addr & PMD_MASK , vma -> vm_start );
4072
4080
end = min (addr | ~PMD_MASK , vma -> vm_end - 1 ) + 1 ;
4073
4081
4082
+ if (end - start == PAGE_SIZE )
4083
+ return true;
4084
+
4074
4085
if (end - start > MIN_LRU_BATCH * PAGE_SIZE ) {
4075
4086
if (addr - start < MIN_LRU_BATCH * PAGE_SIZE / 2 )
4076
4087
end = start + MIN_LRU_BATCH * PAGE_SIZE ;
@@ -4084,7 +4095,7 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
4084
4095
4085
4096
/* folio_update_gen() requires stable folio_memcg() */
4086
4097
if (!mem_cgroup_trylock_pages (memcg ))
4087
- return ;
4098
+ return true ;
4088
4099
4089
4100
arch_enter_lazy_mmu_mode ();
4090
4101
@@ -4094,19 +4105,16 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
4094
4105
unsigned long pfn ;
4095
4106
pte_t ptent = ptep_get (pte + i );
4096
4107
4097
- pfn = get_pte_pfn (ptent , vma , addr );
4108
+ pfn = get_pte_pfn (ptent , vma , addr , pgdat );
4098
4109
if (pfn == -1 )
4099
4110
continue ;
4100
4111
4101
- if (!pte_young (ptent ))
4102
- continue ;
4103
-
4104
4112
folio = get_pfn_folio (pfn , memcg , pgdat , can_swap );
4105
4113
if (!folio )
4106
4114
continue ;
4107
4115
4108
- if (!ptep_test_and_clear_young (vma , addr , pte + i ))
4109
- VM_WARN_ON_ONCE (true) ;
4116
+ if (!ptep_clear_young_notify (vma , addr , pte + i ))
4117
+ continue ;
4110
4118
4111
4119
young ++ ;
4112
4120
@@ -4136,6 +4144,8 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
4136
4144
/* feedback from rmap walkers to page table walkers */
4137
4145
if (mm_state && suitable_to_scan (i , young ))
4138
4146
update_bloom_filter (mm_state , max_seq , pvmw -> pmd );
4147
+
4148
+ return true;
4139
4149
}
4140
4150
4141
4151
/******************************************************************************
0 commit comments