Skip to content

Commit fec89c1

Browse files
kiryltorvalds
authored andcommitted
thp: rewrite freeze_page()/unfreeze_page() with generic rmap walkers
freeze_page() and unfreeze_page() helpers evolved in rather complex beasts. It would be nice to cut complexity of this code. This patch rewrites freeze_page() using standard try_to_unmap(). unfreeze_page() is rewritten with remove_migration_ptes(). The result is much simpler. But the new variant is somewhat slower for PTE-mapped THPs. Current helpers iterates over VMAs the compound page is mapped to, and then over ptes within this VMA. New helpers iterates over small page, then over VMA the small page mapped to, and only then find relevant pte. We have short cut for PMD-mapped THP: we directly install migration entries on PMD split. I don't think the slowdown is critical, considering how much simpler result is and that split_huge_page() is quite rare nowadays. It only happens due memory pressure or migration. Signed-off-by: Kirill A. Shutemov <[email protected]> Cc: Andrea Arcangeli <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent e388466 commit fec89c1

File tree

3 files changed

+50
-180
lines changed

3 files changed

+50
-180
lines changed

include/linux/huge_mm.h

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -96,18 +96,20 @@ static inline int split_huge_page(struct page *page)
9696
void deferred_split_huge_page(struct page *page);
9797

9898
void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
99-
unsigned long address);
99+
unsigned long address, bool freeze);
100100

101101
#define split_huge_pmd(__vma, __pmd, __address) \
102102
do { \
103103
pmd_t *____pmd = (__pmd); \
104104
if (pmd_trans_huge(*____pmd) \
105105
|| pmd_devmap(*____pmd)) \
106-
__split_huge_pmd(__vma, __pmd, __address); \
106+
__split_huge_pmd(__vma, __pmd, __address, \
107+
false); \
107108
} while (0)
108109

109110

110-
void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address);
111+
void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
112+
bool freeze, struct page *page);
111113

112114
#if HPAGE_PMD_ORDER >= MAX_ORDER
113115
#error "hugepages can't be allocated by the buddy allocator"
@@ -178,7 +180,7 @@ static inline void deferred_split_huge_page(struct page *page) {}
178180
do { } while (0)
179181

180182
static inline void split_huge_pmd_address(struct vm_area_struct *vma,
181-
unsigned long address) {}
183+
unsigned long address, bool freeze, struct page *page) {}
182184

183185
static inline int hugepage_madvise(struct vm_area_struct *vma,
184186
unsigned long *vm_flags, int advice)

mm/huge_memory.c

Lines changed: 36 additions & 174 deletions
Original file line numberDiff line numberDiff line change
@@ -2977,7 +2977,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
29772977
}
29782978

29792979
void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
2980-
unsigned long address)
2980+
unsigned long address, bool freeze)
29812981
{
29822982
spinlock_t *ptl;
29832983
struct mm_struct *mm = vma->vm_mm;
@@ -2994,7 +2994,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
29942994
page = NULL;
29952995
} else if (!pmd_devmap(*pmd))
29962996
goto out;
2997-
__split_huge_pmd_locked(vma, pmd, haddr, false);
2997+
__split_huge_pmd_locked(vma, pmd, haddr, freeze);
29982998
out:
29992999
spin_unlock(ptl);
30003000
mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PMD_SIZE);
@@ -3006,7 +3006,8 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
30063006
}
30073007
}
30083008

3009-
void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address)
3009+
void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
3010+
bool freeze, struct page *page)
30103011
{
30113012
pgd_t *pgd;
30123013
pud_t *pud;
@@ -3023,11 +3024,20 @@ void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address)
30233024
pmd = pmd_offset(pud, address);
30243025
if (!pmd_present(*pmd) || (!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd)))
30253026
return;
3027+
3028+
/*
3029+
* If caller asks to setup a migration entries, we need a page to check
3030+
* pmd against. Otherwise we can end up replacing wrong page.
3031+
*/
3032+
VM_BUG_ON(freeze && !page);
3033+
if (page && page != pmd_page(*pmd))
3034+
return;
3035+
30263036
/*
30273037
* Caller holds the mmap_sem write mode, so a huge pmd cannot
30283038
* materialize from under us.
30293039
*/
3030-
split_huge_pmd(vma, pmd, address);
3040+
__split_huge_pmd(vma, pmd, address, freeze);
30313041
}
30323042

30333043
void vma_adjust_trans_huge(struct vm_area_struct *vma,
@@ -3043,7 +3053,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
30433053
if (start & ~HPAGE_PMD_MASK &&
30443054
(start & HPAGE_PMD_MASK) >= vma->vm_start &&
30453055
(start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
3046-
split_huge_pmd_address(vma, start);
3056+
split_huge_pmd_address(vma, start, false, NULL);
30473057

30483058
/*
30493059
* If the new end address isn't hpage aligned and it could
@@ -3053,7 +3063,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
30533063
if (end & ~HPAGE_PMD_MASK &&
30543064
(end & HPAGE_PMD_MASK) >= vma->vm_start &&
30553065
(end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
3056-
split_huge_pmd_address(vma, end);
3066+
split_huge_pmd_address(vma, end, false, NULL);
30573067

30583068
/*
30593069
* If we're also updating the vma->vm_next->vm_start, if the new
@@ -3067,184 +3077,36 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
30673077
if (nstart & ~HPAGE_PMD_MASK &&
30683078
(nstart & HPAGE_PMD_MASK) >= next->vm_start &&
30693079
(nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
3070-
split_huge_pmd_address(next, nstart);
3080+
split_huge_pmd_address(next, nstart, false, NULL);
30713081
}
30723082
}
30733083

3074-
static void freeze_page_vma(struct vm_area_struct *vma, struct page *page,
3075-
unsigned long address)
3084+
static void freeze_page(struct page *page)
30763085
{
3077-
unsigned long haddr = address & HPAGE_PMD_MASK;
3078-
spinlock_t *ptl;
3079-
pgd_t *pgd;
3080-
pud_t *pud;
3081-
pmd_t *pmd;
3082-
pte_t *pte;
3083-
int i, nr = HPAGE_PMD_NR;
3084-
3085-
/* Skip pages which doesn't belong to the VMA */
3086-
if (address < vma->vm_start) {
3087-
int off = (vma->vm_start - address) >> PAGE_SHIFT;
3088-
page += off;
3089-
nr -= off;
3090-
address = vma->vm_start;
3091-
}
3092-
3093-
pgd = pgd_offset(vma->vm_mm, address);
3094-
if (!pgd_present(*pgd))
3095-
return;
3096-
pud = pud_offset(pgd, address);
3097-
if (!pud_present(*pud))
3098-
return;
3099-
pmd = pmd_offset(pud, address);
3100-
ptl = pmd_lock(vma->vm_mm, pmd);
3101-
if (!pmd_present(*pmd)) {
3102-
spin_unlock(ptl);
3103-
return;
3104-
}
3105-
if (pmd_trans_huge(*pmd)) {
3106-
if (page == pmd_page(*pmd))
3107-
__split_huge_pmd_locked(vma, pmd, haddr, true);
3108-
spin_unlock(ptl);
3109-
return;
3110-
}
3111-
spin_unlock(ptl);
3112-
3113-
pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl);
3114-
for (i = 0; i < nr; i++, address += PAGE_SIZE, page++, pte++) {
3115-
pte_t entry, swp_pte;
3116-
swp_entry_t swp_entry;
3117-
3118-
/*
3119-
* We've just crossed page table boundary: need to map next one.
3120-
* It can happen if THP was mremaped to non PMD-aligned address.
3121-
*/
3122-
if (unlikely(address == haddr + HPAGE_PMD_SIZE)) {
3123-
pte_unmap_unlock(pte - 1, ptl);
3124-
pmd = mm_find_pmd(vma->vm_mm, address);
3125-
if (!pmd)
3126-
return;
3127-
pte = pte_offset_map_lock(vma->vm_mm, pmd,
3128-
address, &ptl);
3129-
}
3130-
3131-
if (!pte_present(*pte))
3132-
continue;
3133-
if (page_to_pfn(page) != pte_pfn(*pte))
3134-
continue;
3135-
flush_cache_page(vma, address, page_to_pfn(page));
3136-
entry = ptep_clear_flush(vma, address, pte);
3137-
if (pte_dirty(entry))
3138-
SetPageDirty(page);
3139-
swp_entry = make_migration_entry(page, pte_write(entry));
3140-
swp_pte = swp_entry_to_pte(swp_entry);
3141-
if (pte_soft_dirty(entry))
3142-
swp_pte = pte_swp_mksoft_dirty(swp_pte);
3143-
set_pte_at(vma->vm_mm, address, pte, swp_pte);
3144-
page_remove_rmap(page, false);
3145-
put_page(page);
3146-
}
3147-
pte_unmap_unlock(pte - 1, ptl);
3148-
}
3149-
3150-
static void freeze_page(struct anon_vma *anon_vma, struct page *page)
3151-
{
3152-
struct anon_vma_chain *avc;
3153-
pgoff_t pgoff = page_to_pgoff(page);
3086+
enum ttu_flags ttu_flags = TTU_MIGRATION | TTU_IGNORE_MLOCK |
3087+
TTU_IGNORE_ACCESS | TTU_RMAP_LOCKED;
3088+
int i, ret;
31543089

31553090
VM_BUG_ON_PAGE(!PageHead(page), page);
31563091

3157-
anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff,
3158-
pgoff + HPAGE_PMD_NR - 1) {
3159-
unsigned long address = __vma_address(page, avc->vma);
3160-
3161-
mmu_notifier_invalidate_range_start(avc->vma->vm_mm,
3162-
address, address + HPAGE_PMD_SIZE);
3163-
freeze_page_vma(avc->vma, page, address);
3164-
mmu_notifier_invalidate_range_end(avc->vma->vm_mm,
3165-
address, address + HPAGE_PMD_SIZE);
3166-
}
3167-
}
3168-
3169-
static void unfreeze_page_vma(struct vm_area_struct *vma, struct page *page,
3170-
unsigned long address)
3171-
{
3172-
spinlock_t *ptl;
3173-
pmd_t *pmd;
3174-
pte_t *pte, entry;
3175-
swp_entry_t swp_entry;
3176-
unsigned long haddr = address & HPAGE_PMD_MASK;
3177-
int i, nr = HPAGE_PMD_NR;
3178-
3179-
/* Skip pages which doesn't belong to the VMA */
3180-
if (address < vma->vm_start) {
3181-
int off = (vma->vm_start - address) >> PAGE_SHIFT;
3182-
page += off;
3183-
nr -= off;
3184-
address = vma->vm_start;
3185-
}
3186-
3187-
pmd = mm_find_pmd(vma->vm_mm, address);
3188-
if (!pmd)
3189-
return;
3190-
3191-
pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl);
3192-
for (i = 0; i < nr; i++, address += PAGE_SIZE, page++, pte++) {
3193-
/*
3194-
* We've just crossed page table boundary: need to map next one.
3195-
* It can happen if THP was mremaped to non-PMD aligned address.
3196-
*/
3197-
if (unlikely(address == haddr + HPAGE_PMD_SIZE)) {
3198-
pte_unmap_unlock(pte - 1, ptl);
3199-
pmd = mm_find_pmd(vma->vm_mm, address);
3200-
if (!pmd)
3201-
return;
3202-
pte = pte_offset_map_lock(vma->vm_mm, pmd,
3203-
address, &ptl);
3204-
}
3205-
3206-
if (!is_swap_pte(*pte))
3207-
continue;
3208-
3209-
swp_entry = pte_to_swp_entry(*pte);
3210-
if (!is_migration_entry(swp_entry))
3211-
continue;
3212-
if (migration_entry_to_page(swp_entry) != page)
3213-
continue;
3214-
3215-
get_page(page);
3216-
page_add_anon_rmap(page, vma, address, false);
3217-
3218-
entry = pte_mkold(mk_pte(page, vma->vm_page_prot));
3219-
if (PageDirty(page))
3220-
entry = pte_mkdirty(entry);
3221-
if (is_write_migration_entry(swp_entry))
3222-
entry = maybe_mkwrite(entry, vma);
3223-
3224-
flush_dcache_page(page);
3225-
set_pte_at(vma->vm_mm, address, pte, entry);
3092+
/* We only need TTU_SPLIT_HUGE_PMD once */
3093+
ret = try_to_unmap(page, ttu_flags | TTU_SPLIT_HUGE_PMD);
3094+
for (i = 1; !ret && i < HPAGE_PMD_NR; i++) {
3095+
/* Cut short if the page is unmapped */
3096+
if (page_count(page) == 1)
3097+
return;
32263098

3227-
/* No need to invalidate - it was non-present before */
3228-
update_mmu_cache(vma, address, pte);
3099+
ret = try_to_unmap(page + i, ttu_flags);
32293100
}
3230-
pte_unmap_unlock(pte - 1, ptl);
3101+
VM_BUG_ON(ret);
32313102
}
32323103

3233-
static void unfreeze_page(struct anon_vma *anon_vma, struct page *page)
3104+
static void unfreeze_page(struct page *page)
32343105
{
3235-
struct anon_vma_chain *avc;
3236-
pgoff_t pgoff = page_to_pgoff(page);
3237-
3238-
anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
3239-
pgoff, pgoff + HPAGE_PMD_NR - 1) {
3240-
unsigned long address = __vma_address(page, avc->vma);
3106+
int i;
32413107

3242-
mmu_notifier_invalidate_range_start(avc->vma->vm_mm,
3243-
address, address + HPAGE_PMD_SIZE);
3244-
unfreeze_page_vma(avc->vma, page, address);
3245-
mmu_notifier_invalidate_range_end(avc->vma->vm_mm,
3246-
address, address + HPAGE_PMD_SIZE);
3247-
}
3108+
for (i = 0; i < HPAGE_PMD_NR; i++)
3109+
remove_migration_ptes(page + i, page + i, true);
32483110
}
32493111

32503112
static void __split_huge_page_tail(struct page *head, int tail,
@@ -3322,7 +3184,7 @@ static void __split_huge_page(struct page *page, struct list_head *list)
33223184
ClearPageCompound(head);
33233185
spin_unlock_irq(&zone->lru_lock);
33243186

3325-
unfreeze_page(page_anon_vma(head), head);
3187+
unfreeze_page(head);
33263188

33273189
for (i = 0; i < HPAGE_PMD_NR; i++) {
33283190
struct page *subpage = head + i;
@@ -3418,7 +3280,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
34183280
}
34193281

34203282
mlocked = PageMlocked(page);
3421-
freeze_page(anon_vma, head);
3283+
freeze_page(head);
34223284
VM_BUG_ON_PAGE(compound_mapcount(head), head);
34233285

34243286
/* Make sure the page is not on per-CPU pagevec as it takes pin */
@@ -3447,7 +3309,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
34473309
BUG();
34483310
} else {
34493311
spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
3450-
unfreeze_page(anon_vma, head);
3312+
unfreeze_page(head);
34513313
ret = -EBUSY;
34523314
}
34533315

mm/rmap.c

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1431,8 +1431,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
14311431
if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
14321432
goto out;
14331433

1434-
if (flags & TTU_SPLIT_HUGE_PMD)
1435-
split_huge_pmd_address(vma, address);
1434+
if (flags & TTU_SPLIT_HUGE_PMD) {
1435+
split_huge_pmd_address(vma, address,
1436+
flags & TTU_MIGRATION, page);
1437+
/* check if we have anything to do after split */
1438+
if (page_mapcount(page) == 0)
1439+
goto out;
1440+
}
1441+
14361442
pte = page_check_address(page, mm, address, &ptl, 0);
14371443
if (!pte)
14381444
goto out;

0 commit comments

Comments
 (0)