Skip to content

Commit dce7d10

Browse files
ioworker0akpm00
authored andcommitted
mm/madvise: optimize lazyfreeing with mTHP in madvise_free
This patch optimizes lazyfreeing with PTE-mapped mTHP[1] (Inspired by David Hildenbrand[2]). We aim to avoid unnecessary folio splitting if the large folio is fully mapped within the target range. If a large folio is locked or shared, or if we fail to split it, we just leave it in place and advance to the next PTE in the range. But note that the behavior is changed; previously, any failure of this sort would cause the entire operation to give up. As large folios become more common, sticking to the old way could result in wasted opportunities. On an Intel I5 CPU, lazyfreeing a 1GiB VMA backed by PTE-mapped folios of the same size results in the following runtimes for madvise(MADV_FREE) in seconds (shorter is better): Folio Size | Old | New | Change ------------------------------------------ 4KiB | 0.590251 | 0.590259 | 0% 16KiB | 2.990447 | 0.185655 | -94% 32KiB | 2.547831 | 0.104870 | -95% 64KiB | 2.457796 | 0.052812 | -97% 128KiB | 2.281034 | 0.032777 | -99% 256KiB | 2.230387 | 0.017496 | -99% 512KiB | 2.189106 | 0.010781 | -99% 1024KiB | 2.183949 | 0.007753 | -99% 2048KiB | 0.002799 | 0.002804 | 0% [1] https://lkml.kernel.org/r/[email protected] [2] https://lore.kernel.org/linux-mm/[email protected] Link: https://lkml.kernel.org/r/[email protected] Signed-off-by: Lance Yang <[email protected]> Reviewed-by: Ryan Roberts <[email protected]> Acked-by: David Hildenbrand <[email protected]> Cc: Barry Song <[email protected]> Cc: Jeff Xie <[email protected]> Cc: Kefeng Wang <[email protected]> Cc: Michal Hocko <[email protected]> Cc: Minchan Kim <[email protected]> Cc: Muchun Song <[email protected]> Cc: Peter Xu <[email protected]> Cc: Yang Shi <[email protected]> Cc: Yin Fengwei <[email protected]> Cc: Zach O'Keefe <[email protected]> Signed-off-by: Andrew Morton <[email protected]>
1 parent 96ebdb0 commit dce7d10

File tree

1 file changed

+44
-41
lines changed

1 file changed

+44
-41
lines changed

mm/madvise.c

Lines changed: 44 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -643,6 +643,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
643643
unsigned long end, struct mm_walk *walk)
644644

645645
{
646+
const cydp_t cydp_flags = CYDP_CLEAR_YOUNG | CYDP_CLEAR_DIRTY;
646647
struct mmu_gather *tlb = walk->private;
647648
struct mm_struct *mm = tlb->mm;
648649
struct vm_area_struct *vma = walk->vma;
@@ -697,44 +698,57 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
697698
continue;
698699

699700
/*
700-
* If pmd isn't transhuge but the folio is large and
701-
* is owned by only this process, split it and
702-
* deactivate all pages.
701+
* If we encounter a large folio, only split it if it is not
702+
* fully mapped within the range we are operating on. Otherwise
703+
* leave it as is so that it can be marked as lazyfree. If we
704+
* fail to split a folio, leave it in place and advance to the
705+
* next pte in the range.
703706
*/
704707
if (folio_test_large(folio)) {
705-
int err;
708+
bool any_young, any_dirty;
706709

707-
if (folio_likely_mapped_shared(folio))
708-
break;
709-
if (!folio_trylock(folio))
710-
break;
711-
folio_get(folio);
712-
arch_leave_lazy_mmu_mode();
713-
pte_unmap_unlock(start_pte, ptl);
714-
start_pte = NULL;
715-
err = split_folio(folio);
716-
folio_unlock(folio);
717-
folio_put(folio);
718-
if (err)
719-
break;
720-
start_pte = pte =
721-
pte_offset_map_lock(mm, pmd, addr, &ptl);
722-
if (!start_pte)
723-
break;
724-
arch_enter_lazy_mmu_mode();
725-
pte--;
726-
addr -= PAGE_SIZE;
727-
continue;
710+
nr = madvise_folio_pte_batch(addr, end, folio, pte,
711+
ptent, &any_young, &any_dirty);
712+
713+
if (nr < folio_nr_pages(folio)) {
714+
int err;
715+
716+
if (folio_likely_mapped_shared(folio))
717+
continue;
718+
if (!folio_trylock(folio))
719+
continue;
720+
folio_get(folio);
721+
arch_leave_lazy_mmu_mode();
722+
pte_unmap_unlock(start_pte, ptl);
723+
start_pte = NULL;
724+
err = split_folio(folio);
725+
folio_unlock(folio);
726+
folio_put(folio);
727+
pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
728+
start_pte = pte;
729+
if (!start_pte)
730+
break;
731+
arch_enter_lazy_mmu_mode();
732+
if (!err)
733+
nr = 0;
734+
continue;
735+
}
736+
737+
if (any_young)
738+
ptent = pte_mkyoung(ptent);
739+
if (any_dirty)
740+
ptent = pte_mkdirty(ptent);
728741
}
729742

730743
if (folio_test_swapcache(folio) || folio_test_dirty(folio)) {
731744
if (!folio_trylock(folio))
732745
continue;
733746
/*
734-
* If folio is shared with others, we mustn't clear
735-
* the folio's dirty flag.
747+
* If we have a large folio at this point, we know it is
748+
* fully mapped so if its mapcount is the same as its
749+
* number of pages, it must be exclusive.
736750
*/
737-
if (folio_mapcount(folio) != 1) {
751+
if (folio_mapcount(folio) != folio_nr_pages(folio)) {
738752
folio_unlock(folio);
739753
continue;
740754
}
@@ -750,19 +764,8 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
750764
}
751765

752766
if (pte_young(ptent) || pte_dirty(ptent)) {
753-
/*
754-
* Some of architecture(ex, PPC) don't update TLB
755-
* with set_pte_at and tlb_remove_tlb_entry so for
756-
* the portability, remap the pte with old|clean
757-
* after pte clearing.
758-
*/
759-
ptent = ptep_get_and_clear_full(mm, addr, pte,
760-
tlb->fullmm);
761-
762-
ptent = pte_mkold(ptent);
763-
ptent = pte_mkclean(ptent);
764-
set_pte_at(mm, addr, pte, ptent);
765-
tlb_remove_tlb_entry(tlb, pte, addr);
767+
clear_young_dirty_ptes(vma, addr, pte, nr, cydp_flags);
768+
tlb_remove_tlb_entries(tlb, pte, nr, addr);
766769
}
767770
folio_mark_lazyfree(folio);
768771
}

0 commit comments

Comments
 (0)