Skip to content

Commit eb1521d

Browse files
surenbaghdasaryanakpm00
authored andcommitted
userfaultfd: handle zeropage moves by UFFDIO_MOVE
Current implementation of UFFDIO_MOVE fails to move zeropages and returns EBUSY when it encounters one. We can handle them by mapping a zeropage at the destination and clearing the mapping at the source. This is done both for ordinary and for huge zeropages. Link: https://lkml.kernel.org/r/[email protected] Signed-off-by: Suren Baghdasaryan <[email protected]> Reported-by: kernel test robot <[email protected]> Reported-by: Dan Carpenter <[email protected]> Closes: https://lore.kernel.org/r/[email protected]/ Cc: Alexander Viro <[email protected]> Cc: Andrea Arcangeli <[email protected]> Cc: Axel Rasmussen <[email protected]> Cc: Brian Geffon <[email protected]> Cc: Christian Brauner <[email protected]> Cc: David Hildenbrand <[email protected]> Cc: Hugh Dickins <[email protected]> Cc: Jann Horn <[email protected]> Cc: Kalesh Singh <[email protected]> Cc: Liam R. Howlett <[email protected]> Cc: Lokesh Gidra <[email protected]> Cc: Matthew Wilcox <[email protected]> Cc: Michal Hocko <[email protected]> Cc: Mike Rapoport (IBM) <[email protected]> Cc: Nicolas Geoffray <[email protected]> Cc: Peter Xu <[email protected]> Cc: Ryan Roberts <[email protected]> Cc: Shuah Khan <[email protected]> Cc: ZhangPeng <[email protected]> Signed-off-by: Andrew Morton <[email protected]>
1 parent e777ae4 commit eb1521d

File tree

2 files changed

+98
-51
lines changed

2 files changed

+98
-51
lines changed

mm/huge_memory.c

Lines changed: 61 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -2200,33 +2200,41 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm
22002200
}
22012201

22022202
src_page = pmd_page(src_pmdval);
2203-
if (unlikely(!PageAnonExclusive(src_page))) {
2204-
spin_unlock(src_ptl);
2205-
return -EBUSY;
2206-
}
22072203

2208-
src_folio = page_folio(src_page);
2209-
folio_get(src_folio);
2204+
if (!is_huge_zero_pmd(src_pmdval)) {
2205+
if (unlikely(!PageAnonExclusive(src_page))) {
2206+
spin_unlock(src_ptl);
2207+
return -EBUSY;
2208+
}
2209+
2210+
src_folio = page_folio(src_page);
2211+
folio_get(src_folio);
2212+
} else
2213+
src_folio = NULL;
2214+
22102215
spin_unlock(src_ptl);
22112216

22122217
flush_cache_range(src_vma, src_addr, src_addr + HPAGE_PMD_SIZE);
22132218
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, src_addr,
22142219
src_addr + HPAGE_PMD_SIZE);
22152220
mmu_notifier_invalidate_range_start(&range);
22162221

2217-
folio_lock(src_folio);
2222+
if (src_folio) {
2223+
folio_lock(src_folio);
22182224

2219-
/*
2220-
* split_huge_page walks the anon_vma chain without the page
2221-
* lock. Serialize against it with the anon_vma lock, the page
2222-
* lock is not enough.
2223-
*/
2224-
src_anon_vma = folio_get_anon_vma(src_folio);
2225-
if (!src_anon_vma) {
2226-
err = -EAGAIN;
2227-
goto unlock_folio;
2228-
}
2229-
anon_vma_lock_write(src_anon_vma);
2225+
/*
2226+
* split_huge_page walks the anon_vma chain without the page
2227+
* lock. Serialize against it with the anon_vma lock, the page
2228+
* lock is not enough.
2229+
*/
2230+
src_anon_vma = folio_get_anon_vma(src_folio);
2231+
if (!src_anon_vma) {
2232+
err = -EAGAIN;
2233+
goto unlock_folio;
2234+
}
2235+
anon_vma_lock_write(src_anon_vma);
2236+
} else
2237+
src_anon_vma = NULL;
22302238

22312239
dst_ptl = pmd_lockptr(mm, dst_pmd);
22322240
double_pt_lock(src_ptl, dst_ptl);
@@ -2235,45 +2243,54 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm
22352243
err = -EAGAIN;
22362244
goto unlock_ptls;
22372245
}
2238-
if (folio_maybe_dma_pinned(src_folio) ||
2239-
!PageAnonExclusive(&src_folio->page)) {
2240-
err = -EBUSY;
2241-
goto unlock_ptls;
2242-
}
2246+
if (src_folio) {
2247+
if (folio_maybe_dma_pinned(src_folio) ||
2248+
!PageAnonExclusive(&src_folio->page)) {
2249+
err = -EBUSY;
2250+
goto unlock_ptls;
2251+
}
22432252

2244-
if (WARN_ON_ONCE(!folio_test_head(src_folio)) ||
2245-
WARN_ON_ONCE(!folio_test_anon(src_folio))) {
2246-
err = -EBUSY;
2247-
goto unlock_ptls;
2248-
}
2253+
if (WARN_ON_ONCE(!folio_test_head(src_folio)) ||
2254+
WARN_ON_ONCE(!folio_test_anon(src_folio))) {
2255+
err = -EBUSY;
2256+
goto unlock_ptls;
2257+
}
22492258

2250-
folio_move_anon_rmap(src_folio, dst_vma);
2251-
WRITE_ONCE(src_folio->index, linear_page_index(dst_vma, dst_addr));
2259+
folio_move_anon_rmap(src_folio, dst_vma);
2260+
WRITE_ONCE(src_folio->index, linear_page_index(dst_vma, dst_addr));
22522261

2253-
src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
2254-
/* Folio got pinned from under us. Put it back and fail the move. */
2255-
if (folio_maybe_dma_pinned(src_folio)) {
2256-
set_pmd_at(mm, src_addr, src_pmd, src_pmdval);
2257-
err = -EBUSY;
2258-
goto unlock_ptls;
2259-
}
2262+
src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
2263+
/* Folio got pinned from under us. Put it back and fail the move. */
2264+
if (folio_maybe_dma_pinned(src_folio)) {
2265+
set_pmd_at(mm, src_addr, src_pmd, src_pmdval);
2266+
err = -EBUSY;
2267+
goto unlock_ptls;
2268+
}
22602269

2261-
_dst_pmd = mk_huge_pmd(&src_folio->page, dst_vma->vm_page_prot);
2262-
/* Follow mremap() behavior and treat the entry dirty after the move */
2263-
_dst_pmd = pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma);
2270+
_dst_pmd = mk_huge_pmd(&src_folio->page, dst_vma->vm_page_prot);
2271+
/* Follow mremap() behavior and treat the entry dirty after the move */
2272+
_dst_pmd = pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma);
2273+
} else {
2274+
src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
2275+
_dst_pmd = mk_huge_pmd(src_page, dst_vma->vm_page_prot);
2276+
}
22642277
set_pmd_at(mm, dst_addr, dst_pmd, _dst_pmd);
22652278

22662279
src_pgtable = pgtable_trans_huge_withdraw(mm, src_pmd);
22672280
pgtable_trans_huge_deposit(mm, dst_pmd, src_pgtable);
22682281
unlock_ptls:
22692282
double_pt_unlock(src_ptl, dst_ptl);
2270-
anon_vma_unlock_write(src_anon_vma);
2271-
put_anon_vma(src_anon_vma);
2283+
if (src_anon_vma) {
2284+
anon_vma_unlock_write(src_anon_vma);
2285+
put_anon_vma(src_anon_vma);
2286+
}
22722287
unlock_folio:
22732288
/* unblock rmap walks */
2274-
folio_unlock(src_folio);
2289+
if (src_folio)
2290+
folio_unlock(src_folio);
22752291
mmu_notifier_invalidate_range_end(&range);
2276-
folio_put(src_folio);
2292+
if (src_folio)
2293+
folio_put(src_folio);
22772294
return err;
22782295
}
22792296
#endif /* CONFIG_USERFAULTFD */

mm/userfaultfd.c

Lines changed: 37 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -959,6 +959,33 @@ static int move_swap_pte(struct mm_struct *mm,
959959
return 0;
960960
}
961961

962+
static int move_zeropage_pte(struct mm_struct *mm,
963+
struct vm_area_struct *dst_vma,
964+
struct vm_area_struct *src_vma,
965+
unsigned long dst_addr, unsigned long src_addr,
966+
pte_t *dst_pte, pte_t *src_pte,
967+
pte_t orig_dst_pte, pte_t orig_src_pte,
968+
spinlock_t *dst_ptl, spinlock_t *src_ptl)
969+
{
970+
pte_t zero_pte;
971+
972+
double_pt_lock(dst_ptl, src_ptl);
973+
if (!pte_same(ptep_get(src_pte), orig_src_pte) ||
974+
!pte_same(ptep_get(dst_pte), orig_dst_pte)) {
975+
double_pt_unlock(dst_ptl, src_ptl);
976+
return -EAGAIN;
977+
}
978+
979+
zero_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
980+
dst_vma->vm_page_prot));
981+
ptep_clear_flush(src_vma, src_addr, src_pte);
982+
set_pte_at(mm, dst_addr, dst_pte, zero_pte);
983+
double_pt_unlock(dst_ptl, src_ptl);
984+
985+
return 0;
986+
}
987+
988+
962989
/*
963990
* The mmap_lock for reading is held by the caller. Just move the page
964991
* from src_pmd to dst_pmd if possible, and return true if succeeded
@@ -1041,6 +1068,14 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd,
10411068
}
10421069

10431070
if (pte_present(orig_src_pte)) {
1071+
if (is_zero_pfn(pte_pfn(orig_src_pte))) {
1072+
err = move_zeropage_pte(mm, dst_vma, src_vma,
1073+
dst_addr, src_addr, dst_pte, src_pte,
1074+
orig_dst_pte, orig_src_pte,
1075+
dst_ptl, src_ptl);
1076+
goto out;
1077+
}
1078+
10441079
/*
10451080
* Pin and lock both source folio and anon_vma. Since we are in
10461081
* RCU read section, we can't block, so on contention have to
@@ -1404,19 +1439,14 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
14041439
err = -ENOENT;
14051440
break;
14061441
}
1407-
/* Avoid moving zeropages for now */
1408-
if (is_huge_zero_pmd(*src_pmd)) {
1409-
spin_unlock(ptl);
1410-
err = -EBUSY;
1411-
break;
1412-
}
14131442

14141443
/* Check if we can move the pmd without splitting it. */
14151444
if (move_splits_huge_pmd(dst_addr, src_addr, src_start + len) ||
14161445
!pmd_none(dst_pmdval)) {
14171446
struct folio *folio = pfn_folio(pmd_pfn(*src_pmd));
14181447

1419-
if (!folio || !PageAnonExclusive(&folio->page)) {
1448+
if (!folio || (!is_huge_zero_page(&folio->page) &&
1449+
!PageAnonExclusive(&folio->page))) {
14201450
spin_unlock(ptl);
14211451
err = -EBUSY;
14221452
break;

0 commit comments

Comments
 (0)