Skip to content

Commit ac492b9

Browse files
David Stevensakpm00
authored andcommitted
mm/khugepaged: skip shmem with userfaultfd
Make sure that collapse_file respects any userfaultfds registered with MODE_MISSING. If userspace has any such userfaultfds registered, then for any page which it knows to be missing, it may expect a UFFD_EVENT_PAGEFAULT. This means collapse_file needs to be careful when collapsing a shmem range would result in replacing an empty page with a THP, to avoid breaking userfaultfd. Synchronization when checking for userfaultfds in collapse_file is tricky because the mmap locks can't be used to prevent races with the registration of new userfaultfds. Instead, we provide synchronization by ensuring that userspace cannot observe the fact that pages are missing before we check for userfaultfds. Although this allows registration of a userfaultfd to race with collapse_file, it ensures that userspace cannot observe any pages transition from missing to present after such a race occurs. This makes such a race indistinguishable to the collapse occurring immediately before the userfaultfd registration. The first step to provide this synchronization is to stop filling gaps during the loop iterating over the target range, since the page cache lock can be dropped during that loop. The second step is to fill the gaps with XA_RETRY_ENTRY after the page cache lock is acquired the final time, to avoid races with accesses to the page cache that only take the RCU read lock. The fact that we don't fill holes during the initial iteration means that collapse_file now has to handle faults occurring during the collapse. This is done by re-validating the number of missing pages after acquiring the page cache lock for the final time. This fix is targeted at khugepaged, but the change also applies to MADV_COLLAPSE. MADV_COLLAPSE on a range with a userfaultfd will now return EBUSY if there are any missing pages (instead of succeeding on shmem and returning EINVAL on anonymous memory). There is also now a window during MADV_COLLAPSE where a fault on a missing page will cause the syscall to fail with EAGAIN. The fact that intermediate page cache state can no longer be observed before the rollback of a failed collapse is also technically a userspace-visible change (via at least SEEK_DATA and SEEK_END), but it is exceedingly unlikely that anything relies on being able to observe that transient state. Link: https://lkml.kernel.org/r/[email protected] Signed-off-by: David Stevens <[email protected]> Acked-by: Peter Xu <[email protected]> Cc: David Hildenbrand <[email protected]> Cc: Hugh Dickins <[email protected]> Cc: Jiaqi Yan <[email protected]> Cc: "Kirill A. Shutemov" <[email protected]> Cc: Matthew Wilcox (Oracle) <[email protected]> Cc: Yang Shi <[email protected]> Signed-off-by: Andrew Morton <[email protected]>
1 parent cae106d commit ac492b9

File tree

2 files changed

+81
-21
lines changed

2 files changed

+81
-21
lines changed

include/trace/events/huge_memory.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,8 @@
3838
EM( SCAN_TRUNCATED, "truncated") \
3939
EM( SCAN_PAGE_HAS_PRIVATE, "page_has_private") \
4040
EM( SCAN_STORE_FAILED, "store_failed") \
41-
EMe(SCAN_COPY_MC, "copy_poisoned_page")
41+
EM( SCAN_COPY_MC, "copy_poisoned_page") \
42+
EMe(SCAN_PAGE_FILLED, "page_filled")
4243

4344
#undef EM
4445
#undef EMe

mm/khugepaged.c

Lines changed: 79 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ enum scan_result {
5757
SCAN_PAGE_HAS_PRIVATE,
5858
SCAN_STORE_FAILED,
5959
SCAN_COPY_MC,
60+
SCAN_PAGE_FILLED,
6061
};
6162

6263
#define CREATE_TRACE_POINTS
@@ -1860,8 +1861,8 @@ static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff,
18601861
* - allocate and lock a new huge page;
18611862
* - scan page cache replacing old pages with the new one
18621863
* + swap/gup in pages if necessary;
1863-
* + fill in gaps;
18641864
* + keep old pages around in case rollback is required;
1865+
* - finalize updates to the page cache;
18651866
* - if replacing succeeds:
18661867
* + copy data over;
18671868
* + free old pages;
@@ -1939,7 +1940,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
19391940
result = SCAN_TRUNCATED;
19401941
goto xa_locked;
19411942
}
1942-
xas_set(&xas, index);
1943+
xas_set(&xas, index + 1);
19431944
}
19441945
if (!shmem_charge(mapping->host, 1)) {
19451946
result = SCAN_FAIL;
@@ -2176,22 +2177,66 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
21762177
index++;
21772178
}
21782179

2179-
/*
2180-
* Copying old pages to huge one has succeeded, now we
2181-
* need to free the old pages.
2182-
*/
2183-
list_for_each_entry_safe(page, tmp, &pagelist, lru) {
2184-
list_del(&page->lru);
2185-
page->mapping = NULL;
2186-
page_ref_unfreeze(page, 1);
2187-
ClearPageActive(page);
2188-
ClearPageUnevictable(page);
2189-
unlock_page(page);
2190-
put_page(page);
2180+
if (nr_none) {
2181+
struct vm_area_struct *vma;
2182+
int nr_none_check = 0;
2183+
2184+
i_mmap_lock_read(mapping);
2185+
xas_lock_irq(&xas);
2186+
2187+
xas_set(&xas, start);
2188+
for (index = start; index < end; index++) {
2189+
if (!xas_next(&xas)) {
2190+
xas_store(&xas, XA_RETRY_ENTRY);
2191+
if (xas_error(&xas)) {
2192+
result = SCAN_STORE_FAILED;
2193+
goto immap_locked;
2194+
}
2195+
nr_none_check++;
2196+
}
2197+
}
2198+
2199+
if (nr_none != nr_none_check) {
2200+
result = SCAN_PAGE_FILLED;
2201+
goto immap_locked;
2202+
}
2203+
2204+
/*
2205+
* If userspace observed a missing page in a VMA with a MODE_MISSING
2206+
* userfaultfd, then it might expect a UFFD_EVENT_PAGEFAULT for that
2207+
* page. If so, we need to roll back to avoid suppressing such an
2208+
* event. Since wp/minor userfaultfds don't give userspace any
2209+
* guarantees that the kernel doesn't fill a missing page with a zero
2210+
* page, so they don't matter here.
2211+
*
2212+
* Any userfaultfds registered after this point will not be able to
2213+
* observe any missing pages due to the previously inserted retry
2214+
* entries.
2215+
*/
2216+
vma_interval_tree_foreach(vma, &mapping->i_mmap, start, end) {
2217+
if (userfaultfd_missing(vma)) {
2218+
result = SCAN_EXCEED_NONE_PTE;
2219+
goto immap_locked;
2220+
}
2221+
}
2222+
2223+
immap_locked:
2224+
i_mmap_unlock_read(mapping);
2225+
if (result != SCAN_SUCCEED) {
2226+
xas_set(&xas, start);
2227+
for (index = start; index < end; index++) {
2228+
if (xas_next(&xas) == XA_RETRY_ENTRY)
2229+
xas_store(&xas, NULL);
2230+
}
2231+
2232+
xas_unlock_irq(&xas);
2233+
goto rollback;
2234+
}
2235+
} else {
2236+
xas_lock_irq(&xas);
21912237
}
21922238

21932239
nr = thp_nr_pages(hpage);
2194-
xas_lock_irq(&xas);
21952240
if (is_shmem)
21962241
__mod_lruvec_page_state(hpage, NR_SHMEM_THPS, nr);
21972242
else
@@ -2221,6 +2266,20 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
22212266
result = retract_page_tables(mapping, start, mm, addr, hpage,
22222267
cc);
22232268
unlock_page(hpage);
2269+
2270+
/*
2271+
* The collapse has succeeded, so free the old pages.
2272+
*/
2273+
list_for_each_entry_safe(page, tmp, &pagelist, lru) {
2274+
list_del(&page->lru);
2275+
page->mapping = NULL;
2276+
page_ref_unfreeze(page, 1);
2277+
ClearPageActive(page);
2278+
ClearPageUnevictable(page);
2279+
unlock_page(page);
2280+
put_page(page);
2281+
}
2282+
22242283
goto out;
22252284

22262285
rollback:
@@ -2232,15 +2291,13 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
22322291
}
22332292

22342293
xas_set(&xas, start);
2235-
xas_for_each(&xas, page, end - 1) {
2294+
end = index;
2295+
for (index = start; index < end; index++) {
2296+
xas_next(&xas);
22362297
page = list_first_entry_or_null(&pagelist,
22372298
struct page, lru);
22382299
if (!page || xas.xa_index < page->index) {
2239-
if (!nr_none)
2240-
break;
22412300
nr_none--;
2242-
/* Put holes back where they were */
2243-
xas_store(&xas, NULL);
22442301
continue;
22452302
}
22462303

@@ -2764,12 +2821,14 @@ static int madvise_collapse_errno(enum scan_result r)
27642821
case SCAN_ALLOC_HUGE_PAGE_FAIL:
27652822
return -ENOMEM;
27662823
case SCAN_CGROUP_CHARGE_FAIL:
2824+
case SCAN_EXCEED_NONE_PTE:
27672825
return -EBUSY;
27682826
/* Resource temporary unavailable - trying again might succeed */
27692827
case SCAN_PAGE_COUNT:
27702828
case SCAN_PAGE_LOCK:
27712829
case SCAN_PAGE_LRU:
27722830
case SCAN_DEL_PAGE_LRU:
2831+
case SCAN_PAGE_FILLED:
27732832
return -EAGAIN;
27742833
/*
27752834
* Other: Trying again likely not to succeed / error intrinsic to

0 commit comments

Comments
 (0)