Skip to content

Commit a2e17cc

Browse files
David Stevensakpm00
authored andcommitted
mm/khugepaged: maintain page cache uptodate flag
Make sure that collapse_file doesn't interfere with checking the uptodate flag in the page cache by only inserting hpage into the page cache after it has been updated and marked uptodate. This is achieved by simply not replacing present pages with hpage when iterating over the target range. The present pages are already locked, so replacing them with the locked hpage before the collapse is finalized is unnecessary. However, it is necessary to stop freezing the present pages after validating them, since leaving long-term frozen pages in the page cache can lead to deadlocks. Simply checking the reference count is sufficient to ensure that there are no long-term references hanging around that would the collapse would break. Similar to hpage, there is no reason that the present pages actually need to be frozen in addition to being locked. This fixes a race where folio_seek_hole_data would mistake hpage for an fallocated but unwritten page. This race is visible to userspace via data temporarily disappearing from SEEK_DATA/SEEK_HOLE. This also fixes a similar race where pages could temporarily disappear from mincore. Link: https://lkml.kernel.org/r/[email protected] Fixes: f3f0e1d ("khugepaged: add support of collapse for tmpfs/shmem pages") Signed-off-by: David Stevens <[email protected]> Cc: David Hildenbrand <[email protected]> Cc: Hugh Dickins <[email protected]> Cc: Jiaqi Yan <[email protected]> Cc: "Kirill A. Shutemov" <[email protected]> Cc: Matthew Wilcox (Oracle) <[email protected]> Cc: Peter Xu <[email protected]> Cc: Yang Shi <[email protected]> Signed-off-by: Andrew Morton <[email protected]>
1 parent ac492b9 commit a2e17cc

File tree

1 file changed

+33
-52
lines changed

1 file changed

+33
-52
lines changed

mm/khugepaged.c

Lines changed: 33 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -1859,17 +1859,18 @@ static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff,
18591859
*
18601860
* Basic scheme is simple, details are more complex:
18611861
* - allocate and lock a new huge page;
1862-
* - scan page cache replacing old pages with the new one
1862+
* - scan page cache, locking old pages
18631863
* + swap/gup in pages if necessary;
1864-
* + keep old pages around in case rollback is required;
1864+
* - copy data to new page
1865+
* - handle shmem holes
1866+
* + re-validate that holes weren't filled by someone else
1867+
* + check for userfaultfd
18651868
* - finalize updates to the page cache;
18661869
* - if replacing succeeds:
1867-
* + copy data over;
1868-
* + free old pages;
18691870
* + unlock huge page;
1871+
* + free old pages;
18701872
* - if replacing failed;
1871-
* + put all pages back and unfreeze them;
1872-
* + restore gaps in the page cache;
1873+
* + unlock old pages
18731874
* + unlock and free huge page;
18741875
*/
18751876
static int collapse_file(struct mm_struct *mm, unsigned long addr,
@@ -1917,12 +1918,6 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
19171918
}
19181919
} while (1);
19191920

1920-
/*
1921-
* At this point the hpage is locked and not up-to-date.
1922-
* It's safe to insert it into the page cache, because nobody would
1923-
* be able to map it or use it in another way until we unlock it.
1924-
*/
1925-
19261921
xas_set(&xas, start);
19271922
for (index = start; index < end; index++) {
19281923
page = xas_next(&xas);
@@ -2090,29 +2085,31 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
20902085
VM_BUG_ON_PAGE(page != xas_load(&xas), page);
20912086

20922087
/*
2093-
* The page is expected to have page_count() == 3:
2088+
* We control three references to the page:
20942089
* - we hold a pin on it;
20952090
* - one reference from page cache;
20962091
* - one from isolate_lru_page;
2092+
* If those are the only references, then any new usage of the
2093+
* page will have to fetch it from the page cache. That requires
2094+
* locking the page to handle truncate, so any new usage will be
2095+
* blocked until we unlock page after collapse/during rollback.
20972096
*/
2098-
if (!page_ref_freeze(page, 3)) {
2097+
if (page_count(page) != 3) {
20992098
result = SCAN_PAGE_COUNT;
21002099
xas_unlock_irq(&xas);
21012100
putback_lru_page(page);
21022101
goto out_unlock;
21032102
}
21042103

21052104
/*
2106-
* Add the page to the list to be able to undo the collapse if
2107-
* something go wrong.
2105+
* Accumulate the pages that are being collapsed.
21082106
*/
21092107
list_add_tail(&page->lru, &pagelist);
21102108

2111-
/* Finally, replace with the new page. */
2112-
xas_store(&xas, hpage);
2113-
/* We can't get an ENOMEM here (because the allocation happened before)
2114-
* but let's check for errors (XArray implementation can be
2115-
* changed in the future)
2109+
/*
2110+
* We can't get an ENOMEM here (because the allocation happened
2111+
* before) but let's check for errors (XArray implementation
2112+
* can be changed in the future)
21162113
*/
21172114
WARN_ON_ONCE(xas_error(&xas));
21182115
continue;
@@ -2157,8 +2154,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
21572154
goto rollback;
21582155

21592156
/*
2160-
* Replacing old pages with new one has succeeded, now we
2161-
* attempt to copy the contents.
2157+
* The old pages are locked, so they won't change anymore.
21622158
*/
21632159
index = start;
21642160
list_for_each_entry(page, &pagelist, lru) {
@@ -2247,11 +2243,11 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
22472243
/* nr_none is always 0 for non-shmem. */
22482244
__mod_lruvec_page_state(hpage, NR_SHMEM, nr_none);
22492245
}
2250-
/* Join all the small entries into a single multi-index entry. */
2251-
xas_set_order(&xas, start, HPAGE_PMD_ORDER);
2252-
xas_store(&xas, hpage);
2253-
xas_unlock_irq(&xas);
22542246

2247+
/*
2248+
* Mark hpage as uptodate before inserting it into the page cache so
2249+
* that it isn't mistaken for an fallocated but unwritten page.
2250+
*/
22552251
folio = page_folio(hpage);
22562252
folio_mark_uptodate(folio);
22572253
folio_ref_add(folio, HPAGE_PMD_NR - 1);
@@ -2260,6 +2256,11 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
22602256
folio_mark_dirty(folio);
22612257
folio_add_lru(folio);
22622258

2259+
/* Join all the small entries into a single multi-index entry. */
2260+
xas_set_order(&xas, start, HPAGE_PMD_ORDER);
2261+
xas_store(&xas, hpage);
2262+
xas_unlock_irq(&xas);
2263+
22632264
/*
22642265
* Remove pte page tables, so we can re-fault the page as huge.
22652266
*/
@@ -2273,47 +2274,29 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
22732274
list_for_each_entry_safe(page, tmp, &pagelist, lru) {
22742275
list_del(&page->lru);
22752276
page->mapping = NULL;
2276-
page_ref_unfreeze(page, 1);
22772277
ClearPageActive(page);
22782278
ClearPageUnevictable(page);
22792279
unlock_page(page);
2280-
put_page(page);
2280+
folio_put_refs(page_folio(page), 3);
22812281
}
22822282

22832283
goto out;
22842284

22852285
rollback:
22862286
/* Something went wrong: roll back page cache changes */
2287-
xas_lock_irq(&xas);
22882287
if (nr_none) {
2288+
xas_lock_irq(&xas);
22892289
mapping->nrpages -= nr_none;
22902290
shmem_uncharge(mapping->host, nr_none);
2291+
xas_unlock_irq(&xas);
22912292
}
22922293

2293-
xas_set(&xas, start);
2294-
end = index;
2295-
for (index = start; index < end; index++) {
2296-
xas_next(&xas);
2297-
page = list_first_entry_or_null(&pagelist,
2298-
struct page, lru);
2299-
if (!page || xas.xa_index < page->index) {
2300-
nr_none--;
2301-
continue;
2302-
}
2303-
2304-
VM_BUG_ON_PAGE(page->index != xas.xa_index, page);
2305-
2306-
/* Unfreeze the page. */
2294+
list_for_each_entry_safe(page, tmp, &pagelist, lru) {
23072295
list_del(&page->lru);
2308-
page_ref_unfreeze(page, 2);
2309-
xas_store(&xas, page);
2310-
xas_pause(&xas);
2311-
xas_unlock_irq(&xas);
23122296
unlock_page(page);
23132297
putback_lru_page(page);
2314-
xas_lock_irq(&xas);
2298+
put_page(page);
23152299
}
2316-
VM_BUG_ON(nr_none);
23172300
/*
23182301
* Undo the updates of filemap_nr_thps_inc for non-SHMEM
23192302
* file only. This undo is not needed unless failure is
@@ -2328,8 +2311,6 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
23282311
smp_mb();
23292312
}
23302313

2331-
xas_unlock_irq(&xas);
2332-
23332314
hpage->mapping = NULL;
23342315

23352316
unlock_page(hpage);

0 commit comments

Comments
 (0)