Skip to content

Commit f9ce0be

Browse files
kirylwilldeacon
authored andcommitted
mm: Cleanup faultaround and finish_fault() codepaths
alloc_set_pte() has two users with different requirements: in the faultaround code, it called from an atomic context and PTE page table has to be preallocated. finish_fault() can sleep and allocate page table as needed. PTL locking rules are also strange, hard to follow and overkill for finish_fault(). Let's untangle the mess. alloc_set_pte() has gone now. All locking is explicit. The price is some code duplication to handle huge pages in faultaround path, but it should be fine, having overall improvement in readability. Link: https://lore.kernel.org/r/20201229132819.najtavneutnf7ajp@box Signed-off-by: Kirill A. Shutemov <[email protected]> [will: s/from from/from/ in comment; spotted by willy] Signed-off-by: Will Deacon <[email protected]>
1 parent 19c329f commit f9ce0be

File tree

5 files changed

+213
-192
lines changed

5 files changed

+213
-192
lines changed

fs/xfs/xfs_file.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1319,17 +1319,19 @@ xfs_filemap_pfn_mkwrite(
13191319
return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
13201320
}
13211321

1322-
static void
1322+
static vm_fault_t
13231323
xfs_filemap_map_pages(
13241324
struct vm_fault *vmf,
13251325
pgoff_t start_pgoff,
13261326
pgoff_t end_pgoff)
13271327
{
13281328
struct inode *inode = file_inode(vmf->vma->vm_file);
1329+
vm_fault_t ret;
13291330

13301331
xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1331-
filemap_map_pages(vmf, start_pgoff, end_pgoff);
1332+
ret = filemap_map_pages(vmf, start_pgoff, end_pgoff);
13321333
xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1334+
return ret;
13331335
}
13341336

13351337
static const struct vm_operations_struct xfs_file_vm_ops = {

include/linux/mm.h

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -542,8 +542,8 @@ struct vm_fault {
542542
* is not NULL, otherwise pmd.
543543
*/
544544
pgtable_t prealloc_pte; /* Pre-allocated pte page table.
545-
* vm_ops->map_pages() calls
546-
* alloc_set_pte() from atomic context.
545+
* vm_ops->map_pages() sets up a page
546+
* table from atomic context.
547547
* do_fault_around() pre-allocates
548548
* page table to avoid allocation from
549549
* atomic context.
@@ -578,7 +578,7 @@ struct vm_operations_struct {
578578
vm_fault_t (*fault)(struct vm_fault *vmf);
579579
vm_fault_t (*huge_fault)(struct vm_fault *vmf,
580580
enum page_entry_size pe_size);
581-
void (*map_pages)(struct vm_fault *vmf,
581+
vm_fault_t (*map_pages)(struct vm_fault *vmf,
582582
pgoff_t start_pgoff, pgoff_t end_pgoff);
583583
unsigned long (*pagesize)(struct vm_area_struct * area);
584584

@@ -988,7 +988,9 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
988988
return pte;
989989
}
990990

991-
vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page);
991+
vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page);
992+
void do_set_pte(struct vm_fault *vmf, struct page *page);
993+
992994
vm_fault_t finish_fault(struct vm_fault *vmf);
993995
vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf);
994996
#endif
@@ -2622,7 +2624,7 @@ extern void truncate_inode_pages_final(struct address_space *);
26222624

26232625
/* generic vm_area_ops exported for stackable file systems */
26242626
extern vm_fault_t filemap_fault(struct vm_fault *vmf);
2625-
extern void filemap_map_pages(struct vm_fault *vmf,
2627+
extern vm_fault_t filemap_map_pages(struct vm_fault *vmf,
26262628
pgoff_t start_pgoff, pgoff_t end_pgoff);
26272629
extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf);
26282630

include/linux/pgtable.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1314,6 +1314,17 @@ static inline int pmd_trans_unstable(pmd_t *pmd)
13141314
#endif
13151315
}
13161316

1317+
/*
1318+
* the ordering of these checks is important for pmds with _page_devmap set.
1319+
* if we check pmd_trans_unstable() first we will trip the bad_pmd() check
1320+
* inside of pmd_none_or_trans_huge_or_clear_bad(). this will end up correctly
1321+
* returning 1 but not before it spams dmesg with the pmd_clear_bad() output.
1322+
*/
1323+
static inline int pmd_devmap_trans_unstable(pmd_t *pmd)
1324+
{
1325+
return pmd_devmap(*pmd) || pmd_trans_unstable(pmd);
1326+
}
1327+
13171328
#ifndef CONFIG_NUMA_BALANCING
13181329
/*
13191330
* Technically a PTE can be PROTNONE even when not doing NUMA balancing but

mm/filemap.c

Lines changed: 134 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
#include <linux/psi.h>
4343
#include <linux/ramfs.h>
4444
#include <linux/page_idle.h>
45+
#include <asm/pgalloc.h>
4546
#include "internal.h"
4647

4748
#define CREATE_TRACE_POINTS
@@ -2911,74 +2912,164 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
29112912
}
29122913
EXPORT_SYMBOL(filemap_fault);
29132914

2914-
void filemap_map_pages(struct vm_fault *vmf,
2915-
pgoff_t start_pgoff, pgoff_t end_pgoff)
2915+
static bool filemap_map_pmd(struct vm_fault *vmf, struct page *page)
29162916
{
2917-
struct file *file = vmf->vma->vm_file;
2917+
struct mm_struct *mm = vmf->vma->vm_mm;
2918+
2919+
/* Huge page is mapped? No need to proceed. */
2920+
if (pmd_trans_huge(*vmf->pmd)) {
2921+
unlock_page(page);
2922+
put_page(page);
2923+
return true;
2924+
}
2925+
2926+
if (pmd_none(*vmf->pmd) && PageTransHuge(page)) {
2927+
vm_fault_t ret = do_set_pmd(vmf, page);
2928+
if (!ret) {
2929+
/* The page is mapped successfully, reference consumed. */
2930+
unlock_page(page);
2931+
return true;
2932+
}
2933+
}
2934+
2935+
if (pmd_none(*vmf->pmd)) {
2936+
vmf->ptl = pmd_lock(mm, vmf->pmd);
2937+
if (likely(pmd_none(*vmf->pmd))) {
2938+
mm_inc_nr_ptes(mm);
2939+
pmd_populate(mm, vmf->pmd, vmf->prealloc_pte);
2940+
vmf->prealloc_pte = NULL;
2941+
}
2942+
spin_unlock(vmf->ptl);
2943+
}
2944+
2945+
/* See comment in handle_pte_fault() */
2946+
if (pmd_devmap_trans_unstable(vmf->pmd)) {
2947+
unlock_page(page);
2948+
put_page(page);
2949+
return true;
2950+
}
2951+
2952+
return false;
2953+
}
2954+
2955+
static struct page *next_uptodate_page(struct page *page,
2956+
struct address_space *mapping,
2957+
struct xa_state *xas, pgoff_t end_pgoff)
2958+
{
2959+
unsigned long max_idx;
2960+
2961+
do {
2962+
if (!page)
2963+
return NULL;
2964+
if (xas_retry(xas, page))
2965+
continue;
2966+
if (xa_is_value(page))
2967+
continue;
2968+
if (PageLocked(page))
2969+
continue;
2970+
if (!page_cache_get_speculative(page))
2971+
continue;
2972+
/* Has the page moved or been split? */
2973+
if (unlikely(page != xas_reload(xas)))
2974+
goto skip;
2975+
if (!PageUptodate(page) || PageReadahead(page))
2976+
goto skip;
2977+
if (PageHWPoison(page))
2978+
goto skip;
2979+
if (!trylock_page(page))
2980+
goto skip;
2981+
if (page->mapping != mapping)
2982+
goto unlock;
2983+
if (!PageUptodate(page))
2984+
goto unlock;
2985+
max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
2986+
if (xas->xa_index >= max_idx)
2987+
goto unlock;
2988+
return page;
2989+
unlock:
2990+
unlock_page(page);
2991+
skip:
2992+
put_page(page);
2993+
} while ((page = xas_next_entry(xas, end_pgoff)) != NULL);
2994+
2995+
return NULL;
2996+
}
2997+
2998+
static inline struct page *first_map_page(struct address_space *mapping,
2999+
struct xa_state *xas,
3000+
pgoff_t end_pgoff)
3001+
{
3002+
return next_uptodate_page(xas_find(xas, end_pgoff),
3003+
mapping, xas, end_pgoff);
3004+
}
3005+
3006+
static inline struct page *next_map_page(struct address_space *mapping,
3007+
struct xa_state *xas,
3008+
pgoff_t end_pgoff)
3009+
{
3010+
return next_uptodate_page(xas_next_entry(xas, end_pgoff),
3011+
mapping, xas, end_pgoff);
3012+
}
3013+
3014+
vm_fault_t filemap_map_pages(struct vm_fault *vmf,
3015+
pgoff_t start_pgoff, pgoff_t end_pgoff)
3016+
{
3017+
struct vm_area_struct *vma = vmf->vma;
3018+
struct file *file = vma->vm_file;
29183019
struct address_space *mapping = file->f_mapping;
29193020
pgoff_t last_pgoff = start_pgoff;
2920-
unsigned long max_idx;
3021+
unsigned long address = vmf->address;
29213022
XA_STATE(xas, &mapping->i_pages, start_pgoff);
29223023
struct page *head, *page;
29233024
unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss);
3025+
vm_fault_t ret = 0;
29243026

29253027
rcu_read_lock();
2926-
xas_for_each(&xas, head, end_pgoff) {
2927-
if (xas_retry(&xas, head))
2928-
continue;
2929-
if (xa_is_value(head))
2930-
goto next;
3028+
head = first_map_page(mapping, &xas, end_pgoff);
3029+
if (!head)
3030+
goto out;
29313031

2932-
/*
2933-
* Check for a locked page first, as a speculative
2934-
* reference may adversely influence page migration.
2935-
*/
2936-
if (PageLocked(head))
2937-
goto next;
2938-
if (!page_cache_get_speculative(head))
2939-
goto next;
3032+
if (filemap_map_pmd(vmf, head)) {
3033+
ret = VM_FAULT_NOPAGE;
3034+
goto out;
3035+
}
29403036

2941-
/* Has the page moved or been split? */
2942-
if (unlikely(head != xas_reload(&xas)))
2943-
goto skip;
3037+
vmf->address = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT);
3038+
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl);
3039+
do {
29443040
page = find_subpage(head, xas.xa_index);
2945-
2946-
if (!PageUptodate(head) ||
2947-
PageReadahead(page) ||
2948-
PageHWPoison(page))
2949-
goto skip;
2950-
if (!trylock_page(head))
2951-
goto skip;
2952-
2953-
if (head->mapping != mapping || !PageUptodate(head))
2954-
goto unlock;
2955-
2956-
max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
2957-
if (xas.xa_index >= max_idx)
3041+
if (PageHWPoison(page))
29583042
goto unlock;
29593043

29603044
if (mmap_miss > 0)
29613045
mmap_miss--;
29623046

29633047
vmf->address += (xas.xa_index - last_pgoff) << PAGE_SHIFT;
2964-
if (vmf->pte)
2965-
vmf->pte += xas.xa_index - last_pgoff;
3048+
vmf->pte += xas.xa_index - last_pgoff;
29663049
last_pgoff = xas.xa_index;
2967-
if (alloc_set_pte(vmf, page))
3050+
3051+
if (!pte_none(*vmf->pte))
29683052
goto unlock;
3053+
3054+
do_set_pte(vmf, page);
3055+
/* no need to invalidate: a not-present page won't be cached */
3056+
update_mmu_cache(vma, vmf->address, vmf->pte);
29693057
unlock_page(head);
2970-
goto next;
3058+
3059+
/* The fault is handled */
3060+
if (vmf->address == address)
3061+
ret = VM_FAULT_NOPAGE;
3062+
continue;
29713063
unlock:
29723064
unlock_page(head);
2973-
skip:
29743065
put_page(head);
2975-
next:
2976-
/* Huge page is mapped? No need to proceed. */
2977-
if (pmd_trans_huge(*vmf->pmd))
2978-
break;
2979-
}
3066+
} while ((head = next_map_page(mapping, &xas, end_pgoff)) != NULL);
3067+
pte_unmap_unlock(vmf->pte, vmf->ptl);
3068+
out:
29803069
rcu_read_unlock();
3070+
vmf->address = address;
29813071
WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss);
3072+
return ret;
29823073
}
29833074
EXPORT_SYMBOL(filemap_map_pages);
29843075

0 commit comments

Comments
 (0)