Skip to content

Commit 2bb6d28

Browse files
djbwtorvalds
authored andcommitted
mm: introduce get_user_pages_longterm
Patch series "introduce get_user_pages_longterm()", v2. Here is a new get_user_pages api for cases where a driver intends to keep an elevated page count indefinitely. This is distinct from usages like iov_iter_get_pages where the elevated page counts are transient. The iov_iter_get_pages cases immediately turn around and submit the pages to a device driver which will put_page when the i/o operation completes (under kernel control). In the longterm case userspace is responsible for dropping the page reference at some undefined point in the future. This is untenable for filesystem-dax case where the filesystem is in control of the lifetime of the block / page and needs reasonable limits on how long it can wait for pages in a mapping to become idle. Fixing filesystems to actually wait for dax pages to be idle before blocks from a truncate/hole-punch operation are repurposed is saved for a later patch series. Also, allowing longterm registration of dax mappings is a future patch series that introduces a "map with lease" semantic where the kernel can revoke a lease and force userspace to drop its page references. I have also tagged these for -stable to purposely break cases that might assume that longterm memory registrations for filesystem-dax mappings were supported by the kernel. The behavior regression this policy change implies is one of the reasons we maintain the "dax enabled. Warning: EXPERIMENTAL, use at your own risk" notification when mounting a filesystem in dax mode. It is worth noting the device-dax interface does not suffer the same constraints since it does not support file space management operations like hole-punch. This patch (of 4): Until there is a solution to the dma-to-dax vs truncate problem it is not safe to allow long standing memory registrations against filesytem-dax vmas. Device-dax vmas do not have this problem and are explicitly allowed. This is temporary until a "memory registration with layout-lease" mechanism can be implemented for the affected sub-systems (RDMA and V4L2). [[email protected]: use kcalloc()] Link: http://lkml.kernel.org/r/151068939435.7446.13560129395419350737.stgit@dwillia2-desk3.amr.corp.intel.com Fixes: 3565fce ("mm, x86: get_user_pages() for dax mappings") Signed-off-by: Dan Williams <[email protected]> Suggested-by: Christoph Hellwig <[email protected]> Cc: Doug Ledford <[email protected]> Cc: Hal Rosenstock <[email protected]> Cc: Inki Dae <[email protected]> Cc: Jan Kara <[email protected]> Cc: Jason Gunthorpe <[email protected]> Cc: Jeff Moyer <[email protected]> Cc: Joonyoung Shim <[email protected]> Cc: Kyungmin Park <[email protected]> Cc: Mauro Carvalho Chehab <[email protected]> Cc: Mel Gorman <[email protected]> Cc: Ross Zwisler <[email protected]> Cc: Sean Hefty <[email protected]> Cc: Seung-Woo Kim <[email protected]> Cc: Vlastimil Babka <[email protected]> Cc: <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent 9702cff commit 2bb6d28

File tree

3 files changed

+91
-0
lines changed

3 files changed

+91
-0
lines changed

include/linux/fs.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3194,6 +3194,20 @@ static inline bool vma_is_dax(struct vm_area_struct *vma)
31943194
return vma->vm_file && IS_DAX(vma->vm_file->f_mapping->host);
31953195
}
31963196

3197+
static inline bool vma_is_fsdax(struct vm_area_struct *vma)
3198+
{
3199+
struct inode *inode;
3200+
3201+
if (!vma->vm_file)
3202+
return false;
3203+
if (!vma_is_dax(vma))
3204+
return false;
3205+
inode = file_inode(vma->vm_file);
3206+
if (inode->i_mode == S_IFCHR)
3207+
return false; /* device-dax */
3208+
return true;
3209+
}
3210+
31973211
static inline int iocb_flags(struct file *file)
31983212
{
31993213
int res = 0;

include/linux/mm.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1380,6 +1380,19 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
13801380
unsigned int gup_flags, struct page **pages, int *locked);
13811381
long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
13821382
struct page **pages, unsigned int gup_flags);
1383+
#ifdef CONFIG_FS_DAX
1384+
long get_user_pages_longterm(unsigned long start, unsigned long nr_pages,
1385+
unsigned int gup_flags, struct page **pages,
1386+
struct vm_area_struct **vmas);
1387+
#else
1388+
static inline long get_user_pages_longterm(unsigned long start,
1389+
unsigned long nr_pages, unsigned int gup_flags,
1390+
struct page **pages, struct vm_area_struct **vmas)
1391+
{
1392+
return get_user_pages(start, nr_pages, gup_flags, pages, vmas);
1393+
}
1394+
#endif /* CONFIG_FS_DAX */
1395+
13831396
int get_user_pages_fast(unsigned long start, int nr_pages, int write,
13841397
struct page **pages);
13851398

mm/gup.c

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1095,6 +1095,70 @@ long get_user_pages(unsigned long start, unsigned long nr_pages,
10951095
}
10961096
EXPORT_SYMBOL(get_user_pages);
10971097

1098+
#ifdef CONFIG_FS_DAX
1099+
/*
1100+
* This is the same as get_user_pages() in that it assumes we are
1101+
* operating on the current task's mm, but it goes further to validate
1102+
* that the vmas associated with the address range are suitable for
1103+
* longterm elevated page reference counts. For example, filesystem-dax
1104+
* mappings are subject to the lifetime enforced by the filesystem and
1105+
* we need guarantees that longterm users like RDMA and V4L2 only
1106+
* establish mappings that have a kernel enforced revocation mechanism.
1107+
*
1108+
* "longterm" == userspace controlled elevated page count lifetime.
1109+
* Contrast this to iov_iter_get_pages() usages which are transient.
1110+
*/
1111+
long get_user_pages_longterm(unsigned long start, unsigned long nr_pages,
1112+
unsigned int gup_flags, struct page **pages,
1113+
struct vm_area_struct **vmas_arg)
1114+
{
1115+
struct vm_area_struct **vmas = vmas_arg;
1116+
struct vm_area_struct *vma_prev = NULL;
1117+
long rc, i;
1118+
1119+
if (!pages)
1120+
return -EINVAL;
1121+
1122+
if (!vmas) {
1123+
vmas = kcalloc(nr_pages, sizeof(struct vm_area_struct *),
1124+
GFP_KERNEL);
1125+
if (!vmas)
1126+
return -ENOMEM;
1127+
}
1128+
1129+
rc = get_user_pages(start, nr_pages, gup_flags, pages, vmas);
1130+
1131+
for (i = 0; i < rc; i++) {
1132+
struct vm_area_struct *vma = vmas[i];
1133+
1134+
if (vma == vma_prev)
1135+
continue;
1136+
1137+
vma_prev = vma;
1138+
1139+
if (vma_is_fsdax(vma))
1140+
break;
1141+
}
1142+
1143+
/*
1144+
* Either get_user_pages() failed, or the vma validation
1145+
* succeeded, in either case we don't need to put_page() before
1146+
* returning.
1147+
*/
1148+
if (i >= rc)
1149+
goto out;
1150+
1151+
for (i = 0; i < rc; i++)
1152+
put_page(pages[i]);
1153+
rc = -EOPNOTSUPP;
1154+
out:
1155+
if (vmas != vmas_arg)
1156+
kfree(vmas);
1157+
return rc;
1158+
}
1159+
EXPORT_SYMBOL(get_user_pages_longterm);
1160+
#endif /* CONFIG_FS_DAX */
1161+
10981162
/**
10991163
* populate_vma_page_range() - populate a range of pages in the vma.
11001164
* @vma: target vma

0 commit comments

Comments
 (0)