Skip to content

Commit 3565fce

Browse files
djbwtorvalds
authored andcommitted
mm, x86: get_user_pages() for dax mappings
A dax mapping establishes a pte with _PAGE_DEVMAP set when the driver has established a devm_memremap_pages() mapping, i.e. when the pfn_t return from ->direct_access() has PFN_DEV and PFN_MAP set. Later, when encountering _PAGE_DEVMAP during a page table walk we lookup and pin a struct dev_pagemap instance to keep the result of pfn_to_page() valid until put_page(). Signed-off-by: Dan Williams <[email protected]> Tested-by: Logan Gunthorpe <[email protected]> Cc: Dave Hansen <[email protected]> Cc: Mel Gorman <[email protected]> Cc: Peter Zijlstra <[email protected]> Cc: Andrea Arcangeli <[email protected]> Cc: Thomas Gleixner <[email protected]> Cc: Ingo Molnar <[email protected]> Cc: "H. Peter Anvin" <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent 5c7fb56 commit 3565fce

File tree

8 files changed

+212
-39
lines changed

8 files changed

+212
-39
lines changed

arch/x86/include/asm/pgtable.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -479,6 +479,13 @@ static inline int pte_present(pte_t a)
479479
return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE);
480480
}
481481

482+
#ifdef __HAVE_ARCH_PTE_DEVMAP
483+
static inline int pte_devmap(pte_t a)
484+
{
485+
return (pte_flags(a) & _PAGE_DEVMAP) == _PAGE_DEVMAP;
486+
}
487+
#endif
488+
482489
#define pte_accessible pte_accessible
483490
static inline bool pte_accessible(struct mm_struct *mm, pte_t a)
484491
{

arch/x86/mm/gup.c

Lines changed: 54 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#include <linux/vmstat.h>
1010
#include <linux/highmem.h>
1111
#include <linux/swap.h>
12+
#include <linux/memremap.h>
1213

1314
#include <asm/pgtable.h>
1415

@@ -63,6 +64,16 @@ static inline pte_t gup_get_pte(pte_t *ptep)
6364
#endif
6465
}
6566

67+
static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages)
68+
{
69+
while ((*nr) - nr_start) {
70+
struct page *page = pages[--(*nr)];
71+
72+
ClearPageReferenced(page);
73+
put_page(page);
74+
}
75+
}
76+
6677
/*
6778
* The performance critical leaf functions are made noinline otherwise gcc
6879
* inlines everything into a single function which results in too much
@@ -71,7 +82,9 @@ static inline pte_t gup_get_pte(pte_t *ptep)
7182
static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
7283
unsigned long end, int write, struct page **pages, int *nr)
7384
{
85+
struct dev_pagemap *pgmap = NULL;
7486
unsigned long mask;
87+
int nr_start = *nr;
7588
pte_t *ptep;
7689

7790
mask = _PAGE_PRESENT|_PAGE_USER;
@@ -89,13 +102,21 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
89102
return 0;
90103
}
91104

92-
if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) {
105+
page = pte_page(pte);
106+
if (pte_devmap(pte)) {
107+
pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
108+
if (unlikely(!pgmap)) {
109+
undo_dev_pagemap(nr, nr_start, pages);
110+
pte_unmap(ptep);
111+
return 0;
112+
}
113+
} else if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) {
93114
pte_unmap(ptep);
94115
return 0;
95116
}
96117
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
97-
page = pte_page(pte);
98118
get_page(page);
119+
put_dev_pagemap(pgmap);
99120
SetPageReferenced(page);
100121
pages[*nr] = page;
101122
(*nr)++;
@@ -114,6 +135,32 @@ static inline void get_head_page_multiple(struct page *page, int nr)
114135
SetPageReferenced(page);
115136
}
116137

138+
static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr,
139+
unsigned long end, struct page **pages, int *nr)
140+
{
141+
int nr_start = *nr;
142+
unsigned long pfn = pmd_pfn(pmd);
143+
struct dev_pagemap *pgmap = NULL;
144+
145+
pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
146+
do {
147+
struct page *page = pfn_to_page(pfn);
148+
149+
pgmap = get_dev_pagemap(pfn, pgmap);
150+
if (unlikely(!pgmap)) {
151+
undo_dev_pagemap(nr, nr_start, pages);
152+
return 0;
153+
}
154+
SetPageReferenced(page);
155+
pages[*nr] = page;
156+
get_page(page);
157+
put_dev_pagemap(pgmap);
158+
(*nr)++;
159+
pfn++;
160+
} while (addr += PAGE_SIZE, addr != end);
161+
return 1;
162+
}
163+
117164
static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
118165
unsigned long end, int write, struct page **pages, int *nr)
119166
{
@@ -126,9 +173,13 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
126173
mask |= _PAGE_RW;
127174
if ((pmd_flags(pmd) & mask) != mask)
128175
return 0;
176+
177+
VM_BUG_ON(!pfn_valid(pmd_pfn(pmd)));
178+
if (pmd_devmap(pmd))
179+
return __gup_device_huge_pmd(pmd, addr, end, pages, nr);
180+
129181
/* hugepages are never "special" */
130182
VM_BUG_ON(pmd_flags(pmd) & _PAGE_SPECIAL);
131-
VM_BUG_ON(!pfn_valid(pmd_pfn(pmd)));
132183

133184
refs = 0;
134185
head = pmd_page(pmd);

include/linux/huge_mm.h

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,6 @@ extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
3838
int prot_numa);
3939
int vmf_insert_pfn_pmd(struct vm_area_struct *, unsigned long addr, pmd_t *,
4040
pfn_t pfn, bool write);
41-
4241
enum transparent_hugepage_flag {
4342
TRANSPARENT_HUGEPAGE_FLAG,
4443
TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
@@ -55,6 +54,9 @@ enum transparent_hugepage_flag {
5554
#define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
5655

5756
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
57+
struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
58+
pmd_t *pmd, int flags);
59+
5860
#define HPAGE_PMD_SHIFT PMD_SHIFT
5961
#define HPAGE_PMD_SIZE ((1UL) << HPAGE_PMD_SHIFT)
6062
#define HPAGE_PMD_MASK (~(HPAGE_PMD_SIZE - 1))
@@ -205,6 +207,12 @@ static inline bool is_huge_zero_page(struct page *page)
205207
return false;
206208
}
207209

210+
211+
static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma,
212+
unsigned long addr, pmd_t *pmd, int flags)
213+
{
214+
return NULL;
215+
}
208216
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
209217

210218
#endif /* _LINUX_HUGE_MM_H */

include/linux/mm.h

Lines changed: 41 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#include <linux/mm_types.h>
1717
#include <linux/range.h>
1818
#include <linux/pfn.h>
19+
#include <linux/percpu-refcount.h>
1920
#include <linux/bit_spinlock.h>
2021
#include <linux/shrinker.h>
2122
#include <linux/resource.h>
@@ -465,17 +466,6 @@ static inline int page_count(struct page *page)
465466
return atomic_read(&compound_head(page)->_count);
466467
}
467468

468-
static inline void get_page(struct page *page)
469-
{
470-
page = compound_head(page);
471-
/*
472-
* Getting a normal page or the head of a compound page
473-
* requires to already have an elevated page->_count.
474-
*/
475-
VM_BUG_ON_PAGE(atomic_read(&page->_count) <= 0, page);
476-
atomic_inc(&page->_count);
477-
}
478-
479469
static inline struct page *virt_to_head_page(const void *x)
480470
{
481471
struct page *page = virt_to_page(x);
@@ -494,13 +484,6 @@ static inline void init_page_count(struct page *page)
494484

495485
void __put_page(struct page *page);
496486

497-
static inline void put_page(struct page *page)
498-
{
499-
page = compound_head(page);
500-
if (put_page_testzero(page))
501-
__put_page(page);
502-
}
503-
504487
void put_pages_list(struct list_head *pages);
505488

506489
void split_page(struct page *page, unsigned int order);
@@ -682,17 +665,50 @@ static inline enum zone_type page_zonenum(const struct page *page)
682665
}
683666

684667
#ifdef CONFIG_ZONE_DEVICE
668+
void get_zone_device_page(struct page *page);
669+
void put_zone_device_page(struct page *page);
685670
static inline bool is_zone_device_page(const struct page *page)
686671
{
687672
return page_zonenum(page) == ZONE_DEVICE;
688673
}
689674
#else
675+
static inline void get_zone_device_page(struct page *page)
676+
{
677+
}
678+
static inline void put_zone_device_page(struct page *page)
679+
{
680+
}
690681
static inline bool is_zone_device_page(const struct page *page)
691682
{
692683
return false;
693684
}
694685
#endif
695686

687+
static inline void get_page(struct page *page)
688+
{
689+
page = compound_head(page);
690+
/*
691+
* Getting a normal page or the head of a compound page
692+
* requires to already have an elevated page->_count.
693+
*/
694+
VM_BUG_ON_PAGE(atomic_read(&page->_count) <= 0, page);
695+
atomic_inc(&page->_count);
696+
697+
if (unlikely(is_zone_device_page(page)))
698+
get_zone_device_page(page);
699+
}
700+
701+
static inline void put_page(struct page *page)
702+
{
703+
page = compound_head(page);
704+
705+
if (put_page_testzero(page))
706+
__put_page(page);
707+
708+
if (unlikely(is_zone_device_page(page)))
709+
put_zone_device_page(page);
710+
}
711+
696712
#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
697713
#define SECTION_IN_PAGE_FLAGS
698714
#endif
@@ -1444,6 +1460,13 @@ static inline void sync_mm_rss(struct mm_struct *mm)
14441460
}
14451461
#endif
14461462

1463+
#ifndef __HAVE_ARCH_PTE_DEVMAP
1464+
static inline int pte_devmap(pte_t pte)
1465+
{
1466+
return 0;
1467+
}
1468+
#endif
1469+
14471470
int vma_wants_writenotify(struct vm_area_struct *vma);
14481471

14491472
extern pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,

kernel/memremap.c

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,18 @@ struct page_map {
169169
struct vmem_altmap altmap;
170170
};
171171

172+
void get_zone_device_page(struct page *page)
173+
{
174+
percpu_ref_get(page->pgmap->ref);
175+
}
176+
EXPORT_SYMBOL(get_zone_device_page);
177+
178+
void put_zone_device_page(struct page *page)
179+
{
180+
put_dev_pagemap(page->pgmap);
181+
}
182+
EXPORT_SYMBOL(put_zone_device_page);
183+
172184
static void pgmap_radix_release(struct resource *res)
173185
{
174186
resource_size_t key;

mm/gup.c

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#include <linux/spinlock.h>
55

66
#include <linux/mm.h>
7+
#include <linux/memremap.h>
78
#include <linux/pagemap.h>
89
#include <linux/rmap.h>
910
#include <linux/swap.h>
@@ -62,6 +63,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
6263
unsigned long address, pmd_t *pmd, unsigned int flags)
6364
{
6465
struct mm_struct *mm = vma->vm_mm;
66+
struct dev_pagemap *pgmap = NULL;
6567
struct page *page;
6668
spinlock_t *ptl;
6769
pte_t *ptep, pte;
@@ -98,7 +100,17 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
98100
}
99101

100102
page = vm_normal_page(vma, address, pte);
101-
if (unlikely(!page)) {
103+
if (!page && pte_devmap(pte) && (flags & FOLL_GET)) {
104+
/*
105+
* Only return device mapping pages in the FOLL_GET case since
106+
* they are only valid while holding the pgmap reference.
107+
*/
108+
pgmap = get_dev_pagemap(pte_pfn(pte), NULL);
109+
if (pgmap)
110+
page = pte_page(pte);
111+
else
112+
goto no_page;
113+
} else if (unlikely(!page)) {
102114
if (flags & FOLL_DUMP) {
103115
/* Avoid special (like zero) pages in core dumps */
104116
page = ERR_PTR(-EFAULT);
@@ -129,8 +141,15 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
129141
goto retry;
130142
}
131143

132-
if (flags & FOLL_GET)
144+
if (flags & FOLL_GET) {
133145
get_page(page);
146+
147+
/* drop the pgmap reference now that we hold the page */
148+
if (pgmap) {
149+
put_dev_pagemap(pgmap);
150+
pgmap = NULL;
151+
}
152+
}
134153
if (flags & FOLL_TOUCH) {
135154
if ((flags & FOLL_WRITE) &&
136155
!pte_dirty(pte) && !PageDirty(page))
@@ -237,6 +256,13 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
237256
}
238257
if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
239258
return no_page_table(vma, flags);
259+
if (pmd_devmap(*pmd)) {
260+
ptl = pmd_lock(mm, pmd);
261+
page = follow_devmap_pmd(vma, address, pmd, flags);
262+
spin_unlock(ptl);
263+
if (page)
264+
return page;
265+
}
240266
if (likely(!pmd_trans_huge(*pmd)))
241267
return follow_page_pte(vma, address, pmd, flags);
242268

0 commit comments

Comments
 (0)