Skip to content

Commit b756a3b

Browse files
apopple-nvidiatorvalds
authored andcommitted
mm: device exclusive memory access
Some devices require exclusive write access to shared virtual memory (SVM) ranges to perform atomic operations on that memory. This requires CPU page tables to be updated to deny access whilst atomic operations are occurring. In order to do this introduce a new swap entry type (SWP_DEVICE_EXCLUSIVE). When a SVM range needs to be marked for exclusive access by a device all page table mappings for the particular range are replaced with device exclusive swap entries. This causes any CPU access to the page to result in a fault. Faults are resovled by replacing the faulting entry with the original mapping. This results in MMU notifiers being called which a driver uses to update access permissions such as revoking atomic access. After notifiers have been called the device will no longer have exclusive access to the region. Walking of the page tables to find the target pages is handled by get_user_pages() rather than a direct page table walk. A direct page table walk similar to what migrate_vma_collect()/unmap() does could also have been utilised. However this resulted in more code similar in functionality to what get_user_pages() provides as page faulting is required to make the PTEs present and to break COW. [[email protected]: fix signedness bug in make_device_exclusive_range()] Link: https://lkml.kernel.org/r/YNIz5NVnZ5GiZ3u1@mwanda Link: https://lkml.kernel.org/r/[email protected] Signed-off-by: Alistair Popple <[email protected]> Signed-off-by: Dan Carpenter <[email protected]> Reviewed-by: Christoph Hellwig <[email protected]> Cc: Ben Skeggs <[email protected]> Cc: Hugh Dickins <[email protected]> Cc: Jason Gunthorpe <[email protected]> Cc: John Hubbard <[email protected]> Cc: "Matthew Wilcox (Oracle)" <[email protected]> Cc: Peter Xu <[email protected]> Cc: Ralph Campbell <[email protected]> Cc: Shakeel Butt <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent 9a5cc85 commit b756a3b

File tree

10 files changed

+405
-10
lines changed

10 files changed

+405
-10
lines changed

Documentation/vm/hmm.rst

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -405,6 +405,23 @@ between device driver specific code and shared common code:
405405

406406
The lock can now be released.
407407

408+
Exclusive access memory
409+
=======================
410+
411+
Some devices have features such as atomic PTE bits that can be used to implement
412+
atomic access to system memory. To support atomic operations to a shared virtual
413+
memory page such a device needs access to that page which is exclusive of any
414+
userspace access from the CPU. The ``make_device_exclusive_range()`` function
415+
can be used to make a memory range inaccessible from userspace.
416+
417+
This replaces all mappings for pages in the given range with special swap
418+
entries. Any attempt to access the swap entry results in a fault which is
419+
resovled by replacing the entry with the original mapping. A driver gets
420+
notified that the mapping has been changed by MMU notifiers, after which point
421+
it will no longer have exclusive access to the page. Exclusive access is
422+
guranteed to last until the driver drops the page lock and page reference, at
423+
which point any CPU faults on the page may proceed as described.
424+
408425
Memory cgroup (memcg) and rss accounting
409426
========================================
410427

include/linux/mmu_notifier.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,11 @@ struct mmu_interval_notifier;
4242
* @MMU_NOTIFY_MIGRATE: used during migrate_vma_collect() invalidate to signal
4343
* a device driver to possibly ignore the invalidation if the
4444
* owner field matches the driver's device private pgmap owner.
45+
*
46+
* @MMU_NOTIFY_EXCLUSIVE: to signal a device driver that the device will no
47+
* longer have exclusive access to the page. When sent during creation of an
48+
* exclusive range the owner will be initialised to the value provided by the
49+
* caller of make_device_exclusive_range(), otherwise the owner will be NULL.
4550
*/
4651
enum mmu_notifier_event {
4752
MMU_NOTIFY_UNMAP = 0,
@@ -51,6 +56,7 @@ enum mmu_notifier_event {
5156
MMU_NOTIFY_SOFT_DIRTY,
5257
MMU_NOTIFY_RELEASE,
5358
MMU_NOTIFY_MIGRATE,
59+
MMU_NOTIFY_EXCLUSIVE,
5460
};
5561

5662
#define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0)

include/linux/rmap.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,10 @@ int page_referenced(struct page *, int is_locked,
194194
void try_to_migrate(struct page *page, enum ttu_flags flags);
195195
void try_to_unmap(struct page *, enum ttu_flags flags);
196196

197+
int make_device_exclusive_range(struct mm_struct *mm, unsigned long start,
198+
unsigned long end, struct page **pages,
199+
void *arg);
200+
197201
/* Avoid racy checks */
198202
#define PVMW_SYNC (1 << 0)
199203
/* Look for migarion entries rather than present PTEs */

include/linux/swap.h

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,12 +62,17 @@ static inline int current_is_kswapd(void)
6262
* migrate part of a process memory to device memory.
6363
*
6464
* When a page is migrated from CPU to device, we set the CPU page table entry
65-
* to a special SWP_DEVICE_* entry.
65+
* to a special SWP_DEVICE_{READ|WRITE} entry.
66+
*
67+
* When a page is mapped by the device for exclusive access we set the CPU page
68+
* table entries to special SWP_DEVICE_EXCLUSIVE_* entries.
6669
*/
6770
#ifdef CONFIG_DEVICE_PRIVATE
68-
#define SWP_DEVICE_NUM 2
71+
#define SWP_DEVICE_NUM 4
6972
#define SWP_DEVICE_WRITE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM)
7073
#define SWP_DEVICE_READ (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+1)
74+
#define SWP_DEVICE_EXCLUSIVE_WRITE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+2)
75+
#define SWP_DEVICE_EXCLUSIVE_READ (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+3)
7176
#else
7277
#define SWP_DEVICE_NUM 0
7378
#endif

include/linux/swapops.h

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,27 @@ static inline bool is_writable_device_private_entry(swp_entry_t entry)
127127
{
128128
return unlikely(swp_type(entry) == SWP_DEVICE_WRITE);
129129
}
130+
131+
static inline swp_entry_t make_readable_device_exclusive_entry(pgoff_t offset)
132+
{
133+
return swp_entry(SWP_DEVICE_EXCLUSIVE_READ, offset);
134+
}
135+
136+
static inline swp_entry_t make_writable_device_exclusive_entry(pgoff_t offset)
137+
{
138+
return swp_entry(SWP_DEVICE_EXCLUSIVE_WRITE, offset);
139+
}
140+
141+
static inline bool is_device_exclusive_entry(swp_entry_t entry)
142+
{
143+
return swp_type(entry) == SWP_DEVICE_EXCLUSIVE_READ ||
144+
swp_type(entry) == SWP_DEVICE_EXCLUSIVE_WRITE;
145+
}
146+
147+
static inline bool is_writable_device_exclusive_entry(swp_entry_t entry)
148+
{
149+
return unlikely(swp_type(entry) == SWP_DEVICE_EXCLUSIVE_WRITE);
150+
}
130151
#else /* CONFIG_DEVICE_PRIVATE */
131152
static inline swp_entry_t make_readable_device_private_entry(pgoff_t offset)
132153
{
@@ -147,6 +168,26 @@ static inline bool is_writable_device_private_entry(swp_entry_t entry)
147168
{
148169
return false;
149170
}
171+
172+
static inline swp_entry_t make_readable_device_exclusive_entry(pgoff_t offset)
173+
{
174+
return swp_entry(0, 0);
175+
}
176+
177+
static inline swp_entry_t make_writable_device_exclusive_entry(pgoff_t offset)
178+
{
179+
return swp_entry(0, 0);
180+
}
181+
182+
static inline bool is_device_exclusive_entry(swp_entry_t entry)
183+
{
184+
return false;
185+
}
186+
187+
static inline bool is_writable_device_exclusive_entry(swp_entry_t entry)
188+
{
189+
return false;
190+
}
150191
#endif /* CONFIG_DEVICE_PRIVATE */
151192

152193
#ifdef CONFIG_MIGRATION
@@ -226,7 +267,8 @@ static inline struct page *pfn_swap_entry_to_page(swp_entry_t entry)
226267
*/
227268
static inline bool is_pfn_swap_entry(swp_entry_t entry)
228269
{
229-
return is_migration_entry(entry) || is_device_private_entry(entry);
270+
return is_migration_entry(entry) || is_device_private_entry(entry) ||
271+
is_device_exclusive_entry(entry);
230272
}
231273

232274
struct page_vma_mapped_walk;

mm/hmm.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@
2626
#include <linux/mmu_notifier.h>
2727
#include <linux/memory_hotplug.h>
2828

29+
#include "internal.h"
30+
2931
struct hmm_vma_walk {
3032
struct hmm_range *range;
3133
unsigned long last;
@@ -271,6 +273,9 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
271273
if (!non_swap_entry(entry))
272274
goto fault;
273275

276+
if (is_device_exclusive_entry(entry))
277+
goto fault;
278+
274279
if (is_migration_entry(entry)) {
275280
pte_unmap(ptep);
276281
hmm_vma_walk->last = addr;

mm/memory.c

Lines changed: 123 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -699,6 +699,68 @@ struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
699699
}
700700
#endif
701701

702+
static void restore_exclusive_pte(struct vm_area_struct *vma,
703+
struct page *page, unsigned long address,
704+
pte_t *ptep)
705+
{
706+
pte_t pte;
707+
swp_entry_t entry;
708+
709+
pte = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot)));
710+
if (pte_swp_soft_dirty(*ptep))
711+
pte = pte_mksoft_dirty(pte);
712+
713+
entry = pte_to_swp_entry(*ptep);
714+
if (pte_swp_uffd_wp(*ptep))
715+
pte = pte_mkuffd_wp(pte);
716+
else if (is_writable_device_exclusive_entry(entry))
717+
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
718+
719+
set_pte_at(vma->vm_mm, address, ptep, pte);
720+
721+
/*
722+
* No need to take a page reference as one was already
723+
* created when the swap entry was made.
724+
*/
725+
if (PageAnon(page))
726+
page_add_anon_rmap(page, vma, address, false);
727+
else
728+
/*
729+
* Currently device exclusive access only supports anonymous
730+
* memory so the entry shouldn't point to a filebacked page.
731+
*/
732+
WARN_ON_ONCE(!PageAnon(page));
733+
734+
if (vma->vm_flags & VM_LOCKED)
735+
mlock_vma_page(page);
736+
737+
/*
738+
* No need to invalidate - it was non-present before. However
739+
* secondary CPUs may have mappings that need invalidating.
740+
*/
741+
update_mmu_cache(vma, address, ptep);
742+
}
743+
744+
/*
745+
* Tries to restore an exclusive pte if the page lock can be acquired without
746+
* sleeping.
747+
*/
748+
static int
749+
try_restore_exclusive_pte(pte_t *src_pte, struct vm_area_struct *vma,
750+
unsigned long addr)
751+
{
752+
swp_entry_t entry = pte_to_swp_entry(*src_pte);
753+
struct page *page = pfn_swap_entry_to_page(entry);
754+
755+
if (trylock_page(page)) {
756+
restore_exclusive_pte(vma, page, addr, src_pte);
757+
unlock_page(page);
758+
return 0;
759+
}
760+
761+
return -EBUSY;
762+
}
763+
702764
/*
703765
* copy one vm_area from one task to the other. Assumes the page tables
704766
* already present in the new task to be cleared in the whole range
@@ -780,6 +842,17 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
780842
pte = pte_swp_mkuffd_wp(pte);
781843
set_pte_at(src_mm, addr, src_pte, pte);
782844
}
845+
} else if (is_device_exclusive_entry(entry)) {
846+
/*
847+
* Make device exclusive entries present by restoring the
848+
* original entry then copying as for a present pte. Device
849+
* exclusive entries currently only support private writable
850+
* (ie. COW) mappings.
851+
*/
852+
VM_BUG_ON(!is_cow_mapping(src_vma->vm_flags));
853+
if (try_restore_exclusive_pte(src_pte, src_vma, addr))
854+
return -EBUSY;
855+
return -ENOENT;
783856
}
784857
if (!userfaultfd_wp(dst_vma))
785858
pte = pte_swp_clear_uffd_wp(pte);
@@ -980,9 +1053,18 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
9801053
if (ret == -EIO) {
9811054
entry = pte_to_swp_entry(*src_pte);
9821055
break;
1056+
} else if (ret == -EBUSY) {
1057+
break;
1058+
} else if (!ret) {
1059+
progress += 8;
1060+
continue;
9831061
}
984-
progress += 8;
985-
continue;
1062+
1063+
/*
1064+
* Device exclusive entry restored, continue by copying
1065+
* the now present pte.
1066+
*/
1067+
WARN_ON_ONCE(ret != -ENOENT);
9861068
}
9871069
/* copy_present_pte() will clear `*prealloc' if consumed */
9881070
ret = copy_present_pte(dst_vma, src_vma, dst_pte, src_pte,
@@ -1020,6 +1102,8 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
10201102
goto out;
10211103
}
10221104
entry.val = 0;
1105+
} else if (ret == -EBUSY) {
1106+
goto out;
10231107
} else if (ret == -EAGAIN) {
10241108
prealloc = page_copy_prealloc(src_mm, src_vma, addr);
10251109
if (!prealloc)
@@ -1287,7 +1371,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
12871371
}
12881372

12891373
entry = pte_to_swp_entry(ptent);
1290-
if (is_device_private_entry(entry)) {
1374+
if (is_device_private_entry(entry) ||
1375+
is_device_exclusive_entry(entry)) {
12911376
struct page *page = pfn_swap_entry_to_page(entry);
12921377

12931378
if (unlikely(details && details->check_mapping)) {
@@ -1303,7 +1388,10 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
13031388

13041389
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
13051390
rss[mm_counter(page)]--;
1306-
page_remove_rmap(page, false);
1391+
1392+
if (is_device_private_entry(entry))
1393+
page_remove_rmap(page, false);
1394+
13071395
put_page(page);
13081396
continue;
13091397
}
@@ -3351,6 +3439,34 @@ void unmap_mapping_range(struct address_space *mapping,
33513439
}
33523440
EXPORT_SYMBOL(unmap_mapping_range);
33533441

3442+
/*
3443+
* Restore a potential device exclusive pte to a working pte entry
3444+
*/
3445+
static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
3446+
{
3447+
struct page *page = vmf->page;
3448+
struct vm_area_struct *vma = vmf->vma;
3449+
struct mmu_notifier_range range;
3450+
3451+
if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags))
3452+
return VM_FAULT_RETRY;
3453+
mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma,
3454+
vma->vm_mm, vmf->address & PAGE_MASK,
3455+
(vmf->address & PAGE_MASK) + PAGE_SIZE, NULL);
3456+
mmu_notifier_invalidate_range_start(&range);
3457+
3458+
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
3459+
&vmf->ptl);
3460+
if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
3461+
restore_exclusive_pte(vma, page, vmf->address, vmf->pte);
3462+
3463+
pte_unmap_unlock(vmf->pte, vmf->ptl);
3464+
unlock_page(page);
3465+
3466+
mmu_notifier_invalidate_range_end(&range);
3467+
return 0;
3468+
}
3469+
33543470
/*
33553471
* We enter with non-exclusive mmap_lock (to exclude vma changes,
33563472
* but allow concurrent faults), and pte mapped but not yet locked.
@@ -3379,6 +3495,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
33793495
if (is_migration_entry(entry)) {
33803496
migration_entry_wait(vma->vm_mm, vmf->pmd,
33813497
vmf->address);
3498+
} else if (is_device_exclusive_entry(entry)) {
3499+
vmf->page = pfn_swap_entry_to_page(entry);
3500+
ret = remove_device_exclusive_entry(vmf);
33823501
} else if (is_device_private_entry(entry)) {
33833502
vmf->page = pfn_swap_entry_to_page(entry);
33843503
ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);

mm/mprotect.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,14 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
165165
newpte = swp_entry_to_pte(entry);
166166
if (pte_swp_uffd_wp(oldpte))
167167
newpte = pte_swp_mkuffd_wp(newpte);
168+
} else if (is_writable_device_exclusive_entry(entry)) {
169+
entry = make_readable_device_exclusive_entry(
170+
swp_offset(entry));
171+
newpte = swp_entry_to_pte(entry);
172+
if (pte_swp_soft_dirty(oldpte))
173+
newpte = pte_swp_mksoft_dirty(newpte);
174+
if (pte_swp_uffd_wp(oldpte))
175+
newpte = pte_swp_mkuffd_wp(newpte);
168176
} else {
169177
newpte = oldpte;
170178
}

mm/page_vma_mapped.c

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,8 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw)
4141

4242
/* Handle un-addressable ZONE_DEVICE memory */
4343
entry = pte_to_swp_entry(*pvmw->pte);
44-
if (!is_device_private_entry(entry))
44+
if (!is_device_private_entry(entry) &&
45+
!is_device_exclusive_entry(entry))
4546
return false;
4647
} else if (!pte_present(*pvmw->pte))
4748
return false;
@@ -93,7 +94,8 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw)
9394
return false;
9495
entry = pte_to_swp_entry(*pvmw->pte);
9596

96-
if (!is_migration_entry(entry))
97+
if (!is_migration_entry(entry) &&
98+
!is_device_exclusive_entry(entry))
9799
return false;
98100

99101
pfn = swp_offset(entry);
@@ -102,7 +104,8 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw)
102104

103105
/* Handle un-addressable ZONE_DEVICE memory */
104106
entry = pte_to_swp_entry(*pvmw->pte);
105-
if (!is_device_private_entry(entry))
107+
if (!is_device_private_entry(entry) &&
108+
!is_device_exclusive_entry(entry))
106109
return false;
107110

108111
pfn = swp_offset(entry);

0 commit comments

Comments
 (0)