Skip to content

Commit fc71884

Browse files
CmdrMoozyakpm00
authored andcommitted
mm: userfaultfd: add new UFFDIO_POISON ioctl
The basic idea here is to "simulate" memory poisoning for VMs. A VM running on some host might encounter a memory error, after which some page(s) are poisoned (i.e., future accesses SIGBUS). They expect that once poisoned, pages can never become "un-poisoned". So, when we live migrate the VM, we need to preserve the poisoned status of these pages. When live migrating, we try to get the guest running on its new host as quickly as possible. So, we start it running before all memory has been copied, and before we're certain which pages should be poisoned or not. So the basic way to use this new feature is: - On the new host, the guest's memory is registered with userfaultfd, in either MISSING or MINOR mode (doesn't really matter for this purpose). - On any first access, we get a userfaultfd event. At this point we can communicate with the old host to find out if the page was poisoned. - If so, we can respond with a UFFDIO_POISON - this places a swap marker so any future accesses will SIGBUS. Because the pte is now "present", future accesses won't generate more userfaultfd events, they'll just SIGBUS directly. UFFDIO_POISON does not handle unmapping previously-present PTEs. This isn't needed, because during live migration we want to intercept all accesses with userfaultfd (not just writes, so WP mode isn't useful for this). So whether minor or missing mode is being used (or both), the PTE won't be present in any case, so handling that case isn't needed. Similarly, UFFDIO_POISON won't replace existing PTE markers. This might be okay to do, but it seems to be safer to just refuse to overwrite any existing entry (like a UFFD_WP PTE marker). Link: https://lkml.kernel.org/r/[email protected] Signed-off-by: Axel Rasmussen <[email protected]> Acked-by: Peter Xu <[email protected]> Cc: Al Viro <[email protected]> Cc: Brian Geffon <[email protected]> Cc: Christian Brauner <[email protected]> Cc: David Hildenbrand <[email protected]> Cc: Gaosheng Cui <[email protected]> Cc: Huang, Ying <[email protected]> Cc: Hugh Dickins <[email protected]> Cc: James Houghton <[email protected]> Cc: Jan Alexander Steffens (heftig) <[email protected]> Cc: Jiaqi Yan <[email protected]> Cc: Jonathan Corbet <[email protected]> Cc: Kefeng Wang <[email protected]> Cc: Liam R. Howlett <[email protected]> Cc: Miaohe Lin <[email protected]> Cc: Mike Kravetz <[email protected]> Cc: Mike Rapoport (IBM) <[email protected]> Cc: Muchun Song <[email protected]> Cc: Nadav Amit <[email protected]> Cc: Naoya Horiguchi <[email protected]> Cc: Ryan Roberts <[email protected]> Cc: Shuah Khan <[email protected]> Cc: Suleiman Souhlal <[email protected]> Cc: Suren Baghdasaryan <[email protected]> Cc: T.J. Alumbaugh <[email protected]> Cc: Yu Zhao <[email protected]> Cc: ZhangPeng <[email protected]> Signed-off-by: Andrew Morton <[email protected]>
1 parent 435cdb4 commit fc71884

File tree

4 files changed

+125
-1
lines changed

4 files changed

+125
-1
lines changed

fs/userfaultfd.c

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1967,6 +1967,61 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
19671967
return ret;
19681968
}
19691969

1970+
static inline int userfaultfd_poison(struct userfaultfd_ctx *ctx, unsigned long arg)
1971+
{
1972+
__s64 ret;
1973+
struct uffdio_poison uffdio_poison;
1974+
struct uffdio_poison __user *user_uffdio_poison;
1975+
struct userfaultfd_wake_range range;
1976+
1977+
user_uffdio_poison = (struct uffdio_poison __user *)arg;
1978+
1979+
ret = -EAGAIN;
1980+
if (atomic_read(&ctx->mmap_changing))
1981+
goto out;
1982+
1983+
ret = -EFAULT;
1984+
if (copy_from_user(&uffdio_poison, user_uffdio_poison,
1985+
/* don't copy the output fields */
1986+
sizeof(uffdio_poison) - (sizeof(__s64))))
1987+
goto out;
1988+
1989+
ret = validate_range(ctx->mm, uffdio_poison.range.start,
1990+
uffdio_poison.range.len);
1991+
if (ret)
1992+
goto out;
1993+
1994+
ret = -EINVAL;
1995+
if (uffdio_poison.mode & ~UFFDIO_POISON_MODE_DONTWAKE)
1996+
goto out;
1997+
1998+
if (mmget_not_zero(ctx->mm)) {
1999+
ret = mfill_atomic_poison(ctx->mm, uffdio_poison.range.start,
2000+
uffdio_poison.range.len,
2001+
&ctx->mmap_changing, 0);
2002+
mmput(ctx->mm);
2003+
} else {
2004+
return -ESRCH;
2005+
}
2006+
2007+
if (unlikely(put_user(ret, &user_uffdio_poison->updated)))
2008+
return -EFAULT;
2009+
if (ret < 0)
2010+
goto out;
2011+
2012+
/* len == 0 would wake all */
2013+
BUG_ON(!ret);
2014+
range.len = ret;
2015+
if (!(uffdio_poison.mode & UFFDIO_POISON_MODE_DONTWAKE)) {
2016+
range.start = uffdio_poison.range.start;
2017+
wake_userfault(ctx, &range);
2018+
}
2019+
ret = range.len == uffdio_poison.range.len ? 0 : -EAGAIN;
2020+
2021+
out:
2022+
return ret;
2023+
}
2024+
19702025
static inline unsigned int uffd_ctx_features(__u64 user_features)
19712026
{
19722027
/*
@@ -2068,6 +2123,9 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd,
20682123
case UFFDIO_CONTINUE:
20692124
ret = userfaultfd_continue(ctx, arg);
20702125
break;
2126+
case UFFDIO_POISON:
2127+
ret = userfaultfd_poison(ctx, arg);
2128+
break;
20712129
}
20722130
return ret;
20732131
}

include/linux/userfaultfd_k.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ enum mfill_atomic_mode {
4646
MFILL_ATOMIC_COPY,
4747
MFILL_ATOMIC_ZEROPAGE,
4848
MFILL_ATOMIC_CONTINUE,
49+
MFILL_ATOMIC_POISON,
4950
NR_MFILL_ATOMIC_MODES,
5051
};
5152

@@ -83,6 +84,9 @@ extern ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm,
8384
extern ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long dst_start,
8485
unsigned long len, atomic_t *mmap_changing,
8586
uffd_flags_t flags);
87+
extern ssize_t mfill_atomic_poison(struct mm_struct *dst_mm, unsigned long start,
88+
unsigned long len, atomic_t *mmap_changing,
89+
uffd_flags_t flags);
8690
extern int mwriteprotect_range(struct mm_struct *dst_mm,
8791
unsigned long start, unsigned long len,
8892
bool enable_wp, atomic_t *mmap_changing);

include/uapi/linux/userfaultfd.h

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@
7171
#define _UFFDIO_ZEROPAGE (0x04)
7272
#define _UFFDIO_WRITEPROTECT (0x06)
7373
#define _UFFDIO_CONTINUE (0x07)
74+
#define _UFFDIO_POISON (0x08)
7475
#define _UFFDIO_API (0x3F)
7576

7677
/* userfaultfd ioctl ids */
@@ -91,6 +92,8 @@
9192
struct uffdio_writeprotect)
9293
#define UFFDIO_CONTINUE _IOWR(UFFDIO, _UFFDIO_CONTINUE, \
9394
struct uffdio_continue)
95+
#define UFFDIO_POISON _IOWR(UFFDIO, _UFFDIO_POISON, \
96+
struct uffdio_poison)
9497

9598
/* read() structure */
9699
struct uffd_msg {
@@ -225,6 +228,7 @@ struct uffdio_api {
225228
#define UFFD_FEATURE_EXACT_ADDRESS (1<<11)
226229
#define UFFD_FEATURE_WP_HUGETLBFS_SHMEM (1<<12)
227230
#define UFFD_FEATURE_WP_UNPOPULATED (1<<13)
231+
#define UFFD_FEATURE_POISON (1<<14)
228232
__u64 features;
229233

230234
__u64 ioctls;
@@ -321,6 +325,18 @@ struct uffdio_continue {
321325
__s64 mapped;
322326
};
323327

328+
struct uffdio_poison {
329+
struct uffdio_range range;
330+
#define UFFDIO_POISON_MODE_DONTWAKE ((__u64)1<<0)
331+
__u64 mode;
332+
333+
/*
334+
* Fields below here are written by the ioctl and must be at the end:
335+
* the copy_from_user will not read past here.
336+
*/
337+
__s64 updated;
338+
};
339+
324340
/*
325341
* Flags for the userfaultfd(2) system call itself.
326342
*/

mm/userfaultfd.c

Lines changed: 47 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,40 @@ static int mfill_atomic_pte_continue(pmd_t *dst_pmd,
288288
goto out;
289289
}
290290

291+
/* Handles UFFDIO_POISON for all non-hugetlb VMAs. */
292+
static int mfill_atomic_pte_poison(pmd_t *dst_pmd,
293+
struct vm_area_struct *dst_vma,
294+
unsigned long dst_addr,
295+
uffd_flags_t flags)
296+
{
297+
int ret;
298+
struct mm_struct *dst_mm = dst_vma->vm_mm;
299+
pte_t _dst_pte, *dst_pte;
300+
spinlock_t *ptl;
301+
302+
_dst_pte = make_pte_marker(PTE_MARKER_POISONED);
303+
dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
304+
305+
if (mfill_file_over_size(dst_vma, dst_addr)) {
306+
ret = -EFAULT;
307+
goto out_unlock;
308+
}
309+
310+
ret = -EEXIST;
311+
/* Refuse to overwrite any PTE, even a PTE marker (e.g. UFFD WP). */
312+
if (!pte_none(*dst_pte))
313+
goto out_unlock;
314+
315+
set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
316+
317+
/* No need to invalidate - it was non-present before */
318+
update_mmu_cache(dst_vma, dst_addr, dst_pte);
319+
ret = 0;
320+
out_unlock:
321+
pte_unmap_unlock(dst_pte, ptl);
322+
return ret;
323+
}
324+
291325
static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
292326
{
293327
pgd_t *pgd;
@@ -339,7 +373,8 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
339373
* by THP. Since we can not reliably insert a zero page, this
340374
* feature is not supported.
341375
*/
342-
if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) {
376+
if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE) ||
377+
uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) {
343378
mmap_read_unlock(dst_mm);
344379
return -EINVAL;
345380
}
@@ -483,6 +518,9 @@ static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
483518
if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) {
484519
return mfill_atomic_pte_continue(dst_pmd, dst_vma,
485520
dst_addr, flags);
521+
} else if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) {
522+
return mfill_atomic_pte_poison(dst_pmd, dst_vma,
523+
dst_addr, flags);
486524
}
487525

488526
/*
@@ -704,6 +742,14 @@ ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long start,
704742
uffd_flags_set_mode(flags, MFILL_ATOMIC_CONTINUE));
705743
}
706744

745+
ssize_t mfill_atomic_poison(struct mm_struct *dst_mm, unsigned long start,
746+
unsigned long len, atomic_t *mmap_changing,
747+
uffd_flags_t flags)
748+
{
749+
return mfill_atomic(dst_mm, start, 0, len, mmap_changing,
750+
uffd_flags_set_mode(flags, MFILL_ATOMIC_POISON));
751+
}
752+
707753
long uffd_wp_range(struct vm_area_struct *dst_vma,
708754
unsigned long start, unsigned long len, bool enable_wp)
709755
{

0 commit comments

Comments
 (0)