Skip to content

Commit 893e26e

Browse files
xemultorvalds
authored andcommitted
userfaultfd: non-cooperative: Add fork() event
When the mm with uffd-ed vmas fork()-s the respective vmas notify their uffds with the event which contains a descriptor with new uffd. This new descriptor can then be used to get events from the child and populate its mm with data. Note, that there can be different uffd-s controlling different vmas within one mm, so first we should collect all those uffds (and ctx-s) in a list and then notify them all one by one but only once per fork(). The context is created at fork() time but the descriptor, file struct and anon inode object is created at event read time. So some trickery is added to the userfaultfd_ctx_read() to handle the ctx queues' locking vs file creation. Another thing worth noticing is that the task that fork()-s waits for the uffd event to get processed WITHOUT the mmap sem. [[email protected]: build warning fix] Link: http://lkml.kernel.org/r/[email protected] Link: http://lkml.kernel.org/r/[email protected] Signed-off-by: Pavel Emelyanov <[email protected]> Signed-off-by: Mike Rapoport <[email protected]> Signed-off-by: Andrea Arcangeli <[email protected]> Cc: "Dr. David Alan Gilbert" <[email protected]> Cc: Hillf Danton <[email protected]> Cc: Michael Rapoport <[email protected]> Cc: Mike Kravetz <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent 6560314 commit 893e26e

File tree

4 files changed

+170
-16
lines changed

4 files changed

+170
-16
lines changed

fs/userfaultfd.c

Lines changed: 145 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,12 @@ struct userfaultfd_ctx {
6464
struct mm_struct *mm;
6565
};
6666

67+
struct userfaultfd_fork_ctx {
68+
struct userfaultfd_ctx *orig;
69+
struct userfaultfd_ctx *new;
70+
struct list_head list;
71+
};
72+
6773
struct userfaultfd_wait_queue {
6874
struct uffd_msg msg;
6975
wait_queue_t wq;
@@ -465,9 +471,8 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason)
465471
return ret;
466472
}
467473

468-
static int __maybe_unused userfaultfd_event_wait_completion(
469-
struct userfaultfd_ctx *ctx,
470-
struct userfaultfd_wait_queue *ewq)
474+
static int userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
475+
struct userfaultfd_wait_queue *ewq)
471476
{
472477
int ret = 0;
473478

@@ -518,6 +523,79 @@ static void userfaultfd_event_complete(struct userfaultfd_ctx *ctx,
518523
__remove_wait_queue(&ctx->event_wqh, &ewq->wq);
519524
}
520525

526+
int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
527+
{
528+
struct userfaultfd_ctx *ctx = NULL, *octx;
529+
struct userfaultfd_fork_ctx *fctx;
530+
531+
octx = vma->vm_userfaultfd_ctx.ctx;
532+
if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
533+
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
534+
vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING);
535+
return 0;
536+
}
537+
538+
list_for_each_entry(fctx, fcs, list)
539+
if (fctx->orig == octx) {
540+
ctx = fctx->new;
541+
break;
542+
}
543+
544+
if (!ctx) {
545+
fctx = kmalloc(sizeof(*fctx), GFP_KERNEL);
546+
if (!fctx)
547+
return -ENOMEM;
548+
549+
ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
550+
if (!ctx) {
551+
kfree(fctx);
552+
return -ENOMEM;
553+
}
554+
555+
atomic_set(&ctx->refcount, 1);
556+
ctx->flags = octx->flags;
557+
ctx->state = UFFD_STATE_RUNNING;
558+
ctx->features = octx->features;
559+
ctx->released = false;
560+
ctx->mm = vma->vm_mm;
561+
atomic_inc(&ctx->mm->mm_users);
562+
563+
userfaultfd_ctx_get(octx);
564+
fctx->orig = octx;
565+
fctx->new = ctx;
566+
list_add_tail(&fctx->list, fcs);
567+
}
568+
569+
vma->vm_userfaultfd_ctx.ctx = ctx;
570+
return 0;
571+
}
572+
573+
static int dup_fctx(struct userfaultfd_fork_ctx *fctx)
574+
{
575+
struct userfaultfd_ctx *ctx = fctx->orig;
576+
struct userfaultfd_wait_queue ewq;
577+
578+
msg_init(&ewq.msg);
579+
580+
ewq.msg.event = UFFD_EVENT_FORK;
581+
ewq.msg.arg.reserved.reserved1 = (unsigned long)fctx->new;
582+
583+
return userfaultfd_event_wait_completion(ctx, &ewq);
584+
}
585+
586+
void dup_userfaultfd_complete(struct list_head *fcs)
587+
{
588+
int ret = 0;
589+
struct userfaultfd_fork_ctx *fctx, *n;
590+
591+
list_for_each_entry_safe(fctx, n, fcs, list) {
592+
if (!ret)
593+
ret = dup_fctx(fctx);
594+
list_del(&fctx->list);
595+
kfree(fctx);
596+
}
597+
}
598+
521599
static int userfaultfd_release(struct inode *inode, struct file *file)
522600
{
523601
struct userfaultfd_ctx *ctx = file->private_data;
@@ -653,12 +731,49 @@ static unsigned int userfaultfd_poll(struct file *file, poll_table *wait)
653731
}
654732
}
655733

734+
static const struct file_operations userfaultfd_fops;
735+
736+
static int resolve_userfault_fork(struct userfaultfd_ctx *ctx,
737+
struct userfaultfd_ctx *new,
738+
struct uffd_msg *msg)
739+
{
740+
int fd;
741+
struct file *file;
742+
unsigned int flags = new->flags & UFFD_SHARED_FCNTL_FLAGS;
743+
744+
fd = get_unused_fd_flags(flags);
745+
if (fd < 0)
746+
return fd;
747+
748+
file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, new,
749+
O_RDWR | flags);
750+
if (IS_ERR(file)) {
751+
put_unused_fd(fd);
752+
return PTR_ERR(file);
753+
}
754+
755+
fd_install(fd, file);
756+
msg->arg.reserved.reserved1 = 0;
757+
msg->arg.fork.ufd = fd;
758+
759+
return 0;
760+
}
761+
656762
static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
657763
struct uffd_msg *msg)
658764
{
659765
ssize_t ret;
660766
DECLARE_WAITQUEUE(wait, current);
661767
struct userfaultfd_wait_queue *uwq;
768+
/*
769+
* Handling fork event requires sleeping operations, so
770+
* we drop the event_wqh lock, then do these ops, then
771+
* lock it back and wake up the waiter. While the lock is
772+
* dropped the ewq may go away so we keep track of it
773+
* carefully.
774+
*/
775+
LIST_HEAD(fork_event);
776+
struct userfaultfd_ctx *fork_nctx = NULL;
662777

663778
/* always take the fd_wqh lock before the fault_pending_wqh lock */
664779
spin_lock(&ctx->fd_wqh.lock);
@@ -716,6 +831,16 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
716831
if (uwq) {
717832
*msg = uwq->msg;
718833

834+
if (uwq->msg.event == UFFD_EVENT_FORK) {
835+
fork_nctx = (struct userfaultfd_ctx *)
836+
(unsigned long)
837+
uwq->msg.arg.reserved.reserved1;
838+
list_move(&uwq->wq.task_list, &fork_event);
839+
spin_unlock(&ctx->event_wqh.lock);
840+
ret = 0;
841+
break;
842+
}
843+
719844
userfaultfd_event_complete(ctx, uwq);
720845
spin_unlock(&ctx->event_wqh.lock);
721846
ret = 0;
@@ -739,6 +864,23 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
739864
__set_current_state(TASK_RUNNING);
740865
spin_unlock(&ctx->fd_wqh.lock);
741866

867+
if (!ret && msg->event == UFFD_EVENT_FORK) {
868+
ret = resolve_userfault_fork(ctx, fork_nctx, msg);
869+
870+
if (!ret) {
871+
spin_lock(&ctx->event_wqh.lock);
872+
if (!list_empty(&fork_event)) {
873+
uwq = list_first_entry(&fork_event,
874+
typeof(*uwq),
875+
wq.task_list);
876+
list_del(&uwq->wq.task_list);
877+
__add_wait_queue(&ctx->event_wqh, &uwq->wq);
878+
userfaultfd_event_complete(ctx, uwq);
879+
}
880+
spin_unlock(&ctx->event_wqh.lock);
881+
}
882+
}
883+
742884
return ret;
743885
}
744886

include/linux/userfaultfd_k.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,9 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma)
5252
return vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP);
5353
}
5454

55+
extern int dup_userfaultfd(struct vm_area_struct *, struct list_head *);
56+
extern void dup_userfaultfd_complete(struct list_head *);
57+
5558
#else /* CONFIG_USERFAULTFD */
5659

5760
/* mm helpers */
@@ -76,6 +79,16 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma)
7679
return false;
7780
}
7881

82+
static inline int dup_userfaultfd(struct vm_area_struct *vma,
83+
struct list_head *l)
84+
{
85+
return 0;
86+
}
87+
88+
static inline void dup_userfaultfd_complete(struct list_head *l)
89+
{
90+
}
91+
7992
#endif /* CONFIG_USERFAULTFD */
8093

8194
#endif /* _LINUX_USERFAULTFD_K_H */

include/uapi/linux/userfaultfd.h

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,7 @@
1818
* means the userland is reading).
1919
*/
2020
#define UFFD_API ((__u64)0xAA)
21-
/*
22-
* After implementing the respective features it will become:
23-
* #define UFFD_API_FEATURES (UFFD_FEATURE_PAGEFAULT_FLAG_WP | \
24-
* UFFD_FEATURE_EVENT_FORK)
25-
*/
26-
#define UFFD_API_FEATURES (0)
21+
#define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK)
2722
#define UFFD_API_IOCTLS \
2823
((__u64)1 << _UFFDIO_REGISTER | \
2924
(__u64)1 << _UFFDIO_UNREGISTER | \
@@ -77,6 +72,10 @@ struct uffd_msg {
7772
__u64 address;
7873
} pagefault;
7974

75+
struct {
76+
__u32 ufd;
77+
} fork;
78+
8079
struct {
8180
/* unused reserved fields */
8281
__u64 reserved1;
@@ -90,9 +89,7 @@ struct uffd_msg {
9089
* Start at 0x12 and not at 0 to be more strict against bugs.
9190
*/
9291
#define UFFD_EVENT_PAGEFAULT 0x12
93-
#if 0 /* not available yet */
9492
#define UFFD_EVENT_FORK 0x13
95-
#endif
9693

9794
/* flags for UFFD_EVENT_PAGEFAULT */
9895
#define UFFD_PAGEFAULT_FLAG_WRITE (1<<0) /* If this was a write fault */
@@ -111,10 +108,8 @@ struct uffdio_api {
111108
* are to be considered implicitly always enabled in all kernels as
112109
* long as the uffdio_api.api requested matches UFFD_API.
113110
*/
114-
#if 0 /* not available yet */
115111
#define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0)
116112
#define UFFD_FEATURE_EVENT_FORK (1<<1)
117-
#endif
118113
__u64 features;
119114

120115
__u64 ioctls;

kernel/fork.c

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@
5555
#include <linux/rmap.h>
5656
#include <linux/ksm.h>
5757
#include <linux/acct.h>
58+
#include <linux/userfaultfd_k.h>
5859
#include <linux/tsacct_kern.h>
5960
#include <linux/cn_proc.h>
6061
#include <linux/freezer.h>
@@ -561,6 +562,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
561562
struct rb_node **rb_link, *rb_parent;
562563
int retval;
563564
unsigned long charge;
565+
LIST_HEAD(uf);
564566

565567
uprobe_start_dup_mmap();
566568
if (down_write_killable(&oldmm->mmap_sem)) {
@@ -617,12 +619,13 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
617619
if (retval)
618620
goto fail_nomem_policy;
619621
tmp->vm_mm = mm;
622+
retval = dup_userfaultfd(tmp, &uf);
623+
if (retval)
624+
goto fail_nomem_anon_vma_fork;
620625
if (anon_vma_fork(tmp, mpnt))
621626
goto fail_nomem_anon_vma_fork;
622-
tmp->vm_flags &=
623-
~(VM_LOCKED|VM_LOCKONFAULT|VM_UFFD_MISSING|VM_UFFD_WP);
627+
tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
624628
tmp->vm_next = tmp->vm_prev = NULL;
625-
tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
626629
file = tmp->vm_file;
627630
if (file) {
628631
struct inode *inode = file_inode(file);
@@ -678,6 +681,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
678681
up_write(&mm->mmap_sem);
679682
flush_tlb_mm(oldmm);
680683
up_write(&oldmm->mmap_sem);
684+
dup_userfaultfd_complete(&uf);
681685
fail_uprobe_end:
682686
uprobe_end_dup_mmap();
683687
return retval;

0 commit comments

Comments
 (0)