Skip to content

Commit c56e022

Browse files
committed
io_uring: add support for user mapped provided buffer ring
The ring mapped provided buffer rings rely on the application allocating the memory for the ring, and then the kernel will map it. This generally works fine, but runs into issues on some architectures where we need to be able to ensure that the kernel and application virtual address for the ring play nicely together. This at least impacts architectures that set SHM_COLOUR, but potentially also anyone setting SHMLBA. To use this variant of ring provided buffers, the application need not allocate any memory for the ring. Instead the kernel will do so, and the allocation must subsequently call mmap(2) on the ring with the offset set to: IORING_OFF_PBUF_RING | (bgid << IORING_OFF_PBUF_SHIFT) to get a virtual address for the buffer ring. Normally the application would allocate a suitable piece of memory (and correctly aligned) and simply pass that in via io_uring_buf_reg.ring_addr and the kernel would map it. Outside of the setup differences, the kernel allocate + user mapped provided buffer ring works exactly the same. Acked-by: Helge Deller <[email protected]> Signed-off-by: Jens Axboe <[email protected]>
1 parent 81cf17c commit c56e022

File tree

4 files changed

+109
-24
lines changed

4 files changed

+109
-24
lines changed

include/uapi/linux/io_uring.h

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -389,6 +389,9 @@ enum {
389389
#define IORING_OFF_SQ_RING 0ULL
390390
#define IORING_OFF_CQ_RING 0x8000000ULL
391391
#define IORING_OFF_SQES 0x10000000ULL
392+
#define IORING_OFF_PBUF_RING 0x80000000ULL
393+
#define IORING_OFF_PBUF_SHIFT 16
394+
#define IORING_OFF_MMAP_MASK 0xf8000000ULL
392395

393396
/*
394397
* Filled with the offset for mmap(2)
@@ -635,6 +638,20 @@ struct io_uring_buf_ring {
635638
};
636639
};
637640

641+
/*
642+
* Flags for IORING_REGISTER_PBUF_RING.
643+
*
644+
* IOU_PBUF_RING_MMAP: If set, kernel will allocate the memory for the ring.
645+
* The application must not set a ring_addr in struct
646+
* io_uring_buf_reg, instead it must subsequently call
647+
* mmap(2) with the offset set as:
648+
* IORING_OFF_PBUF_RING | (bgid << IORING_OFF_PBUF_SHIFT)
649+
* to get a virtual mapping for the ring.
650+
*/
651+
enum {
652+
IOU_PBUF_RING_MMAP = 1,
653+
};
654+
638655
/* argument for IORING_(UN)REGISTER_PBUF_RING */
639656
struct io_uring_buf_reg {
640657
__u64 ring_addr;

io_uring/io_uring.c

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3289,14 +3289,25 @@ static void *io_uring_validate_mmap_request(struct file *file,
32893289
struct page *page;
32903290
void *ptr;
32913291

3292-
switch (offset) {
3292+
switch (offset & IORING_OFF_MMAP_MASK) {
32933293
case IORING_OFF_SQ_RING:
32943294
case IORING_OFF_CQ_RING:
32953295
ptr = ctx->rings;
32963296
break;
32973297
case IORING_OFF_SQES:
32983298
ptr = ctx->sq_sqes;
32993299
break;
3300+
case IORING_OFF_PBUF_RING: {
3301+
unsigned int bgid;
3302+
3303+
bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
3304+
mutex_lock(&ctx->uring_lock);
3305+
ptr = io_pbuf_get_address(ctx, bgid);
3306+
mutex_unlock(&ctx->uring_lock);
3307+
if (!ptr)
3308+
return ERR_PTR(-EINVAL);
3309+
break;
3310+
}
33003311
default:
33013312
return ERR_PTR(-EINVAL);
33023313
}

io_uring/kbuf.c

Lines changed: 76 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,8 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
137137
return NULL;
138138

139139
head &= bl->mask;
140-
if (head < IO_BUFFER_LIST_BUF_PER_PAGE) {
140+
/* mmaped buffers are always contig */
141+
if (bl->is_mmap || head < IO_BUFFER_LIST_BUF_PER_PAGE) {
141142
buf = &br->bufs[head];
142143
} else {
143144
int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1);
@@ -214,15 +215,27 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
214215
if (!nbufs)
215216
return 0;
216217

217-
if (bl->is_mapped && bl->buf_nr_pages) {
218-
int j;
219-
218+
if (bl->is_mapped) {
220219
i = bl->buf_ring->tail - bl->head;
221-
for (j = 0; j < bl->buf_nr_pages; j++)
222-
unpin_user_page(bl->buf_pages[j]);
223-
kvfree(bl->buf_pages);
224-
bl->buf_pages = NULL;
225-
bl->buf_nr_pages = 0;
220+
if (bl->is_mmap) {
221+
if (bl->buf_ring) {
222+
struct page *page;
223+
224+
page = virt_to_head_page(bl->buf_ring);
225+
if (put_page_testzero(page))
226+
free_compound_page(page);
227+
bl->buf_ring = NULL;
228+
}
229+
bl->is_mmap = 0;
230+
} else if (bl->buf_nr_pages) {
231+
int j;
232+
233+
for (j = 0; j < bl->buf_nr_pages; j++)
234+
unpin_user_page(bl->buf_pages[j]);
235+
kvfree(bl->buf_pages);
236+
bl->buf_pages = NULL;
237+
bl->buf_nr_pages = 0;
238+
}
226239
/* make sure it's seen as empty */
227240
INIT_LIST_HEAD(&bl->buf_list);
228241
bl->is_mapped = 0;
@@ -482,6 +495,25 @@ static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg,
482495
bl->buf_nr_pages = nr_pages;
483496
bl->buf_ring = br;
484497
bl->is_mapped = 1;
498+
bl->is_mmap = 0;
499+
return 0;
500+
}
501+
502+
static int io_alloc_pbuf_ring(struct io_uring_buf_reg *reg,
503+
struct io_buffer_list *bl)
504+
{
505+
gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
506+
size_t ring_size;
507+
void *ptr;
508+
509+
ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring);
510+
ptr = (void *) __get_free_pages(gfp, get_order(ring_size));
511+
if (!ptr)
512+
return -ENOMEM;
513+
514+
bl->buf_ring = ptr;
515+
bl->is_mapped = 1;
516+
bl->is_mmap = 1;
485517
return 0;
486518
}
487519

@@ -496,12 +528,18 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
496528

497529
if (reg.resv[0] || reg.resv[1] || reg.resv[2])
498530
return -EINVAL;
499-
if (reg.flags)
500-
return -EINVAL;
501-
if (!reg.ring_addr)
502-
return -EFAULT;
503-
if (reg.ring_addr & ~PAGE_MASK)
531+
if (reg.flags & ~IOU_PBUF_RING_MMAP)
504532
return -EINVAL;
533+
if (!(reg.flags & IOU_PBUF_RING_MMAP)) {
534+
if (!reg.ring_addr)
535+
return -EFAULT;
536+
if (reg.ring_addr & ~PAGE_MASK)
537+
return -EINVAL;
538+
} else {
539+
if (reg.ring_addr)
540+
return -EINVAL;
541+
}
542+
505543
if (!is_power_of_2(reg.ring_entries))
506544
return -EINVAL;
507545

@@ -526,17 +564,21 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
526564
return -ENOMEM;
527565
}
528566

529-
ret = io_pin_pbuf_ring(&reg, bl);
530-
if (ret) {
531-
kfree(free_bl);
532-
return ret;
533-
}
567+
if (!(reg.flags & IOU_PBUF_RING_MMAP))
568+
ret = io_pin_pbuf_ring(&reg, bl);
569+
else
570+
ret = io_alloc_pbuf_ring(&reg, bl);
534571

535-
bl->nr_entries = reg.ring_entries;
536-
bl->mask = reg.ring_entries - 1;
572+
if (!ret) {
573+
bl->nr_entries = reg.ring_entries;
574+
bl->mask = reg.ring_entries - 1;
537575

538-
io_buffer_add_list(ctx, bl, reg.bgid);
539-
return 0;
576+
io_buffer_add_list(ctx, bl, reg.bgid);
577+
return 0;
578+
}
579+
580+
kfree(free_bl);
581+
return ret;
540582
}
541583

542584
int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
@@ -564,3 +606,14 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
564606
}
565607
return 0;
566608
}
609+
610+
void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid)
611+
{
612+
struct io_buffer_list *bl;
613+
614+
bl = io_buffer_get_list(ctx, bgid);
615+
if (!bl || !bl->is_mmap)
616+
return NULL;
617+
618+
return bl->buf_ring;
619+
}

io_uring/kbuf.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ struct io_buffer_list {
2626

2727
/* ring mapped provided buffers */
2828
__u8 is_mapped;
29+
/* ring mapped provided buffers, but mmap'ed by application */
30+
__u8 is_mmap;
2931
};
3032

3133
struct io_buffer {
@@ -53,6 +55,8 @@ unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags);
5355

5456
void io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags);
5557

58+
void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid);
59+
5660
static inline void io_kbuf_recycle_ring(struct io_kiocb *req)
5761
{
5862
/*

0 commit comments

Comments
 (0)