Skip to content

Commit 53ae7e1

Browse files
committed
Merge tag 'io_uring-6.3-2023-03-03' of git://git.kernel.dk/linux
Pull more io_uring updates from Jens Axboe: "Here's a set of fixes/changes that didn't make the first cut, either because they got queued before I sent the early merge request, or fixes that came in afterwards. In detail: - Don't set MSG_NOSIGNAL on recv/recvmsg opcodes, as AF_PACKET will error out (David) - Fix for spurious poll wakeups (me) - Fix for a file leak for buffered reads in certain conditions (Joseph) - Don't allow registered buffers of mixed types (Pavel) - Improve handling of huge pages for registered buffers (Pavel) - Provided buffer ring size calculation fix (Wojciech) - Minor cleanups (me)" * tag 'io_uring-6.3-2023-03-03' of git://git.kernel.dk/linux: io_uring/poll: don't pass in wake func to io_init_poll_iocb() io_uring: fix fget leak when fs don't support nowait buffered read io_uring/poll: allow some retries for poll triggering spuriously io_uring: remove MSG_NOSIGNAL from recvmsg io_uring/rsrc: always initialize 'folio' to NULL io_uring/rsrc: optimise registered huge pages io_uring/rsrc: optimise single entry advance io_uring/rsrc: disallow multi-source reg buffers io_uring: remove unused wq_list_merge io_uring: fix size calculation when registering buf ring io_uring/rsrc: fix a comment in io_import_fixed() io_uring: rename 'in_idle' to 'in_cancel' io_uring: consolidate the put_ref-and-return section of adding work
2 parents 9d0281b + 1947ddf commit 53ae7e1

File tree

9 files changed

+85
-62
lines changed

9 files changed

+85
-62
lines changed

include/linux/io_uring_types.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ struct io_uring_task {
5858

5959
struct xarray xa;
6060
struct wait_queue_head wait;
61-
atomic_t in_idle;
61+
atomic_t in_cancel;
6262
atomic_t inflight_tracked;
6363
struct percpu_counter inflight;
6464

io_uring/io_uring.c

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -719,7 +719,7 @@ static void io_put_task_remote(struct task_struct *task, int nr)
719719
struct io_uring_task *tctx = task->io_uring;
720720

721721
percpu_counter_sub(&tctx->inflight, nr);
722-
if (unlikely(atomic_read(&tctx->in_idle)))
722+
if (unlikely(atomic_read(&tctx->in_cancel)))
723723
wake_up(&tctx->wait);
724724
put_task_struct_many(task, nr);
725725
}
@@ -1258,8 +1258,8 @@ void tctx_task_work(struct callback_head *cb)
12581258

12591259
ctx_flush_and_put(ctx, &uring_locked);
12601260

1261-
/* relaxed read is enough as only the task itself sets ->in_idle */
1262-
if (unlikely(atomic_read(&tctx->in_idle)))
1261+
/* relaxed read is enough as only the task itself sets ->in_cancel */
1262+
if (unlikely(atomic_read(&tctx->in_cancel)))
12631263
io_uring_drop_tctx_refs(current);
12641264

12651265
trace_io_uring_task_work_run(tctx, count, loops);
@@ -1285,17 +1285,15 @@ static void io_req_local_work_add(struct io_kiocb *req)
12851285

12861286
percpu_ref_get(&ctx->refs);
12871287

1288-
if (!llist_add(&req->io_task_work.node, &ctx->work_llist)) {
1289-
percpu_ref_put(&ctx->refs);
1290-
return;
1291-
}
1288+
if (!llist_add(&req->io_task_work.node, &ctx->work_llist))
1289+
goto put_ref;
1290+
12921291
/* needed for the following wake up */
12931292
smp_mb__after_atomic();
12941293

1295-
if (unlikely(atomic_read(&req->task->io_uring->in_idle))) {
1294+
if (unlikely(atomic_read(&req->task->io_uring->in_cancel))) {
12961295
io_move_task_work_from_local(ctx);
1297-
percpu_ref_put(&ctx->refs);
1298-
return;
1296+
goto put_ref;
12991297
}
13001298

13011299
if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
@@ -1305,6 +1303,8 @@ static void io_req_local_work_add(struct io_kiocb *req)
13051303

13061304
if (READ_ONCE(ctx->cq_waiting))
13071305
wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE);
1306+
1307+
put_ref:
13081308
percpu_ref_put(&ctx->refs);
13091309
}
13101310

@@ -1777,7 +1777,7 @@ int io_req_prep_async(struct io_kiocb *req)
17771777
const struct io_issue_def *def = &io_issue_defs[req->opcode];
17781778

17791779
/* assign early for deferred execution for non-fixed file */
1780-
if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE))
1780+
if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE) && !req->file)
17811781
req->file = io_file_get_normal(req, req->cqe.fd);
17821782
if (!cdef->prep_async)
17831783
return 0;
@@ -2937,12 +2937,12 @@ static __cold void io_tctx_exit_cb(struct callback_head *cb)
29372937

29382938
work = container_of(cb, struct io_tctx_exit, task_work);
29392939
/*
2940-
* When @in_idle, we're in cancellation and it's racy to remove the
2940+
* When @in_cancel, we're in cancellation and it's racy to remove the
29412941
* node. It'll be removed by the end of cancellation, just ignore it.
29422942
* tctx can be NULL if the queueing of this task_work raced with
29432943
* work cancelation off the exec path.
29442944
*/
2945-
if (tctx && !atomic_read(&tctx->in_idle))
2945+
if (tctx && !atomic_read(&tctx->in_cancel))
29462946
io_uring_del_tctx_node((unsigned long)work->ctx);
29472947
complete(&work->completion);
29482948
}
@@ -3210,7 +3210,7 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
32103210
if (tctx->io_wq)
32113211
io_wq_exit_start(tctx->io_wq);
32123212

3213-
atomic_inc(&tctx->in_idle);
3213+
atomic_inc(&tctx->in_cancel);
32143214
do {
32153215
bool loop = false;
32163216

@@ -3261,9 +3261,9 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
32613261
if (cancel_all) {
32623262
/*
32633263
* We shouldn't run task_works after cancel, so just leave
3264-
* ->in_idle set for normal exit.
3264+
* ->in_cancel set for normal exit.
32653265
*/
3266-
atomic_dec(&tctx->in_idle);
3266+
atomic_dec(&tctx->in_cancel);
32673267
/* for exec all current's requests should be gone, kill tctx */
32683268
__io_uring_free(current);
32693269
}

io_uring/kbuf.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -505,7 +505,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
505505
}
506506

507507
pages = io_pin_pages(reg.ring_addr,
508-
struct_size(br, bufs, reg.ring_entries),
508+
flex_array_size(br, bufs, reg.ring_entries),
509509
&nr_pages);
510510
if (IS_ERR(pages)) {
511511
kfree(free_bl);

io_uring/net.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -567,7 +567,7 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
567567
sr->flags = READ_ONCE(sqe->ioprio);
568568
if (sr->flags & ~(RECVMSG_FLAGS))
569569
return -EINVAL;
570-
sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
570+
sr->msg_flags = READ_ONCE(sqe->msg_flags);
571571
if (sr->msg_flags & MSG_DONTWAIT)
572572
req->flags |= REQ_F_NOWAIT;
573573
if (sr->msg_flags & MSG_ERRQUEUE)

io_uring/poll.c

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,9 @@ struct io_poll_table {
5151

5252
#define IO_WQE_F_DOUBLE 1
5353

54+
static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
55+
void *key);
56+
5457
static inline struct io_kiocb *wqe_to_req(struct wait_queue_entry *wqe)
5558
{
5659
unsigned long priv = (unsigned long)wqe->private;
@@ -164,15 +167,14 @@ static void io_poll_tw_hash_eject(struct io_kiocb *req, bool *locked)
164167
}
165168
}
166169

167-
static void io_init_poll_iocb(struct io_poll *poll, __poll_t events,
168-
wait_queue_func_t wake_func)
170+
static void io_init_poll_iocb(struct io_poll *poll, __poll_t events)
169171
{
170172
poll->head = NULL;
171173
#define IO_POLL_UNMASK (EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP)
172174
/* mask in events that we always want/need */
173175
poll->events = events | IO_POLL_UNMASK;
174176
INIT_LIST_HEAD(&poll->wait.entry);
175-
init_waitqueue_func_entry(&poll->wait, wake_func);
177+
init_waitqueue_func_entry(&poll->wait, io_poll_wake);
176178
}
177179

178180
static inline void io_poll_remove_entry(struct io_poll *poll)
@@ -508,7 +510,7 @@ static void __io_queue_proc(struct io_poll *poll, struct io_poll_table *pt,
508510

509511
/* mark as double wq entry */
510512
wqe_private |= IO_WQE_F_DOUBLE;
511-
io_init_poll_iocb(poll, first->events, first->wait.func);
513+
io_init_poll_iocb(poll, first->events);
512514
if (!io_poll_double_prepare(req)) {
513515
/* the request is completing, just back off */
514516
kfree(poll);
@@ -569,7 +571,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
569571

570572
INIT_HLIST_NODE(&req->hash_node);
571573
req->work.cancel_seq = atomic_read(&ctx->cancel_seq);
572-
io_init_poll_iocb(poll, mask, io_poll_wake);
574+
io_init_poll_iocb(poll, mask);
573575
poll->file = req->file;
574576
req->apoll_events = poll->events;
575577

@@ -650,6 +652,14 @@ static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
650652
__io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll);
651653
}
652654

655+
/*
656+
* We can't reliably detect loops in repeated poll triggers and issue
657+
* subsequently failing. But rather than fail these immediately, allow a
658+
* certain amount of retries before we give up. Given that this condition
659+
* should _rarely_ trigger even once, we should be fine with a larger value.
660+
*/
661+
#define APOLL_MAX_RETRY 128
662+
653663
static struct async_poll *io_req_alloc_apoll(struct io_kiocb *req,
654664
unsigned issue_flags)
655665
{
@@ -665,14 +675,18 @@ static struct async_poll *io_req_alloc_apoll(struct io_kiocb *req,
665675
if (entry == NULL)
666676
goto alloc_apoll;
667677
apoll = container_of(entry, struct async_poll, cache);
678+
apoll->poll.retries = APOLL_MAX_RETRY;
668679
} else {
669680
alloc_apoll:
670681
apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
671682
if (unlikely(!apoll))
672683
return NULL;
684+
apoll->poll.retries = APOLL_MAX_RETRY;
673685
}
674686
apoll->double_poll = NULL;
675687
req->apoll = apoll;
688+
if (unlikely(!--apoll->poll.retries))
689+
return NULL;
676690
return apoll;
677691
}
678692

@@ -694,8 +708,6 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
694708
return IO_APOLL_ABORTED;
695709
if (!file_can_poll(req->file))
696710
return IO_APOLL_ABORTED;
697-
if ((req->flags & (REQ_F_POLLED|REQ_F_PARTIAL_IO)) == REQ_F_POLLED)
698-
return IO_APOLL_ABORTED;
699711
if (!(req->flags & REQ_F_APOLL_MULTISHOT))
700712
mask |= EPOLLONESHOT;
701713

io_uring/poll.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ struct io_poll {
1212
struct file *file;
1313
struct wait_queue_head *head;
1414
__poll_t events;
15+
int retries;
1516
struct wait_queue_entry wait;
1617
};
1718

io_uring/rsrc.c

Lines changed: 45 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1162,14 +1162,17 @@ struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages)
11621162
pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
11631163
pages, vmas);
11641164
if (pret == nr_pages) {
1165+
struct file *file = vmas[0]->vm_file;
1166+
11651167
/* don't support file backed memory */
11661168
for (i = 0; i < nr_pages; i++) {
1167-
struct vm_area_struct *vma = vmas[i];
1168-
1169-
if (vma_is_shmem(vma))
1169+
if (vmas[i]->vm_file != file) {
1170+
ret = -EINVAL;
1171+
break;
1172+
}
1173+
if (!file)
11701174
continue;
1171-
if (vma->vm_file &&
1172-
!is_file_hugepages(vma->vm_file)) {
1175+
if (!vma_is_shmem(vmas[i]) && !is_file_hugepages(file)) {
11731176
ret = -EOPNOTSUPP;
11741177
break;
11751178
}
@@ -1207,6 +1210,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
12071210
unsigned long off;
12081211
size_t size;
12091212
int ret, nr_pages, i;
1213+
struct folio *folio = NULL;
12101214

12111215
*pimu = ctx->dummy_ubuf;
12121216
if (!iov->iov_base)
@@ -1221,6 +1225,21 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
12211225
goto done;
12221226
}
12231227

1228+
/* If it's a huge page, try to coalesce them into a single bvec entry */
1229+
if (nr_pages > 1) {
1230+
folio = page_folio(pages[0]);
1231+
for (i = 1; i < nr_pages; i++) {
1232+
if (page_folio(pages[i]) != folio) {
1233+
folio = NULL;
1234+
break;
1235+
}
1236+
}
1237+
if (folio) {
1238+
folio_put_refs(folio, nr_pages - 1);
1239+
nr_pages = 1;
1240+
}
1241+
}
1242+
12241243
imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
12251244
if (!imu)
12261245
goto done;
@@ -1233,6 +1252,17 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
12331252

12341253
off = (unsigned long) iov->iov_base & ~PAGE_MASK;
12351254
size = iov->iov_len;
1255+
/* store original address for later verification */
1256+
imu->ubuf = (unsigned long) iov->iov_base;
1257+
imu->ubuf_end = imu->ubuf + iov->iov_len;
1258+
imu->nr_bvecs = nr_pages;
1259+
*pimu = imu;
1260+
ret = 0;
1261+
1262+
if (folio) {
1263+
bvec_set_page(&imu->bvec[0], pages[0], size, off);
1264+
goto done;
1265+
}
12361266
for (i = 0; i < nr_pages; i++) {
12371267
size_t vec_len;
12381268

@@ -1241,12 +1271,6 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
12411271
off = 0;
12421272
size -= vec_len;
12431273
}
1244-
/* store original address for later verification */
1245-
imu->ubuf = (unsigned long) iov->iov_base;
1246-
imu->ubuf_end = imu->ubuf + iov->iov_len;
1247-
imu->nr_bvecs = nr_pages;
1248-
*pimu = imu;
1249-
ret = 0;
12501274
done:
12511275
if (ret)
12521276
kvfree(imu);
@@ -1335,7 +1359,7 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
13351359
return -EFAULT;
13361360

13371361
/*
1338-
* May not be a start of buffer, set size appropriately
1362+
* Might not be a start of buffer, set size appropriately
13391363
* and advance us to the beginning.
13401364
*/
13411365
offset = buf_addr - imu->ubuf;
@@ -1361,7 +1385,15 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
13611385
const struct bio_vec *bvec = imu->bvec;
13621386

13631387
if (offset <= bvec->bv_len) {
1364-
iov_iter_advance(iter, offset);
1388+
/*
1389+
* Note, huge pages buffers consists of one large
1390+
* bvec entry and should always go this way. The other
1391+
* branch doesn't expect non PAGE_SIZE'd chunks.
1392+
*/
1393+
iter->bvec = bvec;
1394+
iter->nr_segs = bvec->bv_len;
1395+
iter->count -= offset;
1396+
iter->iov_offset = offset;
13651397
} else {
13661398
unsigned long seg_skip;
13671399

io_uring/slist.h

Lines changed: 0 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -27,28 +27,6 @@ static inline void wq_list_add_after(struct io_wq_work_node *node,
2727
list->last = node;
2828
}
2929

30-
/**
31-
* wq_list_merge - merge the second list to the first one.
32-
* @list0: the first list
33-
* @list1: the second list
34-
* Return the first node after mergence.
35-
*/
36-
static inline struct io_wq_work_node *wq_list_merge(struct io_wq_work_list *list0,
37-
struct io_wq_work_list *list1)
38-
{
39-
struct io_wq_work_node *ret;
40-
41-
if (!list0->first) {
42-
ret = list1->first;
43-
} else {
44-
ret = list0->first;
45-
list0->last->next = list1->first;
46-
}
47-
INIT_WQ_LIST(list0);
48-
INIT_WQ_LIST(list1);
49-
return ret;
50-
}
51-
5230
static inline void wq_list_add_tail(struct io_wq_work_node *node,
5331
struct io_wq_work_list *list)
5432
{

io_uring/tctx.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ __cold int io_uring_alloc_task_context(struct task_struct *task,
8383

8484
xa_init(&tctx->xa);
8585
init_waitqueue_head(&tctx->wait);
86-
atomic_set(&tctx->in_idle, 0);
86+
atomic_set(&tctx->in_cancel, 0);
8787
atomic_set(&tctx->inflight_tracked, 0);
8888
task->io_uring = tctx;
8989
init_llist_head(&tctx->task_list);

0 commit comments

Comments
 (0)