Skip to content

Commit 734551d

Browse files
isilenceaxboe
authored andcommitted
io_uring: fix shared sqpoll cancellation hangs
[ 736.982891] INFO: task iou-sqp-4294:4295 blocked for more than 122 seconds. [ 736.982897] Call Trace: [ 736.982901] schedule+0x68/0xe0 [ 736.982903] io_uring_cancel_sqpoll+0xdb/0x110 [ 736.982908] io_sqpoll_cancel_cb+0x24/0x30 [ 736.982911] io_run_task_work_head+0x28/0x50 [ 736.982913] io_sq_thread+0x4e3/0x720 We call io_uring_cancel_sqpoll() one by one for each ctx either in sq_thread() itself or via task works, and it's intended to cancel all requests of a specified context. However the function uses per-task counters to track the number of inflight requests, so it counts more requests than available via currect io_uring ctx and goes to sleep for them to appear (e.g. from IRQ), that will never happen. Cancel a bit more than before, i.e. all ctxs that share sqpoll and continue to use shared counters. Don't forget that we should not remove ctx from the list before running that task_work sqpoll-cancel, otherwise the function wouldn't be able to find the context and will hang. Reported-by: Joakim Hassila <[email protected]> Reported-by: Jens Axboe <[email protected]> Fixes: 37d1e2e ("io_uring: move SQPOLL thread io-wq forked worker") Cc: [email protected] Signed-off-by: Pavel Begunkov <[email protected]> Link: https://lore.kernel.org/r/1bded7e6c6b32e0bae25fce36be2868e46b116a0.1618752958.git.asml.silence@gmail.com Signed-off-by: Jens Axboe <[email protected]>
1 parent 3b763ba commit 734551d

File tree

1 file changed

+14
-13
lines changed

1 file changed

+14
-13
lines changed

fs/io_uring.c

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1022,7 +1022,7 @@ static void io_uring_del_task_file(unsigned long index);
10221022
static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
10231023
struct task_struct *task,
10241024
struct files_struct *files);
1025-
static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx);
1025+
static void io_uring_cancel_sqpoll(struct io_sq_data *sqd);
10261026
static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx);
10271027

10281028
static bool io_cqring_fill_event(struct io_kiocb *req, long res, unsigned cflags);
@@ -6870,15 +6870,14 @@ static int io_sq_thread(void *data)
68706870
timeout = jiffies + sqd->sq_thread_idle;
68716871
}
68726872

6873-
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
6874-
io_uring_cancel_sqpoll(ctx);
6873+
io_uring_cancel_sqpoll(sqd);
68756874
sqd->thread = NULL;
68766875
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
68776876
io_ring_set_wakeup_flag(ctx);
6878-
mutex_unlock(&sqd->lock);
6879-
68806877
io_run_task_work();
68816878
io_run_task_work_head(&sqd->park_task_work);
6879+
mutex_unlock(&sqd->lock);
6880+
68826881
complete(&sqd->exited);
68836882
do_exit(0);
68846883
}
@@ -8870,11 +8869,11 @@ static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked)
88708869
static void io_sqpoll_cancel_cb(struct callback_head *cb)
88718870
{
88728871
struct io_tctx_exit *work = container_of(cb, struct io_tctx_exit, task_work);
8873-
struct io_ring_ctx *ctx = work->ctx;
8874-
struct io_sq_data *sqd = ctx->sq_data;
8872+
struct io_sq_data *sqd = work->ctx->sq_data;
88758873

88768874
if (sqd->thread)
8877-
io_uring_cancel_sqpoll(ctx);
8875+
io_uring_cancel_sqpoll(sqd);
8876+
list_del_init(&work->ctx->sqd_list);
88788877
complete(&work->completion);
88798878
}
88808879

@@ -8885,14 +8884,15 @@ static void io_sqpoll_cancel_sync(struct io_ring_ctx *ctx)
88858884
struct task_struct *task;
88868885

88878886
io_sq_thread_park(sqd);
8888-
list_del_init(&ctx->sqd_list);
88898887
io_sqd_update_thread_idle(sqd);
88908888
task = sqd->thread;
88918889
if (task) {
88928890
init_completion(&work.completion);
88938891
init_task_work(&work.task_work, io_sqpoll_cancel_cb);
88948892
io_task_work_add_head(&sqd->park_task_work, &work.task_work);
88958893
wake_up_process(task);
8894+
} else {
8895+
list_del_init(&ctx->sqd_list);
88968896
}
88978897
io_sq_thread_unpark(sqd);
88988898

@@ -8918,22 +8918,23 @@ static void io_uring_try_cancel(struct files_struct *files)
89188918
}
89198919

89208920
/* should only be called by SQPOLL task */
8921-
static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx)
8921+
static void io_uring_cancel_sqpoll(struct io_sq_data *sqd)
89228922
{
8923-
struct io_sq_data *sqd = ctx->sq_data;
89248923
struct io_uring_task *tctx = current->io_uring;
8924+
struct io_ring_ctx *ctx;
89258925
s64 inflight;
89268926
DEFINE_WAIT(wait);
89278927

8928-
WARN_ON_ONCE(!sqd || ctx->sq_data->thread != current);
8928+
WARN_ON_ONCE(!sqd || sqd->thread != current);
89298929

89308930
atomic_inc(&tctx->in_idle);
89318931
do {
89328932
/* read completions before cancelations */
89338933
inflight = tctx_inflight(tctx, false);
89348934
if (!inflight)
89358935
break;
8936-
io_uring_try_cancel_requests(ctx, current, NULL);
8936+
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
8937+
io_uring_try_cancel_requests(ctx, current, NULL);
89378938

89388939
prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
89398940
/*

0 commit comments

Comments
 (0)