Skip to content

Commit e34ecee

Browse files
author
Kent Overstreet
committed
aio: Fix a trinity splat
aio kiocb refcounting was broken - it was relying on keeping track of the number of available ring buffer entries, which it needs to do anyways; then at shutdown time it'd wait for completions to be delivered until the # of available ring buffer entries equalled what it was initialized to. Problem with that is that the ring buffer is mapped writable into userspace, so userspace could futz with the head and tail pointers to cause the kernel to see extra completions, and cause free_ioctx() to return while there were still outstanding kiocbs. Which would be bad. Fix is just to directly refcount the kiocbs - which is more straightforward, and with the new percpu refcounting code doesn't cost us any cacheline bouncing which was the whole point of the original scheme. Also clean up ioctx_alloc()'s error path and fix a bug where it wasn't subtracting from aio_nr if ioctx_add_table() failed. Signed-off-by: Kent Overstreet <[email protected]>
1 parent d0e639c commit e34ecee

File tree

1 file changed

+48
-81
lines changed

1 file changed

+48
-81
lines changed

fs/aio.c

Lines changed: 48 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,8 @@ struct kioctx {
8080
struct percpu_ref users;
8181
atomic_t dead;
8282

83+
struct percpu_ref reqs;
84+
8385
unsigned long user_id;
8486

8587
struct __percpu kioctx_cpu *cpu;
@@ -107,7 +109,6 @@ struct kioctx {
107109
struct page **ring_pages;
108110
long nr_pages;
109111

110-
struct rcu_head rcu_head;
111112
struct work_struct free_work;
112113

113114
struct {
@@ -412,26 +413,34 @@ static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb)
412413
return cancel(kiocb);
413414
}
414415

415-
static void free_ioctx_rcu(struct rcu_head *head)
416+
static void free_ioctx(struct work_struct *work)
416417
{
417-
struct kioctx *ctx = container_of(head, struct kioctx, rcu_head);
418+
struct kioctx *ctx = container_of(work, struct kioctx, free_work);
419+
420+
pr_debug("freeing %p\n", ctx);
418421

422+
aio_free_ring(ctx);
419423
free_percpu(ctx->cpu);
420424
kmem_cache_free(kioctx_cachep, ctx);
421425
}
422426

427+
static void free_ioctx_reqs(struct percpu_ref *ref)
428+
{
429+
struct kioctx *ctx = container_of(ref, struct kioctx, reqs);
430+
431+
INIT_WORK(&ctx->free_work, free_ioctx);
432+
schedule_work(&ctx->free_work);
433+
}
434+
423435
/*
424436
* When this function runs, the kioctx has been removed from the "hash table"
425437
* and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
426438
* now it's safe to cancel any that need to be.
427439
*/
428-
static void free_ioctx(struct work_struct *work)
440+
static void free_ioctx_users(struct percpu_ref *ref)
429441
{
430-
struct kioctx *ctx = container_of(work, struct kioctx, free_work);
431-
struct aio_ring *ring;
442+
struct kioctx *ctx = container_of(ref, struct kioctx, users);
432443
struct kiocb *req;
433-
unsigned cpu, avail;
434-
DEFINE_WAIT(wait);
435444

436445
spin_lock_irq(&ctx->ctx_lock);
437446

@@ -445,54 +454,8 @@ static void free_ioctx(struct work_struct *work)
445454

446455
spin_unlock_irq(&ctx->ctx_lock);
447456

448-
for_each_possible_cpu(cpu) {
449-
struct kioctx_cpu *kcpu = per_cpu_ptr(ctx->cpu, cpu);
450-
451-
atomic_add(kcpu->reqs_available, &ctx->reqs_available);
452-
kcpu->reqs_available = 0;
453-
}
454-
455-
while (1) {
456-
prepare_to_wait(&ctx->wait, &wait, TASK_UNINTERRUPTIBLE);
457-
458-
ring = kmap_atomic(ctx->ring_pages[0]);
459-
avail = (ring->head <= ring->tail)
460-
? ring->tail - ring->head
461-
: ctx->nr_events - ring->head + ring->tail;
462-
463-
atomic_add(avail, &ctx->reqs_available);
464-
ring->head = ring->tail;
465-
kunmap_atomic(ring);
466-
467-
if (atomic_read(&ctx->reqs_available) >= ctx->nr_events - 1)
468-
break;
469-
470-
schedule();
471-
}
472-
finish_wait(&ctx->wait, &wait);
473-
474-
WARN_ON(atomic_read(&ctx->reqs_available) > ctx->nr_events - 1);
475-
476-
aio_free_ring(ctx);
477-
478-
pr_debug("freeing %p\n", ctx);
479-
480-
/*
481-
* Here the call_rcu() is between the wait_event() for reqs_active to
482-
* hit 0, and freeing the ioctx.
483-
*
484-
* aio_complete() decrements reqs_active, but it has to touch the ioctx
485-
* after to issue a wakeup so we use rcu.
486-
*/
487-
call_rcu(&ctx->rcu_head, free_ioctx_rcu);
488-
}
489-
490-
static void free_ioctx_ref(struct percpu_ref *ref)
491-
{
492-
struct kioctx *ctx = container_of(ref, struct kioctx, users);
493-
494-
INIT_WORK(&ctx->free_work, free_ioctx);
495-
schedule_work(&ctx->free_work);
457+
percpu_ref_kill(&ctx->reqs);
458+
percpu_ref_put(&ctx->reqs);
496459
}
497460

498461
static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
@@ -551,6 +514,16 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
551514
}
552515
}
553516

517+
static void aio_nr_sub(unsigned nr)
518+
{
519+
spin_lock(&aio_nr_lock);
520+
if (WARN_ON(aio_nr - nr > aio_nr))
521+
aio_nr = 0;
522+
else
523+
aio_nr -= nr;
524+
spin_unlock(&aio_nr_lock);
525+
}
526+
554527
/* ioctx_alloc
555528
* Allocates and initializes an ioctx. Returns an ERR_PTR if it failed.
556529
*/
@@ -588,8 +561,11 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
588561

589562
ctx->max_reqs = nr_events;
590563

591-
if (percpu_ref_init(&ctx->users, free_ioctx_ref))
592-
goto out_freectx;
564+
if (percpu_ref_init(&ctx->users, free_ioctx_users))
565+
goto err;
566+
567+
if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs))
568+
goto err;
593569

594570
spin_lock_init(&ctx->ctx_lock);
595571
spin_lock_init(&ctx->completion_lock);
@@ -600,10 +576,10 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
600576

601577
ctx->cpu = alloc_percpu(struct kioctx_cpu);
602578
if (!ctx->cpu)
603-
goto out_freeref;
579+
goto err;
604580

605581
if (aio_setup_ring(ctx) < 0)
606-
goto out_freepcpu;
582+
goto err;
607583

608584
atomic_set(&ctx->reqs_available, ctx->nr_events - 1);
609585
ctx->req_batch = (ctx->nr_events - 1) / (num_possible_cpus() * 4);
@@ -615,7 +591,8 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
615591
if (aio_nr + nr_events > (aio_max_nr * 2UL) ||
616592
aio_nr + nr_events < aio_nr) {
617593
spin_unlock(&aio_nr_lock);
618-
goto out_cleanup;
594+
err = -EAGAIN;
595+
goto err;
619596
}
620597
aio_nr += ctx->max_reqs;
621598
spin_unlock(&aio_nr_lock);
@@ -624,23 +601,19 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
624601

625602
err = ioctx_add_table(ctx, mm);
626603
if (err)
627-
goto out_cleanup_put;
604+
goto err_cleanup;
628605

629606
pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
630607
ctx, ctx->user_id, mm, ctx->nr_events);
631608
return ctx;
632609

633-
out_cleanup_put:
634-
percpu_ref_put(&ctx->users);
635-
out_cleanup:
636-
err = -EAGAIN;
610+
err_cleanup:
611+
aio_nr_sub(ctx->max_reqs);
612+
err:
637613
aio_free_ring(ctx);
638-
out_freepcpu:
639614
free_percpu(ctx->cpu);
640-
out_freeref:
615+
free_percpu(ctx->reqs.pcpu_count);
641616
free_percpu(ctx->users.pcpu_count);
642-
out_freectx:
643-
put_aio_ring_file(ctx);
644617
kmem_cache_free(kioctx_cachep, ctx);
645618
pr_debug("error allocating ioctx %d\n", err);
646619
return ERR_PTR(err);
@@ -675,10 +648,7 @@ static void kill_ioctx(struct mm_struct *mm, struct kioctx *ctx)
675648
* -EAGAIN with no ioctxs actually in use (as far as userspace
676649
* could tell).
677650
*/
678-
spin_lock(&aio_nr_lock);
679-
BUG_ON(aio_nr - ctx->max_reqs > aio_nr);
680-
aio_nr -= ctx->max_reqs;
681-
spin_unlock(&aio_nr_lock);
651+
aio_nr_sub(ctx->max_reqs);
682652

683653
if (ctx->mmap_size)
684654
vm_munmap(ctx->mmap_base, ctx->mmap_size);
@@ -810,6 +780,8 @@ static inline struct kiocb *aio_get_req(struct kioctx *ctx)
810780
if (unlikely(!req))
811781
goto out_put;
812782

783+
percpu_ref_get(&ctx->reqs);
784+
813785
req->ki_ctx = ctx;
814786
return req;
815787
out_put:
@@ -879,12 +851,6 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
879851
return;
880852
}
881853

882-
/*
883-
* Take rcu_read_lock() in case the kioctx is being destroyed, as we
884-
* need to issue a wakeup after incrementing reqs_available.
885-
*/
886-
rcu_read_lock();
887-
888854
if (iocb->ki_list.next) {
889855
unsigned long flags;
890856

@@ -959,7 +925,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
959925
if (waitqueue_active(&ctx->wait))
960926
wake_up(&ctx->wait);
961927

962-
rcu_read_unlock();
928+
percpu_ref_put(&ctx->reqs);
963929
}
964930
EXPORT_SYMBOL(aio_complete);
965931

@@ -1370,6 +1336,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
13701336
return 0;
13711337
out_put_req:
13721338
put_reqs_available(ctx, 1);
1339+
percpu_ref_put(&ctx->reqs);
13731340
kiocb_free(req);
13741341
return ret;
13751342
}

0 commit comments

Comments
 (0)