Skip to content

Commit 811c668

Browse files
committed
rbd: fix rbd map vs notify races
A while ago, commit 9875201 ("rbd: fix use-after free of rbd_dev->disk") fixed rbd unmap vs notify race by introducing an exported wrapper for flushing notifies and sticking it into do_rbd_remove(). A similar problem exists on the rbd map path, though: the watch is registered in rbd_dev_image_probe(), while the disk is set up quite a few steps later, in rbd_dev_device_setup(). Nothing prevents a notify from coming in and crashing on a NULL rbd_dev->disk: BUG: unable to handle kernel NULL pointer dereference at 0000000000000050 Call Trace: [<ffffffffa0508344>] rbd_watch_cb+0x34/0x180 [rbd] [<ffffffffa04bd290>] do_event_work+0x40/0xb0 [libceph] [<ffffffff8109d5db>] process_one_work+0x17b/0x470 [<ffffffff8109e3ab>] worker_thread+0x11b/0x400 [<ffffffff8109e290>] ? rescuer_thread+0x400/0x400 [<ffffffff810a5acf>] kthread+0xcf/0xe0 [<ffffffff810b41b3>] ? finish_task_switch+0x53/0x170 [<ffffffff810a5a00>] ? kthread_create_on_node+0x140/0x140 [<ffffffff81645dd8>] ret_from_fork+0x58/0x90 [<ffffffff810a5a00>] ? kthread_create_on_node+0x140/0x140 RIP [<ffffffffa050828a>] rbd_dev_refresh+0xfa/0x180 [rbd] If an error occurs during rbd map, we have to error out, potentially tearing down a watch. Just like on rbd unmap, notifies have to be flushed, otherwise rbd_watch_cb() may end up trying to read in the image header after rbd_dev_image_release() has run: Assertion failure in rbd_dev_header_info() at line 4722: rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); Call Trace: [<ffffffff81cccee0>] ? rbd_parent_request_create+0x150/0x150 [<ffffffff81cd4e59>] rbd_dev_refresh+0x59/0x390 [<ffffffff81cd5229>] rbd_watch_cb+0x69/0x290 [<ffffffff81fde9bf>] do_event_work+0x10f/0x1c0 [<ffffffff81107799>] process_one_work+0x689/0x1a80 [<ffffffff811076f7>] ? process_one_work+0x5e7/0x1a80 [<ffffffff81132065>] ? finish_task_switch+0x225/0x640 [<ffffffff81107110>] ? pwq_dec_nr_in_flight+0x2b0/0x2b0 [<ffffffff81108c69>] worker_thread+0xd9/0x1320 [<ffffffff81108b90>] ? process_one_work+0x1a80/0x1a80 [<ffffffff8111b02d>] kthread+0x21d/0x2e0 [<ffffffff8111ae10>] ? kthread_stop+0x550/0x550 [<ffffffff82022802>] ret_from_fork+0x22/0x40 [<ffffffff8111ae10>] ? kthread_stop+0x550/0x550 RIP [<ffffffff81ccd8f9>] rbd_dev_header_info+0xa19/0x1e30 To fix this, a) check if RBD_DEV_FLAG_EXISTS is set before calling revalidate_disk(), b) move ceph_osdc_flush_notifies() call into rbd_dev_header_unwatch_sync() to cover rbd map error paths and c) turn header read-in into a critical section. The latter also happens to take care of rbd map foo@bar vs rbd snap rm foo@bar race. Fixes: http://tracker.ceph.com/issues/15490 Signed-off-by: Ilya Dryomov <[email protected]> Reviewed-by: Josh Durgin <[email protected]>
1 parent 6c1ea26 commit 811c668

File tree

1 file changed

+19
-24
lines changed

1 file changed

+19
-24
lines changed

drivers/block/rbd.c

Lines changed: 19 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -538,7 +538,6 @@ static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
538538
u8 *order, u64 *snap_size);
539539
static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
540540
u64 *snap_features);
541-
static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name);
542541

543542
static int rbd_open(struct block_device *bdev, fmode_t mode)
544543
{
@@ -3127,9 +3126,6 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
31273126
struct rbd_device *rbd_dev = (struct rbd_device *)data;
31283127
int ret;
31293128

3130-
if (!rbd_dev)
3131-
return;
3132-
31333129
dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
31343130
rbd_dev->header_name, (unsigned long long)notify_id,
31353131
(unsigned int)opcode);
@@ -3263,6 +3259,9 @@ static void rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev)
32633259

32643260
ceph_osdc_cancel_event(rbd_dev->watch_event);
32653261
rbd_dev->watch_event = NULL;
3262+
3263+
dout("%s flushing notifies\n", __func__);
3264+
ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
32663265
}
32673266

32683267
/*
@@ -3642,21 +3641,14 @@ static void rbd_exists_validate(struct rbd_device *rbd_dev)
36423641
static void rbd_dev_update_size(struct rbd_device *rbd_dev)
36433642
{
36443643
sector_t size;
3645-
bool removing;
36463644

36473645
/*
3648-
* Don't hold the lock while doing disk operations,
3649-
* or lock ordering will conflict with the bdev mutex via:
3650-
* rbd_add() -> blkdev_get() -> rbd_open()
3646+
* If EXISTS is not set, rbd_dev->disk may be NULL, so don't
3647+
* try to update its size. If REMOVING is set, updating size
3648+
* is just useless work since the device can't be opened.
36513649
*/
3652-
spin_lock_irq(&rbd_dev->lock);
3653-
removing = test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags);
3654-
spin_unlock_irq(&rbd_dev->lock);
3655-
/*
3656-
* If the device is being removed, rbd_dev->disk has
3657-
* been destroyed, so don't try to update its size
3658-
*/
3659-
if (!removing) {
3650+
if (test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags) &&
3651+
!test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) {
36603652
size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
36613653
dout("setting size to %llu sectors", (unsigned long long)size);
36623654
set_capacity(rbd_dev->disk, size);
@@ -5187,6 +5179,10 @@ static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth)
51875179
return ret;
51885180
}
51895181

5182+
/*
5183+
* rbd_dev->header_rwsem must be locked for write and will be unlocked
5184+
* upon return.
5185+
*/
51905186
static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
51915187
{
51925188
int ret;
@@ -5195,7 +5191,7 @@ static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
51955191

51965192
ret = rbd_dev_id_get(rbd_dev);
51975193
if (ret)
5198-
return ret;
5194+
goto err_out_unlock;
51995195

52005196
BUILD_BUG_ON(DEV_NAME_LEN
52015197
< sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
@@ -5236,8 +5232,9 @@ static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
52365232
/* Everything's ready. Announce the disk to the world. */
52375233

52385234
set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
5239-
add_disk(rbd_dev->disk);
5235+
up_write(&rbd_dev->header_rwsem);
52405236

5237+
add_disk(rbd_dev->disk);
52415238
pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
52425239
(unsigned long long) rbd_dev->mapping.size);
52435240

@@ -5252,6 +5249,8 @@ static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
52525249
unregister_blkdev(rbd_dev->major, rbd_dev->name);
52535250
err_out_id:
52545251
rbd_dev_id_put(rbd_dev);
5252+
err_out_unlock:
5253+
up_write(&rbd_dev->header_rwsem);
52555254
return ret;
52565255
}
52575256

@@ -5442,6 +5441,7 @@ static ssize_t do_rbd_add(struct bus_type *bus,
54425441
spec = NULL; /* rbd_dev now owns this */
54435442
rbd_opts = NULL; /* rbd_dev now owns this */
54445443

5444+
down_write(&rbd_dev->header_rwsem);
54455445
rc = rbd_dev_image_probe(rbd_dev, 0);
54465446
if (rc < 0)
54475447
goto err_out_rbd_dev;
@@ -5471,6 +5471,7 @@ static ssize_t do_rbd_add(struct bus_type *bus,
54715471
return rc;
54725472

54735473
err_out_rbd_dev:
5474+
up_write(&rbd_dev->header_rwsem);
54745475
rbd_dev_destroy(rbd_dev);
54755476
err_out_client:
54765477
rbd_put_client(rbdc);
@@ -5577,12 +5578,6 @@ static ssize_t do_rbd_remove(struct bus_type *bus,
55775578
return ret;
55785579

55795580
rbd_dev_header_unwatch_sync(rbd_dev);
5580-
/*
5581-
* flush remaining watch callbacks - these must be complete
5582-
* before the osd_client is shutdown
5583-
*/
5584-
dout("%s: flushing notifies", __func__);
5585-
ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
55865581

55875582
/*
55885583
* Don't free anything from rbd_dev->disk until after all

0 commit comments

Comments
 (0)