Skip to content

Commit f78c436

Browse files
committed
Btrfs: fix race between block group relocation and nocow writes
Relocation of a block group waits for all existing tasks flushing dellaloc, starting direct IO writes and any ordered extents before starting the relocation process. However for direct IO writes that end up doing nocow (inode either has the flag nodatacow set or the write is against a prealloc extent) we have a short time window that allows for a race that makes relocation proceed without waiting for the direct IO write to complete first, resulting in data loss after the relocation finishes. This is illustrated by the following diagram: CPU 1 CPU 2 btrfs_relocate_block_group(bg X) direct IO write starts against an extent in block group X using nocow mode (inode has the nodatacow flag or the write is for a prealloc extent) btrfs_direct_IO() btrfs_get_blocks_direct() --> can_nocow_extent() returns 1 btrfs_inc_block_group_ro(bg X) --> turns block group into RO mode btrfs_wait_ordered_roots() --> returns and does not know about the DIO write happening at CPU 2 (the task there has not created yet an ordered extent) relocate_block_group(bg X) --> rc->stage == MOVE_DATA_EXTENTS find_next_extent() --> returns extent that the DIO write is going to write to relocate_data_extent() relocate_file_extent_cluster() --> reads the extent from disk into pages belonging to the relocation inode and dirties them --> creates DIO ordered extent btrfs_submit_direct() --> submits bio against a location on disk obtained from an extent map before the relocation started btrfs_wait_ordered_range() --> writes all the pages read before to disk (belonging to the relocation inode) relocation finishes bio completes and wrote new data to the old location of the block group So fix this by tracking the number of nocow writers for a block group and make sure relocation waits for that number to go down to 0 before starting to move the extents. The same race can also happen with buffered writes in nocow mode since the patch I recently made titled "Btrfs: don't do unnecessary delalloc flushes when relocating", because we are no longer flushing all delalloc which served as a synchonization mechanism (due to page locking) and ensured the ordered extents for nocow buffered writes were created before we called btrfs_wait_ordered_roots(). The race with direct IO writes in nocow mode existed before that patch (no pages are locked or used during direct IO) and that fixed only races with direct IO writes that do cow. Signed-off-by: Filipe Manana <[email protected]> Reviewed-by: Josef Bacik <[email protected]>
1 parent 0b90191 commit f78c436

File tree

4 files changed

+81
-1
lines changed

4 files changed

+81
-1
lines changed

fs/btrfs/ctree.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1419,6 +1419,16 @@ struct btrfs_block_group_cache {
14191419
*/
14201420
atomic_t reservations;
14211421

1422+
/*
1423+
* Incremented while holding the spinlock *lock* by a task checking if
1424+
* it can perform a nocow write (incremented if the value for the *ro*
1425+
* field is 0). Decremented by such tasks once they create an ordered
1426+
* extent or before that if some error happens before reaching that step.
1427+
* This is to prevent races between block group relocation and nocow
1428+
* writes through direct IO.
1429+
*/
1430+
atomic_t nocow_writers;
1431+
14221432
/* Lock for free space tree operations. */
14231433
struct mutex free_space_lock;
14241434

@@ -3513,6 +3523,9 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
35133523
void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
35143524
const u64 start);
35153525
void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg);
3526+
bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr);
3527+
void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr);
3528+
void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg);
35163529
void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
35173530
int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
35183531
struct btrfs_root *root, unsigned long count);

fs/btrfs/extent-tree.c

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3824,6 +3824,59 @@ int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
38243824
return readonly;
38253825
}
38263826

3827+
bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
3828+
{
3829+
struct btrfs_block_group_cache *bg;
3830+
bool ret = true;
3831+
3832+
bg = btrfs_lookup_block_group(fs_info, bytenr);
3833+
if (!bg)
3834+
return false;
3835+
3836+
spin_lock(&bg->lock);
3837+
if (bg->ro)
3838+
ret = false;
3839+
else
3840+
atomic_inc(&bg->nocow_writers);
3841+
spin_unlock(&bg->lock);
3842+
3843+
/* no put on block group, done by btrfs_dec_nocow_writers */
3844+
if (!ret)
3845+
btrfs_put_block_group(bg);
3846+
3847+
return ret;
3848+
3849+
}
3850+
3851+
void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
3852+
{
3853+
struct btrfs_block_group_cache *bg;
3854+
3855+
bg = btrfs_lookup_block_group(fs_info, bytenr);
3856+
ASSERT(bg);
3857+
if (atomic_dec_and_test(&bg->nocow_writers))
3858+
wake_up_atomic_t(&bg->nocow_writers);
3859+
/*
3860+
* Once for our lookup and once for the lookup done by a previous call
3861+
* to btrfs_inc_nocow_writers()
3862+
*/
3863+
btrfs_put_block_group(bg);
3864+
btrfs_put_block_group(bg);
3865+
}
3866+
3867+
static int btrfs_wait_nocow_writers_atomic_t(atomic_t *a)
3868+
{
3869+
schedule();
3870+
return 0;
3871+
}
3872+
3873+
void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg)
3874+
{
3875+
wait_on_atomic_t(&bg->nocow_writers,
3876+
btrfs_wait_nocow_writers_atomic_t,
3877+
TASK_UNINTERRUPTIBLE);
3878+
}
3879+
38273880
static const char *alloc_name(u64 flags)
38283881
{
38293882
switch (flags) {

fs/btrfs/inode.c

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1382,6 +1382,9 @@ static noinline int run_delalloc_nocow(struct inode *inode,
13821382
*/
13831383
if (csum_exist_in_range(root, disk_bytenr, num_bytes))
13841384
goto out_check;
1385+
if (!btrfs_inc_nocow_writers(root->fs_info,
1386+
disk_bytenr))
1387+
goto out_check;
13851388
nocow = 1;
13861389
} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
13871390
extent_end = found_key.offset +
@@ -1396,6 +1399,9 @@ static noinline int run_delalloc_nocow(struct inode *inode,
13961399
path->slots[0]++;
13971400
if (!nolock && nocow)
13981401
btrfs_end_write_no_snapshoting(root);
1402+
if (nocow)
1403+
btrfs_dec_nocow_writers(root->fs_info,
1404+
disk_bytenr);
13991405
goto next_slot;
14001406
}
14011407
if (!nocow) {
@@ -1416,6 +1422,9 @@ static noinline int run_delalloc_nocow(struct inode *inode,
14161422
if (ret) {
14171423
if (!nolock && nocow)
14181424
btrfs_end_write_no_snapshoting(root);
1425+
if (nocow)
1426+
btrfs_dec_nocow_writers(root->fs_info,
1427+
disk_bytenr);
14191428
goto error;
14201429
}
14211430
cow_start = (u64)-1;
@@ -1458,6 +1467,8 @@ static noinline int run_delalloc_nocow(struct inode *inode,
14581467

14591468
ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
14601469
num_bytes, num_bytes, type);
1470+
if (nocow)
1471+
btrfs_dec_nocow_writers(root->fs_info, disk_bytenr);
14611472
BUG_ON(ret); /* -ENOMEM */
14621473

14631474
if (root->root_key.objectid ==
@@ -7657,7 +7668,8 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
76577668
block_start = em->block_start + (start - em->start);
76587669

76597670
if (can_nocow_extent(inode, start, &len, &orig_start,
7660-
&orig_block_len, &ram_bytes) == 1) {
7671+
&orig_block_len, &ram_bytes) == 1 &&
7672+
btrfs_inc_nocow_writers(root->fs_info, block_start)) {
76617673

76627674
/*
76637675
* Create the ordered extent before the extent map. This
@@ -7672,6 +7684,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
76727684
*/
76737685
ret = btrfs_add_ordered_extent_dio(inode, start,
76747686
block_start, len, len, type);
7687+
btrfs_dec_nocow_writers(root->fs_info, block_start);
76757688
if (ret) {
76767689
free_extent_map(em);
76777690
goto unlock_err;

fs/btrfs/relocation.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4255,6 +4255,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
42554255
rc->block_group->key.objectid, rc->block_group->flags);
42564256

42574257
btrfs_wait_block_group_reservations(rc->block_group);
4258+
btrfs_wait_nocow_writers(rc->block_group);
42584259
btrfs_wait_ordered_roots(fs_info, -1,
42594260
rc->block_group->key.objectid,
42604261
rc->block_group->key.offset);

0 commit comments

Comments
 (0)