Skip to content

Commit 3cd24c6

Browse files
Ethan Lienkdave
authored andcommitted
btrfs: use tagged writepage to mitigate livelock of snapshot
Snapshot is expected to be fast. But if there are writers steadily creating dirty pages in our subvolume, the snapshot may take a very long time to complete. To fix the problem, we use tagged writepage for snapshot flusher as we do in the generic write_cache_pages(), so we can omit pages dirtied after the snapshot command. This does not change the semantics regarding which data get to the snapshot, if there are pages being dirtied during the snapshotting operation. There's a sync called before snapshot is taken in old/new case, any IO in flight just after that may be in the snapshot but this depends on other system effects that might still sync the IO. We do a simple snapshot speed test on a Intel D-1531 box: fio --ioengine=libaio --iodepth=32 --bs=4k --rw=write --size=64G --direct=0 --thread=1 --numjobs=1 --time_based --runtime=120 --filename=/mnt/sub/testfile --name=job1 --group_reporting & sleep 5; time btrfs sub snap -r /mnt/sub /mnt/snap; killall fio original: 1m58sec patched: 6.54sec This is the best case for this patch since for a sequential write case, we omit nearly all pages dirtied after the snapshot command. For a multi writers, random write test: fio --ioengine=libaio --iodepth=32 --bs=4k --rw=randwrite --size=64G --direct=0 --thread=1 --numjobs=4 --time_based --runtime=120 --filename=/mnt/sub/testfile --name=job1 --group_reporting & sleep 5; time btrfs sub snap -r /mnt/sub /mnt/snap; killall fio original: 15.83sec patched: 10.35sec The improvement is smaller compared to the sequential write case, since we omit only half of the pages dirtied after snapshot command. Reviewed-by: Nikolay Borisov <[email protected]> Signed-off-by: Ethan Lien <[email protected]> Reviewed-by: David Sterba <[email protected]> Signed-off-by: David Sterba <[email protected]>
1 parent c629732 commit 3cd24c6

File tree

5 files changed

+25
-8
lines changed

5 files changed

+25
-8
lines changed

fs/btrfs/btrfs_inode.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ enum {
2929
BTRFS_INODE_IN_DELALLOC_LIST,
3030
BTRFS_INODE_READDIO_NEED_LOCK,
3131
BTRFS_INODE_HAS_PROPS,
32+
BTRFS_INODE_SNAPSHOT_FLUSH,
3233
};
3334

3435
/* in memory btrfs inode */

fs/btrfs/ctree.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3170,7 +3170,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
31703170
struct inode *inode, u64 new_size,
31713171
u32 min_type);
31723172

3173-
int btrfs_start_delalloc_inodes(struct btrfs_root *root);
3173+
int btrfs_start_delalloc_snapshot(struct btrfs_root *root);
31743174
int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr);
31753175
int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
31763176
unsigned int extra_bits,

fs/btrfs/extent_io.c

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3911,12 +3911,25 @@ static int extent_write_cache_pages(struct address_space *mapping,
39113911
range_whole = 1;
39123912
scanned = 1;
39133913
}
3914-
if (wbc->sync_mode == WB_SYNC_ALL)
3914+
3915+
/*
3916+
* We do the tagged writepage as long as the snapshot flush bit is set
3917+
* and we are the first one who do the filemap_flush() on this inode.
3918+
*
3919+
* The nr_to_write == LONG_MAX is needed to make sure other flushers do
3920+
* not race in and drop the bit.
3921+
*/
3922+
if (range_whole && wbc->nr_to_write == LONG_MAX &&
3923+
test_and_clear_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
3924+
&BTRFS_I(inode)->runtime_flags))
3925+
wbc->tagged_writepages = 1;
3926+
3927+
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
39153928
tag = PAGECACHE_TAG_TOWRITE;
39163929
else
39173930
tag = PAGECACHE_TAG_DIRTY;
39183931
retry:
3919-
if (wbc->sync_mode == WB_SYNC_ALL)
3932+
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
39203933
tag_pages_for_writeback(mapping, index, end);
39213934
done_index = index;
39223935
while (!done && !nr_to_write_done && (index <= end) &&

fs/btrfs/inode.c

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9961,7 +9961,7 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode
99619961
* some fairly slow code that needs optimization. This walks the list
99629962
* of all the inodes with pending delalloc and forces them to disk.
99639963
*/
9964-
static int start_delalloc_inodes(struct btrfs_root *root, int nr)
9964+
static int start_delalloc_inodes(struct btrfs_root *root, int nr, bool snapshot)
99659965
{
99669966
struct btrfs_inode *binode;
99679967
struct inode *inode;
@@ -9989,6 +9989,9 @@ static int start_delalloc_inodes(struct btrfs_root *root, int nr)
99899989
}
99909990
spin_unlock(&root->delalloc_lock);
99919991

9992+
if (snapshot)
9993+
set_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
9994+
&binode->runtime_flags);
99929995
work = btrfs_alloc_delalloc_work(inode);
99939996
if (!work) {
99949997
iput(inode);
@@ -10022,15 +10025,15 @@ static int start_delalloc_inodes(struct btrfs_root *root, int nr)
1002210025
return ret;
1002310026
}
1002410027

10025-
int btrfs_start_delalloc_inodes(struct btrfs_root *root)
10028+
int btrfs_start_delalloc_snapshot(struct btrfs_root *root)
1002610029
{
1002710030
struct btrfs_fs_info *fs_info = root->fs_info;
1002810031
int ret;
1002910032

1003010033
if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
1003110034
return -EROFS;
1003210035

10033-
ret = start_delalloc_inodes(root, -1);
10036+
ret = start_delalloc_inodes(root, -1, true);
1003410037
if (ret > 0)
1003510038
ret = 0;
1003610039
return ret;
@@ -10059,7 +10062,7 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr)
1005910062
&fs_info->delalloc_roots);
1006010063
spin_unlock(&fs_info->delalloc_root_lock);
1006110064

10062-
ret = start_delalloc_inodes(root, nr);
10065+
ret = start_delalloc_inodes(root, nr, false);
1006310066
btrfs_put_fs_root(root);
1006410067
if (ret < 0)
1006510068
goto out;

fs/btrfs/ioctl.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -788,7 +788,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
788788
wait_event(root->subv_writers->wait,
789789
percpu_counter_sum(&root->subv_writers->counter) == 0);
790790

791-
ret = btrfs_start_delalloc_inodes(root);
791+
ret = btrfs_start_delalloc_snapshot(root);
792792
if (ret)
793793
goto dec_and_free;
794794

0 commit comments

Comments
 (0)