Skip to content

Commit 49d766f

Browse files
committed
Merge tag 'for-5.17-rc1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
Pull btrfs fixes from David Sterba: "Several fixes for defragmentation that got broken in 5.16 after refactoring and added subpage support. The observed bugs are excessive IO or uninterruptible ioctl. All stable material" * tag 'for-5.17-rc1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: btrfs: update writeback index when starting defrag btrfs: add back missing dirty page rate limiting to defrag btrfs: fix deadlock when reserving space during defrag btrfs: defrag: properly update range->start for autodefrag btrfs: defrag: fix wrong number of defragged sectors btrfs: allow defrag to be interruptible btrfs: fix too long loop when defragging a 1 byte file
2 parents a08b41a + 27cdfde commit 49d766f

File tree

1 file changed

+75
-9
lines changed

1 file changed

+75
-9
lines changed

fs/btrfs/ioctl.c

Lines changed: 75 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1213,6 +1213,35 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
12131213
if (em->generation < newer_than)
12141214
goto next;
12151215

1216+
/*
1217+
* Our start offset might be in the middle of an existing extent
1218+
* map, so take that into account.
1219+
*/
1220+
range_len = em->len - (cur - em->start);
1221+
/*
1222+
* If this range of the extent map is already flagged for delalloc,
1223+
* skip it, because:
1224+
*
1225+
* 1) We could deadlock later, when trying to reserve space for
1226+
* delalloc, because in case we can't immediately reserve space
1227+
* the flusher can start delalloc and wait for the respective
1228+
* ordered extents to complete. The deadlock would happen
1229+
* because we do the space reservation while holding the range
1230+
* locked, and starting writeback, or finishing an ordered
1231+
* extent, requires locking the range;
1232+
*
1233+
* 2) If there's delalloc there, it means there's dirty pages for
1234+
* which writeback has not started yet (we clean the delalloc
1235+
* flag when starting writeback and after creating an ordered
1236+
* extent). If we mark pages in an adjacent range for defrag,
1237+
* then we will have a larger contiguous range for delalloc,
1238+
* very likely resulting in a larger extent after writeback is
1239+
* triggered (except in a case of free space fragmentation).
1240+
*/
1241+
if (test_range_bit(&inode->io_tree, cur, cur + range_len - 1,
1242+
EXTENT_DELALLOC, 0, NULL))
1243+
goto next;
1244+
12161245
/*
12171246
* For do_compress case, we want to compress all valid file
12181247
* extents, thus no @extent_thresh or mergeable check.
@@ -1221,7 +1250,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
12211250
goto add;
12221251

12231252
/* Skip too large extent */
1224-
if (em->len >= extent_thresh)
1253+
if (range_len >= extent_thresh)
12251254
goto next;
12261255

12271256
next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em,
@@ -1442,9 +1471,11 @@ static int defrag_one_cluster(struct btrfs_inode *inode,
14421471
list_for_each_entry(entry, &target_list, list) {
14431472
u32 range_len = entry->len;
14441473

1445-
/* Reached the limit */
1446-
if (max_sectors && max_sectors == *sectors_defragged)
1474+
/* Reached or beyond the limit */
1475+
if (max_sectors && *sectors_defragged >= max_sectors) {
1476+
ret = 1;
14471477
break;
1478+
}
14481479

14491480
if (max_sectors)
14501481
range_len = min_t(u32, range_len,
@@ -1465,7 +1496,8 @@ static int defrag_one_cluster(struct btrfs_inode *inode,
14651496
extent_thresh, newer_than, do_compress);
14661497
if (ret < 0)
14671498
break;
1468-
*sectors_defragged += range_len;
1499+
*sectors_defragged += range_len >>
1500+
inode->root->fs_info->sectorsize_bits;
14691501
}
14701502
out:
14711503
list_for_each_entry_safe(entry, tmp, &target_list, list) {
@@ -1484,6 +1516,12 @@ static int defrag_one_cluster(struct btrfs_inode *inode,
14841516
* @newer_than: minimum transid to defrag
14851517
* @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode
14861518
* will be defragged.
1519+
*
1520+
* Return <0 for error.
1521+
* Return >=0 for the number of sectors defragged, and range->start will be updated
1522+
* to indicate the file offset where next defrag should be started at.
1523+
* (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without
1524+
* defragging all the range).
14871525
*/
14881526
int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
14891527
struct btrfs_ioctl_defrag_range_args *range,
@@ -1499,6 +1537,7 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
14991537
int compress_type = BTRFS_COMPRESS_ZLIB;
15001538
int ret = 0;
15011539
u32 extent_thresh = range->extent_thresh;
1540+
pgoff_t start_index;
15021541

15031542
if (isize == 0)
15041543
return 0;
@@ -1518,12 +1557,16 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
15181557

15191558
if (range->start + range->len > range->start) {
15201559
/* Got a specific range */
1521-
last_byte = min(isize, range->start + range->len) - 1;
1560+
last_byte = min(isize, range->start + range->len);
15221561
} else {
15231562
/* Defrag until file end */
1524-
last_byte = isize - 1;
1563+
last_byte = isize;
15251564
}
15261565

1566+
/* Align the range */
1567+
cur = round_down(range->start, fs_info->sectorsize);
1568+
last_byte = round_up(last_byte, fs_info->sectorsize) - 1;
1569+
15271570
/*
15281571
* If we were not given a ra, allocate a readahead context. As
15291572
* readahead is just an optimization, defrag will work without it so
@@ -1536,16 +1579,26 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
15361579
file_ra_state_init(ra, inode->i_mapping);
15371580
}
15381581

1539-
/* Align the range */
1540-
cur = round_down(range->start, fs_info->sectorsize);
1541-
last_byte = round_up(last_byte, fs_info->sectorsize) - 1;
1582+
/*
1583+
* Make writeback start from the beginning of the range, so that the
1584+
* defrag range can be written sequentially.
1585+
*/
1586+
start_index = cur >> PAGE_SHIFT;
1587+
if (start_index < inode->i_mapping->writeback_index)
1588+
inode->i_mapping->writeback_index = start_index;
15421589

15431590
while (cur < last_byte) {
1591+
const unsigned long prev_sectors_defragged = sectors_defragged;
15441592
u64 cluster_end;
15451593

15461594
/* The cluster size 256K should always be page aligned */
15471595
BUILD_BUG_ON(!IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE));
15481596

1597+
if (btrfs_defrag_cancelled(fs_info)) {
1598+
ret = -EAGAIN;
1599+
break;
1600+
}
1601+
15491602
/* We want the cluster end at page boundary when possible */
15501603
cluster_end = (((cur >> PAGE_SHIFT) +
15511604
(SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1;
@@ -1567,14 +1620,27 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
15671620
cluster_end + 1 - cur, extent_thresh,
15681621
newer_than, do_compress,
15691622
&sectors_defragged, max_to_defrag);
1623+
1624+
if (sectors_defragged > prev_sectors_defragged)
1625+
balance_dirty_pages_ratelimited(inode->i_mapping);
1626+
15701627
btrfs_inode_unlock(inode, 0);
15711628
if (ret < 0)
15721629
break;
15731630
cur = cluster_end + 1;
1631+
if (ret > 0) {
1632+
ret = 0;
1633+
break;
1634+
}
15741635
}
15751636

15761637
if (ra_allocated)
15771638
kfree(ra);
1639+
/*
1640+
* Update range.start for autodefrag, this will indicate where to start
1641+
* in next run.
1642+
*/
1643+
range->start = cur;
15781644
if (sectors_defragged) {
15791645
/*
15801646
* We have defragged some sectors, for compression case they

0 commit comments

Comments
 (0)