@@ -1213,6 +1213,35 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
1213
1213
if (em -> generation < newer_than )
1214
1214
goto next ;
1215
1215
1216
+ /*
1217
+ * Our start offset might be in the middle of an existing extent
1218
+ * map, so take that into account.
1219
+ */
1220
+ range_len = em -> len - (cur - em -> start );
1221
+ /*
1222
+ * If this range of the extent map is already flagged for delalloc,
1223
+ * skip it, because:
1224
+ *
1225
+ * 1) We could deadlock later, when trying to reserve space for
1226
+ * delalloc, because in case we can't immediately reserve space
1227
+ * the flusher can start delalloc and wait for the respective
1228
+ * ordered extents to complete. The deadlock would happen
1229
+ * because we do the space reservation while holding the range
1230
+ * locked, and starting writeback, or finishing an ordered
1231
+ * extent, requires locking the range;
1232
+ *
1233
+ * 2) If there's delalloc there, it means there's dirty pages for
1234
+ * which writeback has not started yet (we clean the delalloc
1235
+ * flag when starting writeback and after creating an ordered
1236
+ * extent). If we mark pages in an adjacent range for defrag,
1237
+ * then we will have a larger contiguous range for delalloc,
1238
+ * very likely resulting in a larger extent after writeback is
1239
+ * triggered (except in a case of free space fragmentation).
1240
+ */
1241
+ if (test_range_bit (& inode -> io_tree , cur , cur + range_len - 1 ,
1242
+ EXTENT_DELALLOC , 0 , NULL ))
1243
+ goto next ;
1244
+
1216
1245
/*
1217
1246
* For do_compress case, we want to compress all valid file
1218
1247
* extents, thus no @extent_thresh or mergeable check.
@@ -1221,7 +1250,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
1221
1250
goto add ;
1222
1251
1223
1252
/* Skip too large extent */
1224
- if (em -> len >= extent_thresh )
1253
+ if (range_len >= extent_thresh )
1225
1254
goto next ;
1226
1255
1227
1256
next_mergeable = defrag_check_next_extent (& inode -> vfs_inode , em ,
@@ -1442,9 +1471,11 @@ static int defrag_one_cluster(struct btrfs_inode *inode,
1442
1471
list_for_each_entry (entry , & target_list , list ) {
1443
1472
u32 range_len = entry -> len ;
1444
1473
1445
- /* Reached the limit */
1446
- if (max_sectors && max_sectors == * sectors_defragged )
1474
+ /* Reached or beyond the limit */
1475
+ if (max_sectors && * sectors_defragged >= max_sectors ) {
1476
+ ret = 1 ;
1447
1477
break ;
1478
+ }
1448
1479
1449
1480
if (max_sectors )
1450
1481
range_len = min_t (u32 , range_len ,
@@ -1465,7 +1496,8 @@ static int defrag_one_cluster(struct btrfs_inode *inode,
1465
1496
extent_thresh , newer_than , do_compress );
1466
1497
if (ret < 0 )
1467
1498
break ;
1468
- * sectors_defragged += range_len ;
1499
+ * sectors_defragged += range_len >>
1500
+ inode -> root -> fs_info -> sectorsize_bits ;
1469
1501
}
1470
1502
out :
1471
1503
list_for_each_entry_safe (entry , tmp , & target_list , list ) {
@@ -1484,6 +1516,12 @@ static int defrag_one_cluster(struct btrfs_inode *inode,
1484
1516
* @newer_than: minimum transid to defrag
1485
1517
* @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode
1486
1518
* will be defragged.
1519
+ *
1520
+ * Return <0 for error.
1521
+ * Return >=0 for the number of sectors defragged, and range->start will be updated
1522
+ * to indicate the file offset where next defrag should be started at.
1523
+ * (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without
1524
+ * defragging all the range).
1487
1525
*/
1488
1526
int btrfs_defrag_file (struct inode * inode , struct file_ra_state * ra ,
1489
1527
struct btrfs_ioctl_defrag_range_args * range ,
@@ -1499,6 +1537,7 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
1499
1537
int compress_type = BTRFS_COMPRESS_ZLIB ;
1500
1538
int ret = 0 ;
1501
1539
u32 extent_thresh = range -> extent_thresh ;
1540
+ pgoff_t start_index ;
1502
1541
1503
1542
if (isize == 0 )
1504
1543
return 0 ;
@@ -1518,12 +1557,16 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
1518
1557
1519
1558
if (range -> start + range -> len > range -> start ) {
1520
1559
/* Got a specific range */
1521
- last_byte = min (isize , range -> start + range -> len ) - 1 ;
1560
+ last_byte = min (isize , range -> start + range -> len );
1522
1561
} else {
1523
1562
/* Defrag until file end */
1524
- last_byte = isize - 1 ;
1563
+ last_byte = isize ;
1525
1564
}
1526
1565
1566
+ /* Align the range */
1567
+ cur = round_down (range -> start , fs_info -> sectorsize );
1568
+ last_byte = round_up (last_byte , fs_info -> sectorsize ) - 1 ;
1569
+
1527
1570
/*
1528
1571
* If we were not given a ra, allocate a readahead context. As
1529
1572
* readahead is just an optimization, defrag will work without it so
@@ -1536,16 +1579,26 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
1536
1579
file_ra_state_init (ra , inode -> i_mapping );
1537
1580
}
1538
1581
1539
- /* Align the range */
1540
- cur = round_down (range -> start , fs_info -> sectorsize );
1541
- last_byte = round_up (last_byte , fs_info -> sectorsize ) - 1 ;
1582
+ /*
1583
+ * Make writeback start from the beginning of the range, so that the
1584
+ * defrag range can be written sequentially.
1585
+ */
1586
+ start_index = cur >> PAGE_SHIFT ;
1587
+ if (start_index < inode -> i_mapping -> writeback_index )
1588
+ inode -> i_mapping -> writeback_index = start_index ;
1542
1589
1543
1590
while (cur < last_byte ) {
1591
+ const unsigned long prev_sectors_defragged = sectors_defragged ;
1544
1592
u64 cluster_end ;
1545
1593
1546
1594
/* The cluster size 256K should always be page aligned */
1547
1595
BUILD_BUG_ON (!IS_ALIGNED (CLUSTER_SIZE , PAGE_SIZE ));
1548
1596
1597
+ if (btrfs_defrag_cancelled (fs_info )) {
1598
+ ret = - EAGAIN ;
1599
+ break ;
1600
+ }
1601
+
1549
1602
/* We want the cluster end at page boundary when possible */
1550
1603
cluster_end = (((cur >> PAGE_SHIFT ) +
1551
1604
(SZ_256K >> PAGE_SHIFT )) << PAGE_SHIFT ) - 1 ;
@@ -1567,14 +1620,27 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
1567
1620
cluster_end + 1 - cur , extent_thresh ,
1568
1621
newer_than , do_compress ,
1569
1622
& sectors_defragged , max_to_defrag );
1623
+
1624
+ if (sectors_defragged > prev_sectors_defragged )
1625
+ balance_dirty_pages_ratelimited (inode -> i_mapping );
1626
+
1570
1627
btrfs_inode_unlock (inode , 0 );
1571
1628
if (ret < 0 )
1572
1629
break ;
1573
1630
cur = cluster_end + 1 ;
1631
+ if (ret > 0 ) {
1632
+ ret = 0 ;
1633
+ break ;
1634
+ }
1574
1635
}
1575
1636
1576
1637
if (ra_allocated )
1577
1638
kfree (ra );
1639
+ /*
1640
+ * Update range.start for autodefrag, this will indicate where to start
1641
+ * in next run.
1642
+ */
1643
+ range -> start = cur ;
1578
1644
if (sectors_defragged ) {
1579
1645
/*
1580
1646
* We have defragged some sectors, for compression case they
0 commit comments