Skip to content

Commit a7e3b97

Browse files
committed
Btrfs: fix reported number of inode blocks
Currently when there are buffered writes that were not yet flushed and they fall within allocated ranges of the file (that is, not in holes or beyond eof assuming there are no prealloc extents beyond eof), btrfs simply reports an incorrect number of used blocks through the stat(2) system call (or any of its variants), regardless of mount options or inode flags (compress, compress-force, nodatacow). This is because the number of blocks used that is reported is based on the current number of bytes in the vfs inode plus the number of dealloc bytes in the btrfs inode. The later covers bytes that both fall within allocated regions of the file and holes. Example scenarios where the number of reported blocks is wrong while the buffered writes are not flushed: $ mkfs.btrfs -f /dev/sdc $ mount /dev/sdc /mnt/sdc $ xfs_io -f -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo1 wrote 65536/65536 bytes at offset 0 64 KiB, 16 ops; 0.0000 sec (259.336 MiB/sec and 66390.0415 ops/sec) $ sync $ xfs_io -c "pwrite -S 0xbb 0 64K" /mnt/sdc/foo1 wrote 65536/65536 bytes at offset 0 64 KiB, 16 ops; 0.0000 sec (192.308 MiB/sec and 49230.7692 ops/sec) # The following should have reported 64K... $ du -h /mnt/sdc/foo1 128K /mnt/sdc/foo1 $ sync # After flushing the buffered write, it now reports the correct value. $ du -h /mnt/sdc/foo1 64K /mnt/sdc/foo1 $ xfs_io -f -c "falloc -k 0 128K" -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo2 wrote 65536/65536 bytes at offset 0 64 KiB, 16 ops; 0.0000 sec (520.833 MiB/sec and 133333.3333 ops/sec) $ sync $ xfs_io -c "pwrite -S 0xbb 64K 64K" /mnt/sdc/foo2 wrote 65536/65536 bytes at offset 65536 64 KiB, 16 ops; 0.0000 sec (260.417 MiB/sec and 66666.6667 ops/sec) # The following should have reported 128K... $ du -h /mnt/sdc/foo2 192K /mnt/sdc/foo2 $ sync # After flushing the buffered write, it now reports the correct value. $ du -h /mnt/sdc/foo2 128K /mnt/sdc/foo2 So the number of used file blocks is simply incorrect, unlike in other filesystems such as ext4 and xfs for example, but only while the buffered writes are not flushed. Fix this by tracking the number of delalloc bytes that fall within holes and beyond eof of a file, and use instead this new counter when reporting the number of used blocks for an inode. Another different problem that exists is that the delalloc bytes counter is reset when writeback starts (by clearing the EXTENT_DEALLOC flag from the respective range in the inode's iotree) and the vfs inode's bytes counter is only incremented when writeback finishes (through insert_reserved_file_extent()). Therefore while writeback is ongoing we simply report a wrong number of blocks used by an inode if the write operation covers a range previously unallocated. While this change does not fix this problem, it does minimizes it a lot by shortening that time window, as the new dealloc bytes counter (new_delalloc_bytes) is only decremented when writeback finishes right before updating the vfs inode's bytes counter. Fully fixing this second problem is not trivial and will be addressed later by a different patch. Signed-off-by: Filipe Manana <[email protected]>
1 parent e1cbfd7 commit a7e3b97

File tree

4 files changed

+119
-18
lines changed

4 files changed

+119
-18
lines changed

fs/btrfs/btrfs_inode.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,13 @@ struct btrfs_inode {
124124
*/
125125
u64 delalloc_bytes;
126126

127+
/*
128+
* Total number of bytes pending delalloc that fall within a file
129+
* range that is either a hole or beyond EOF (and no prealloc extent
130+
* exists in the range). This is always <= delalloc_bytes.
131+
*/
132+
u64 new_delalloc_bytes;
133+
127134
/*
128135
* total number of bytes pending defrag, used by stat to check whether
129136
* it needs COW.

fs/btrfs/extent_io.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
#define EXTENT_NORESERVE (1U << 15)
2222
#define EXTENT_QGROUP_RESERVED (1U << 16)
2323
#define EXTENT_CLEAR_DATA_RESV (1U << 17)
24+
#define EXTENT_DELALLOC_NEW (1U << 18)
2425
#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
2526
#define EXTENT_DO_ACCOUNTING (EXTENT_CLEAR_META_RESV | \
2627
EXTENT_CLEAR_DATA_RESV)

fs/btrfs/file.c

Lines changed: 57 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1404,6 +1404,47 @@ static noinline int prepare_pages(struct inode *inode, struct page **pages,
14041404

14051405
}
14061406

1407+
static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
1408+
const u64 start,
1409+
const u64 len,
1410+
struct extent_state **cached_state)
1411+
{
1412+
u64 search_start = start;
1413+
const u64 end = start + len - 1;
1414+
1415+
while (search_start < end) {
1416+
const u64 search_len = end - search_start + 1;
1417+
struct extent_map *em;
1418+
u64 em_len;
1419+
int ret = 0;
1420+
1421+
em = btrfs_get_extent(inode, NULL, 0, search_start,
1422+
search_len, 0);
1423+
if (IS_ERR(em))
1424+
return PTR_ERR(em);
1425+
1426+
if (em->block_start != EXTENT_MAP_HOLE)
1427+
goto next;
1428+
1429+
em_len = em->len;
1430+
if (em->start < search_start)
1431+
em_len -= search_start - em->start;
1432+
if (em_len > search_len)
1433+
em_len = search_len;
1434+
1435+
ret = set_extent_bit(&inode->io_tree, search_start,
1436+
search_start + em_len - 1,
1437+
EXTENT_DELALLOC_NEW,
1438+
NULL, cached_state, GFP_NOFS);
1439+
next:
1440+
search_start = extent_map_end(em);
1441+
free_extent_map(em);
1442+
if (ret)
1443+
return ret;
1444+
}
1445+
return 0;
1446+
}
1447+
14071448
/*
14081449
* This function locks the extent and properly waits for data=ordered extents
14091450
* to finish before allowing the pages to be modified if need.
@@ -1432,8 +1473,11 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
14321473
+ round_up(pos + write_bytes - start_pos,
14331474
fs_info->sectorsize) - 1;
14341475

1435-
if (start_pos < inode->vfs_inode.i_size) {
1476+
if (start_pos < inode->vfs_inode.i_size ||
1477+
(inode->flags & BTRFS_INODE_PREALLOC)) {
14361478
struct btrfs_ordered_extent *ordered;
1479+
unsigned int clear_bits;
1480+
14371481
lock_extent_bits(&inode->io_tree, start_pos, last_pos,
14381482
cached_state);
14391483
ordered = btrfs_lookup_ordered_range(inode, start_pos,
@@ -1454,11 +1498,19 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
14541498
}
14551499
if (ordered)
14561500
btrfs_put_ordered_extent(ordered);
1457-
1501+
ret = btrfs_find_new_delalloc_bytes(inode, start_pos,
1502+
last_pos - start_pos + 1,
1503+
cached_state);
1504+
clear_bits = EXTENT_DIRTY | EXTENT_DELALLOC |
1505+
EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG;
1506+
if (ret)
1507+
clear_bits |= EXTENT_DELALLOC_NEW | EXTENT_LOCKED;
14581508
clear_extent_bit(&inode->io_tree, start_pos,
1459-
last_pos, EXTENT_DIRTY | EXTENT_DELALLOC |
1460-
EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
1461-
0, 0, cached_state, GFP_NOFS);
1509+
last_pos, clear_bits,
1510+
(clear_bits & EXTENT_LOCKED) ? 1 : 0,
1511+
0, cached_state, GFP_NOFS);
1512+
if (ret)
1513+
return ret;
14621514
*lockstart = start_pos;
14631515
*lockend = last_pos;
14641516
ret = 1;

fs/btrfs/inode.c

Lines changed: 54 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -572,7 +572,7 @@ static noinline void compress_file_range(struct inode *inode,
572572
}
573573
if (ret <= 0) {
574574
unsigned long clear_flags = EXTENT_DELALLOC |
575-
EXTENT_DEFRAG;
575+
EXTENT_DELALLOC_NEW | EXTENT_DEFRAG;
576576
unsigned long page_error_op;
577577

578578
clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0;
@@ -879,6 +879,7 @@ static noinline void submit_compressed_extents(struct inode *inode,
879879
async_extent->start +
880880
async_extent->ram_size - 1,
881881
NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
882+
EXTENT_DELALLOC_NEW |
882883
EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
883884
PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
884885
PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
@@ -974,6 +975,7 @@ static noinline int cow_file_range(struct inode *inode,
974975
extent_clear_unlock_delalloc(inode, start, end,
975976
delalloc_end, NULL,
976977
EXTENT_LOCKED | EXTENT_DELALLOC |
978+
EXTENT_DELALLOC_NEW |
977979
EXTENT_DEFRAG, PAGE_UNLOCK |
978980
PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
979981
PAGE_END_WRITEBACK);
@@ -1086,8 +1088,8 @@ static noinline int cow_file_range(struct inode *inode,
10861088
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
10871089
btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
10881090
out_unlock:
1089-
clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DEFRAG |
1090-
EXTENT_CLEAR_META_RESV;
1091+
clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
1092+
EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
10911093
page_ops = PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
10921094
PAGE_END_WRITEBACK;
10931095
/*
@@ -1775,6 +1777,14 @@ static void btrfs_set_bit_hook(struct inode *inode,
17751777
btrfs_add_delalloc_inodes(root, inode);
17761778
spin_unlock(&BTRFS_I(inode)->lock);
17771779
}
1780+
1781+
if (!(state->state & EXTENT_DELALLOC_NEW) &&
1782+
(*bits & EXTENT_DELALLOC_NEW)) {
1783+
spin_lock(&BTRFS_I(inode)->lock);
1784+
BTRFS_I(inode)->new_delalloc_bytes += state->end + 1 -
1785+
state->start;
1786+
spin_unlock(&BTRFS_I(inode)->lock);
1787+
}
17781788
}
17791789

17801790
/*
@@ -1840,6 +1850,14 @@ static void btrfs_clear_bit_hook(struct btrfs_inode *inode,
18401850
btrfs_del_delalloc_inode(root, inode);
18411851
spin_unlock(&inode->lock);
18421852
}
1853+
1854+
if ((state->state & EXTENT_DELALLOC_NEW) &&
1855+
(*bits & EXTENT_DELALLOC_NEW)) {
1856+
spin_lock(&inode->lock);
1857+
ASSERT(inode->new_delalloc_bytes >= len);
1858+
inode->new_delalloc_bytes -= len;
1859+
spin_unlock(&inode->lock);
1860+
}
18431861
}
18441862

18451863
/*
@@ -2872,6 +2890,13 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
28722890
u64 logical_len = ordered_extent->len;
28732891
bool nolock;
28742892
bool truncated = false;
2893+
bool range_locked = false;
2894+
bool clear_new_delalloc_bytes = false;
2895+
2896+
if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
2897+
!test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
2898+
!test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags))
2899+
clear_new_delalloc_bytes = true;
28752900

28762901
nolock = btrfs_is_free_space_inode(BTRFS_I(inode));
28772902

@@ -2920,6 +2945,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
29202945
goto out;
29212946
}
29222947

2948+
range_locked = true;
29232949
lock_extent_bits(io_tree, ordered_extent->file_offset,
29242950
ordered_extent->file_offset + ordered_extent->len - 1,
29252951
&cached_state);
@@ -2945,7 +2971,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
29452971
if (IS_ERR(trans)) {
29462972
ret = PTR_ERR(trans);
29472973
trans = NULL;
2948-
goto out_unlock;
2974+
goto out;
29492975
}
29502976

29512977
trans->block_rsv = &fs_info->delalloc_block_rsv;
@@ -2977,7 +3003,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
29773003
trans->transid);
29783004
if (ret < 0) {
29793005
btrfs_abort_transaction(trans, ret);
2980-
goto out_unlock;
3006+
goto out;
29813007
}
29823008

29833009
add_pending_csums(trans, inode, &ordered_extent->list);
@@ -2986,14 +3012,26 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
29863012
ret = btrfs_update_inode_fallback(trans, root, inode);
29873013
if (ret) { /* -ENOMEM or corruption */
29883014
btrfs_abort_transaction(trans, ret);
2989-
goto out_unlock;
3015+
goto out;
29903016
}
29913017
ret = 0;
2992-
out_unlock:
2993-
unlock_extent_cached(io_tree, ordered_extent->file_offset,
2994-
ordered_extent->file_offset +
2995-
ordered_extent->len - 1, &cached_state, GFP_NOFS);
29963018
out:
3019+
if (range_locked || clear_new_delalloc_bytes) {
3020+
unsigned int clear_bits = 0;
3021+
3022+
if (range_locked)
3023+
clear_bits |= EXTENT_LOCKED;
3024+
if (clear_new_delalloc_bytes)
3025+
clear_bits |= EXTENT_DELALLOC_NEW;
3026+
clear_extent_bit(&BTRFS_I(inode)->io_tree,
3027+
ordered_extent->file_offset,
3028+
ordered_extent->file_offset +
3029+
ordered_extent->len - 1,
3030+
clear_bits,
3031+
(clear_bits & EXTENT_LOCKED) ? 1 : 0,
3032+
0, &cached_state, GFP_NOFS);
3033+
}
3034+
29973035
if (root != fs_info->tree_root)
29983036
btrfs_delalloc_release_metadata(BTRFS_I(inode),
29993037
ordered_extent->len);
@@ -8906,6 +8944,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
89068944
if (!inode_evicting)
89078945
clear_extent_bit(tree, start, end,
89088946
EXTENT_DIRTY | EXTENT_DELALLOC |
8947+
EXTENT_DELALLOC_NEW |
89098948
EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
89108949
EXTENT_DEFRAG, 1, 0, &cached_state,
89118950
GFP_NOFS);
@@ -8963,8 +9002,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
89639002
if (!inode_evicting) {
89649003
clear_extent_bit(tree, page_start, page_end,
89659004
EXTENT_LOCKED | EXTENT_DIRTY |
8966-
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
8967-
EXTENT_DEFRAG, 1, 1,
9005+
EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
9006+
EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1,
89689007
&cached_state, GFP_NOFS);
89699008

89709009
__btrfs_releasepage(page, GFP_NOFS);
@@ -9335,6 +9374,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
93359374
ei->last_sub_trans = 0;
93369375
ei->logged_trans = 0;
93379376
ei->delalloc_bytes = 0;
9377+
ei->new_delalloc_bytes = 0;
93389378
ei->defrag_bytes = 0;
93399379
ei->disk_i_size = 0;
93409380
ei->flags = 0;
@@ -9400,6 +9440,7 @@ void btrfs_destroy_inode(struct inode *inode)
94009440
WARN_ON(BTRFS_I(inode)->outstanding_extents);
94019441
WARN_ON(BTRFS_I(inode)->reserved_extents);
94029442
WARN_ON(BTRFS_I(inode)->delalloc_bytes);
9443+
WARN_ON(BTRFS_I(inode)->new_delalloc_bytes);
94039444
WARN_ON(BTRFS_I(inode)->csum_bytes);
94049445
WARN_ON(BTRFS_I(inode)->defrag_bytes);
94059446

@@ -9523,7 +9564,7 @@ static int btrfs_getattr(struct vfsmount *mnt,
95239564
stat->dev = BTRFS_I(inode)->root->anon_dev;
95249565

95259566
spin_lock(&BTRFS_I(inode)->lock);
9526-
delalloc_bytes = BTRFS_I(inode)->delalloc_bytes;
9567+
delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes;
95279568
spin_unlock(&BTRFS_I(inode)->lock);
95289569
stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) +
95299570
ALIGN(delalloc_bytes, blocksize)) >> 9;

0 commit comments

Comments
 (0)