Skip to content

Commit bc42bda

Browse files
Qu Wenruokdave
authored andcommitted
btrfs: qgroup: Fix qgroup reserved space underflow by only freeing reserved ranges
[BUG] For the following case, btrfs can underflow qgroup reserved space at an error path: (Page size 4K, function name without "btrfs_" prefix) Task A | Task B ---------------------------------------------------------------------- Buffered_write [0, 2K) | |- check_data_free_space() | | |- qgroup_reserve_data() | | Range aligned to page | | range [0, 4K) <<< | | 4K bytes reserved <<< | |- copy pages to page cache | | Buffered_write [2K, 4K) | |- check_data_free_space() | | |- qgroup_reserved_data() | | Range alinged to page | | range [0, 4K) | | Already reserved by A <<< | | 0 bytes reserved <<< | |- delalloc_reserve_metadata() | | And it *FAILED* (Maybe EQUOTA) | |- free_reserved_data_space() |- qgroup_free_data() Range aligned to page range [0, 4K) Freeing 4K (Special thanks to Chandan for the detailed report and analyse) [CAUSE] Above Task B is freeing reserved data range [0, 4K) which is actually reserved by Task A. And at writeback time, page dirty by Task A will go through writeback routine, which will free 4K reserved data space at file extent insert time, causing the qgroup underflow. [FIX] For btrfs_qgroup_free_data(), add @reserved parameter to only free data ranges reserved by previous btrfs_qgroup_reserve_data(). So in above case, Task B will try to free 0 byte, so no underflow. Reported-by: Chandan Rajendra <[email protected]> Signed-off-by: Qu Wenruo <[email protected]> Reviewed-by: Chandan Rajendra <[email protected]> Tested-by: Chandan Rajendra <[email protected]> Signed-off-by: David Sterba <[email protected]>
1 parent 364ecf3 commit bc42bda

File tree

8 files changed

+117
-46
lines changed

8 files changed

+117
-46
lines changed

fs/btrfs/ctree.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2711,7 +2711,10 @@ enum btrfs_flush_state {
27112711
int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes);
27122712
int btrfs_check_data_free_space(struct inode *inode,
27132713
struct extent_changeset **reserved, u64 start, u64 len);
2714-
void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len);
2714+
void btrfs_free_reserved_data_space(struct inode *inode,
2715+
struct extent_changeset *reserved, u64 start, u64 len);
2716+
void btrfs_delalloc_release_space(struct inode *inode,
2717+
struct extent_changeset *reserved, u64 start, u64 len);
27152718
void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
27162719
u64 len);
27172720
void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
@@ -2730,7 +2733,6 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes);
27302733
void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes);
27312734
int btrfs_delalloc_reserve_space(struct inode *inode,
27322735
struct extent_changeset **reserved, u64 start, u64 len);
2733-
void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len);
27342736
void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
27352737
struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
27362738
unsigned short type);

fs/btrfs/extent-tree.c

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4389,7 +4389,8 @@ void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
43894389
* This one will handle the per-inode data rsv map for accurate reserved
43904390
* space framework.
43914391
*/
4392-
void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len)
4392+
void btrfs_free_reserved_data_space(struct inode *inode,
4393+
struct extent_changeset *reserved, u64 start, u64 len)
43934394
{
43944395
struct btrfs_root *root = BTRFS_I(inode)->root;
43954396

@@ -4399,7 +4400,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len)
43994400
start = round_down(start, root->fs_info->sectorsize);
44004401

44014402
btrfs_free_reserved_data_space_noquota(inode, start, len);
4402-
btrfs_qgroup_free_data(inode, start, len);
4403+
btrfs_qgroup_free_data(inode, reserved, start, len);
44034404
}
44044405

44054406
static void force_metadata_allocation(struct btrfs_fs_info *info)
@@ -6204,7 +6205,7 @@ int btrfs_delalloc_reserve_space(struct inode *inode,
62046205
return ret;
62056206
ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len);
62066207
if (ret < 0)
6207-
btrfs_free_reserved_data_space(inode, start, len);
6208+
btrfs_free_reserved_data_space(inode, *reserved, start, len);
62086209
return ret;
62096210
}
62106211

@@ -6223,10 +6224,11 @@ int btrfs_delalloc_reserve_space(struct inode *inode,
62236224
* list if there are no delalloc bytes left.
62246225
* Also it will handle the qgroup reserved space.
62256226
*/
6226-
void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len)
6227+
void btrfs_delalloc_release_space(struct inode *inode,
6228+
struct extent_changeset *reserved, u64 start, u64 len)
62276229
{
62286230
btrfs_delalloc_release_metadata(BTRFS_I(inode), len);
6229-
btrfs_free_reserved_data_space(inode, start, len);
6231+
btrfs_free_reserved_data_space(inode, reserved, start, len);
62306232
}
62316233

62326234
static int update_block_group(struct btrfs_trans_handle *trans,

fs/btrfs/file.c

Lines changed: 16 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1660,8 +1660,9 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
16601660
reserve_bytes);
16611661
if (ret) {
16621662
if (!only_release_metadata)
1663-
btrfs_free_reserved_data_space(inode, pos,
1664-
write_bytes);
1663+
btrfs_free_reserved_data_space(inode,
1664+
data_reserved, pos,
1665+
write_bytes);
16651666
else
16661667
btrfs_end_write_no_snapshoting(root);
16671668
break;
@@ -1743,8 +1744,9 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
17431744
__pos = round_down(pos,
17441745
fs_info->sectorsize) +
17451746
(dirty_pages << PAGE_SHIFT);
1746-
btrfs_delalloc_release_space(inode, __pos,
1747-
release_bytes);
1747+
btrfs_delalloc_release_space(inode,
1748+
data_reserved, __pos,
1749+
release_bytes);
17481750
}
17491751
}
17501752

@@ -1799,9 +1801,9 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
17991801
btrfs_delalloc_release_metadata(BTRFS_I(inode),
18001802
release_bytes);
18011803
} else {
1802-
btrfs_delalloc_release_space(inode,
1803-
round_down(pos, fs_info->sectorsize),
1804-
release_bytes);
1804+
btrfs_delalloc_release_space(inode, data_reserved,
1805+
round_down(pos, fs_info->sectorsize),
1806+
release_bytes);
18051807
}
18061808
}
18071809

@@ -2918,8 +2920,8 @@ static long btrfs_fallocate(struct file *file, int mode,
29182920
* range, free reserved data space first, otherwise
29192921
* it'll result in false ENOSPC error.
29202922
*/
2921-
btrfs_free_reserved_data_space(inode, cur_offset,
2922-
last_byte - cur_offset);
2923+
btrfs_free_reserved_data_space(inode, data_reserved,
2924+
cur_offset, last_byte - cur_offset);
29232925
}
29242926
free_extent_map(em);
29252927
cur_offset = last_byte;
@@ -2938,8 +2940,9 @@ static long btrfs_fallocate(struct file *file, int mode,
29382940
range->len, i_blocksize(inode),
29392941
offset + len, &alloc_hint);
29402942
else
2941-
btrfs_free_reserved_data_space(inode, range->start,
2942-
range->len);
2943+
btrfs_free_reserved_data_space(inode,
2944+
data_reserved, range->start,
2945+
range->len);
29432946
list_del(&range->list);
29442947
kfree(range);
29452948
}
@@ -2977,8 +2980,8 @@ static long btrfs_fallocate(struct file *file, int mode,
29772980
inode_unlock(inode);
29782981
/* Let go of our reservation. */
29792982
if (ret != 0)
2980-
btrfs_free_reserved_data_space(inode, alloc_start,
2981-
alloc_end - cur_offset);
2983+
btrfs_free_reserved_data_space(inode, data_reserved,
2984+
alloc_start, alloc_end - cur_offset);
29822985
extent_changeset_free(data_reserved);
29832986
return ret;
29842987
}

fs/btrfs/inode.c

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -345,7 +345,7 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
345345
* And at reserve time, it's always aligned to page size, so
346346
* just free one page here.
347347
*/
348-
btrfs_qgroup_free_data(inode, 0, PAGE_SIZE);
348+
btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE);
349349
btrfs_free_path(path);
350350
btrfs_end_transaction(trans);
351351
return ret;
@@ -2935,7 +2935,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
29352935
* space for NOCOW range.
29362936
* As NOCOW won't cause a new delayed ref, just free the space
29372937
*/
2938-
btrfs_qgroup_free_data(inode, ordered_extent->file_offset,
2938+
btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset,
29392939
ordered_extent->len);
29402940
btrfs_ordered_update_i_size(inode, 0, ordered_extent);
29412941
if (nolock)
@@ -4794,7 +4794,7 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
47944794
again:
47954795
page = find_or_create_page(mapping, index, mask);
47964796
if (!page) {
4797-
btrfs_delalloc_release_space(inode,
4797+
btrfs_delalloc_release_space(inode, data_reserved,
47984798
round_down(from, blocksize),
47994799
blocksize);
48004800
ret = -ENOMEM;
@@ -4866,7 +4866,7 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
48664866

48674867
out_unlock:
48684868
if (ret)
4869-
btrfs_delalloc_release_space(inode, block_start,
4869+
btrfs_delalloc_release_space(inode, data_reserved, block_start,
48704870
blocksize);
48714871
unlock_page(page);
48724872
put_page(page);
@@ -5266,7 +5266,7 @@ static void evict_inode_truncate_pages(struct inode *inode)
52665266
* Note, end is the bytenr of last byte, so we need + 1 here.
52675267
*/
52685268
if (state->state & EXTENT_DELALLOC)
5269-
btrfs_qgroup_free_data(inode, start, end - start + 1);
5269+
btrfs_qgroup_free_data(inode, NULL, start, end - start + 1);
52705270

52715271
clear_extent_bit(io_tree, start, end,
52725272
EXTENT_LOCKED | EXTENT_DIRTY |
@@ -8792,8 +8792,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
87928792
current->journal_info = NULL;
87938793
if (ret < 0 && ret != -EIOCBQUEUED) {
87948794
if (dio_data.reserve)
8795-
btrfs_delalloc_release_space(inode, offset,
8796-
dio_data.reserve);
8795+
btrfs_delalloc_release_space(inode, data_reserved,
8796+
offset, dio_data.reserve);
87978797
/*
87988798
* On error we might have left some ordered extents
87998799
* without submitting corresponding bios for them, so
@@ -8808,8 +8808,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
88088808
dio_data.unsubmitted_oe_range_start,
88098809
false);
88108810
} else if (ret >= 0 && (size_t)ret < count)
8811-
btrfs_delalloc_release_space(inode, offset,
8812-
count - (size_t)ret);
8811+
btrfs_delalloc_release_space(inode, data_reserved,
8812+
offset, count - (size_t)ret);
88138813
}
88148814
out:
88158815
if (wakeup)
@@ -9008,7 +9008,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
90089008
* free the entire extent.
90099009
*/
90109010
if (PageDirty(page))
9011-
btrfs_qgroup_free_data(inode, page_start, PAGE_SIZE);
9011+
btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE);
90129012
if (!inode_evicting) {
90139013
clear_extent_bit(tree, page_start, page_end,
90149014
EXTENT_LOCKED | EXTENT_DIRTY |
@@ -9130,8 +9130,8 @@ int btrfs_page_mkwrite(struct vm_fault *vmf)
91309130
spin_lock(&BTRFS_I(inode)->lock);
91319131
BTRFS_I(inode)->outstanding_extents++;
91329132
spin_unlock(&BTRFS_I(inode)->lock);
9133-
btrfs_delalloc_release_space(inode, page_start,
9134-
PAGE_SIZE - reserved_space);
9133+
btrfs_delalloc_release_space(inode, data_reserved,
9134+
page_start, PAGE_SIZE - reserved_space);
91359135
}
91369136
}
91379137

@@ -9187,7 +9187,8 @@ int btrfs_page_mkwrite(struct vm_fault *vmf)
91879187
}
91889188
unlock_page(page);
91899189
out:
9190-
btrfs_delalloc_release_space(inode, page_start, reserved_space);
9190+
btrfs_delalloc_release_space(inode, data_reserved, page_start,
9191+
reserved_space);
91919192
out_noreserve:
91929193
sb_end_pagefault(inode->i_sb);
91939194
extent_changeset_free(data_reserved);
@@ -10557,7 +10558,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
1055710558
btrfs_end_transaction(trans);
1055810559
}
1055910560
if (cur_offset < end)
10560-
btrfs_free_reserved_data_space(inode, cur_offset,
10561+
btrfs_free_reserved_data_space(inode, NULL, cur_offset,
1056110562
end - cur_offset + 1);
1056210563
return ret;
1056310564
}

fs/btrfs/ioctl.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1227,7 +1227,7 @@ static int cluster_pages_for_defrag(struct inode *inode,
12271227
spin_lock(&BTRFS_I(inode)->lock);
12281228
BTRFS_I(inode)->outstanding_extents++;
12291229
spin_unlock(&BTRFS_I(inode)->lock);
1230-
btrfs_delalloc_release_space(inode,
1230+
btrfs_delalloc_release_space(inode, data_reserved,
12311231
start_index << PAGE_SHIFT,
12321232
(page_cnt - i_done) << PAGE_SHIFT);
12331233
}
@@ -1255,7 +1255,7 @@ static int cluster_pages_for_defrag(struct inode *inode,
12551255
unlock_page(pages[i]);
12561256
put_page(pages[i]);
12571257
}
1258-
btrfs_delalloc_release_space(inode,
1258+
btrfs_delalloc_release_space(inode, data_reserved,
12591259
start_index << PAGE_SHIFT,
12601260
page_cnt << PAGE_SHIFT);
12611261
extent_changeset_free(data_reserved);

fs/btrfs/qgroup.c

Lines changed: 67 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2892,13 +2892,72 @@ int btrfs_qgroup_reserve_data(struct inode *inode,
28922892
return ret;
28932893
}
28942894

2895-
static int __btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len,
2896-
int free)
2895+
/* Free ranges specified by @reserved, normally in error path */
2896+
static int qgroup_free_reserved_data(struct inode *inode,
2897+
struct extent_changeset *reserved, u64 start, u64 len)
2898+
{
2899+
struct btrfs_root *root = BTRFS_I(inode)->root;
2900+
struct ulist_node *unode;
2901+
struct ulist_iterator uiter;
2902+
struct extent_changeset changeset;
2903+
int freed = 0;
2904+
int ret;
2905+
2906+
extent_changeset_init(&changeset);
2907+
len = round_up(start + len, root->fs_info->sectorsize);
2908+
start = round_down(start, root->fs_info->sectorsize);
2909+
2910+
ULIST_ITER_INIT(&uiter);
2911+
while ((unode = ulist_next(&reserved->range_changed, &uiter))) {
2912+
u64 range_start = unode->val;
2913+
/* unode->aux is the inclusive end */
2914+
u64 range_len = unode->aux - range_start + 1;
2915+
u64 free_start;
2916+
u64 free_len;
2917+
2918+
extent_changeset_release(&changeset);
2919+
2920+
/* Only free range in range [start, start + len) */
2921+
if (range_start >= start + len ||
2922+
range_start + range_len <= start)
2923+
continue;
2924+
free_start = max(range_start, start);
2925+
free_len = min(start + len, range_start + range_len) -
2926+
free_start;
2927+
/*
2928+
* TODO: To also modify reserved->ranges_reserved to reflect
2929+
* the modification.
2930+
*
2931+
* However as long as we free qgroup reserved according to
2932+
* EXTENT_QGROUP_RESERVED, we won't double free.
2933+
* So not need to rush.
2934+
*/
2935+
ret = clear_record_extent_bits(&BTRFS_I(inode)->io_failure_tree,
2936+
free_start, free_start + free_len - 1,
2937+
EXTENT_QGROUP_RESERVED, &changeset);
2938+
if (ret < 0)
2939+
goto out;
2940+
freed += changeset.bytes_changed;
2941+
}
2942+
btrfs_qgroup_free_refroot(root->fs_info, root->objectid, freed);
2943+
ret = freed;
2944+
out:
2945+
extent_changeset_release(&changeset);
2946+
return ret;
2947+
}
2948+
2949+
static int __btrfs_qgroup_release_data(struct inode *inode,
2950+
struct extent_changeset *reserved, u64 start, u64 len,
2951+
int free)
28972952
{
28982953
struct extent_changeset changeset;
28992954
int trace_op = QGROUP_RELEASE;
29002955
int ret;
29012956

2957+
/* In release case, we shouldn't have @reserved */
2958+
WARN_ON(!free && reserved);
2959+
if (free && reserved)
2960+
return qgroup_free_reserved_data(inode, reserved, start, len);
29022961
extent_changeset_init(&changeset);
29032962
ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, start,
29042963
start + len -1, EXTENT_QGROUP_RESERVED, &changeset);
@@ -2924,14 +2983,17 @@ static int __btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len,
29242983
*
29252984
* Should be called when a range of pages get invalidated before reaching disk.
29262985
* Or for error cleanup case.
2986+
* if @reserved is given, only reserved range in [@start, @start + @len) will
2987+
* be freed.
29272988
*
29282989
* For data written to disk, use btrfs_qgroup_release_data().
29292990
*
29302991
* NOTE: This function may sleep for memory allocation.
29312992
*/
2932-
int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len)
2993+
int btrfs_qgroup_free_data(struct inode *inode,
2994+
struct extent_changeset *reserved, u64 start, u64 len)
29332995
{
2934-
return __btrfs_qgroup_release_data(inode, start, len, 1);
2996+
return __btrfs_qgroup_release_data(inode, reserved, start, len, 1);
29352997
}
29362998

29372999
/*
@@ -2951,7 +3013,7 @@ int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len)
29513013
*/
29523014
int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len)
29533015
{
2954-
return __btrfs_qgroup_release_data(inode, start, len, 0);
3016+
return __btrfs_qgroup_release_data(inode, NULL, start, len, 0);
29553017
}
29563018

29573019
int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,

fs/btrfs/qgroup.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -245,7 +245,8 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
245245
int btrfs_qgroup_reserve_data(struct inode *inode,
246246
struct extent_changeset **reserved, u64 start, u64 len);
247247
int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len);
248-
int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len);
248+
int btrfs_qgroup_free_data(struct inode *inode,
249+
struct extent_changeset *reserved, u64 start, u64 len);
249250

250251
int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
251252
bool enforce);

fs/btrfs/relocation.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3114,8 +3114,8 @@ int prealloc_file_extent_cluster(struct inode *inode,
31143114
lock_extent(&BTRFS_I(inode)->io_tree, start, end);
31153115
num_bytes = end + 1 - start;
31163116
if (cur_offset < start)
3117-
btrfs_free_reserved_data_space(inode, cur_offset,
3118-
start - cur_offset);
3117+
btrfs_free_reserved_data_space(inode, data_reserved,
3118+
cur_offset, start - cur_offset);
31193119
ret = btrfs_prealloc_file_range(inode, 0, start,
31203120
num_bytes, num_bytes,
31213121
end + 1, &alloc_hint);
@@ -3126,8 +3126,8 @@ int prealloc_file_extent_cluster(struct inode *inode,
31263126
nr++;
31273127
}
31283128
if (cur_offset < prealloc_end)
3129-
btrfs_free_reserved_data_space(inode, cur_offset,
3130-
prealloc_end + 1 - cur_offset);
3129+
btrfs_free_reserved_data_space(inode, data_reserved,
3130+
cur_offset, prealloc_end + 1 - cur_offset);
31313131
out:
31323132
inode_unlock(inode);
31333133
extent_changeset_free(data_reserved);

0 commit comments

Comments
 (0)