Skip to content

Commit fcebe45

Browse files
Josef Bacikmasoncl
authored andcommitted
Btrfs: rework qgroup accounting
Currently qgroups account for space by intercepting delayed ref updates to fs trees. It does this by adding sequence numbers to delayed ref updates so that it can figure out how the tree looked before the update so we can adjust the counters properly. The problem with this is that it does not allow delayed refs to be merged, so if you say are defragging an extent with 5k snapshots pointing to it we will thrash the delayed ref lock because we need to go back and manually merge these things together. Instead we want to process quota changes when we know they are going to happen, like when we first allocate an extent, we free a reference for an extent, we add new references etc. This patch accomplishes this by only adding qgroup operations for real ref changes. We only modify the sequence number when we need to lookup roots for bytenrs, this reduces the amount of churn on the sequence number and allows us to merge delayed refs as we add them most of the time. This patch encompasses a bunch of architectural changes 1) qgroup ref operations: instead of tracking qgroup operations through the delayed refs we simply add new ref operations whenever we notice that we need to when we've modified the refs themselves. 2) tree mod seq: we no longer have this separation of major/minor counters. this makes the sequence number stuff much more sane and we can remove some locking that was needed to protect the counter. 3) delayed ref seq: we now read the tree mod seq number and use that as our sequence. This means each new delayed ref doesn't have it's own unique sequence number, rather whenever we go to lookup backrefs we inc the sequence number so we can make sure to keep any new operations from screwing up our world view at that given point. This allows us to merge delayed refs during runtime. With all of these changes the delayed ref stuff is a little saner and the qgroup accounting stuff no longer goes negative in some cases like it was before. Thanks, Signed-off-by: Josef Bacik <[email protected]> Signed-off-by: Chris Mason <[email protected]>
1 parent 5dca6ee commit fcebe45

File tree

12 files changed

+1044
-479
lines changed

12 files changed

+1044
-479
lines changed

fs/btrfs/backref.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,8 +55,8 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
5555
int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
5656

5757
int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
58-
struct btrfs_fs_info *fs_info, u64 bytenr,
59-
u64 time_seq, struct ulist **roots);
58+
struct btrfs_fs_info *fs_info, u64 bytenr,
59+
u64 time_seq, struct ulist **roots);
6060
char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
6161
u32 name_len, unsigned long name_off,
6262
struct extent_buffer *eb_in, u64 parent,

fs/btrfs/ctree.c

Lines changed: 5 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -356,43 +356,13 @@ static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info)
356356
}
357357

358358
/*
359-
* Increment the upper half of tree_mod_seq, set lower half zero.
360-
*
361-
* Must be called with fs_info->tree_mod_seq_lock held.
362-
*/
363-
static inline u64 btrfs_inc_tree_mod_seq_major(struct btrfs_fs_info *fs_info)
364-
{
365-
u64 seq = atomic64_read(&fs_info->tree_mod_seq);
366-
seq &= 0xffffffff00000000ull;
367-
seq += 1ull << 32;
368-
atomic64_set(&fs_info->tree_mod_seq, seq);
369-
return seq;
370-
}
371-
372-
/*
373-
* Increment the lower half of tree_mod_seq.
374-
*
375-
* Must be called with fs_info->tree_mod_seq_lock held. The way major numbers
376-
* are generated should not technically require a spin lock here. (Rationale:
377-
* incrementing the minor while incrementing the major seq number is between its
378-
* atomic64_read and atomic64_set calls doesn't duplicate sequence numbers, it
379-
* just returns a unique sequence number as usual.) We have decided to leave
380-
* that requirement in here and rethink it once we notice it really imposes a
381-
* problem on some workload.
359+
* Pull a new tree mod seq number for our operation.
382360
*/
383-
static inline u64 btrfs_inc_tree_mod_seq_minor(struct btrfs_fs_info *fs_info)
361+
static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
384362
{
385363
return atomic64_inc_return(&fs_info->tree_mod_seq);
386364
}
387365

388-
/*
389-
* return the last minor in the previous major tree_mod_seq number
390-
*/
391-
u64 btrfs_tree_mod_seq_prev(u64 seq)
392-
{
393-
return (seq & 0xffffffff00000000ull) - 1ull;
394-
}
395-
396366
/*
397367
* This adds a new blocker to the tree mod log's blocker list if the @elem
398368
* passed does not already have a sequence number set. So when a caller expects
@@ -404,19 +374,16 @@ u64 btrfs_tree_mod_seq_prev(u64 seq)
404374
u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
405375
struct seq_list *elem)
406376
{
407-
u64 seq;
408-
409377
tree_mod_log_write_lock(fs_info);
410378
spin_lock(&fs_info->tree_mod_seq_lock);
411379
if (!elem->seq) {
412-
elem->seq = btrfs_inc_tree_mod_seq_major(fs_info);
380+
elem->seq = btrfs_inc_tree_mod_seq(fs_info);
413381
list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
414382
}
415-
seq = btrfs_inc_tree_mod_seq_minor(fs_info);
416383
spin_unlock(&fs_info->tree_mod_seq_lock);
417384
tree_mod_log_write_unlock(fs_info);
418385

419-
return seq;
386+
return elem->seq;
420387
}
421388

422389
void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
@@ -489,9 +456,7 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
489456

490457
BUG_ON(!tm);
491458

492-
spin_lock(&fs_info->tree_mod_seq_lock);
493-
tm->seq = btrfs_inc_tree_mod_seq_minor(fs_info);
494-
spin_unlock(&fs_info->tree_mod_seq_lock);
459+
tm->seq = btrfs_inc_tree_mod_seq(fs_info);
495460

496461
tm_root = &fs_info->tree_mod_log;
497462
new = &tm_root->rb_node;

fs/btrfs/ctree.h

Lines changed: 7 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -1648,7 +1648,10 @@ struct btrfs_fs_info {
16481648

16491649
/* holds configuration and tracking. Protected by qgroup_lock */
16501650
struct rb_root qgroup_tree;
1651+
struct rb_root qgroup_op_tree;
16511652
spinlock_t qgroup_lock;
1653+
spinlock_t qgroup_op_lock;
1654+
atomic_t qgroup_op_seq;
16521655

16531656
/*
16541657
* used to avoid frequently calling ulist_alloc()/ulist_free()
@@ -3300,17 +3303,17 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes,
33003303
u64 min_alloc_size, u64 empty_size, u64 hint_byte,
33013304
struct btrfs_key *ins, int is_data);
33023305
int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3303-
struct extent_buffer *buf, int full_backref, int for_cow);
3306+
struct extent_buffer *buf, int full_backref, int no_quota);
33043307
int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3305-
struct extent_buffer *buf, int full_backref, int for_cow);
3308+
struct extent_buffer *buf, int full_backref, int no_quota);
33063309
int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
33073310
struct btrfs_root *root,
33083311
u64 bytenr, u64 num_bytes, u64 flags,
33093312
int level, int is_data);
33103313
int btrfs_free_extent(struct btrfs_trans_handle *trans,
33113314
struct btrfs_root *root,
33123315
u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
3313-
u64 owner, u64 offset, int for_cow);
3316+
u64 owner, u64 offset, int no_quota);
33143317

33153318
int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
33163319
int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
@@ -3322,7 +3325,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
33223325
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
33233326
struct btrfs_root *root,
33243327
u64 bytenr, u64 num_bytes, u64 parent,
3325-
u64 root_objectid, u64 owner, u64 offset, int for_cow);
3328+
u64 root_objectid, u64 owner, u64 offset, int no_quota);
33263329

33273330
int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
33283331
struct btrfs_root *root);
@@ -3410,7 +3413,6 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
34103413
int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
34113414
struct btrfs_fs_info *fs_info);
34123415
int __get_raid_index(u64 flags);
3413-
34143416
int btrfs_start_nocow_write(struct btrfs_root *root);
34153417
void btrfs_end_nocow_write(struct btrfs_root *root);
34163418
/* ctree.c */
@@ -3586,7 +3588,6 @@ u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
35863588
struct seq_list *elem);
35873589
void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
35883590
struct seq_list *elem);
3589-
u64 btrfs_tree_mod_seq_prev(u64 seq);
35903591
int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq);
35913592

35923593
/* root-item.c */
@@ -4094,52 +4095,6 @@ void btrfs_reada_detach(void *handle);
40944095
int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
40954096
u64 start, int err);
40964097

4097-
/* qgroup.c */
4098-
struct qgroup_update {
4099-
struct list_head list;
4100-
struct btrfs_delayed_ref_node *node;
4101-
struct btrfs_delayed_extent_op *extent_op;
4102-
};
4103-
4104-
int btrfs_quota_enable(struct btrfs_trans_handle *trans,
4105-
struct btrfs_fs_info *fs_info);
4106-
int btrfs_quota_disable(struct btrfs_trans_handle *trans,
4107-
struct btrfs_fs_info *fs_info);
4108-
int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info);
4109-
void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info);
4110-
int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info);
4111-
int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
4112-
struct btrfs_fs_info *fs_info, u64 src, u64 dst);
4113-
int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
4114-
struct btrfs_fs_info *fs_info, u64 src, u64 dst);
4115-
int btrfs_create_qgroup(struct btrfs_trans_handle *trans,
4116-
struct btrfs_fs_info *fs_info, u64 qgroupid,
4117-
char *name);
4118-
int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
4119-
struct btrfs_fs_info *fs_info, u64 qgroupid);
4120-
int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
4121-
struct btrfs_fs_info *fs_info, u64 qgroupid,
4122-
struct btrfs_qgroup_limit *limit);
4123-
int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info);
4124-
void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info);
4125-
struct btrfs_delayed_extent_op;
4126-
int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
4127-
struct btrfs_delayed_ref_node *node,
4128-
struct btrfs_delayed_extent_op *extent_op);
4129-
int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
4130-
struct btrfs_fs_info *fs_info,
4131-
struct btrfs_delayed_ref_node *node,
4132-
struct btrfs_delayed_extent_op *extent_op);
4133-
int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
4134-
struct btrfs_fs_info *fs_info);
4135-
int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
4136-
struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
4137-
struct btrfs_qgroup_inherit *inherit);
4138-
int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes);
4139-
void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes);
4140-
4141-
void assert_qgroups_uptodate(struct btrfs_trans_handle *trans);
4142-
41434098
static inline int is_fstree(u64 rootid)
41444099
{
41454100
if (rootid == BTRFS_FS_TREE_OBJECTID ||

fs/btrfs/delayed-ref.c

Lines changed: 23 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,10 @@ static int comp_entry(struct btrfs_delayed_ref_node *ref2,
106106
return -1;
107107
if (ref1->type > ref2->type)
108108
return 1;
109+
if (ref1->no_quota > ref2->no_quota)
110+
return 1;
111+
if (ref1->no_quota < ref2->no_quota)
112+
return -1;
109113
/* merging of sequenced refs is not allowed */
110114
if (compare_seq) {
111115
if (ref1->seq < ref2->seq)
@@ -635,7 +639,7 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
635639
struct btrfs_delayed_ref_head *head_ref,
636640
struct btrfs_delayed_ref_node *ref, u64 bytenr,
637641
u64 num_bytes, u64 parent, u64 ref_root, int level,
638-
int action, int for_cow)
642+
int action, int no_quota)
639643
{
640644
struct btrfs_delayed_ref_node *existing;
641645
struct btrfs_delayed_tree_ref *full_ref;
@@ -645,6 +649,8 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
645649
if (action == BTRFS_ADD_DELAYED_EXTENT)
646650
action = BTRFS_ADD_DELAYED_REF;
647651

652+
if (is_fstree(ref_root))
653+
seq = atomic64_read(&fs_info->tree_mod_seq);
648654
delayed_refs = &trans->transaction->delayed_refs;
649655

650656
/* first set the basic ref node struct up */
@@ -655,9 +661,7 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
655661
ref->action = action;
656662
ref->is_head = 0;
657663
ref->in_tree = 1;
658-
659-
if (need_ref_seq(for_cow, ref_root))
660-
seq = btrfs_get_tree_mod_seq(fs_info, &trans->delayed_ref_elem);
664+
ref->no_quota = no_quota;
661665
ref->seq = seq;
662666

663667
full_ref = btrfs_delayed_node_to_tree_ref(ref);
@@ -697,7 +701,7 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,
697701
struct btrfs_delayed_ref_head *head_ref,
698702
struct btrfs_delayed_ref_node *ref, u64 bytenr,
699703
u64 num_bytes, u64 parent, u64 ref_root, u64 owner,
700-
u64 offset, int action, int for_cow)
704+
u64 offset, int action, int no_quota)
701705
{
702706
struct btrfs_delayed_ref_node *existing;
703707
struct btrfs_delayed_data_ref *full_ref;
@@ -709,6 +713,9 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,
709713

710714
delayed_refs = &trans->transaction->delayed_refs;
711715

716+
if (is_fstree(ref_root))
717+
seq = atomic64_read(&fs_info->tree_mod_seq);
718+
712719
/* first set the basic ref node struct up */
713720
atomic_set(&ref->refs, 1);
714721
ref->bytenr = bytenr;
@@ -717,9 +724,7 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,
717724
ref->action = action;
718725
ref->is_head = 0;
719726
ref->in_tree = 1;
720-
721-
if (need_ref_seq(for_cow, ref_root))
722-
seq = btrfs_get_tree_mod_seq(fs_info, &trans->delayed_ref_elem);
727+
ref->no_quota = no_quota;
723728
ref->seq = seq;
724729

725730
full_ref = btrfs_delayed_node_to_data_ref(ref);
@@ -762,12 +767,15 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
762767
u64 bytenr, u64 num_bytes, u64 parent,
763768
u64 ref_root, int level, int action,
764769
struct btrfs_delayed_extent_op *extent_op,
765-
int for_cow)
770+
int no_quota)
766771
{
767772
struct btrfs_delayed_tree_ref *ref;
768773
struct btrfs_delayed_ref_head *head_ref;
769774
struct btrfs_delayed_ref_root *delayed_refs;
770775

776+
if (!is_fstree(ref_root) || !fs_info->quota_enabled)
777+
no_quota = 0;
778+
771779
BUG_ON(extent_op && extent_op->is_data);
772780
ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS);
773781
if (!ref)
@@ -793,10 +801,8 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
793801

794802
add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr,
795803
num_bytes, parent, ref_root, level, action,
796-
for_cow);
804+
no_quota);
797805
spin_unlock(&delayed_refs->lock);
798-
if (need_ref_seq(for_cow, ref_root))
799-
btrfs_qgroup_record_ref(trans, &ref->node, extent_op);
800806

801807
return 0;
802808
}
@@ -810,12 +816,15 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
810816
u64 parent, u64 ref_root,
811817
u64 owner, u64 offset, int action,
812818
struct btrfs_delayed_extent_op *extent_op,
813-
int for_cow)
819+
int no_quota)
814820
{
815821
struct btrfs_delayed_data_ref *ref;
816822
struct btrfs_delayed_ref_head *head_ref;
817823
struct btrfs_delayed_ref_root *delayed_refs;
818824

825+
if (!is_fstree(ref_root) || !fs_info->quota_enabled)
826+
no_quota = 0;
827+
819828
BUG_ON(extent_op && !extent_op->is_data);
820829
ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS);
821830
if (!ref)
@@ -841,10 +850,8 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
841850

842851
add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr,
843852
num_bytes, parent, ref_root, owner, offset,
844-
action, for_cow);
853+
action, no_quota);
845854
spin_unlock(&delayed_refs->lock);
846-
if (need_ref_seq(for_cow, ref_root))
847-
btrfs_qgroup_record_ref(trans, &ref->node, extent_op);
848855

849856
return 0;
850857
}

fs/btrfs/delayed-ref.h

Lines changed: 3 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ struct btrfs_delayed_ref_node {
5252

5353
unsigned int action:8;
5454
unsigned int type:8;
55+
unsigned int no_quota:1;
5556
/* is this node still in the rbtree? */
5657
unsigned int is_head:1;
5758
unsigned int in_tree:1;
@@ -196,14 +197,14 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
196197
u64 bytenr, u64 num_bytes, u64 parent,
197198
u64 ref_root, int level, int action,
198199
struct btrfs_delayed_extent_op *extent_op,
199-
int for_cow);
200+
int no_quota);
200201
int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
201202
struct btrfs_trans_handle *trans,
202203
u64 bytenr, u64 num_bytes,
203204
u64 parent, u64 ref_root,
204205
u64 owner, u64 offset, int action,
205206
struct btrfs_delayed_extent_op *extent_op,
206-
int for_cow);
207+
int no_quota);
207208
int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
208209
struct btrfs_trans_handle *trans,
209210
u64 bytenr, u64 num_bytes,
@@ -230,25 +231,6 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
230231
struct btrfs_delayed_ref_root *delayed_refs,
231232
u64 seq);
232233

233-
/*
234-
* delayed refs with a ref_seq > 0 must be held back during backref walking.
235-
* this only applies to items in one of the fs-trees. for_cow items never need
236-
* to be held back, so they won't get a ref_seq number.
237-
*/
238-
static inline int need_ref_seq(int for_cow, u64 rootid)
239-
{
240-
if (for_cow)
241-
return 0;
242-
243-
if (rootid == BTRFS_FS_TREE_OBJECTID)
244-
return 1;
245-
246-
if ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID)
247-
return 1;
248-
249-
return 0;
250-
}
251-
252234
/*
253235
* a node might live in a head or a regular ref, this lets you
254236
* test for the proper type to use.

0 commit comments

Comments
 (0)