Skip to content

Commit 1cee528

Browse files
fdmananakdave
authored andcommitted
btrfs: track delayed ref heads in an xarray
Currently we use a red black tree (rb-tree) to track the delayed ref heads (in struct btrfs_delayed_ref_root::href_root). This however is not very efficient when the number of delayed ref heads is large (and it's very common to be at least in the order of thousands) since rb-trees are binary trees. For example for 10K delayed ref heads, the tree has a depth of 13. Besides that, inserting into the tree requires navigating through it and pulling useless cache lines in the process since the red black tree nodes are embedded within the delayed ref head structure - on the other hand, by being embedded, it requires no extra memory allocations. We can improve this by using an xarray instead which has a much higher branching factor than a red black tree (binary balanced tree) and is more cache friendly and behaves like a resizable array, with a much better search and insertion complexity than a red black tree. This only has one small disadvantage which is that insertion will sometimes require allocating memory for the xarray - which may fail (not that often since it uses a kmem_cache) - but on the other hand we can reduce the delayed ref head structure size by 24 bytes (from 152 down to 128 bytes) after removing the embedded red black tree node, meaning than we can now fit 32 delayed ref heads per 4K page instead of 26, and that gain compensates for the occasional memory allocations needed for the xarray nodes. We also end up using only 2 cache lines instead of 3 per delayed ref head. Running the following fs_mark test showed some improvements: $ cat test.sh #!/bin/bash DEV=/dev/nullb0 MNT=/mnt/nullb0 MOUNT_OPTIONS="-o ssd" FILES=100000 THREADS=$(nproc --all) echo "performance" | \ tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor mkfs.btrfs -f $DEV mount $MOUNT_OPTIONS $DEV $MNT OPTS="-S 0 -L 5 -n $FILES -s 0 -t $THREADS -k" for ((i = 1; i <= $THREADS; i++)); do OPTS="$OPTS -d $MNT/d$i" done fs_mark $OPTS umount $MNT Before this patch: FSUse% Count Size Files/sec App Overhead 10 1200000 0 171845.7 12253839 16 2400000 0 230898.7 12308254 23 3600000 0 212292.9 12467768 30 4800000 0 195737.8 12627554 46 6000000 0 171055.2 12783329 After this patch: FSUse% Count Size Files/sec App Overhead 10 1200000 0 173835.0 12246131 16 2400000 0 233537.8 12271746 23 3600000 0 220398.7 12307737 30 4800000 0 204483.6 12392318 40 6000000 0 182923.3 12771843 Reviewed-by: Boris Burkov <[email protected]> Reviewed-by: Qu Wenruo <[email protected]> Signed-off-by: Filipe Manana <[email protected]> Reviewed-by: David Sterba <[email protected]> Signed-off-by: David Sterba <[email protected]>
1 parent b2ca1eb commit 1cee528

File tree

4 files changed

+106
-119
lines changed

4 files changed

+106
-119
lines changed

fs/btrfs/delayed-ref.c

Lines changed: 88 additions & 104 deletions
Original file line numberDiff line numberDiff line change
@@ -314,39 +314,6 @@ static int comp_refs(struct btrfs_delayed_ref_node *ref1,
314314
return 0;
315315
}
316316

317-
/* insert a new ref to head ref rbtree */
318-
static struct btrfs_delayed_ref_head *htree_insert(struct rb_root_cached *root,
319-
struct rb_node *node)
320-
{
321-
struct rb_node **p = &root->rb_root.rb_node;
322-
struct rb_node *parent_node = NULL;
323-
struct btrfs_delayed_ref_head *entry;
324-
struct btrfs_delayed_ref_head *ins;
325-
u64 bytenr;
326-
bool leftmost = true;
327-
328-
ins = rb_entry(node, struct btrfs_delayed_ref_head, href_node);
329-
bytenr = ins->bytenr;
330-
while (*p) {
331-
parent_node = *p;
332-
entry = rb_entry(parent_node, struct btrfs_delayed_ref_head,
333-
href_node);
334-
335-
if (bytenr < entry->bytenr) {
336-
p = &(*p)->rb_left;
337-
} else if (bytenr > entry->bytenr) {
338-
p = &(*p)->rb_right;
339-
leftmost = false;
340-
} else {
341-
return entry;
342-
}
343-
}
344-
345-
rb_link_node(node, parent_node, p);
346-
rb_insert_color_cached(node, root, leftmost);
347-
return NULL;
348-
}
349-
350317
static struct btrfs_delayed_ref_node* tree_insert(struct rb_root_cached *root,
351318
struct btrfs_delayed_ref_node *ins)
352319
{
@@ -381,18 +348,11 @@ static struct btrfs_delayed_ref_node* tree_insert(struct rb_root_cached *root,
381348
static struct btrfs_delayed_ref_head *find_first_ref_head(
382349
struct btrfs_delayed_ref_root *dr)
383350
{
384-
struct rb_node *n;
385-
struct btrfs_delayed_ref_head *entry;
351+
unsigned long from = 0;
386352

387353
lockdep_assert_held(&dr->lock);
388354

389-
n = rb_first_cached(&dr->href_root);
390-
if (!n)
391-
return NULL;
392-
393-
entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node);
394-
395-
return entry;
355+
return xa_find(&dr->head_refs, &from, ULONG_MAX, XA_PRESENT);
396356
}
397357

398358
/*
@@ -405,35 +365,22 @@ static struct btrfs_delayed_ref_head *find_ref_head(
405365
struct btrfs_delayed_ref_root *dr, u64 bytenr,
406366
bool return_bigger)
407367
{
408-
struct rb_root *root = &dr->href_root.rb_root;
409-
struct rb_node *n;
368+
const unsigned long target_index = (bytenr >> fs_info->sectorsize_bits);
369+
unsigned long found_index = target_index;
410370
struct btrfs_delayed_ref_head *entry;
411371

412372
lockdep_assert_held(&dr->lock);
413373

414-
n = root->rb_node;
415-
entry = NULL;
416-
while (n) {
417-
entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node);
374+
entry = xa_find(&dr->head_refs, &found_index, ULONG_MAX, XA_PRESENT);
375+
if (!entry)
376+
return NULL;
377+
378+
ASSERT(found_index >= target_index);
418379

419-
if (bytenr < entry->bytenr)
420-
n = n->rb_left;
421-
else if (bytenr > entry->bytenr)
422-
n = n->rb_right;
423-
else
424-
return entry;
425-
}
426-
if (entry && return_bigger) {
427-
if (bytenr > entry->bytenr) {
428-
n = rb_next(&entry->href_node);
429-
if (!n)
430-
return NULL;
431-
entry = rb_entry(n, struct btrfs_delayed_ref_head,
432-
href_node);
433-
}
434-
return entry;
435-
}
436-
return NULL;
380+
if (found_index != target_index && !return_bigger)
381+
return NULL;
382+
383+
return entry;
437384
}
438385

439386
static bool btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs,
@@ -448,7 +395,7 @@ static bool btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs,
448395

449396
mutex_lock(&head->mutex);
450397
spin_lock(&delayed_refs->lock);
451-
if (RB_EMPTY_NODE(&head->href_node)) {
398+
if (!head->tracked) {
452399
mutex_unlock(&head->mutex);
453400
btrfs_put_delayed_ref_head(head);
454401
return false;
@@ -567,35 +514,27 @@ struct btrfs_delayed_ref_head *btrfs_select_ref_head(
567514
struct btrfs_delayed_ref_root *delayed_refs)
568515
{
569516
struct btrfs_delayed_ref_head *head;
517+
unsigned long start_index;
518+
unsigned long found_index;
519+
bool found_head = false;
570520
bool locked;
571521

572522
spin_lock(&delayed_refs->lock);
573523
again:
574-
head = find_ref_head(fs_info, delayed_refs,
575-
delayed_refs->run_delayed_start, true);
576-
if (!head && delayed_refs->run_delayed_start != 0) {
577-
delayed_refs->run_delayed_start = 0;
578-
head = find_first_ref_head(delayed_refs);
579-
}
580-
if (!head) {
581-
spin_unlock(&delayed_refs->lock);
582-
return NULL;
524+
start_index = (delayed_refs->run_delayed_start >> fs_info->sectorsize_bits);
525+
xa_for_each_start(&delayed_refs->head_refs, found_index, head, start_index) {
526+
if (!head->processing) {
527+
found_head = true;
528+
break;
529+
}
583530
}
584-
585-
while (head->processing) {
586-
struct rb_node *node;
587-
588-
node = rb_next(&head->href_node);
589-
if (!node) {
590-
if (delayed_refs->run_delayed_start == 0) {
591-
spin_unlock(&delayed_refs->lock);
592-
return NULL;
593-
}
594-
delayed_refs->run_delayed_start = 0;
595-
goto again;
531+
if (!found_head) {
532+
if (delayed_refs->run_delayed_start == 0) {
533+
spin_unlock(&delayed_refs->lock);
534+
return NULL;
596535
}
597-
head = rb_entry(node, struct btrfs_delayed_ref_head,
598-
href_node);
536+
delayed_refs->run_delayed_start = 0;
537+
goto again;
599538
}
600539

601540
head->processing = true;
@@ -632,11 +571,13 @@ void btrfs_delete_ref_head(const struct btrfs_fs_info *fs_info,
632571
struct btrfs_delayed_ref_root *delayed_refs,
633572
struct btrfs_delayed_ref_head *head)
634573
{
574+
const unsigned long index = (head->bytenr >> fs_info->sectorsize_bits);
575+
635576
lockdep_assert_held(&delayed_refs->lock);
636577
lockdep_assert_held(&head->lock);
637578

638-
rb_erase_cached(&head->href_node, &delayed_refs->href_root);
639-
RB_CLEAR_NODE(&head->href_node);
579+
xa_erase(&delayed_refs->head_refs, index);
580+
head->tracked = false;
640581
delayed_refs->num_heads--;
641582
if (!head->processing)
642583
delayed_refs->num_heads_ready--;
@@ -845,7 +786,7 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
845786
head_ref->is_system = (generic_ref->ref_root == BTRFS_CHUNK_TREE_OBJECTID);
846787
head_ref->ref_tree = RB_ROOT_CACHED;
847788
INIT_LIST_HEAD(&head_ref->ref_add_list);
848-
RB_CLEAR_NODE(&head_ref->href_node);
789+
head_ref->tracked = false;
849790
head_ref->processing = false;
850791
head_ref->total_ref_mod = count_mod;
851792
spin_lock_init(&head_ref->lock);
@@ -883,11 +824,24 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
883824
struct btrfs_fs_info *fs_info = trans->fs_info;
884825
struct btrfs_delayed_ref_head *existing;
885826
struct btrfs_delayed_ref_root *delayed_refs;
827+
const unsigned long index = (head_ref->bytenr >> fs_info->sectorsize_bits);
886828
bool qrecord_inserted = false;
887829

888830
delayed_refs = &trans->transaction->delayed_refs;
889831
lockdep_assert_held(&delayed_refs->lock);
890832

833+
#if BITS_PER_LONG == 32
834+
if (head_ref->bytenr >= MAX_LFS_FILESIZE) {
835+
if (qrecord)
836+
xa_release(&delayed_refs->dirty_extents, index);
837+
btrfs_err_rl(fs_info,
838+
"delayed ref head %llu is beyond 32bit page cache and xarray index limit",
839+
head_ref->bytenr);
840+
btrfs_err_32bit_limit(fs_info);
841+
return ERR_PTR(-EOVERFLOW);
842+
}
843+
#endif
844+
891845
/* Record qgroup extent info if provided */
892846
if (qrecord) {
893847
int ret;
@@ -896,8 +850,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
896850
head_ref->bytenr);
897851
if (ret) {
898852
/* Clean up if insertion fails or item exists. */
899-
xa_release(&delayed_refs->dirty_extents,
900-
head_ref->bytenr >> fs_info->sectorsize_bits);
853+
xa_release(&delayed_refs->dirty_extents, index);
901854
/* Caller responsible for freeing qrecord on error. */
902855
if (ret < 0)
903856
return ERR_PTR(ret);
@@ -909,8 +862,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
909862

910863
trace_add_delayed_ref_head(fs_info, head_ref, action);
911864

912-
existing = htree_insert(&delayed_refs->href_root,
913-
&head_ref->href_node);
865+
existing = xa_load(&delayed_refs->head_refs, index);
914866
if (existing) {
915867
update_existing_head_ref(trans, existing, head_ref);
916868
/*
@@ -920,6 +872,19 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
920872
kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
921873
head_ref = existing;
922874
} else {
875+
existing = xa_store(&delayed_refs->head_refs, index, head_ref, GFP_ATOMIC);
876+
if (xa_is_err(existing)) {
877+
/* Memory was preallocated by the caller. */
878+
ASSERT(xa_err(existing) != -ENOMEM);
879+
return ERR_PTR(xa_err(existing));
880+
} else if (WARN_ON(existing)) {
881+
/*
882+
* Shouldn't happen we just did a lookup before under
883+
* delayed_refs->lock.
884+
*/
885+
return ERR_PTR(-EEXIST);
886+
}
887+
head_ref->tracked = true;
923888
/*
924889
* We reserve the amount of bytes needed to delete csums when
925890
* adding the ref head and not when adding individual drop refs
@@ -1040,6 +1005,8 @@ static int add_delayed_ref(struct btrfs_trans_handle *trans,
10401005
struct btrfs_delayed_ref_head *new_head_ref;
10411006
struct btrfs_delayed_ref_root *delayed_refs;
10421007
struct btrfs_qgroup_extent_record *record = NULL;
1008+
const unsigned long index = (generic_ref->bytenr >> fs_info->sectorsize_bits);
1009+
bool qrecord_reserved = false;
10431010
bool qrecord_inserted;
10441011
int action = generic_ref->action;
10451012
bool merged;
@@ -1055,25 +1022,32 @@ static int add_delayed_ref(struct btrfs_trans_handle *trans,
10551022
goto free_node;
10561023
}
10571024

1025+
delayed_refs = &trans->transaction->delayed_refs;
1026+
10581027
if (btrfs_qgroup_full_accounting(fs_info) && !generic_ref->skip_qgroup) {
10591028
record = kzalloc(sizeof(*record), GFP_NOFS);
10601029
if (!record) {
10611030
ret = -ENOMEM;
10621031
goto free_head_ref;
10631032
}
1064-
if (xa_reserve(&trans->transaction->delayed_refs.dirty_extents,
1065-
generic_ref->bytenr >> fs_info->sectorsize_bits,
1066-
GFP_NOFS)) {
1033+
if (xa_reserve(&delayed_refs->dirty_extents, index, GFP_NOFS)) {
10671034
ret = -ENOMEM;
10681035
goto free_record;
10691036
}
1037+
qrecord_reserved = true;
1038+
}
1039+
1040+
ret = xa_reserve(&delayed_refs->head_refs, index, GFP_NOFS);
1041+
if (ret) {
1042+
if (qrecord_reserved)
1043+
xa_release(&delayed_refs->dirty_extents, index);
1044+
goto free_record;
10701045
}
10711046

10721047
init_delayed_ref_common(fs_info, node, generic_ref);
10731048
init_delayed_ref_head(head_ref, generic_ref, record, reserved);
10741049
head_ref->extent_op = extent_op;
10751050

1076-
delayed_refs = &trans->transaction->delayed_refs;
10771051
spin_lock(&delayed_refs->lock);
10781052

10791053
/*
@@ -1083,6 +1057,7 @@ static int add_delayed_ref(struct btrfs_trans_handle *trans,
10831057
new_head_ref = add_delayed_ref_head(trans, head_ref, record,
10841058
action, &qrecord_inserted);
10851059
if (IS_ERR(new_head_ref)) {
1060+
xa_release(&delayed_refs->head_refs, index);
10861061
spin_unlock(&delayed_refs->lock);
10871062
ret = PTR_ERR(new_head_ref);
10881063
goto free_record;
@@ -1145,6 +1120,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
11451120
u64 bytenr, u64 num_bytes, u8 level,
11461121
struct btrfs_delayed_extent_op *extent_op)
11471122
{
1123+
const unsigned long index = (bytenr >> trans->fs_info->sectorsize_bits);
11481124
struct btrfs_delayed_ref_head *head_ref;
11491125
struct btrfs_delayed_ref_head *head_ref_ret;
11501126
struct btrfs_delayed_ref_root *delayed_refs;
@@ -1155,6 +1131,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
11551131
.num_bytes = num_bytes,
11561132
.tree_ref.level = level,
11571133
};
1134+
int ret;
11581135

11591136
head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
11601137
if (!head_ref)
@@ -1164,16 +1141,23 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
11641141
head_ref->extent_op = extent_op;
11651142

11661143
delayed_refs = &trans->transaction->delayed_refs;
1167-
spin_lock(&delayed_refs->lock);
11681144

1145+
ret = xa_reserve(&delayed_refs->head_refs, index, GFP_NOFS);
1146+
if (ret) {
1147+
kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
1148+
return ret;
1149+
}
1150+
1151+
spin_lock(&delayed_refs->lock);
11691152
head_ref_ret = add_delayed_ref_head(trans, head_ref, NULL,
11701153
BTRFS_UPDATE_DELAYED_HEAD, NULL);
1171-
spin_unlock(&delayed_refs->lock);
1172-
11731154
if (IS_ERR(head_ref_ret)) {
1155+
xa_release(&delayed_refs->head_refs, index);
1156+
spin_unlock(&delayed_refs->lock);
11741157
kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
11751158
return PTR_ERR(head_ref_ret);
11761159
}
1160+
spin_unlock(&delayed_refs->lock);
11771161

11781162
/*
11791163
* Need to update the delayed_refs_rsv with any changes we may have

0 commit comments

Comments
 (0)