Skip to content

Commit b0643e5

Browse files
dennisszhoukdave
authored andcommitted
btrfs: add the beginning of async discard, discard workqueue
When discard is enabled, everytime a pinned extent is released back to the block_group's free space cache, a discard is issued for the extent. This is an overeager approach when it comes to discarding and helping the SSD maintain enough free space to prevent severe garbage collection situations. This adds the beginning of async discard. Instead of issuing a discard prior to returning it to the free space, it is just marked as untrimmed. The block_group is then added to a LRU which then feeds into a workqueue to issue discards at a much slower rate. Full discarding of unused block groups is still done and will be addressed in a future patch of the series. For now, we don't persist the discard state of extents and bitmaps. Therefore, our failure recovery mode will be to consider extents untrimmed. This lets us handle failure and unmounting as one in the same. On a number of Facebook webservers, I collected data every minute accounting the time we spent in btrfs_finish_extent_commit() (col. 1) and in btrfs_commit_transaction() (col. 2). btrfs_finish_extent_commit() is where we discard extents synchronously before returning them to the free space cache. discard=sync: p99 total per minute p99 total per minute Drive | extent_commit() (ms) | commit_trans() (ms) --------------------------------------------------------------- Drive A | 434 | 1170 Drive B | 880 | 2330 Drive C | 2943 | 3920 Drive D | 4763 | 5701 discard=async: p99 total per minute p99 total per minute Drive | extent_commit() (ms) | commit_trans() (ms) -------------------------------------------------------------- Drive A | 134 | 956 Drive B | 64 | 1972 Drive C | 59 | 1032 Drive D | 62 | 1200 While it's not great that the stats are cumulative over 1m, all of these servers are running the same workload and and the delta between the two are substantial. We are spending significantly less time in btrfs_finish_extent_commit() which is responsible for discarding. Reviewed-by: Josef Bacik <[email protected]> Signed-off-by: Dennis Zhou <[email protected]> Reviewed-by: David Sterba <[email protected]> Signed-off-by: David Sterba <[email protected]>
1 parent da080fe commit b0643e5

File tree

12 files changed

+468
-13
lines changed

12 files changed

+468
-13
lines changed

fs/btrfs/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
1111
compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
1212
reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
1313
uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
14-
block-rsv.o delalloc-space.o block-group.o
14+
block-rsv.o delalloc-space.o block-group.o discard.o
1515

1616
btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
1717
btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o

fs/btrfs/block-group.c

Lines changed: 34 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include "sysfs.h"
1515
#include "tree-log.h"
1616
#include "delalloc-space.h"
17+
#include "discard.h"
1718

1819
/*
1920
* Return target flags in extended format or 0 if restripe for this chunk_type
@@ -131,6 +132,15 @@ void btrfs_put_block_group(struct btrfs_block_group *cache)
131132
WARN_ON(cache->pinned > 0);
132133
WARN_ON(cache->reserved > 0);
133134

135+
/*
136+
* A block_group shouldn't be on the discard_list anymore.
137+
* Remove the block_group from the discard_list to prevent us
138+
* from causing a panic due to NULL pointer dereference.
139+
*/
140+
if (WARN_ON(!list_empty(&cache->discard_list)))
141+
btrfs_discard_cancel_work(&cache->fs_info->discard_ctl,
142+
cache);
143+
134144
/*
135145
* If not empty, someone is still holding mutex of
136146
* full_stripe_lock, which can only be released by caller.
@@ -466,8 +476,8 @@ u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end
466476
} else if (extent_start > start && extent_start < end) {
467477
size = extent_start - start;
468478
total_added += size;
469-
ret = btrfs_add_free_space(block_group, start,
470-
size);
479+
ret = btrfs_add_free_space_async_trimmed(block_group,
480+
start, size);
471481
BUG_ON(ret); /* -ENOMEM or logic error */
472482
start = extent_end + 1;
473483
} else {
@@ -478,7 +488,8 @@ u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end
478488
if (start < end) {
479489
size = end - start;
480490
total_added += size;
481-
ret = btrfs_add_free_space(block_group, start, size);
491+
ret = btrfs_add_free_space_async_trimmed(block_group, start,
492+
size);
482493
BUG_ON(ret); /* -ENOMEM or logic error */
483494
}
484495

@@ -1258,6 +1269,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
12581269
}
12591270
spin_unlock(&fs_info->unused_bgs_lock);
12601271

1272+
btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
1273+
12611274
mutex_lock(&fs_info->delete_unused_bgs_mutex);
12621275

12631276
/* Don't want to race with allocators so take the groups_sem */
@@ -1333,6 +1346,23 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
13331346
}
13341347
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
13351348

1349+
/*
1350+
* At this point, the block_group is read only and should fail
1351+
* new allocations. However, btrfs_finish_extent_commit() can
1352+
* cause this block_group to be placed back on the discard
1353+
* lists because now the block_group isn't fully discarded.
1354+
* Bail here and try again later after discarding everything.
1355+
*/
1356+
spin_lock(&fs_info->discard_ctl.lock);
1357+
if (!list_empty(&block_group->discard_list)) {
1358+
spin_unlock(&fs_info->discard_ctl.lock);
1359+
btrfs_dec_block_group_ro(block_group);
1360+
btrfs_discard_queue_work(&fs_info->discard_ctl,
1361+
block_group);
1362+
goto end_trans;
1363+
}
1364+
spin_unlock(&fs_info->discard_ctl.lock);
1365+
13361366
/* Reset pinned so btrfs_put_block_group doesn't complain */
13371367
spin_lock(&space_info->lock);
13381368
spin_lock(&block_group->lock);
@@ -1603,6 +1633,7 @@ static struct btrfs_block_group *btrfs_create_block_group_cache(
16031633
INIT_LIST_HEAD(&cache->cluster_list);
16041634
INIT_LIST_HEAD(&cache->bg_list);
16051635
INIT_LIST_HEAD(&cache->ro_list);
1636+
INIT_LIST_HEAD(&cache->discard_list);
16061637
INIT_LIST_HEAD(&cache->dirty_list);
16071638
INIT_LIST_HEAD(&cache->io_list);
16081639
btrfs_init_free_space_ctl(cache);

fs/btrfs/block-group.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,11 @@ struct btrfs_block_group {
116116
/* For read-only block groups */
117117
struct list_head ro_list;
118118

119+
/* For discard operations */
119120
atomic_t trimming;
121+
struct list_head discard_list;
122+
int discard_index;
123+
u64 discard_eligible_time;
120124

121125
/* For dirty block groups */
122126
struct list_head dirty_list;
@@ -158,6 +162,11 @@ struct btrfs_block_group {
158162
struct btrfs_full_stripe_locks_tree full_stripe_locks_root;
159163
};
160164

165+
static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group)
166+
{
167+
return (block_group->start + block_group->length);
168+
}
169+
161170
#ifdef CONFIG_BTRFS_DEBUG
162171
static inline int btrfs_should_fragment_free_space(
163172
struct btrfs_block_group *block_group)

fs/btrfs/ctree.h

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -440,6 +440,21 @@ struct btrfs_full_stripe_locks_tree {
440440
struct mutex lock;
441441
};
442442

443+
/* Discard control. */
444+
/*
445+
* Async discard uses multiple lists to differentiate the discard filter
446+
* parameters.
447+
*/
448+
#define BTRFS_NR_DISCARD_LISTS 1
449+
450+
struct btrfs_discard_ctl {
451+
struct workqueue_struct *discard_workers;
452+
struct delayed_work work;
453+
spinlock_t lock;
454+
struct btrfs_block_group *block_group;
455+
struct list_head discard_list[BTRFS_NR_DISCARD_LISTS];
456+
};
457+
443458
/* delayed seq elem */
444459
struct seq_list {
445460
struct list_head list;
@@ -526,6 +541,9 @@ enum {
526541
* so we don't need to offload checksums to workqueues.
527542
*/
528543
BTRFS_FS_CSUM_IMPL_FAST,
544+
545+
/* Indicate that the discard workqueue can service discards. */
546+
BTRFS_FS_DISCARD_RUNNING,
529547
};
530548

531549
struct btrfs_fs_info {
@@ -816,6 +834,8 @@ struct btrfs_fs_info {
816834
struct btrfs_workqueue *scrub_wr_completion_workers;
817835
struct btrfs_workqueue *scrub_parity_workers;
818836

837+
struct btrfs_discard_ctl discard_ctl;
838+
819839
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
820840
u32 check_integrity_print_mask;
821841
#endif
@@ -1189,6 +1209,7 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info)
11891209
#define BTRFS_MOUNT_FREE_SPACE_TREE (1 << 26)
11901210
#define BTRFS_MOUNT_NOLOGREPLAY (1 << 27)
11911211
#define BTRFS_MOUNT_REF_VERIFY (1 << 28)
1212+
#define BTRFS_MOUNT_DISCARD_ASYNC (1 << 29)
11921213

11931214
#define BTRFS_DEFAULT_COMMIT_INTERVAL (30)
11941215
#define BTRFS_DEFAULT_MAX_INLINE (2048)

0 commit comments

Comments
 (0)