Skip to content

Commit c9dc4c6

Browse files
committed
Btrfs: two stage dirty block group writeout
Block group cache writeout is currently waiting on the pages for each block group cache before moving on to writing the next one. This commit switches things around to send down all the caches and then wait on them in batches. The end result is much faster, since we're keeping the disk pipeline full. Signed-off-by: Chris Mason <[email protected]>
1 parent 4c6d1d8 commit c9dc4c6

File tree

4 files changed

+170
-32
lines changed

4 files changed

+170
-32
lines changed

fs/btrfs/ctree.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1261,9 +1261,12 @@ struct btrfs_io_ctl {
12611261
struct page *page;
12621262
struct page **pages;
12631263
struct btrfs_root *root;
1264+
struct inode *inode;
12641265
unsigned long size;
12651266
int index;
12661267
int num_pages;
1268+
int entries;
1269+
int bitmaps;
12671270
unsigned check_crcs:1;
12681271
};
12691272

@@ -1332,6 +1335,9 @@ struct btrfs_block_group_cache {
13321335

13331336
/* For dirty block groups */
13341337
struct list_head dirty_list;
1338+
struct list_head io_list;
1339+
1340+
struct btrfs_io_ctl io_ctl;
13351341
};
13361342

13371343
/* delayed seq elem */

fs/btrfs/extent-tree.c

Lines changed: 53 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3388,7 +3388,11 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
33883388
struct btrfs_block_group_cache *cache;
33893389
struct btrfs_transaction *cur_trans = trans->transaction;
33903390
int ret = 0;
3391+
int should_put;
33913392
struct btrfs_path *path;
3393+
LIST_HEAD(io);
3394+
int num_started = 0;
3395+
int num_waited = 0;
33923396

33933397
if (list_empty(&cur_trans->dirty_bgs))
33943398
return 0;
@@ -3407,16 +3411,60 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
34073411
cache = list_first_entry(&cur_trans->dirty_bgs,
34083412
struct btrfs_block_group_cache,
34093413
dirty_list);
3414+
3415+
/*
3416+
* this can happen if cache_save_setup re-dirties a block
3417+
* group that is already under IO. Just wait for it to
3418+
* finish and then do it all again
3419+
*/
3420+
if (!list_empty(&cache->io_list)) {
3421+
list_del_init(&cache->io_list);
3422+
btrfs_wait_cache_io(root, trans, cache,
3423+
&cache->io_ctl, path,
3424+
cache->key.objectid);
3425+
btrfs_put_block_group(cache);
3426+
num_waited++;
3427+
}
3428+
34103429
list_del_init(&cache->dirty_list);
3430+
should_put = 1;
3431+
34113432
if (cache->disk_cache_state == BTRFS_DC_CLEAR)
34123433
cache_save_setup(cache, trans, path);
3434+
34133435
if (!ret)
3414-
ret = btrfs_run_delayed_refs(trans, root,
3415-
(unsigned long) -1);
3416-
if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP)
3417-
btrfs_write_out_cache(root, trans, cache, path);
3436+
ret = btrfs_run_delayed_refs(trans, root, (unsigned long) -1);
3437+
3438+
if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
3439+
cache->io_ctl.inode = NULL;
3440+
ret = btrfs_write_out_cache(root, trans, cache, path);
3441+
if (ret == 0 && cache->io_ctl.inode) {
3442+
num_started++;
3443+
should_put = 0;
3444+
list_add_tail(&cache->io_list, &io);
3445+
} else {
3446+
/*
3447+
* if we failed to write the cache, the
3448+
* generation will be bad and life goes on
3449+
*/
3450+
ret = 0;
3451+
}
3452+
}
34183453
if (!ret)
34193454
ret = write_one_cache_group(trans, root, path, cache);
3455+
3456+
/* if its not on the io list, we need to put the block group */
3457+
if (should_put)
3458+
btrfs_put_block_group(cache);
3459+
}
3460+
3461+
while (!list_empty(&io)) {
3462+
cache = list_first_entry(&io, struct btrfs_block_group_cache,
3463+
io_list);
3464+
list_del_init(&cache->io_list);
3465+
num_waited++;
3466+
btrfs_wait_cache_io(root, trans, cache,
3467+
&cache->io_ctl, path, cache->key.objectid);
34203468
btrfs_put_block_group(cache);
34213469
}
34223470

@@ -9013,6 +9061,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
90139061
INIT_LIST_HEAD(&cache->bg_list);
90149062
INIT_LIST_HEAD(&cache->ro_list);
90159063
INIT_LIST_HEAD(&cache->dirty_list);
9064+
INIT_LIST_HEAD(&cache->io_list);
90169065
btrfs_init_free_space_ctl(cache);
90179066
atomic_set(&cache->trimming, 0);
90189067

fs/btrfs/free-space-cache.c

Lines changed: 104 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -170,13 +170,13 @@ static int __create_free_space_inode(struct btrfs_root *root,
170170
key.objectid = BTRFS_FREE_SPACE_OBJECTID;
171171
key.offset = offset;
172172
key.type = 0;
173-
174173
ret = btrfs_insert_empty_item(trans, root, path, &key,
175174
sizeof(struct btrfs_free_space_header));
176175
if (ret < 0) {
177176
btrfs_release_path(path);
178177
return ret;
179178
}
179+
180180
leaf = path->nodes[0];
181181
header = btrfs_item_ptr(leaf, path->slots[0],
182182
struct btrfs_free_space_header);
@@ -296,13 +296,15 @@ static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode,
296296
io_ctl->num_pages = num_pages;
297297
io_ctl->root = root;
298298
io_ctl->check_crcs = check_crcs;
299+
io_ctl->inode = inode;
299300

300301
return 0;
301302
}
302303

303304
static void io_ctl_free(struct btrfs_io_ctl *io_ctl)
304305
{
305306
kfree(io_ctl->pages);
307+
io_ctl->pages = NULL;
306308
}
307309

308310
static void io_ctl_unmap_page(struct btrfs_io_ctl *io_ctl)
@@ -1092,6 +1094,61 @@ cleanup_write_cache_enospc(struct inode *inode,
10921094
GFP_NOFS);
10931095
}
10941096

1097+
int btrfs_wait_cache_io(struct btrfs_root *root,
1098+
struct btrfs_trans_handle *trans,
1099+
struct btrfs_block_group_cache *block_group,
1100+
struct btrfs_io_ctl *io_ctl,
1101+
struct btrfs_path *path, u64 offset)
1102+
{
1103+
int ret;
1104+
struct inode *inode = io_ctl->inode;
1105+
1106+
root = root->fs_info->tree_root;
1107+
1108+
/* Flush the dirty pages in the cache file. */
1109+
ret = flush_dirty_cache(inode);
1110+
if (ret)
1111+
goto out;
1112+
1113+
/* Update the cache item to tell everyone this cache file is valid. */
1114+
ret = update_cache_item(trans, root, inode, path, offset,
1115+
io_ctl->entries, io_ctl->bitmaps);
1116+
out:
1117+
io_ctl_free(io_ctl);
1118+
if (ret) {
1119+
invalidate_inode_pages2(inode->i_mapping);
1120+
BTRFS_I(inode)->generation = 0;
1121+
if (block_group) {
1122+
#ifdef DEBUG
1123+
btrfs_err(root->fs_info,
1124+
"failed to write free space cache for block group %llu",
1125+
block_group->key.objectid);
1126+
#endif
1127+
}
1128+
}
1129+
btrfs_update_inode(trans, root, inode);
1130+
1131+
if (block_group) {
1132+
spin_lock(&block_group->lock);
1133+
1134+
/*
1135+
* only mark this as written if we didn't get put back on
1136+
* the dirty list while waiting for IO.
1137+
*/
1138+
if (!ret && list_empty(&block_group->dirty_list))
1139+
block_group->disk_cache_state = BTRFS_DC_WRITTEN;
1140+
else if (ret)
1141+
block_group->disk_cache_state = BTRFS_DC_ERROR;
1142+
1143+
spin_unlock(&block_group->lock);
1144+
io_ctl->inode = NULL;
1145+
iput(inode);
1146+
}
1147+
1148+
return ret;
1149+
1150+
}
1151+
10951152
/**
10961153
* __btrfs_write_out_cache - write out cached info to an inode
10971154
* @root - the root the inode belongs to
@@ -1108,20 +1165,22 @@ cleanup_write_cache_enospc(struct inode *inode,
11081165
static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
11091166
struct btrfs_free_space_ctl *ctl,
11101167
struct btrfs_block_group_cache *block_group,
1168+
struct btrfs_io_ctl *io_ctl,
11111169
struct btrfs_trans_handle *trans,
11121170
struct btrfs_path *path, u64 offset)
11131171
{
11141172
struct extent_state *cached_state = NULL;
1115-
struct btrfs_io_ctl io_ctl;
11161173
LIST_HEAD(bitmap_list);
11171174
int entries = 0;
11181175
int bitmaps = 0;
11191176
int ret;
1177+
int must_iput = 0;
11201178

11211179
if (!i_size_read(inode))
11221180
return -1;
11231181

1124-
ret = io_ctl_init(&io_ctl, inode, root, 1);
1182+
WARN_ON(io_ctl->pages);
1183+
ret = io_ctl_init(io_ctl, inode, root, 1);
11251184
if (ret)
11261185
return -1;
11271186

@@ -1134,22 +1193,23 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
11341193
up_write(&block_group->data_rwsem);
11351194
BTRFS_I(inode)->generation = 0;
11361195
ret = 0;
1196+
must_iput = 1;
11371197
goto out;
11381198
}
11391199
spin_unlock(&block_group->lock);
11401200
}
11411201

11421202
/* Lock all pages first so we can lock the extent safely. */
1143-
io_ctl_prepare_pages(&io_ctl, inode, 0);
1203+
io_ctl_prepare_pages(io_ctl, inode, 0);
11441204

11451205
lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
11461206
0, &cached_state);
11471207

1148-
io_ctl_set_generation(&io_ctl, trans->transid);
1208+
io_ctl_set_generation(io_ctl, trans->transid);
11491209

11501210
mutex_lock(&ctl->cache_writeout_mutex);
11511211
/* Write out the extent entries in the free space cache */
1152-
ret = write_cache_extent_entries(&io_ctl, ctl,
1212+
ret = write_cache_extent_entries(io_ctl, ctl,
11531213
block_group, &entries, &bitmaps,
11541214
&bitmap_list);
11551215
if (ret) {
@@ -1162,7 +1222,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
11621222
* they will be added into free space cache after the transaction is
11631223
* committed, we shouldn't lose them.
11641224
*/
1165-
ret = write_pinned_extent_entries(root, block_group, &io_ctl, &entries);
1225+
ret = write_pinned_extent_entries(root, block_group, io_ctl, &entries);
11661226
if (ret) {
11671227
mutex_unlock(&ctl->cache_writeout_mutex);
11681228
goto out_nospc;
@@ -1173,16 +1233,16 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
11731233
* locked while doing it because a concurrent trim can be manipulating
11741234
* or freeing the bitmap.
11751235
*/
1176-
ret = write_bitmap_entries(&io_ctl, &bitmap_list);
1236+
ret = write_bitmap_entries(io_ctl, &bitmap_list);
11771237
mutex_unlock(&ctl->cache_writeout_mutex);
11781238
if (ret)
11791239
goto out_nospc;
11801240

11811241
/* Zero out the rest of the pages just to make sure */
1182-
io_ctl_zero_remaining_pages(&io_ctl);
1242+
io_ctl_zero_remaining_pages(io_ctl);
11831243

11841244
/* Everything is written out, now we dirty the pages in the file. */
1185-
ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages,
1245+
ret = btrfs_dirty_pages(root, inode, io_ctl->pages, io_ctl->num_pages,
11861246
0, i_size_read(inode), &cached_state);
11871247
if (ret)
11881248
goto out_nospc;
@@ -1193,30 +1253,39 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
11931253
* Release the pages and unlock the extent, we will flush
11941254
* them out later
11951255
*/
1196-
io_ctl_drop_pages(&io_ctl);
1256+
io_ctl_drop_pages(io_ctl);
11971257

11981258
unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
11991259
i_size_read(inode) - 1, &cached_state, GFP_NOFS);
12001260

1201-
/* Flush the dirty pages in the cache file. */
1202-
ret = flush_dirty_cache(inode);
1261+
/*
1262+
* at this point the pages are under IO and we're happy,
1263+
* The caller is responsible for waiting on them and updating the
1264+
* the cache and the inode
1265+
*/
1266+
io_ctl->entries = entries;
1267+
io_ctl->bitmaps = bitmaps;
1268+
1269+
ret = btrfs_fdatawrite_range(inode, 0, (u64)-1);
12031270
if (ret)
12041271
goto out;
12051272

1206-
/* Update the cache item to tell everyone this cache file is valid. */
1207-
ret = update_cache_item(trans, root, inode, path, offset,
1208-
entries, bitmaps);
1273+
return 0;
1274+
12091275
out:
1210-
io_ctl_free(&io_ctl);
1276+
io_ctl->inode = NULL;
1277+
io_ctl_free(io_ctl);
12111278
if (ret) {
12121279
invalidate_inode_pages2(inode->i_mapping);
12131280
BTRFS_I(inode)->generation = 0;
12141281
}
12151282
btrfs_update_inode(trans, root, inode);
1283+
if (must_iput)
1284+
iput(inode);
12161285
return ret;
12171286

12181287
out_nospc:
1219-
cleanup_write_cache_enospc(inode, &io_ctl, &cached_state, &bitmap_list);
1288+
cleanup_write_cache_enospc(inode, io_ctl, &cached_state, &bitmap_list);
12201289

12211290
if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
12221291
up_write(&block_group->data_rwsem);
@@ -1232,7 +1301,6 @@ int btrfs_write_out_cache(struct btrfs_root *root,
12321301
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
12331302
struct inode *inode;
12341303
int ret = 0;
1235-
enum btrfs_disk_cache_state dcs = BTRFS_DC_WRITTEN;
12361304

12371305
root = root->fs_info->tree_root;
12381306

@@ -1253,22 +1321,28 @@ int btrfs_write_out_cache(struct btrfs_root *root,
12531321
if (IS_ERR(inode))
12541322
return 0;
12551323

1256-
ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans,
1324+
ret = __btrfs_write_out_cache(root, inode, ctl, block_group,
1325+
&block_group->io_ctl, trans,
12571326
path, block_group->key.objectid);
12581327
if (ret) {
1259-
dcs = BTRFS_DC_ERROR;
1260-
ret = 0;
12611328
#ifdef DEBUG
12621329
btrfs_err(root->fs_info,
12631330
"failed to write free space cache for block group %llu",
12641331
block_group->key.objectid);
12651332
#endif
1333+
spin_lock(&block_group->lock);
1334+
block_group->disk_cache_state = BTRFS_DC_ERROR;
1335+
spin_unlock(&block_group->lock);
1336+
1337+
block_group->io_ctl.inode = NULL;
1338+
iput(inode);
12661339
}
12671340

1268-
spin_lock(&block_group->lock);
1269-
block_group->disk_cache_state = dcs;
1270-
spin_unlock(&block_group->lock);
1271-
iput(inode);
1341+
/*
1342+
* if ret == 0 the caller is expected to call btrfs_wait_cache_io
1343+
* to wait for IO and put the inode
1344+
*/
1345+
12721346
return ret;
12731347
}
12741348

@@ -3331,11 +3405,14 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,
33313405
{
33323406
struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
33333407
int ret;
3408+
struct btrfs_io_ctl io_ctl;
33343409

33353410
if (!btrfs_test_opt(root, INODE_MAP_CACHE))
33363411
return 0;
33373412

3338-
ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0);
3413+
ret = __btrfs_write_out_cache(root, inode, ctl, NULL, &io_ctl,
3414+
trans, path, 0) ||
3415+
btrfs_wait_cache_io(root, trans, NULL, &io_ctl, path, 0);
33393416
if (ret) {
33403417
btrfs_delalloc_release_metadata(inode, inode->i_size);
33413418
#ifdef DEBUG

0 commit comments

Comments
 (0)