Skip to content

Commit b8aa330

Browse files
fdmananakdave
authored andcommitted
Btrfs: improve performance on fsync of files with multiple hardlinks
Commit 41bd606 ("Btrfs: fix fsync of files with multiple hard links in new directories") introduced a path that makes fsync fallback to a full transaction commit in order to avoid losing hard links and new ancestors of the fsynced inode. That path is triggered only when the inode has more than one hard link and either has a new hard link created in the current transaction or the inode was evicted and reloaded in the current transaction. That path ends up getting triggered very often (hundreds of times) during the course of pgbench benchmarks, resulting in performance drops of about 20%. This change restores the performance by not triggering the full transaction commit in those cases, and instead iterate the fs/subvolume tree in search of all possible new ancestors, for all hard links, to log them. Reported-by: Zhao Yuhu <[email protected]> Tested-by: James Wang <[email protected]> Signed-off-by: Filipe Manana <[email protected]> Signed-off-by: David Sterba <[email protected]>
1 parent 62d54f3 commit b8aa330

File tree

3 files changed

+188
-63
lines changed

3 files changed

+188
-63
lines changed

fs/btrfs/btrfs_inode.h

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -147,12 +147,6 @@ struct btrfs_inode {
147147
*/
148148
u64 last_unlink_trans;
149149

150-
/*
151-
* Track the transaction id of the last transaction used to create a
152-
* hard link for the inode. This is used by the log tree (fsync).
153-
*/
154-
u64 last_link_trans;
155-
156150
/*
157151
* Number of bytes outstanding that are going to need csums. This is
158152
* used in ENOSPC accounting.

fs/btrfs/inode.c

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -3689,21 +3689,6 @@ static int btrfs_read_locked_inode(struct inode *inode,
36893689
* inode is not a directory, logging its parent unnecessarily.
36903690
*/
36913691
BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans;
3692-
/*
3693-
* Similar reasoning for last_link_trans, needs to be set otherwise
3694-
* for a case like the following:
3695-
*
3696-
* mkdir A
3697-
* touch foo
3698-
* ln foo A/bar
3699-
* echo 2 > /proc/sys/vm/drop_caches
3700-
* fsync foo
3701-
* <power failure>
3702-
*
3703-
* Would result in link bar and directory A not existing after the power
3704-
* failure.
3705-
*/
3706-
BTRFS_I(inode)->last_link_trans = BTRFS_I(inode)->last_trans;
37073692

37083693
path->slots[0]++;
37093694
if (inode->i_nlink != 1 ||
@@ -6655,7 +6640,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
66556640
if (err)
66566641
goto fail;
66576642
}
6658-
BTRFS_I(inode)->last_link_trans = trans->transid;
66596643
d_instantiate(dentry, inode);
66606644
ret = btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent,
66616645
true, NULL);
@@ -9192,7 +9176,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
91929176
ei->index_cnt = (u64)-1;
91939177
ei->dir_index = 0;
91949178
ei->last_unlink_trans = 0;
9195-
ei->last_link_trans = 0;
91969179
ei->last_log_commit = 0;
91979180

91989181
spin_lock_init(&ei->lock);

fs/btrfs/tree-log.c

Lines changed: 188 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -5832,6 +5832,190 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
58325832
return ret;
58335833
}
58345834

5835+
static int log_new_ancestors(struct btrfs_trans_handle *trans,
5836+
struct btrfs_root *root,
5837+
struct btrfs_path *path,
5838+
struct btrfs_log_ctx *ctx)
5839+
{
5840+
struct btrfs_key found_key;
5841+
5842+
btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
5843+
5844+
while (true) {
5845+
struct btrfs_fs_info *fs_info = root->fs_info;
5846+
const u64 last_committed = fs_info->last_trans_committed;
5847+
struct extent_buffer *leaf = path->nodes[0];
5848+
int slot = path->slots[0];
5849+
struct btrfs_key search_key;
5850+
struct inode *inode;
5851+
int ret = 0;
5852+
5853+
btrfs_release_path(path);
5854+
5855+
search_key.objectid = found_key.offset;
5856+
search_key.type = BTRFS_INODE_ITEM_KEY;
5857+
search_key.offset = 0;
5858+
inode = btrfs_iget(fs_info->sb, &search_key, root, NULL);
5859+
if (IS_ERR(inode))
5860+
return PTR_ERR(inode);
5861+
5862+
if (BTRFS_I(inode)->generation > last_committed)
5863+
ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
5864+
LOG_INODE_EXISTS,
5865+
0, LLONG_MAX, ctx);
5866+
iput(inode);
5867+
if (ret)
5868+
return ret;
5869+
5870+
if (search_key.objectid == BTRFS_FIRST_FREE_OBJECTID)
5871+
break;
5872+
5873+
search_key.type = BTRFS_INODE_REF_KEY;
5874+
ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
5875+
if (ret < 0)
5876+
return ret;
5877+
5878+
leaf = path->nodes[0];
5879+
slot = path->slots[0];
5880+
if (slot >= btrfs_header_nritems(leaf)) {
5881+
ret = btrfs_next_leaf(root, path);
5882+
if (ret < 0)
5883+
return ret;
5884+
else if (ret > 0)
5885+
return -ENOENT;
5886+
leaf = path->nodes[0];
5887+
slot = path->slots[0];
5888+
}
5889+
5890+
btrfs_item_key_to_cpu(leaf, &found_key, slot);
5891+
if (found_key.objectid != search_key.objectid ||
5892+
found_key.type != BTRFS_INODE_REF_KEY)
5893+
return -ENOENT;
5894+
}
5895+
return 0;
5896+
}
5897+
5898+
static int log_new_ancestors_fast(struct btrfs_trans_handle *trans,
5899+
struct btrfs_inode *inode,
5900+
struct dentry *parent,
5901+
struct btrfs_log_ctx *ctx)
5902+
{
5903+
struct btrfs_root *root = inode->root;
5904+
struct btrfs_fs_info *fs_info = root->fs_info;
5905+
struct dentry *old_parent = NULL;
5906+
struct super_block *sb = inode->vfs_inode.i_sb;
5907+
int ret = 0;
5908+
5909+
while (true) {
5910+
if (!parent || d_really_is_negative(parent) ||
5911+
sb != parent->d_sb)
5912+
break;
5913+
5914+
inode = BTRFS_I(d_inode(parent));
5915+
if (root != inode->root)
5916+
break;
5917+
5918+
if (inode->generation > fs_info->last_trans_committed) {
5919+
ret = btrfs_log_inode(trans, root, inode,
5920+
LOG_INODE_EXISTS, 0, LLONG_MAX, ctx);
5921+
if (ret)
5922+
break;
5923+
}
5924+
if (IS_ROOT(parent))
5925+
break;
5926+
5927+
parent = dget_parent(parent);
5928+
dput(old_parent);
5929+
old_parent = parent;
5930+
}
5931+
dput(old_parent);
5932+
5933+
return ret;
5934+
}
5935+
5936+
static int log_all_new_ancestors(struct btrfs_trans_handle *trans,
5937+
struct btrfs_inode *inode,
5938+
struct dentry *parent,
5939+
struct btrfs_log_ctx *ctx)
5940+
{
5941+
struct btrfs_root *root = inode->root;
5942+
const u64 ino = btrfs_ino(inode);
5943+
struct btrfs_path *path;
5944+
struct btrfs_key search_key;
5945+
int ret;
5946+
5947+
/*
5948+
* For a single hard link case, go through a fast path that does not
5949+
* need to iterate the fs/subvolume tree.
5950+
*/
5951+
if (inode->vfs_inode.i_nlink < 2)
5952+
return log_new_ancestors_fast(trans, inode, parent, ctx);
5953+
5954+
path = btrfs_alloc_path();
5955+
if (!path)
5956+
return -ENOMEM;
5957+
5958+
search_key.objectid = ino;
5959+
search_key.type = BTRFS_INODE_REF_KEY;
5960+
search_key.offset = 0;
5961+
again:
5962+
ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
5963+
if (ret < 0)
5964+
goto out;
5965+
if (ret == 0)
5966+
path->slots[0]++;
5967+
5968+
while (true) {
5969+
struct extent_buffer *leaf = path->nodes[0];
5970+
int slot = path->slots[0];
5971+
struct btrfs_key found_key;
5972+
5973+
if (slot >= btrfs_header_nritems(leaf)) {
5974+
ret = btrfs_next_leaf(root, path);
5975+
if (ret < 0)
5976+
goto out;
5977+
else if (ret > 0)
5978+
break;
5979+
continue;
5980+
}
5981+
5982+
btrfs_item_key_to_cpu(leaf, &found_key, slot);
5983+
if (found_key.objectid != ino ||
5984+
found_key.type > BTRFS_INODE_EXTREF_KEY)
5985+
break;
5986+
5987+
/*
5988+
* Don't deal with extended references because they are rare
5989+
* cases and too complex to deal with (we would need to keep
5990+
* track of which subitem we are processing for each item in
5991+
* this loop, etc). So just return some error to fallback to
5992+
* a transaction commit.
5993+
*/
5994+
if (found_key.type == BTRFS_INODE_EXTREF_KEY) {
5995+
ret = -EMLINK;
5996+
goto out;
5997+
}
5998+
5999+
/*
6000+
* Logging ancestors needs to do more searches on the fs/subvol
6001+
* tree, so it releases the path as needed to avoid deadlocks.
6002+
* Keep track of the last inode ref key and resume from that key
6003+
* after logging all new ancestors for the current hard link.
6004+
*/
6005+
memcpy(&search_key, &found_key, sizeof(search_key));
6006+
6007+
ret = log_new_ancestors(trans, root, path, ctx);
6008+
if (ret)
6009+
goto out;
6010+
btrfs_release_path(path);
6011+
goto again;
6012+
}
6013+
ret = 0;
6014+
out:
6015+
btrfs_free_path(path);
6016+
return ret;
6017+
}
6018+
58356019
/*
58366020
* helper function around btrfs_log_inode to make sure newly created
58376021
* parent directories also end up in the log. A minimal inode and backref
@@ -5849,11 +6033,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
58496033
struct btrfs_root *root = inode->root;
58506034
struct btrfs_fs_info *fs_info = root->fs_info;
58516035
struct super_block *sb;
5852-
struct dentry *old_parent = NULL;
58536036
int ret = 0;
58546037
u64 last_committed = fs_info->last_trans_committed;
58556038
bool log_dentries = false;
5856-
struct btrfs_inode *orig_inode = inode;
58576039

58586040
sb = inode->vfs_inode.i_sb;
58596041

@@ -5959,54 +6141,20 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
59596141
* and has a link count of 2.
59606142
*/
59616143
if (inode->last_unlink_trans > last_committed) {
5962-
ret = btrfs_log_all_parents(trans, orig_inode, ctx);
6144+
ret = btrfs_log_all_parents(trans, inode, ctx);
59636145
if (ret)
59646146
goto end_trans;
59656147
}
59666148

5967-
/*
5968-
* If a new hard link was added to the inode in the current transaction
5969-
* and its link count is now greater than 1, we need to fallback to a
5970-
* transaction commit, otherwise we can end up not logging all its new
5971-
* parents for all the hard links. Here just from the dentry used to
5972-
* fsync, we can not visit the ancestor inodes for all the other hard
5973-
* links to figure out if any is new, so we fallback to a transaction
5974-
* commit (instead of adding a lot of complexity of scanning a btree,
5975-
* since this scenario is not a common use case).
5976-
*/
5977-
if (inode->vfs_inode.i_nlink > 1 &&
5978-
inode->last_link_trans > last_committed) {
5979-
ret = -EMLINK;
6149+
ret = log_all_new_ancestors(trans, inode, parent, ctx);
6150+
if (ret)
59806151
goto end_trans;
5981-
}
5982-
5983-
while (1) {
5984-
if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
5985-
break;
5986-
5987-
inode = BTRFS_I(d_inode(parent));
5988-
if (root != inode->root)
5989-
break;
59906152

5991-
if (inode->generation > last_committed) {
5992-
ret = btrfs_log_inode(trans, root, inode,
5993-
LOG_INODE_EXISTS, 0, LLONG_MAX, ctx);
5994-
if (ret)
5995-
goto end_trans;
5996-
}
5997-
if (IS_ROOT(parent))
5998-
break;
5999-
6000-
parent = dget_parent(parent);
6001-
dput(old_parent);
6002-
old_parent = parent;
6003-
}
60046153
if (log_dentries)
6005-
ret = log_new_dir_dentries(trans, root, orig_inode, ctx);
6154+
ret = log_new_dir_dentries(trans, root, inode, ctx);
60066155
else
60076156
ret = 0;
60086157
end_trans:
6009-
dput(old_parent);
60106158
if (ret < 0) {
60116159
btrfs_set_log_full_commit(trans);
60126160
ret = 1;

0 commit comments

Comments
 (0)