Skip to content

Commit 6b5fc43

Browse files
fdmananakdave
authored andcommitted
Btrfs: fix fsync after succession of renames of different files
After a succession of rename operations of different files and fsyncing one of them, such that each file gets a new name that corresponds to an old name of another file, we can end up with a log that will cause a failure when attempted to replay at mount time (an EEXIST error). We currently have correct behaviour when such succession of renames involves only two files, but if there are more files involved, we end up not logging all the inodes that are needed, therefore resulting in a failure when attempting to replay the log. Example: $ mkfs.btrfs -f /dev/sdb $ mount /dev/sdb /mnt $ mkdir /mnt/testdir $ touch /mnt/testdir/fname1 $ touch /mnt/testdir/fname2 $ sync $ mv /mnt/testdir/fname1 /mnt/testdir/fname3 $ mv /mnt/testdir/fname2 /mnt/testdir/fname4 $ ln /mnt/testdir/fname3 /mnt/testdir/fname2 $ touch /mnt/testdir/fname1 $ xfs_io -c "fsync" /mnt/testdir/fname1 <power failure> $ mount /dev/sdb /mnt mount: mount /dev/sdb on /mnt failed: File exists So fix this by checking all inode dependencies when logging an inode. That is, if one logged inode A has a new name that matches the old name of some other inode B, check if inode B has a new name that matches the old name of some other inode C, and so on. This fix is implemented not by doing any recursive function calls but by using an iterative method using a linked list that is used in a first-in-first-out fashion. A test case for fstests follows soon. Signed-off-by: Filipe Manana <[email protected]> Signed-off-by: David Sterba <[email protected]>
1 parent 38e3eeb commit 6b5fc43

File tree

1 file changed

+197
-44
lines changed

1 file changed

+197
-44
lines changed

fs/btrfs/tree-log.c

Lines changed: 197 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -1330,6 +1330,67 @@ static int btrfs_inode_ref_exists(struct inode *inode, struct inode *dir,
13301330
return ret;
13311331
}
13321332

1333+
static int add_link(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1334+
struct inode *dir, struct inode *inode, const char *name,
1335+
int namelen, u64 ref_index)
1336+
{
1337+
struct btrfs_dir_item *dir_item;
1338+
struct btrfs_key key;
1339+
struct btrfs_path *path;
1340+
struct inode *other_inode = NULL;
1341+
int ret;
1342+
1343+
path = btrfs_alloc_path();
1344+
if (!path)
1345+
return -ENOMEM;
1346+
1347+
dir_item = btrfs_lookup_dir_item(NULL, root, path,
1348+
btrfs_ino(BTRFS_I(dir)),
1349+
name, namelen, 0);
1350+
if (!dir_item) {
1351+
btrfs_release_path(path);
1352+
goto add_link;
1353+
} else if (IS_ERR(dir_item)) {
1354+
ret = PTR_ERR(dir_item);
1355+
goto out;
1356+
}
1357+
1358+
/*
1359+
* Our inode's dentry collides with the dentry of another inode which is
1360+
* in the log but not yet processed since it has a higher inode number.
1361+
* So delete that other dentry.
1362+
*/
1363+
btrfs_dir_item_key_to_cpu(path->nodes[0], dir_item, &key);
1364+
btrfs_release_path(path);
1365+
other_inode = read_one_inode(root, key.objectid);
1366+
if (!other_inode) {
1367+
ret = -ENOENT;
1368+
goto out;
1369+
}
1370+
ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), BTRFS_I(other_inode),
1371+
name, namelen);
1372+
if (ret)
1373+
goto out;
1374+
/*
1375+
* If we dropped the link count to 0, bump it so that later the iput()
1376+
* on the inode will not free it. We will fixup the link count later.
1377+
*/
1378+
if (other_inode->i_nlink == 0)
1379+
inc_nlink(other_inode);
1380+
1381+
ret = btrfs_run_delayed_items(trans);
1382+
if (ret)
1383+
goto out;
1384+
add_link:
1385+
ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
1386+
name, namelen, 0, ref_index);
1387+
out:
1388+
iput(other_inode);
1389+
btrfs_free_path(path);
1390+
1391+
return ret;
1392+
}
1393+
13331394
/*
13341395
* replay one inode back reference item found in the log tree.
13351396
* eb, slot and key refer to the buffer and key found in the log tree.
@@ -1466,9 +1527,8 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
14661527
goto out;
14671528

14681529
/* insert our name */
1469-
ret = btrfs_add_link(trans, BTRFS_I(dir),
1470-
BTRFS_I(inode),
1471-
name, namelen, 0, ref_index);
1530+
ret = add_link(trans, root, dir, inode, name, namelen,
1531+
ref_index);
14721532
if (ret)
14731533
goto out;
14741534

@@ -4780,8 +4840,12 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
47804840
btrfs_dir_item_key_to_cpu(search_path->nodes[0],
47814841
di, &di_key);
47824842
if (di_key.type == BTRFS_INODE_ITEM_KEY) {
4783-
ret = 1;
4784-
*other_ino = di_key.objectid;
4843+
if (di_key.objectid != key->objectid) {
4844+
ret = 1;
4845+
*other_ino = di_key.objectid;
4846+
} else {
4847+
ret = 0;
4848+
}
47854849
} else {
47864850
ret = -EAGAIN;
47874851
}
@@ -4801,6 +4865,126 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
48014865
return ret;
48024866
}
48034867

4868+
struct btrfs_ino_list {
4869+
u64 ino;
4870+
struct list_head list;
4871+
};
4872+
4873+
static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
4874+
struct btrfs_root *root,
4875+
struct btrfs_path *path,
4876+
struct btrfs_log_ctx *ctx,
4877+
u64 ino)
4878+
{
4879+
struct btrfs_ino_list *ino_elem;
4880+
LIST_HEAD(inode_list);
4881+
int ret = 0;
4882+
4883+
ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
4884+
if (!ino_elem)
4885+
return -ENOMEM;
4886+
ino_elem->ino = ino;
4887+
list_add_tail(&ino_elem->list, &inode_list);
4888+
4889+
while (!list_empty(&inode_list)) {
4890+
struct btrfs_fs_info *fs_info = root->fs_info;
4891+
struct btrfs_key key;
4892+
struct inode *inode;
4893+
4894+
ino_elem = list_first_entry(&inode_list, struct btrfs_ino_list,
4895+
list);
4896+
ino = ino_elem->ino;
4897+
list_del(&ino_elem->list);
4898+
kfree(ino_elem);
4899+
if (ret)
4900+
continue;
4901+
4902+
btrfs_release_path(path);
4903+
4904+
key.objectid = ino;
4905+
key.type = BTRFS_INODE_ITEM_KEY;
4906+
key.offset = 0;
4907+
inode = btrfs_iget(fs_info->sb, &key, root, NULL);
4908+
/*
4909+
* If the other inode that had a conflicting dir entry was
4910+
* deleted in the current transaction, we don't need to do more
4911+
* work nor fallback to a transaction commit.
4912+
*/
4913+
if (IS_ERR(inode)) {
4914+
ret = PTR_ERR(inode);
4915+
if (ret == -ENOENT)
4916+
ret = 0;
4917+
continue;
4918+
}
4919+
/*
4920+
* We are safe logging the other inode without acquiring its
4921+
* lock as long as we log with the LOG_INODE_EXISTS mode. We
4922+
* are safe against concurrent renames of the other inode as
4923+
* well because during a rename we pin the log and update the
4924+
* log with the new name before we unpin it.
4925+
*/
4926+
ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
4927+
LOG_OTHER_INODE, 0, LLONG_MAX, ctx);
4928+
if (ret) {
4929+
iput(inode);
4930+
continue;
4931+
}
4932+
4933+
key.objectid = ino;
4934+
key.type = BTRFS_INODE_REF_KEY;
4935+
key.offset = 0;
4936+
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4937+
if (ret < 0) {
4938+
iput(inode);
4939+
continue;
4940+
}
4941+
4942+
while (true) {
4943+
struct extent_buffer *leaf = path->nodes[0];
4944+
int slot = path->slots[0];
4945+
u64 other_ino = 0;
4946+
4947+
if (slot >= btrfs_header_nritems(leaf)) {
4948+
ret = btrfs_next_leaf(root, path);
4949+
if (ret < 0) {
4950+
break;
4951+
} else if (ret > 0) {
4952+
ret = 0;
4953+
break;
4954+
}
4955+
continue;
4956+
}
4957+
4958+
btrfs_item_key_to_cpu(leaf, &key, slot);
4959+
if (key.objectid != ino ||
4960+
(key.type != BTRFS_INODE_REF_KEY &&
4961+
key.type != BTRFS_INODE_EXTREF_KEY)) {
4962+
ret = 0;
4963+
break;
4964+
}
4965+
4966+
ret = btrfs_check_ref_name_override(leaf, slot, &key,
4967+
BTRFS_I(inode), &other_ino);
4968+
if (ret < 0)
4969+
break;
4970+
if (ret > 0) {
4971+
ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
4972+
if (!ino_elem) {
4973+
ret = -ENOMEM;
4974+
break;
4975+
}
4976+
ino_elem->ino = other_ino;
4977+
list_add_tail(&ino_elem->list, &inode_list);
4978+
ret = 0;
4979+
}
4980+
path->slots[0]++;
4981+
}
4982+
iput(inode);
4983+
}
4984+
4985+
return ret;
4986+
}
4987+
48044988
/* log a single inode in the tree log.
48054989
* At least one parent directory for this inode must exist in the tree
48064990
* or be logged already.
@@ -4840,6 +5024,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
48405024
u64 logged_isize = 0;
48415025
bool need_log_inode_item = true;
48425026
bool xattrs_logged = false;
5027+
bool recursive_logging = (inode_only == LOG_OTHER_INODE);
48435028

48445029
path = btrfs_alloc_path();
48455030
if (!path)
@@ -4981,7 +5166,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
49815166

49825167
if ((min_key.type == BTRFS_INODE_REF_KEY ||
49835168
min_key.type == BTRFS_INODE_EXTREF_KEY) &&
4984-
inode->generation == trans->transid) {
5169+
inode->generation == trans->transid &&
5170+
!recursive_logging) {
49855171
u64 other_ino = 0;
49865172

49875173
ret = btrfs_check_ref_name_override(path->nodes[0],
@@ -4992,9 +5178,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
49925178
goto out_unlock;
49935179
} else if (ret > 0 && ctx &&
49945180
other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
4995-
struct btrfs_key inode_key;
4996-
struct inode *other_inode;
4997-
49985181
if (ins_nr > 0) {
49995182
ins_nr++;
50005183
} else {
@@ -5010,43 +5193,13 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
50105193
goto out_unlock;
50115194
}
50125195
ins_nr = 0;
5013-
btrfs_release_path(path);
5014-
inode_key.objectid = other_ino;
5015-
inode_key.type = BTRFS_INODE_ITEM_KEY;
5016-
inode_key.offset = 0;
5017-
other_inode = btrfs_iget(fs_info->sb,
5018-
&inode_key, root,
5019-
NULL);
5020-
/*
5021-
* If the other inode that had a conflicting dir
5022-
* entry was deleted in the current transaction,
5023-
* we don't need to do more work nor fallback to
5024-
* a transaction commit.
5025-
*/
5026-
if (other_inode == ERR_PTR(-ENOENT)) {
5027-
goto next_key;
5028-
} else if (IS_ERR(other_inode)) {
5029-
err = PTR_ERR(other_inode);
5030-
goto out_unlock;
5031-
}
5032-
/*
5033-
* We are safe logging the other inode without
5034-
* acquiring its i_mutex as long as we log with
5035-
* the LOG_INODE_EXISTS mode. We're safe against
5036-
* concurrent renames of the other inode as well
5037-
* because during a rename we pin the log and
5038-
* update the log with the new name before we
5039-
* unpin it.
5040-
*/
5041-
err = btrfs_log_inode(trans, root,
5042-
BTRFS_I(other_inode),
5043-
LOG_OTHER_INODE, 0, LLONG_MAX,
5044-
ctx);
5045-
iput(other_inode);
5196+
5197+
err = log_conflicting_inodes(trans, root, path,
5198+
ctx, other_ino);
50465199
if (err)
50475200
goto out_unlock;
5048-
else
5049-
goto next_key;
5201+
btrfs_release_path(path);
5202+
goto next_key;
50505203
}
50515204
}
50525205

0 commit comments

Comments
 (0)