Skip to content

Commit 36283bf

Browse files
fdmananamasoncl
authored andcommitted
Btrfs: fix fsync xattr loss in the fast fsync path
After commit 4f764e5 ("Btrfs: remove deleted xattrs on fsync log replay"), we can end up in a situation where during log replay we end up deleting xattrs that were never deleted when their file was last fsynced. This happens in the fast fsync path (flag BTRFS_INODE_NEEDS_FULL_SYNC is not set in the inode) if the inode has the flag BTRFS_INODE_COPY_EVERYTHING set, the xattr was added in a past transaction and the leaf where the xattr is located was not updated (COWed or created) in the current transaction. In this scenario the xattr item never ends up in the log tree and therefore at log replay time, which makes the replay code delete the xattr from the fs/subvol tree as it thinks that xattr was deleted prior to the last fsync. Fix this by always logging all xattrs, which is the simplest and most reliable way to detect deleted xattrs and replay the deletes at log replay time. This issue is reproducible with the following test case for fstests: seq=`basename $0` seqres=$RESULT_DIR/$seq echo "QA output created by $seq" here=`pwd` tmp=/tmp/$$ status=1 # failure is the default! _cleanup() { _cleanup_flakey rm -f $tmp.* } trap "_cleanup; exit \$status" 0 1 2 3 15 # get standard environment, filters and checks . ./common/rc . ./common/filter . ./common/dmflakey . ./common/attr # real QA test starts here # We create a lot of xattrs for a single file. Only btrfs and xfs are currently # able to store such a large mount of xattrs per file, other filesystems such # as ext3/4 and f2fs for example, fail with ENOSPC even if we attempt to add # less than 1000 xattrs with very small values. _supported_fs btrfs xfs _supported_os Linux _need_to_be_root _require_scratch _require_dm_flakey _require_attrs _require_metadata_journaling $SCRATCH_DEV rm -f $seqres.full _scratch_mkfs >> $seqres.full 2>&1 _init_flakey _mount_flakey # Create the test file with some initial data and make sure everything is # durably persisted. $XFS_IO_PROG -f -c "pwrite -S 0xaa 0 32k" $SCRATCH_MNT/foo | _filter_xfs_io sync # Add many small xattrs to our file. # We create such a large amount because it's needed to trigger the issue found # in btrfs - we need to have an amount that causes the fs to have at least 3 # btree leafs with xattrs stored in them, and it must work on any leaf size # (maximum leaf/node size is 64Kb). num_xattrs=2000 for ((i = 1; i <= $num_xattrs; i++)); do name="user.attr_$(printf "%04d" $i)" $SETFATTR_PROG -n $name -v "val_$(printf "%04d" $i)" $SCRATCH_MNT/foo done # Sync the filesystem to force a commit of the current btrfs transaction, this # is a necessary condition to trigger the bug on btrfs. sync # Now update our file's data and fsync the file. # After a successful fsync, if the fsync log/journal is replayed we expect to # see all the xattrs we added before with the same values (and the updated file # data of course). Btrfs used to delete some of these xattrs when it replayed # its fsync log/journal. $XFS_IO_PROG -c "pwrite -S 0xbb 8K 16K" \ -c "fsync" \ $SCRATCH_MNT/foo | _filter_xfs_io # Simulate a crash/power loss. _load_flakey_table $FLAKEY_DROP_WRITES _unmount_flakey # Allow writes again and mount. This makes the fs replay its fsync log. _load_flakey_table $FLAKEY_ALLOW_WRITES _mount_flakey echo "File content after crash and log replay:" od -t x1 $SCRATCH_MNT/foo echo "File xattrs after crash and log replay:" for ((i = 1; i <= $num_xattrs; i++)); do name="user.attr_$(printf "%04d" $i)" echo -n "$name=" $GETFATTR_PROG --absolute-names -n $name --only-values $SCRATCH_MNT/foo echo done status=0 exit The golden output expects all xattrs to be available, and with the correct values, after the fsync log is replayed. Signed-off-by: Filipe Manana <[email protected]> Signed-off-by: Chris Mason <[email protected]>
1 parent e4545de commit 36283bf

File tree

1 file changed

+104
-0
lines changed

1 file changed

+104
-0
lines changed

fs/btrfs/tree-log.c

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4117,6 +4117,86 @@ static int logged_inode_size(struct btrfs_root *log, struct inode *inode,
41174117
return 0;
41184118
}
41194119

4120+
/*
4121+
* At the moment we always log all xattrs. This is to figure out at log replay
4122+
* time which xattrs must have their deletion replayed. If a xattr is missing
4123+
* in the log tree and exists in the fs/subvol tree, we delete it. This is
4124+
* because if a xattr is deleted, the inode is fsynced and a power failure
4125+
* happens, causing the log to be replayed the next time the fs is mounted,
4126+
* we want the xattr to not exist anymore (same behaviour as other filesystems
4127+
* with a journal, ext3/4, xfs, f2fs, etc).
4128+
*/
4129+
static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
4130+
struct btrfs_root *root,
4131+
struct inode *inode,
4132+
struct btrfs_path *path,
4133+
struct btrfs_path *dst_path)
4134+
{
4135+
int ret;
4136+
struct btrfs_key key;
4137+
const u64 ino = btrfs_ino(inode);
4138+
int ins_nr = 0;
4139+
int start_slot = 0;
4140+
4141+
key.objectid = ino;
4142+
key.type = BTRFS_XATTR_ITEM_KEY;
4143+
key.offset = 0;
4144+
4145+
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4146+
if (ret < 0)
4147+
return ret;
4148+
4149+
while (true) {
4150+
int slot = path->slots[0];
4151+
struct extent_buffer *leaf = path->nodes[0];
4152+
int nritems = btrfs_header_nritems(leaf);
4153+
4154+
if (slot >= nritems) {
4155+
if (ins_nr > 0) {
4156+
u64 last_extent = 0;
4157+
4158+
ret = copy_items(trans, inode, dst_path, path,
4159+
&last_extent, start_slot,
4160+
ins_nr, 1, 0);
4161+
/* can't be 1, extent items aren't processed */
4162+
ASSERT(ret <= 0);
4163+
if (ret < 0)
4164+
return ret;
4165+
ins_nr = 0;
4166+
}
4167+
ret = btrfs_next_leaf(root, path);
4168+
if (ret < 0)
4169+
return ret;
4170+
else if (ret > 0)
4171+
break;
4172+
continue;
4173+
}
4174+
4175+
btrfs_item_key_to_cpu(leaf, &key, slot);
4176+
if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY)
4177+
break;
4178+
4179+
if (ins_nr == 0)
4180+
start_slot = slot;
4181+
ins_nr++;
4182+
path->slots[0]++;
4183+
cond_resched();
4184+
}
4185+
if (ins_nr > 0) {
4186+
u64 last_extent = 0;
4187+
4188+
ret = copy_items(trans, inode, dst_path, path,
4189+
&last_extent, start_slot,
4190+
ins_nr, 1, 0);
4191+
/* can't be 1, extent items aren't processed */
4192+
ASSERT(ret <= 0);
4193+
if (ret < 0)
4194+
return ret;
4195+
}
4196+
4197+
return 0;
4198+
}
4199+
41204200
/* log a single inode in the tree log.
41214201
* At least one parent directory for this inode must exist in the tree
41224202
* or be logged already.
@@ -4289,6 +4369,25 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
42894369
if (min_key.type == BTRFS_INODE_ITEM_KEY)
42904370
need_log_inode_item = false;
42914371

4372+
/* Skip xattrs, we log them later with btrfs_log_all_xattrs() */
4373+
if (min_key.type == BTRFS_XATTR_ITEM_KEY) {
4374+
if (ins_nr == 0)
4375+
goto next_slot;
4376+
ret = copy_items(trans, inode, dst_path, path,
4377+
&last_extent, ins_start_slot,
4378+
ins_nr, inode_only, logged_isize);
4379+
if (ret < 0) {
4380+
err = ret;
4381+
goto out_unlock;
4382+
}
4383+
ins_nr = 0;
4384+
if (ret) {
4385+
btrfs_release_path(path);
4386+
continue;
4387+
}
4388+
goto next_slot;
4389+
}
4390+
42924391
src = path->nodes[0];
42934392
if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
42944393
ins_nr++;
@@ -4356,6 +4455,11 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
43564455
ins_nr = 0;
43574456
}
43584457

4458+
btrfs_release_path(path);
4459+
btrfs_release_path(dst_path);
4460+
err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path);
4461+
if (err)
4462+
goto out_unlock;
43594463
log_extents:
43604464
btrfs_release_path(path);
43614465
btrfs_release_path(dst_path);

0 commit comments

Comments
 (0)