Skip to content

Commit de0ee0e

Browse files
fdmananamasoncl
authored andcommitted
Btrfs: fix race between fsync and lockless direct IO writes
An fsync, using the fast path, can race with a concurrent lockless direct IO write and end up logging a file extent item that points to an extent that wasn't written to yet. This is because the fast fsync path collects ordered extents into a local list and then collects all the new extent maps to log file extent items based on them, while the direct IO write path creates the new extent map before it creates the corresponding ordered extent (and submitting the respective bio(s)). So fix this by making the direct IO write path create ordered extents before the extent maps and make the fast fsync path collect any new ordered extents after it collects the extent maps. Note that making the fsync handler call inode_dio_wait() (after acquiring the inode's i_mutex) would not work and lead to a deadlock when doing AIO, as through AIO we end up in a path where the fsync handler is called (through dio_aio_complete_work() -> dio_complete() -> vfs_fsync_range()) before the inode's dio counter is decremented (inode_dio_wait() waits for this counter to have a value of zero). Signed-off-by: Filipe Manana <[email protected]> Signed-off-by: Chris Mason <[email protected]>
1 parent 6b5aa88 commit de0ee0e

File tree

2 files changed

+39
-11
lines changed

2 files changed

+39
-11
lines changed

fs/btrfs/inode.c

Lines changed: 28 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7115,21 +7115,41 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
71157115
if (ret)
71167116
return ERR_PTR(ret);
71177117

7118-
em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
7119-
ins.offset, ins.offset, ins.offset, 0);
7120-
if (IS_ERR(em)) {
7121-
btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
7122-
return em;
7123-
}
7124-
7118+
/*
7119+
* Create the ordered extent before the extent map. This is to avoid
7120+
* races with the fast fsync path that would lead to it logging file
7121+
* extent items that point to disk extents that were not yet written to.
7122+
* The fast fsync path collects ordered extents into a local list and
7123+
* then collects all the new extent maps, so we must create the ordered
7124+
* extent first and make sure the fast fsync path collects any new
7125+
* ordered extents after collecting new extent maps as well.
7126+
* The fsync path simply can not rely on inode_dio_wait() because it
7127+
* causes deadlock with AIO.
7128+
*/
71257129
ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
71267130
ins.offset, ins.offset, 0);
71277131
if (ret) {
71287132
btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
7129-
free_extent_map(em);
71307133
return ERR_PTR(ret);
71317134
}
71327135

7136+
em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
7137+
ins.offset, ins.offset, ins.offset, 0);
7138+
if (IS_ERR(em)) {
7139+
struct btrfs_ordered_extent *oe;
7140+
7141+
btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
7142+
oe = btrfs_lookup_ordered_extent(inode, start);
7143+
ASSERT(oe);
7144+
if (WARN_ON(!oe))
7145+
return em;
7146+
set_bit(BTRFS_ORDERED_IOERR, &oe->flags);
7147+
set_bit(BTRFS_ORDERED_IO_DONE, &oe->flags);
7148+
btrfs_remove_ordered_extent(inode, oe);
7149+
/* Once for our lookup and once for the ordered extents tree. */
7150+
btrfs_put_ordered_extent(oe);
7151+
btrfs_put_ordered_extent(oe);
7152+
}
71337153
return em;
71347154
}
71357155

fs/btrfs/tree-log.c

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4127,7 +4127,9 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
41274127
struct inode *inode,
41284128
struct btrfs_path *path,
41294129
struct list_head *logged_list,
4130-
struct btrfs_log_ctx *ctx)
4130+
struct btrfs_log_ctx *ctx,
4131+
const u64 start,
4132+
const u64 end)
41314133
{
41324134
struct extent_map *em, *n;
41334135
struct list_head extents;
@@ -4166,7 +4168,13 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
41664168
}
41674169

41684170
list_sort(NULL, &extents, extent_cmp);
4169-
4171+
/*
4172+
* Collect any new ordered extents within the range. This is to
4173+
* prevent logging file extent items without waiting for the disk
4174+
* location they point to being written. We do this only to deal
4175+
* with races against concurrent lockless direct IO writes.
4176+
*/
4177+
btrfs_get_logged_extents(inode, logged_list, start, end);
41704178
process:
41714179
while (!list_empty(&extents)) {
41724180
em = list_entry(extents.next, struct extent_map, list);
@@ -4701,7 +4709,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
47014709
goto out_unlock;
47024710
}
47034711
ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
4704-
&logged_list, ctx);
4712+
&logged_list, ctx, start, end);
47054713
if (ret) {
47064714
err = ret;
47074715
goto out_unlock;

0 commit comments

Comments
 (0)