Skip to content

Commit 6070dcc

Browse files
committed
Merge tag 'for-5.16-deadlock-fix-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
Pull btrfs fix from David Sterba: "Fix for a deadlock when direct/buffered IO is done on a mmaped file and a fault happens (details in the patch). There's a fstest generic/647 that triggers the problem and makes testing hard" * tag 'for-5.16-deadlock-fix-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: btrfs: fix deadlock due to page faults during direct IO reads and writes
2 parents 38764c7 + 51bd956 commit 6070dcc

File tree

1 file changed

+123
-16
lines changed

1 file changed

+123
-16
lines changed

fs/btrfs/file.c

Lines changed: 123 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1912,16 +1912,17 @@ static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
19121912

19131913
static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
19141914
{
1915+
const bool is_sync_write = (iocb->ki_flags & IOCB_DSYNC);
19151916
struct file *file = iocb->ki_filp;
19161917
struct inode *inode = file_inode(file);
19171918
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
19181919
loff_t pos;
19191920
ssize_t written = 0;
19201921
ssize_t written_buffered;
1922+
size_t prev_left = 0;
19211923
loff_t endbyte;
19221924
ssize_t err;
19231925
unsigned int ilock_flags = 0;
1924-
struct iomap_dio *dio = NULL;
19251926

19261927
if (iocb->ki_flags & IOCB_NOWAIT)
19271928
ilock_flags |= BTRFS_ILOCK_TRY;
@@ -1964,23 +1965,80 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
19641965
goto buffered;
19651966
}
19661967

1967-
dio = __iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
1968-
0, 0);
1968+
/*
1969+
* We remove IOCB_DSYNC so that we don't deadlock when iomap_dio_rw()
1970+
* calls generic_write_sync() (through iomap_dio_complete()), because
1971+
* that results in calling fsync (btrfs_sync_file()) which will try to
1972+
* lock the inode in exclusive/write mode.
1973+
*/
1974+
if (is_sync_write)
1975+
iocb->ki_flags &= ~IOCB_DSYNC;
19691976

1970-
btrfs_inode_unlock(inode, ilock_flags);
1977+
/*
1978+
* The iov_iter can be mapped to the same file range we are writing to.
1979+
* If that's the case, then we will deadlock in the iomap code, because
1980+
* it first calls our callback btrfs_dio_iomap_begin(), which will create
1981+
* an ordered extent, and after that it will fault in the pages that the
1982+
* iov_iter refers to. During the fault in we end up in the readahead
1983+
* pages code (starting at btrfs_readahead()), which will lock the range,
1984+
* find that ordered extent and then wait for it to complete (at
1985+
* btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since
1986+
* obviously the ordered extent can never complete as we didn't submit
1987+
* yet the respective bio(s). This always happens when the buffer is
1988+
* memory mapped to the same file range, since the iomap DIO code always
1989+
* invalidates pages in the target file range (after starting and waiting
1990+
* for any writeback).
1991+
*
1992+
* So here we disable page faults in the iov_iter and then retry if we
1993+
* got -EFAULT, faulting in the pages before the retry.
1994+
*/
1995+
again:
1996+
from->nofault = true;
1997+
err = iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
1998+
IOMAP_DIO_PARTIAL, written);
1999+
from->nofault = false;
19712000

1972-
if (IS_ERR_OR_NULL(dio)) {
1973-
err = PTR_ERR_OR_ZERO(dio);
1974-
if (err < 0 && err != -ENOTBLK)
1975-
goto out;
1976-
} else {
1977-
written = iomap_dio_complete(dio);
2001+
/* No increment (+=) because iomap returns a cumulative value. */
2002+
if (err > 0)
2003+
written = err;
2004+
2005+
if (iov_iter_count(from) > 0 && (err == -EFAULT || err > 0)) {
2006+
const size_t left = iov_iter_count(from);
2007+
/*
2008+
* We have more data left to write. Try to fault in as many as
2009+
* possible of the remainder pages and retry. We do this without
2010+
* releasing and locking again the inode, to prevent races with
2011+
* truncate.
2012+
*
2013+
* Also, in case the iov refers to pages in the file range of the
2014+
* file we want to write to (due to a mmap), we could enter an
2015+
* infinite loop if we retry after faulting the pages in, since
2016+
* iomap will invalidate any pages in the range early on, before
2017+
* it tries to fault in the pages of the iov. So we keep track of
2018+
* how much was left of iov in the previous EFAULT and fallback
2019+
* to buffered IO in case we haven't made any progress.
2020+
*/
2021+
if (left == prev_left) {
2022+
err = -ENOTBLK;
2023+
} else {
2024+
fault_in_iov_iter_readable(from, left);
2025+
prev_left = left;
2026+
goto again;
2027+
}
19782028
}
19792029

1980-
if (written < 0 || !iov_iter_count(from)) {
1981-
err = written;
2030+
btrfs_inode_unlock(inode, ilock_flags);
2031+
2032+
/*
2033+
* Add back IOCB_DSYNC. Our caller, btrfs_file_write_iter(), will do
2034+
* the fsync (call generic_write_sync()).
2035+
*/
2036+
if (is_sync_write)
2037+
iocb->ki_flags |= IOCB_DSYNC;
2038+
2039+
/* If 'err' is -ENOTBLK then it means we must fallback to buffered IO. */
2040+
if ((err < 0 && err != -ENOTBLK) || !iov_iter_count(from))
19822041
goto out;
1983-
}
19842042

19852043
buffered:
19862044
pos = iocb->ki_pos;
@@ -2005,7 +2063,7 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
20052063
invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,
20062064
endbyte >> PAGE_SHIFT);
20072065
out:
2008-
return written ? written : err;
2066+
return err < 0 ? err : written;
20092067
}
20102068

20112069
static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
@@ -3659,6 +3717,8 @@ static int check_direct_read(struct btrfs_fs_info *fs_info,
36593717
static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
36603718
{
36613719
struct inode *inode = file_inode(iocb->ki_filp);
3720+
size_t prev_left = 0;
3721+
ssize_t read = 0;
36623722
ssize_t ret;
36633723

36643724
if (fsverity_active(inode))
@@ -3668,10 +3728,57 @@ static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
36683728
return 0;
36693729

36703730
btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
3731+
again:
3732+
/*
3733+
* This is similar to what we do for direct IO writes, see the comment
3734+
* at btrfs_direct_write(), but we also disable page faults in addition
3735+
* to disabling them only at the iov_iter level. This is because when
3736+
* reading from a hole or prealloc extent, iomap calls iov_iter_zero(),
3737+
* which can still trigger page fault ins despite having set ->nofault
3738+
* to true of our 'to' iov_iter.
3739+
*
3740+
* The difference to direct IO writes is that we deadlock when trying
3741+
* to lock the extent range in the inode's tree during he page reads
3742+
* triggered by the fault in (while for writes it is due to waiting for
3743+
* our own ordered extent). This is because for direct IO reads,
3744+
* btrfs_dio_iomap_begin() returns with the extent range locked, which
3745+
* is only unlocked in the endio callback (end_bio_extent_readpage()).
3746+
*/
3747+
pagefault_disable();
3748+
to->nofault = true;
36713749
ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
3672-
0, 0);
3750+
IOMAP_DIO_PARTIAL, read);
3751+
to->nofault = false;
3752+
pagefault_enable();
3753+
3754+
/* No increment (+=) because iomap returns a cumulative value. */
3755+
if (ret > 0)
3756+
read = ret;
3757+
3758+
if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) {
3759+
const size_t left = iov_iter_count(to);
3760+
3761+
if (left == prev_left) {
3762+
/*
3763+
* We didn't make any progress since the last attempt,
3764+
* fallback to a buffered read for the remainder of the
3765+
* range. This is just to avoid any possibility of looping
3766+
* for too long.
3767+
*/
3768+
ret = read;
3769+
} else {
3770+
/*
3771+
* We made some progress since the last retry or this is
3772+
* the first time we are retrying. Fault in as many pages
3773+
* as possible and retry.
3774+
*/
3775+
fault_in_iov_iter_writeable(to, left);
3776+
prev_left = left;
3777+
goto again;
3778+
}
3779+
}
36733780
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
3674-
return ret;
3781+
return ret < 0 ? ret : read;
36753782
}
36763783

36773784
static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)

0 commit comments

Comments
 (0)