@@ -1912,16 +1912,17 @@ static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
1912
1912
1913
1913
static ssize_t btrfs_direct_write (struct kiocb * iocb , struct iov_iter * from )
1914
1914
{
1915
+ const bool is_sync_write = (iocb -> ki_flags & IOCB_DSYNC );
1915
1916
struct file * file = iocb -> ki_filp ;
1916
1917
struct inode * inode = file_inode (file );
1917
1918
struct btrfs_fs_info * fs_info = btrfs_sb (inode -> i_sb );
1918
1919
loff_t pos ;
1919
1920
ssize_t written = 0 ;
1920
1921
ssize_t written_buffered ;
1922
+ size_t prev_left = 0 ;
1921
1923
loff_t endbyte ;
1922
1924
ssize_t err ;
1923
1925
unsigned int ilock_flags = 0 ;
1924
- struct iomap_dio * dio = NULL ;
1925
1926
1926
1927
if (iocb -> ki_flags & IOCB_NOWAIT )
1927
1928
ilock_flags |= BTRFS_ILOCK_TRY ;
@@ -1964,23 +1965,80 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
1964
1965
goto buffered ;
1965
1966
}
1966
1967
1967
- dio = __iomap_dio_rw (iocb , from , & btrfs_dio_iomap_ops , & btrfs_dio_ops ,
1968
- 0 , 0 );
1968
+ /*
1969
+ * We remove IOCB_DSYNC so that we don't deadlock when iomap_dio_rw()
1970
+ * calls generic_write_sync() (through iomap_dio_complete()), because
1971
+ * that results in calling fsync (btrfs_sync_file()) which will try to
1972
+ * lock the inode in exclusive/write mode.
1973
+ */
1974
+ if (is_sync_write )
1975
+ iocb -> ki_flags &= ~IOCB_DSYNC ;
1969
1976
1970
- btrfs_inode_unlock (inode , ilock_flags );
1977
+ /*
1978
+ * The iov_iter can be mapped to the same file range we are writing to.
1979
+ * If that's the case, then we will deadlock in the iomap code, because
1980
+ * it first calls our callback btrfs_dio_iomap_begin(), which will create
1981
+ * an ordered extent, and after that it will fault in the pages that the
1982
+ * iov_iter refers to. During the fault in we end up in the readahead
1983
+ * pages code (starting at btrfs_readahead()), which will lock the range,
1984
+ * find that ordered extent and then wait for it to complete (at
1985
+ * btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since
1986
+ * obviously the ordered extent can never complete as we didn't submit
1987
+ * yet the respective bio(s). This always happens when the buffer is
1988
+ * memory mapped to the same file range, since the iomap DIO code always
1989
+ * invalidates pages in the target file range (after starting and waiting
1990
+ * for any writeback).
1991
+ *
1992
+ * So here we disable page faults in the iov_iter and then retry if we
1993
+ * got -EFAULT, faulting in the pages before the retry.
1994
+ */
1995
+ again :
1996
+ from -> nofault = true;
1997
+ err = iomap_dio_rw (iocb , from , & btrfs_dio_iomap_ops , & btrfs_dio_ops ,
1998
+ IOMAP_DIO_PARTIAL , written );
1999
+ from -> nofault = false;
1971
2000
1972
- if (IS_ERR_OR_NULL (dio )) {
1973
- err = PTR_ERR_OR_ZERO (dio );
1974
- if (err < 0 && err != - ENOTBLK )
1975
- goto out ;
1976
- } else {
1977
- written = iomap_dio_complete (dio );
2001
+ /* No increment (+=) because iomap returns a cumulative value. */
2002
+ if (err > 0 )
2003
+ written = err ;
2004
+
2005
+ if (iov_iter_count (from ) > 0 && (err == - EFAULT || err > 0 )) {
2006
+ const size_t left = iov_iter_count (from );
2007
+ /*
2008
+ * We have more data left to write. Try to fault in as many as
2009
+ * possible of the remainder pages and retry. We do this without
2010
+ * releasing and locking again the inode, to prevent races with
2011
+ * truncate.
2012
+ *
2013
+ * Also, in case the iov refers to pages in the file range of the
2014
+ * file we want to write to (due to a mmap), we could enter an
2015
+ * infinite loop if we retry after faulting the pages in, since
2016
+ * iomap will invalidate any pages in the range early on, before
2017
+ * it tries to fault in the pages of the iov. So we keep track of
2018
+ * how much was left of iov in the previous EFAULT and fallback
2019
+ * to buffered IO in case we haven't made any progress.
2020
+ */
2021
+ if (left == prev_left ) {
2022
+ err = - ENOTBLK ;
2023
+ } else {
2024
+ fault_in_iov_iter_readable (from , left );
2025
+ prev_left = left ;
2026
+ goto again ;
2027
+ }
1978
2028
}
1979
2029
1980
- if (written < 0 || !iov_iter_count (from )) {
1981
- err = written ;
2030
+ btrfs_inode_unlock (inode , ilock_flags );
2031
+
2032
+ /*
2033
+ * Add back IOCB_DSYNC. Our caller, btrfs_file_write_iter(), will do
2034
+ * the fsync (call generic_write_sync()).
2035
+ */
2036
+ if (is_sync_write )
2037
+ iocb -> ki_flags |= IOCB_DSYNC ;
2038
+
2039
+ /* If 'err' is -ENOTBLK then it means we must fallback to buffered IO. */
2040
+ if ((err < 0 && err != - ENOTBLK ) || !iov_iter_count (from ))
1982
2041
goto out ;
1983
- }
1984
2042
1985
2043
buffered :
1986
2044
pos = iocb -> ki_pos ;
@@ -2005,7 +2063,7 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
2005
2063
invalidate_mapping_pages (file -> f_mapping , pos >> PAGE_SHIFT ,
2006
2064
endbyte >> PAGE_SHIFT );
2007
2065
out :
2008
- return written ? written : err ;
2066
+ return err < 0 ? err : written ;
2009
2067
}
2010
2068
2011
2069
static ssize_t btrfs_file_write_iter (struct kiocb * iocb ,
@@ -3659,6 +3717,8 @@ static int check_direct_read(struct btrfs_fs_info *fs_info,
3659
3717
static ssize_t btrfs_direct_read (struct kiocb * iocb , struct iov_iter * to )
3660
3718
{
3661
3719
struct inode * inode = file_inode (iocb -> ki_filp );
3720
+ size_t prev_left = 0 ;
3721
+ ssize_t read = 0 ;
3662
3722
ssize_t ret ;
3663
3723
3664
3724
if (fsverity_active (inode ))
@@ -3668,10 +3728,57 @@ static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
3668
3728
return 0 ;
3669
3729
3670
3730
btrfs_inode_lock (inode , BTRFS_ILOCK_SHARED );
3731
+ again :
3732
+ /*
3733
+ * This is similar to what we do for direct IO writes, see the comment
3734
+ * at btrfs_direct_write(), but we also disable page faults in addition
3735
+ * to disabling them only at the iov_iter level. This is because when
3736
+ * reading from a hole or prealloc extent, iomap calls iov_iter_zero(),
3737
+ * which can still trigger page fault ins despite having set ->nofault
3738
+ * to true of our 'to' iov_iter.
3739
+ *
3740
+ * The difference to direct IO writes is that we deadlock when trying
3741
+ * to lock the extent range in the inode's tree during he page reads
3742
+ * triggered by the fault in (while for writes it is due to waiting for
3743
+ * our own ordered extent). This is because for direct IO reads,
3744
+ * btrfs_dio_iomap_begin() returns with the extent range locked, which
3745
+ * is only unlocked in the endio callback (end_bio_extent_readpage()).
3746
+ */
3747
+ pagefault_disable ();
3748
+ to -> nofault = true;
3671
3749
ret = iomap_dio_rw (iocb , to , & btrfs_dio_iomap_ops , & btrfs_dio_ops ,
3672
- 0 , 0 );
3750
+ IOMAP_DIO_PARTIAL , read );
3751
+ to -> nofault = false;
3752
+ pagefault_enable ();
3753
+
3754
+ /* No increment (+=) because iomap returns a cumulative value. */
3755
+ if (ret > 0 )
3756
+ read = ret ;
3757
+
3758
+ if (iov_iter_count (to ) > 0 && (ret == - EFAULT || ret > 0 )) {
3759
+ const size_t left = iov_iter_count (to );
3760
+
3761
+ if (left == prev_left ) {
3762
+ /*
3763
+ * We didn't make any progress since the last attempt,
3764
+ * fallback to a buffered read for the remainder of the
3765
+ * range. This is just to avoid any possibility of looping
3766
+ * for too long.
3767
+ */
3768
+ ret = read ;
3769
+ } else {
3770
+ /*
3771
+ * We made some progress since the last retry or this is
3772
+ * the first time we are retrying. Fault in as many pages
3773
+ * as possible and retry.
3774
+ */
3775
+ fault_in_iov_iter_writeable (to , left );
3776
+ prev_left = left ;
3777
+ goto again ;
3778
+ }
3779
+ }
3673
3780
btrfs_inode_unlock (inode , BTRFS_ILOCK_SHARED );
3674
- return ret ;
3781
+ return ret < 0 ? ret : read ;
3675
3782
}
3676
3783
3677
3784
static ssize_t btrfs_file_read_iter (struct kiocb * iocb , struct iov_iter * to )
0 commit comments