Skip to content

Commit 914f82a

Browse files
jankaratytso
authored andcommitted
ext4: refactor direct IO code
Currently ext4 direct IO handling is split between ext4_ext_direct_IO() and ext4_ind_direct_IO(). However the extent based function calls into the indirect based one for some cases and for example it is not able to handle file extending. Previously it was not also properly handling retries in case of ENOSPC errors. With DAX things would get even more contrieved so just refactor the direct IO code and instead of indirect / extent split do the split to read vs writes. Signed-off-by: Jan Kara <[email protected]> Signed-off-by: Theodore Ts'o <[email protected]>
1 parent dbc427c commit 914f82a

File tree

3 files changed

+114
-146
lines changed

3 files changed

+114
-146
lines changed

fs/ext4/ext4.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2587,8 +2587,6 @@ extern int ext4_get_next_extent(struct inode *inode, ext4_lblk_t lblk,
25872587
/* indirect.c */
25882588
extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
25892589
struct ext4_map_blocks *map, int flags);
2590-
extern ssize_t ext4_ind_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
2591-
loff_t offset);
25922590
extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock);
25932591
extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks);
25942592
extern void ext4_ind_truncate(handle_t *, struct inode *inode);

fs/ext4/indirect.c

Lines changed: 0 additions & 127 deletions
Original file line numberDiff line numberDiff line change
@@ -648,133 +648,6 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
648648
return err;
649649
}
650650

651-
/*
652-
* O_DIRECT for ext3 (or indirect map) based files
653-
*
654-
* If the O_DIRECT write will extend the file then add this inode to the
655-
* orphan list. So recovery will truncate it back to the original size
656-
* if the machine crashes during the write.
657-
*
658-
* If the O_DIRECT write is intantiating holes inside i_size and the machine
659-
* crashes then stale disk data _may_ be exposed inside the file. But current
660-
* VFS code falls back into buffered path in that case so we are safe.
661-
*/
662-
ssize_t ext4_ind_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
663-
loff_t offset)
664-
{
665-
struct file *file = iocb->ki_filp;
666-
struct inode *inode = file->f_mapping->host;
667-
struct ext4_inode_info *ei = EXT4_I(inode);
668-
handle_t *handle;
669-
ssize_t ret;
670-
int orphan = 0;
671-
size_t count = iov_iter_count(iter);
672-
int retries = 0;
673-
674-
if (iov_iter_rw(iter) == WRITE) {
675-
loff_t final_size = offset + count;
676-
677-
if (final_size > inode->i_size) {
678-
/* Credits for sb + inode write */
679-
handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
680-
if (IS_ERR(handle)) {
681-
ret = PTR_ERR(handle);
682-
goto out;
683-
}
684-
ret = ext4_orphan_add(handle, inode);
685-
if (ret) {
686-
ext4_journal_stop(handle);
687-
goto out;
688-
}
689-
orphan = 1;
690-
ei->i_disksize = inode->i_size;
691-
ext4_journal_stop(handle);
692-
}
693-
}
694-
695-
retry:
696-
if (iov_iter_rw(iter) == READ && ext4_should_dioread_nolock(inode)) {
697-
/*
698-
* Nolock dioread optimization may be dynamically disabled
699-
* via ext4_inode_block_unlocked_dio(). Check inode's state
700-
* while holding extra i_dio_count ref.
701-
*/
702-
inode_dio_begin(inode);
703-
smp_mb();
704-
if (unlikely(ext4_test_inode_state(inode,
705-
EXT4_STATE_DIOREAD_LOCK))) {
706-
inode_dio_end(inode);
707-
goto locked;
708-
}
709-
if (IS_DAX(inode))
710-
ret = dax_do_io(iocb, inode, iter, offset,
711-
ext4_dio_get_block, NULL, 0);
712-
else
713-
ret = __blockdev_direct_IO(iocb, inode,
714-
inode->i_sb->s_bdev, iter,
715-
offset, ext4_dio_get_block,
716-
NULL, NULL, 0);
717-
inode_dio_end(inode);
718-
} else {
719-
locked:
720-
if (IS_DAX(inode))
721-
ret = dax_do_io(iocb, inode, iter, offset,
722-
ext4_dio_get_block, NULL, DIO_LOCKING);
723-
else
724-
ret = blockdev_direct_IO(iocb, inode, iter, offset,
725-
ext4_dio_get_block);
726-
727-
if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) {
728-
loff_t isize = i_size_read(inode);
729-
loff_t end = offset + count;
730-
731-
if (end > isize)
732-
ext4_truncate_failed_write(inode);
733-
}
734-
}
735-
if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
736-
goto retry;
737-
738-
if (orphan) {
739-
int err;
740-
741-
/* Credits for sb + inode write */
742-
handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
743-
if (IS_ERR(handle)) {
744-
/* This is really bad luck. We've written the data
745-
* but cannot extend i_size. Bail out and pretend
746-
* the write failed... */
747-
ret = PTR_ERR(handle);
748-
if (inode->i_nlink)
749-
ext4_orphan_del(NULL, inode);
750-
751-
goto out;
752-
}
753-
if (inode->i_nlink)
754-
ext4_orphan_del(handle, inode);
755-
if (ret > 0) {
756-
loff_t end = offset + ret;
757-
if (end > inode->i_size) {
758-
ei->i_disksize = end;
759-
i_size_write(inode, end);
760-
/*
761-
* We're going to return a positive `ret'
762-
* here due to non-zero-length I/O, so there's
763-
* no way of reporting error returns from
764-
* ext4_mark_inode_dirty() to userspace. So
765-
* ignore it.
766-
*/
767-
ext4_mark_inode_dirty(handle, inode);
768-
}
769-
}
770-
err = ext4_journal_stop(handle);
771-
if (ret == 0)
772-
ret = err;
773-
}
774-
out:
775-
return ret;
776-
}
777-
778651
/*
779652
* Calculate the number of metadata blocks need to reserve
780653
* to allocate a new block at @lblocks for non extent file based file

fs/ext4/inode.c

Lines changed: 114 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -3295,7 +3295,9 @@ static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
32953295
}
32963296

32973297
/*
3298-
* For ext4 extent files, ext4 will do direct-io write to holes,
3298+
* Handling of direct IO writes.
3299+
*
3300+
* For ext4 extent files, ext4 will do direct-io write even to holes,
32993301
* preallocated extents, and those write extend the file, no need to
33003302
* fall back to buffered IO.
33013303
*
@@ -3313,21 +3315,37 @@ static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
33133315
* if the machine crashes during the write.
33143316
*
33153317
*/
3316-
static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
3317-
loff_t offset)
3318+
static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter,
3319+
loff_t offset)
33183320
{
33193321
struct file *file = iocb->ki_filp;
33203322
struct inode *inode = file->f_mapping->host;
3323+
struct ext4_inode_info *ei = EXT4_I(inode);
33213324
ssize_t ret;
33223325
size_t count = iov_iter_count(iter);
33233326
int overwrite = 0;
33243327
get_block_t *get_block_func = NULL;
33253328
int dio_flags = 0;
33263329
loff_t final_size = offset + count;
3330+
int orphan = 0;
3331+
handle_t *handle;
33273332

3328-
/* Use the old path for reads and writes beyond i_size. */
3329-
if (iov_iter_rw(iter) != WRITE || final_size > inode->i_size)
3330-
return ext4_ind_direct_IO(iocb, iter, offset);
3333+
if (final_size > inode->i_size) {
3334+
/* Credits for sb + inode write */
3335+
handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
3336+
if (IS_ERR(handle)) {
3337+
ret = PTR_ERR(handle);
3338+
goto out;
3339+
}
3340+
ret = ext4_orphan_add(handle, inode);
3341+
if (ret) {
3342+
ext4_journal_stop(handle);
3343+
goto out;
3344+
}
3345+
orphan = 1;
3346+
ei->i_disksize = inode->i_size;
3347+
ext4_journal_stop(handle);
3348+
}
33313349

33323350
BUG_ON(iocb->private == NULL);
33333351

@@ -3336,8 +3354,7 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
33363354
* conversion. This also disallows race between truncate() and
33373355
* overwrite DIO as i_dio_count needs to be incremented under i_mutex.
33383356
*/
3339-
if (iov_iter_rw(iter) == WRITE)
3340-
inode_dio_begin(inode);
3357+
inode_dio_begin(inode);
33413358

33423359
/* If we do a overwrite dio, i_mutex locking can be released */
33433360
overwrite = *((int *)iocb->private);
@@ -3346,7 +3363,7 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
33463363
inode_unlock(inode);
33473364

33483365
/*
3349-
* We could direct write to holes and fallocate.
3366+
* For extent mapped files we could direct write to holes and fallocate.
33503367
*
33513368
* Allocated blocks to fill the hole are marked as unwritten to prevent
33523369
* parallel buffered read to expose the stale data before DIO complete
@@ -3368,7 +3385,11 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
33683385
iocb->private = NULL;
33693386
if (overwrite)
33703387
get_block_func = ext4_dio_get_block_overwrite;
3371-
else if (is_sync_kiocb(iocb)) {
3388+
else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) ||
3389+
round_down(offset, 1 << inode->i_blkbits) >= inode->i_size) {
3390+
get_block_func = ext4_dio_get_block;
3391+
dio_flags = DIO_LOCKING | DIO_SKIP_HOLES;
3392+
} else if (is_sync_kiocb(iocb)) {
33723393
get_block_func = ext4_dio_get_block_unwritten_sync;
33733394
dio_flags = DIO_LOCKING;
33743395
} else {
@@ -3378,10 +3399,11 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
33783399
#ifdef CONFIG_EXT4_FS_ENCRYPTION
33793400
BUG_ON(ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode));
33803401
#endif
3381-
if (IS_DAX(inode))
3402+
if (IS_DAX(inode)) {
3403+
dio_flags &= ~DIO_SKIP_HOLES;
33823404
ret = dax_do_io(iocb, inode, iter, offset, get_block_func,
33833405
ext4_end_io_dio, dio_flags);
3384-
else
3406+
} else
33853407
ret = __blockdev_direct_IO(iocb, inode,
33863408
inode->i_sb->s_bdev, iter, offset,
33873409
get_block_func,
@@ -3401,12 +3423,87 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
34013423
ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
34023424
}
34033425

3404-
if (iov_iter_rw(iter) == WRITE)
3405-
inode_dio_end(inode);
3426+
inode_dio_end(inode);
34063427
/* take i_mutex locking again if we do a ovewrite dio */
34073428
if (overwrite)
34083429
inode_lock(inode);
34093430

3431+
if (ret < 0 && final_size > inode->i_size)
3432+
ext4_truncate_failed_write(inode);
3433+
3434+
/* Handle extending of i_size after direct IO write */
3435+
if (orphan) {
3436+
int err;
3437+
3438+
/* Credits for sb + inode write */
3439+
handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
3440+
if (IS_ERR(handle)) {
3441+
/* This is really bad luck. We've written the data
3442+
* but cannot extend i_size. Bail out and pretend
3443+
* the write failed... */
3444+
ret = PTR_ERR(handle);
3445+
if (inode->i_nlink)
3446+
ext4_orphan_del(NULL, inode);
3447+
3448+
goto out;
3449+
}
3450+
if (inode->i_nlink)
3451+
ext4_orphan_del(handle, inode);
3452+
if (ret > 0) {
3453+
loff_t end = offset + ret;
3454+
if (end > inode->i_size) {
3455+
ei->i_disksize = end;
3456+
i_size_write(inode, end);
3457+
/*
3458+
* We're going to return a positive `ret'
3459+
* here due to non-zero-length I/O, so there's
3460+
* no way of reporting error returns from
3461+
* ext4_mark_inode_dirty() to userspace. So
3462+
* ignore it.
3463+
*/
3464+
ext4_mark_inode_dirty(handle, inode);
3465+
}
3466+
}
3467+
err = ext4_journal_stop(handle);
3468+
if (ret == 0)
3469+
ret = err;
3470+
}
3471+
out:
3472+
return ret;
3473+
}
3474+
3475+
static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter,
3476+
loff_t offset)
3477+
{
3478+
int unlocked = 0;
3479+
struct inode *inode = iocb->ki_filp->f_mapping->host;
3480+
ssize_t ret;
3481+
3482+
if (ext4_should_dioread_nolock(inode)) {
3483+
/*
3484+
* Nolock dioread optimization may be dynamically disabled
3485+
* via ext4_inode_block_unlocked_dio(). Check inode's state
3486+
* while holding extra i_dio_count ref.
3487+
*/
3488+
inode_dio_begin(inode);
3489+
smp_mb();
3490+
if (unlikely(ext4_test_inode_state(inode,
3491+
EXT4_STATE_DIOREAD_LOCK)))
3492+
inode_dio_end(inode);
3493+
else
3494+
unlocked = 1;
3495+
}
3496+
if (IS_DAX(inode)) {
3497+
ret = dax_do_io(iocb, inode, iter, offset, ext4_dio_get_block,
3498+
NULL, unlocked ? 0 : DIO_LOCKING);
3499+
} else {
3500+
ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
3501+
iter, offset, ext4_dio_get_block,
3502+
NULL, NULL,
3503+
unlocked ? 0 : DIO_LOCKING);
3504+
}
3505+
if (unlocked)
3506+
inode_dio_end(inode);
34103507
return ret;
34113508
}
34123509

@@ -3434,10 +3531,10 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
34343531
return 0;
34353532

34363533
trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
3437-
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
3438-
ret = ext4_ext_direct_IO(iocb, iter, offset);
3534+
if (iov_iter_rw(iter) == READ)
3535+
ret = ext4_direct_IO_read(iocb, iter, offset);
34393536
else
3440-
ret = ext4_ind_direct_IO(iocb, iter, offset);
3537+
ret = ext4_direct_IO_write(iocb, iter, offset);
34413538
trace_ext4_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret);
34423539
return ret;
34433540
}

0 commit comments

Comments
 (0)