Skip to content

Commit aa9714d

Browse files
riteshharjanitytso
authored andcommitted
ext4: Start with shared i_rwsem in case of DIO instead of exclusive
Earlier there was no shared lock in DIO read path. But this patch (16c5468: ext4: Allow parallel DIO reads) simplified some of the locking mechanism while still allowing for parallel DIO reads by adding shared lock in inode DIO read path. But this created problem with mixed read/write workload. It is due to the fact that in DIO path, we first start with exclusive lock and only when we determine that it is a ovewrite IO, we downgrade the lock. This causes the problem, since we still have shared locking in DIO reads. So, this patch tries to fix this issue by starting with shared lock and then switching to exclusive lock only when required based on ext4_dio_write_checks(). Other than that, it also simplifies below cases:- 1. Simplified ext4_unaligned_aio API to ext4_unaligned_io. Previous API was abused in the sense that it was not really checking for AIO anywhere also it used to check for extending writes. So this API was renamed and simplified to ext4_unaligned_io() which actully only checks if the IO is really unaligned. Now, in case of unaligned direct IO, iomap_dio_rw needs to do zeroing of partial block and that will require serialization against other direct IOs in the same block. So we take a exclusive inode lock for any unaligned DIO. In case of AIO we also need to wait for any outstanding IOs to complete so that conversion from unwritten to written is completed before anyone try to map the overlapping block. Hence we take exclusive inode lock and also wait for inode_dio_wait() for unaligned DIO case. Please note since we are anyway taking an exclusive lock in unaligned IO, inode_dio_wait() becomes a no-op in case of non-AIO DIO. 2. Added ext4_extending_io(). This checks if the IO is extending the file. 3. Added ext4_dio_write_checks(). In this we start with shared inode lock and only switch to exclusive lock if required. So in most cases with aligned, non-extending, dioread_nolock & overwrites, it tries to write with a shared lock. If not, then we restart the operation in ext4_dio_write_checks(), after acquiring exclusive lock. Reviewed-by: Jan Kara <[email protected]> Tested-by: Joseph Qi <[email protected]> Signed-off-by: Ritesh Harjani <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Theodore Ts'o <[email protected]>
1 parent f629afe commit aa9714d

File tree

1 file changed

+142
-49
lines changed

1 file changed

+142
-49
lines changed

fs/ext4/file.c

Lines changed: 142 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -166,19 +166,25 @@ static int ext4_release_file(struct inode *inode, struct file *filp)
166166
* threads are at work on the same unwritten block, they must be synchronized
167167
* or one thread will zero the other's data, causing corruption.
168168
*/
169-
static int
170-
ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos)
169+
static bool
170+
ext4_unaligned_io(struct inode *inode, struct iov_iter *from, loff_t pos)
171171
{
172172
struct super_block *sb = inode->i_sb;
173-
int blockmask = sb->s_blocksize - 1;
174-
175-
if (pos >= ALIGN(i_size_read(inode), sb->s_blocksize))
176-
return 0;
173+
unsigned long blockmask = sb->s_blocksize - 1;
177174

178175
if ((pos | iov_iter_alignment(from)) & blockmask)
179-
return 1;
176+
return true;
180177

181-
return 0;
178+
return false;
179+
}
180+
181+
static bool
182+
ext4_extending_io(struct inode *inode, loff_t offset, size_t len)
183+
{
184+
if (offset + len > i_size_read(inode) ||
185+
offset + len > EXT4_I(inode)->i_disksize)
186+
return true;
187+
return false;
182188
}
183189

184190
/* Is IO overwriting allocated and initialized blocks? */
@@ -204,7 +210,8 @@ static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len)
204210
return err == blklen && (map.m_flags & EXT4_MAP_MAPPED);
205211
}
206212

207-
static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
213+
static ssize_t ext4_generic_write_checks(struct kiocb *iocb,
214+
struct iov_iter *from)
208215
{
209216
struct inode *inode = file_inode(iocb->ki_filp);
210217
ssize_t ret;
@@ -228,11 +235,21 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
228235
iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos);
229236
}
230237

238+
return iov_iter_count(from);
239+
}
240+
241+
static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
242+
{
243+
ssize_t ret, count;
244+
245+
count = ext4_generic_write_checks(iocb, from);
246+
if (count <= 0)
247+
return count;
248+
231249
ret = file_modified(iocb->ki_filp);
232250
if (ret)
233251
return ret;
234-
235-
return iov_iter_count(from);
252+
return count;
236253
}
237254

238255
static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
@@ -364,62 +381,139 @@ static const struct iomap_dio_ops ext4_dio_write_ops = {
364381
.end_io = ext4_dio_write_end_io,
365382
};
366383

384+
/*
385+
* The intention here is to start with shared lock acquired then see if any
386+
* condition requires an exclusive inode lock. If yes, then we restart the
387+
* whole operation by releasing the shared lock and acquiring exclusive lock.
388+
*
389+
* - For unaligned_io we never take shared lock as it may cause data corruption
390+
* when two unaligned IO tries to modify the same block e.g. while zeroing.
391+
*
392+
* - For extending writes case we don't take the shared lock, since it requires
393+
* updating inode i_disksize and/or orphan handling with exclusive lock.
394+
*
395+
* - shared locking will only be true mostly with overwrites in dioread_nolock
396+
* mode. Otherwise we will switch to exclusive i_rwsem lock.
397+
*/
398+
static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
399+
bool *ilock_shared, bool *extend)
400+
{
401+
struct file *file = iocb->ki_filp;
402+
struct inode *inode = file_inode(file);
403+
loff_t offset;
404+
size_t count;
405+
ssize_t ret;
406+
407+
restart:
408+
ret = ext4_generic_write_checks(iocb, from);
409+
if (ret <= 0)
410+
goto out;
411+
412+
offset = iocb->ki_pos;
413+
count = ret;
414+
if (ext4_extending_io(inode, offset, count))
415+
*extend = true;
416+
/*
417+
* Determine whether the IO operation will overwrite allocated
418+
* and initialized blocks. If so, check to see whether it is
419+
* possible to take the dioread_nolock path.
420+
*
421+
* We need exclusive i_rwsem for changing security info
422+
* in file_modified().
423+
*/
424+
if (*ilock_shared && (!IS_NOSEC(inode) || *extend ||
425+
!ext4_should_dioread_nolock(inode) ||
426+
!ext4_overwrite_io(inode, offset, count))) {
427+
inode_unlock_shared(inode);
428+
*ilock_shared = false;
429+
inode_lock(inode);
430+
goto restart;
431+
}
432+
433+
ret = file_modified(file);
434+
if (ret < 0)
435+
goto out;
436+
437+
return count;
438+
out:
439+
if (*ilock_shared)
440+
inode_unlock_shared(inode);
441+
else
442+
inode_unlock(inode);
443+
return ret;
444+
}
445+
367446
static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
368447
{
369448
ssize_t ret;
370-
size_t count;
371-
loff_t offset;
372449
handle_t *handle;
373450
struct inode *inode = file_inode(iocb->ki_filp);
374-
bool extend = false, overwrite = false, unaligned_aio = false;
451+
loff_t offset = iocb->ki_pos;
452+
size_t count = iov_iter_count(from);
453+
bool extend = false, unaligned_io = false;
454+
bool ilock_shared = true;
455+
456+
/*
457+
* We initially start with shared inode lock unless it is
458+
* unaligned IO which needs exclusive lock anyways.
459+
*/
460+
if (ext4_unaligned_io(inode, from, offset)) {
461+
unaligned_io = true;
462+
ilock_shared = false;
463+
}
464+
/*
465+
* Quick check here without any i_rwsem lock to see if it is extending
466+
* IO. A more reliable check is done in ext4_dio_write_checks() with
467+
* proper locking in place.
468+
*/
469+
if (offset + count > i_size_read(inode))
470+
ilock_shared = false;
375471

376472
if (iocb->ki_flags & IOCB_NOWAIT) {
377-
if (!inode_trylock(inode))
378-
return -EAGAIN;
473+
if (ilock_shared) {
474+
if (!inode_trylock_shared(inode))
475+
return -EAGAIN;
476+
} else {
477+
if (!inode_trylock(inode))
478+
return -EAGAIN;
479+
}
379480
} else {
380-
inode_lock(inode);
481+
if (ilock_shared)
482+
inode_lock_shared(inode);
483+
else
484+
inode_lock(inode);
381485
}
382486

487+
/* Fallback to buffered I/O if the inode does not support direct I/O. */
383488
if (!ext4_dio_supported(inode)) {
384-
inode_unlock(inode);
385-
/*
386-
* Fallback to buffered I/O if the inode does not support
387-
* direct I/O.
388-
*/
489+
if (ilock_shared)
490+
inode_unlock_shared(inode);
491+
else
492+
inode_unlock(inode);
389493
return ext4_buffered_write_iter(iocb, from);
390494
}
391495

392-
ret = ext4_write_checks(iocb, from);
393-
if (ret <= 0) {
394-
inode_unlock(inode);
496+
ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend);
497+
if (ret <= 0)
395498
return ret;
396-
}
397499

398-
/*
399-
* Unaligned asynchronous direct I/O must be serialized among each
400-
* other as the zeroing of partial blocks of two competing unaligned
401-
* asynchronous direct I/O writes can result in data corruption.
402-
*/
403500
offset = iocb->ki_pos;
404-
count = iov_iter_count(from);
405-
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
406-
!is_sync_kiocb(iocb) && ext4_unaligned_aio(inode, from, offset)) {
407-
unaligned_aio = true;
408-
inode_dio_wait(inode);
409-
}
501+
count = ret;
410502

411503
/*
412-
* Determine whether the I/O will overwrite allocated and initialized
413-
* blocks. If so, check to see whether it is possible to take the
414-
* dioread_nolock path.
504+
* Unaligned direct IO must be serialized among each other as zeroing
505+
* of partial blocks of two competing unaligned IOs can result in data
506+
* corruption.
507+
*
508+
* So we make sure we don't allow any unaligned IO in flight.
509+
* For IOs where we need not wait (like unaligned non-AIO DIO),
510+
* below inode_dio_wait() may anyway become a no-op, since we start
511+
* with exclusive lock.
415512
*/
416-
if (!unaligned_aio && ext4_overwrite_io(inode, offset, count) &&
417-
ext4_should_dioread_nolock(inode)) {
418-
overwrite = true;
419-
downgrade_write(&inode->i_rwsem);
420-
}
513+
if (unaligned_io)
514+
inode_dio_wait(inode);
421515

422-
if (offset + count > EXT4_I(inode)->i_disksize) {
516+
if (extend) {
423517
handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
424518
if (IS_ERR(handle)) {
425519
ret = PTR_ERR(handle);
@@ -432,18 +526,17 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
432526
goto out;
433527
}
434528

435-
extend = true;
436529
ext4_journal_stop(handle);
437530
}
438531

439532
ret = iomap_dio_rw(iocb, from, &ext4_iomap_ops, &ext4_dio_write_ops,
440-
is_sync_kiocb(iocb) || unaligned_aio || extend);
533+
is_sync_kiocb(iocb) || unaligned_io || extend);
441534

442535
if (extend)
443536
ret = ext4_handle_inode_extension(inode, offset, ret, count);
444537

445538
out:
446-
if (overwrite)
539+
if (ilock_shared)
447540
inode_unlock_shared(inode);
448541
else
449542
inode_unlock(inode);

0 commit comments

Comments
 (0)