Skip to content

Commit af006cb

Browse files
riteshharjaniBrian Maly
authored andcommitted
ext4: Start with shared i_rwsem in case of DIO instead of exclusive
Earlier there was no shared lock in DIO read path. But this patch (16c5468: ext4: Allow parallel DIO reads) simplified some of the locking mechanism while still allowing for parallel DIO reads by adding shared lock in inode DIO read path. But this created problem with mixed read/write workload. It is due to the fact that in DIO path, we first start with exclusive lock and only when we determine that it is a ovewrite IO, we downgrade the lock. This causes the problem, since we still have shared locking in DIO reads. So, this patch tries to fix this issue by starting with shared lock and then switching to exclusive lock only when required based on ext4_dio_write_checks(). Other than that, it also simplifies below cases:- 1. Simplified ext4_unaligned_aio API to ext4_unaligned_io. Previous API was abused in the sense that it was not really checking for AIO anywhere also it used to check for extending writes. So this API was renamed and simplified to ext4_unaligned_io() which actully only checks if the IO is really unaligned. Now, in case of unaligned direct IO, iomap_dio_rw needs to do zeroing of partial block and that will require serialization against other direct IOs in the same block. So we take a exclusive inode lock for any unaligned DIO. In case of AIO we also need to wait for any outstanding IOs to complete so that conversion from unwritten to written is completed before anyone try to map the overlapping block. Hence we take exclusive inode lock and also wait for inode_dio_wait() for unaligned DIO case. Please note since we are anyway taking an exclusive lock in unaligned IO, inode_dio_wait() becomes a no-op in case of non-AIO DIO. 2. Added ext4_extending_io(). This checks if the IO is extending the file. 3. Added ext4_dio_write_checks(). In this we start with shared inode lock and only switch to exclusive lock if required. So in most cases with aligned, non-extending, dioread_nolock & overwrites, it tries to write with a shared lock. If not, then we restart the operation in ext4_dio_write_checks(), after acquiring exclusive lock. Reviewed-by: Jan Kara <[email protected]> Tested-by: Joseph Qi <[email protected]> Signed-off-by: Ritesh Harjani <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Theodore Ts'o <[email protected]> Orabug: 34405736 (cherry picked from commit aa9714d) Signed-off-by: Junxiao Bi <[email protected]> Reviewed-by: Darrick J. Wong <[email protected]> Conflicts: fs/ext4/file.c Signed-off-by: Brian Maly <[email protected]>
1 parent 518e38a commit af006cb

File tree

1 file changed

+137
-49
lines changed

1 file changed

+137
-49
lines changed

fs/ext4/file.c

Lines changed: 137 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -112,28 +112,25 @@ static void ext4_unwritten_wait(struct inode *inode)
112112
wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_unwritten) == 0));
113113
}
114114

115-
/*
116-
* This tests whether the IO in question is block-aligned or not.
117-
* Ext4 utilizes unwritten extents when hole-filling during direct IO, and they
118-
* are converted to written only after the IO is complete. Until they are
119-
* mapped, these blocks appear as holes, so dio_zero_block() will assume that
120-
* it needs to zero out portions of the start and/or end block. If 2 AIO
121-
* threads are at work on the same unwritten block, they must be synchronized
122-
* or one thread will zero the other's data, causing corruption.
123-
*/
124-
static int
125-
ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos)
115+
static bool
116+
ext4_unaligned_io(struct inode *inode, struct iov_iter *from, loff_t pos)
126117
{
127118
struct super_block *sb = inode->i_sb;
128-
int blockmask = sb->s_blocksize - 1;
129-
130-
if (pos >= ALIGN(i_size_read(inode), sb->s_blocksize))
131-
return 0;
119+
unsigned long blockmask = sb->s_blocksize - 1;
132120

133121
if ((pos | iov_iter_alignment(from)) & blockmask)
134-
return 1;
122+
return true;
135123

136-
return 0;
124+
return false;
125+
}
126+
127+
static bool
128+
ext4_extending_io(struct inode *inode, loff_t offset, size_t len)
129+
{
130+
if (offset + len > i_size_read(inode) ||
131+
offset + len > EXT4_I(inode)->i_disksize)
132+
return true;
133+
return false;
137134
}
138135

139136
/* Is IO overwriting allocated and initialized blocks? */
@@ -258,51 +255,131 @@ static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
258255
return ret;
259256
}
260257

258+
/*
259+
* The intention here is to start with shared lock acquired then see if any
260+
* condition requires an exclusive inode lock. If yes, then we restart the
261+
* whole operation by releasing the shared lock and acquiring exclusive lock.
262+
*
263+
* - For unaligned_io we never take shared lock as it may cause data corruption
264+
* when two unaligned IO tries to modify the same block e.g. while zeroing.
265+
*
266+
* - For extending writes case we don't take the shared lock, since it requires
267+
* updating inode i_disksize and/or orphan handling with exclusive lock.
268+
*
269+
* - shared locking will only be true mostly with overwrites in dioread_nolock
270+
* mode. Otherwise we will switch to exclusive i_rwsem lock.
271+
*/
272+
static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
273+
bool *ilock_shared, bool *extend)
274+
{
275+
struct file *file = iocb->ki_filp;
276+
struct inode *inode = file_inode(file);
277+
loff_t offset;
278+
size_t count;
279+
ssize_t ret;
280+
bool overwrite;
281+
282+
restart:
283+
ret = ext4_write_checks(iocb, from);
284+
if (ret <= 0)
285+
goto out;
286+
287+
offset = iocb->ki_pos;
288+
count = ret;
289+
if (ext4_extending_io(inode, offset, count))
290+
*extend = true;
291+
292+
overwrite = ext4_overwrite_io(inode, offset, count);
293+
if ((iocb->ki_flags & IOCB_NOWAIT) && (extend || !overwrite)) {
294+
ret = -EAGAIN;
295+
goto out;
296+
}
297+
/*
298+
* Determine whether the IO operation will overwrite allocated
299+
* and initialized blocks. If so, check to see whether it is
300+
* possible to take the dioread_nolock path.
301+
*
302+
* We need exclusive i_rwsem for changing security info
303+
* in file_modified().
304+
*/
305+
if (*ilock_shared && (!IS_NOSEC(inode) || *extend ||
306+
!ext4_should_dioread_nolock(inode) ||
307+
!overwrite)) {
308+
inode_unlock_shared(inode);
309+
*ilock_shared = false;
310+
inode_lock(inode);
311+
goto restart;
312+
}
313+
314+
return count;
315+
out:
316+
if (*ilock_shared)
317+
inode_unlock_shared(inode);
318+
else
319+
inode_unlock(inode);
320+
return ret;
321+
}
322+
261323
static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
262324
{
263325
struct file *file = iocb->ki_filp;
264326
struct inode *inode = file_inode(file);
265-
int unaligned_aio = 0;
266-
int overwrite = 0;
267327
ssize_t ret;
268328
struct address_space *mapping = file->f_mapping;
269329
ssize_t status;
270330
loff_t pos, endbyte;
271331
ssize_t err;
332+
loff_t offset = iocb->ki_pos;
333+
size_t count = iov_iter_count(from);
334+
bool extend = false, unaligned_io = false;
335+
bool ilock_shared = true;
336+
int dio_unlocking = 0;
272337

273-
if (!inode_trylock(inode)) {
274-
if (iocb->ki_flags & IOCB_NOWAIT)
275-
return -EAGAIN;
276-
inode_lock(inode);
338+
/*
339+
* We initially start with shared inode lock unless it is
340+
* unaligned IO which needs exclusive lock anyways.
341+
*/
342+
if (ext4_unaligned_io(inode, from, offset)) {
343+
unaligned_io = true;
344+
ilock_shared = false;
345+
}
346+
/*
347+
* Quick check here without any i_rwsem lock to see if it is extending
348+
* IO. A more reliable check is done in ext4_dio_write_checks() with
349+
* proper locking in place.
350+
*/
351+
if (offset + count > i_size_read(inode))
352+
ilock_shared = false;
353+
354+
if (ilock_shared) {
355+
if (!inode_trylock_shared(inode)) {
356+
if (iocb->ki_flags & IOCB_NOWAIT)
357+
return -EAGAIN;
358+
inode_lock_shared(inode);
359+
}
360+
} else {
361+
if (!inode_trylock(inode)) {
362+
if (iocb->ki_flags & IOCB_NOWAIT)
363+
return -EAGAIN;
364+
inode_lock(inode);
365+
}
277366
}
278367

279-
ret = ext4_write_checks(iocb, from);
368+
ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend);
280369
if (ret <= 0)
281-
goto err_out;
282-
370+
return ret;
283371
/*
284-
* Unaligned direct AIO must be serialized among each other as zeroing
285-
* of partial blocks of two competing unaligned AIOs can result in data
372+
* Unaligned direct IO must be serialized among each other as zeroing
373+
* of partial blocks of two competing unaligned IOs can result in data
286374
* corruption.
375+
*
376+
* So we make sure we don't allow any unaligned IO in flight.
377+
* For IOs where we need not wait (like unaligned non-AIO DIO),
378+
* below ext4_unwritten_wait() may anyway become a no-op, since we start
379+
* with exclusive lock.
287380
*/
288-
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
289-
!is_sync_kiocb(iocb) &&
290-
ext4_unaligned_aio(inode, from, iocb->ki_pos)) {
291-
unaligned_aio = 1;
381+
if (unaligned_io)
292382
ext4_unwritten_wait(inode);
293-
}
294-
295-
iocb->private = &overwrite;
296-
/* Check whether we do a DIO overwrite or not */
297-
if (!unaligned_aio) {
298-
if (ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) {
299-
if (ext4_should_dioread_nolock(inode))
300-
overwrite = 1;
301-
} else if (iocb->ki_flags & IOCB_NOWAIT) {
302-
ret = -EAGAIN;
303-
goto err_out;
304-
}
305-
}
306383

307384
ret = file_remove_privs(file);
308385
if (ret)
@@ -312,6 +389,8 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
312389
if (ret)
313390
goto err_out;
314391

392+
/* dio_unlock is always zero which will skip all unlocking in dio overwrite case. */
393+
iocb->private = &dio_unlocking;
315394
ret = generic_file_direct_write(iocb, from);
316395
/*
317396
* If the write stopped short of completing, fall back to
@@ -323,7 +402,10 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
323402
if (ret < 0 || !iov_iter_count(from))
324403
goto out;
325404

326-
inode_unlock(inode);
405+
if (ilock_shared)
406+
inode_unlock_shared(inode);
407+
else
408+
inode_unlock(inode);
327409

328410
/*direct io partial done, fallen into buffer io. */
329411
pos = iocb->ki_pos;
@@ -358,17 +440,23 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
358440
* overlapping aligned IO after unaligned might result in data
359441
* corruption.
360442
*/
361-
if (ret == -EIOCBQUEUED && unaligned_aio)
443+
if (ret == -EIOCBQUEUED && unaligned_io)
362444
ext4_unwritten_wait(inode);
363-
inode_unlock(inode);
445+
if (ilock_shared)
446+
inode_unlock_shared(inode);
447+
else
448+
inode_unlock(inode);
364449

365450
if (ret > 0)
366451
ret = generic_write_sync(iocb, ret);
367452

368453
return ret;
369454

370455
err_out:
371-
inode_unlock(inode);
456+
if (ilock_shared)
457+
inode_unlock_shared(inode);
458+
else
459+
inode_unlock(inode);
372460
return ret;
373461
}
374462

0 commit comments

Comments
 (0)