Skip to content

Commit fe0f07d

Browse files
axboeAl Viro
authored andcommitted
direct-io: only inc/dec inode->i_dio_count for file systems
do_blockdev_direct_IO() increments and decrements the inode ->i_dio_count for each IO operation. It does this to protect against truncate of a file. Block devices don't need this sort of protection. For a capable multiqueue setup, this atomic int is the only shared state between applications accessing the device for O_DIRECT, and it presents a scaling wall for that. In my testing, as much as 30% of system time is spent incrementing and decrementing this value. A mixed read/write workload improved from ~2.5M IOPS to ~9.6M IOPS, with better latencies too. Before: clat percentiles (usec): | 1.00th=[ 33], 5.00th=[ 34], 10.00th=[ 34], 20.00th=[ 34], | 30.00th=[ 34], 40.00th=[ 34], 50.00th=[ 35], 60.00th=[ 35], | 70.00th=[ 35], 80.00th=[ 35], 90.00th=[ 37], 95.00th=[ 80], | 99.00th=[ 98], 99.50th=[ 151], 99.90th=[ 155], 99.95th=[ 155], | 99.99th=[ 165] After: clat percentiles (usec): | 1.00th=[ 95], 5.00th=[ 108], 10.00th=[ 129], 20.00th=[ 149], | 30.00th=[ 155], 40.00th=[ 161], 50.00th=[ 167], 60.00th=[ 171], | 70.00th=[ 177], 80.00th=[ 185], 90.00th=[ 201], 95.00th=[ 270], | 99.00th=[ 390], 99.50th=[ 398], 99.90th=[ 418], 99.95th=[ 422], | 99.99th=[ 438] In other setups, Robert Elliott reported seeing good performance improvements: https://lkml.org/lkml/2015/4/3/557 The more applications accessing the device, the worse it gets. Add a new direct-io flags, DIO_SKIP_DIO_COUNT, which tells do_blockdev_direct_IO() that it need not worry about incrementing or decrementing the inode i_dio_count for this caller. Cc: Andrew Morton <[email protected]> Cc: Christoph Hellwig <[email protected]> Cc: Theodore Ts'o <[email protected]> Cc: Elliott, Robert (Server Storage) <[email protected]> Cc: Al Viro <[email protected]> Signed-off-by: Jens Axboe <[email protected]> Signed-off-by: Al Viro <[email protected]>
1 parent 8e3c500 commit fe0f07d

File tree

9 files changed

+50
-33
lines changed

9 files changed

+50
-33
lines changed

fs/block_dev.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,8 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
152152
struct inode *inode = file->f_mapping->host;
153153

154154
return __blockdev_direct_IO(iocb, inode, I_BDEV(inode), iter, offset,
155-
blkdev_get_block, NULL, NULL, 0);
155+
blkdev_get_block, NULL, NULL,
156+
DIO_SKIP_DIO_COUNT);
156157
}
157158

158159
int __sync_blockdev(struct block_device *bdev, int wait)

fs/btrfs/inode.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8129,7 +8129,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
81298129
if (check_direct_IO(BTRFS_I(inode)->root, iocb, iter, offset))
81308130
return 0;
81318131

8132-
atomic_inc(&inode->i_dio_count);
8132+
inode_dio_begin(inode);
81338133
smp_mb__after_atomic();
81348134

81358135
/*
@@ -8169,7 +8169,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
81698169
current->journal_info = &outstanding_extents;
81708170
} else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
81718171
&BTRFS_I(inode)->runtime_flags)) {
8172-
inode_dio_done(inode);
8172+
inode_dio_end(inode);
81738173
flags = DIO_LOCKING | DIO_SKIP_HOLES;
81748174
wakeup = false;
81758175
}
@@ -8188,7 +8188,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
81888188
}
81898189
out:
81908190
if (wakeup)
8191-
inode_dio_done(inode);
8191+
inode_dio_end(inode);
81928192
if (relock)
81938193
mutex_lock(&inode->i_mutex);
81948194

fs/dax.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,7 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
209209
}
210210

211211
/* Protects against truncate */
212-
atomic_inc(&inode->i_dio_count);
212+
inode_dio_begin(inode);
213213

214214
retval = dax_io(inode, iter, pos, end, get_block, &bh);
215215

@@ -219,7 +219,7 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
219219
if ((retval > 0) && end_io)
220220
end_io(iocb, pos, retval, bh.b_private);
221221

222-
inode_dio_done(inode);
222+
inode_dio_end(inode);
223223
out:
224224
return retval;
225225
}

fs/direct-io.c

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -253,7 +253,9 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret,
253253
if (dio->end_io && dio->result)
254254
dio->end_io(dio->iocb, offset, transferred, dio->private);
255255

256-
inode_dio_done(dio->inode);
256+
if (!(dio->flags & DIO_SKIP_DIO_COUNT))
257+
inode_dio_end(dio->inode);
258+
257259
if (is_async) {
258260
if (dio->rw & WRITE) {
259261
int err;
@@ -1195,7 +1197,8 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
11951197
/*
11961198
* Will be decremented at I/O completion time.
11971199
*/
1198-
atomic_inc(&inode->i_dio_count);
1200+
if (!(dio->flags & DIO_SKIP_DIO_COUNT))
1201+
inode_dio_begin(inode);
11991202

12001203
retval = 0;
12011204
sdio.blkbits = blkbits;

fs/ext4/indirect.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -682,11 +682,11 @@ ssize_t ext4_ind_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
682682
* via ext4_inode_block_unlocked_dio(). Check inode's state
683683
* while holding extra i_dio_count ref.
684684
*/
685-
atomic_inc(&inode->i_dio_count);
685+
inode_dio_begin(inode);
686686
smp_mb();
687687
if (unlikely(ext4_test_inode_state(inode,
688688
EXT4_STATE_DIOREAD_LOCK))) {
689-
inode_dio_done(inode);
689+
inode_dio_end(inode);
690690
goto locked;
691691
}
692692
if (IS_DAX(inode))
@@ -697,7 +697,7 @@ ssize_t ext4_ind_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
697697
inode->i_sb->s_bdev, iter,
698698
offset, ext4_get_block, NULL,
699699
NULL, 0);
700-
inode_dio_done(inode);
700+
inode_dio_end(inode);
701701
} else {
702702
locked:
703703
if (IS_DAX(inode))

fs/ext4/inode.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2977,7 +2977,7 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
29772977
* overwrite DIO as i_dio_count needs to be incremented under i_mutex.
29782978
*/
29792979
if (iov_iter_rw(iter) == WRITE)
2980-
atomic_inc(&inode->i_dio_count);
2980+
inode_dio_begin(inode);
29812981

29822982
/* If we do a overwrite dio, i_mutex locking can be released */
29832983
overwrite = *((int *)iocb->private);
@@ -3079,7 +3079,7 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
30793079

30803080
retake_lock:
30813081
if (iov_iter_rw(iter) == WRITE)
3082-
inode_dio_done(inode);
3082+
inode_dio_end(inode);
30833083
/* take i_mutex locking again if we do a ovewrite dio */
30843084
if (overwrite) {
30853085
up_read(&EXT4_I(inode)->i_data_sem);

fs/inode.c

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1945,20 +1945,6 @@ void inode_dio_wait(struct inode *inode)
19451945
}
19461946
EXPORT_SYMBOL(inode_dio_wait);
19471947

1948-
/*
1949-
* inode_dio_done - signal finish of a direct I/O requests
1950-
* @inode: inode the direct I/O happens on
1951-
*
1952-
* This is called once we've finished processing a direct I/O request,
1953-
* and is used to wake up callers waiting for direct I/O to be quiesced.
1954-
*/
1955-
void inode_dio_done(struct inode *inode)
1956-
{
1957-
if (atomic_dec_and_test(&inode->i_dio_count))
1958-
wake_up_bit(&inode->i_state, __I_DIO_WAKEUP);
1959-
}
1960-
EXPORT_SYMBOL(inode_dio_done);
1961-
19621948
/*
19631949
* inode_set_flags - atomically set some inode flags
19641950
*

fs/nfs/direct.c

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -386,7 +386,7 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq, bool write)
386386
if (write)
387387
nfs_zap_mapping(inode, inode->i_mapping);
388388

389-
inode_dio_done(inode);
389+
inode_dio_end(inode);
390390

391391
if (dreq->iocb) {
392392
long res = (long) dreq->error;
@@ -486,7 +486,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
486486
&nfs_direct_read_completion_ops);
487487
get_dreq(dreq);
488488
desc.pg_dreq = dreq;
489-
atomic_inc(&inode->i_dio_count);
489+
inode_dio_begin(inode);
490490

491491
while (iov_iter_count(iter)) {
492492
struct page **pagevec;
@@ -538,7 +538,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
538538
* generic layer handle the completion.
539539
*/
540540
if (requested_bytes == 0) {
541-
inode_dio_done(inode);
541+
inode_dio_end(inode);
542542
nfs_direct_req_release(dreq);
543543
return result < 0 ? result : -EIO;
544544
}
@@ -872,7 +872,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
872872
&nfs_direct_write_completion_ops);
873873
desc.pg_dreq = dreq;
874874
get_dreq(dreq);
875-
atomic_inc(&inode->i_dio_count);
875+
inode_dio_begin(inode);
876876

877877
NFS_I(inode)->write_io += iov_iter_count(iter);
878878
while (iov_iter_count(iter)) {
@@ -928,7 +928,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
928928
* generic layer handle the completion.
929929
*/
930930
if (requested_bytes == 0) {
931-
inode_dio_done(inode);
931+
inode_dio_end(inode);
932932
nfs_direct_req_release(dreq);
933933
return result < 0 ? result : -EIO;
934934
}

include/linux/fs.h

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2635,6 +2635,9 @@ enum {
26352635

26362636
/* filesystem can handle aio writes beyond i_size */
26372637
DIO_ASYNC_EXTEND = 0x04,
2638+
2639+
/* inode/fs/bdev does not need truncate protection */
2640+
DIO_SKIP_DIO_COUNT = 0x08,
26382641
};
26392642

26402643
void dio_end_io(struct bio *bio, int error);
@@ -2657,7 +2660,31 @@ static inline ssize_t blockdev_direct_IO(struct kiocb *iocb,
26572660
#endif
26582661

26592662
void inode_dio_wait(struct inode *inode);
2660-
void inode_dio_done(struct inode *inode);
2663+
2664+
/*
2665+
* inode_dio_begin - signal start of a direct I/O requests
2666+
* @inode: inode the direct I/O happens on
2667+
*
2668+
* This is called once we've finished processing a direct I/O request,
2669+
* and is used to wake up callers waiting for direct I/O to be quiesced.
2670+
*/
2671+
static inline void inode_dio_begin(struct inode *inode)
2672+
{
2673+
atomic_inc(&inode->i_dio_count);
2674+
}
2675+
2676+
/*
2677+
* inode_dio_end - signal finish of a direct I/O requests
2678+
* @inode: inode the direct I/O happens on
2679+
*
2680+
* This is called once we've finished processing a direct I/O request,
2681+
* and is used to wake up callers waiting for direct I/O to be quiesced.
2682+
*/
2683+
static inline void inode_dio_end(struct inode *inode)
2684+
{
2685+
if (atomic_dec_and_test(&inode->i_dio_count))
2686+
wake_up_bit(&inode->i_state, __I_DIO_WAKEUP);
2687+
}
26612688

26622689
extern void inode_set_flags(struct inode *inode, unsigned int flags,
26632690
unsigned int mask);

0 commit comments

Comments
 (0)