Skip to content

Commit e9e3bce

Browse files
Eric Sandeentytso
authored andcommitted
ext4: serialize unaligned asynchronous DIO
ext4 has a data corruption case when doing non-block-aligned asynchronous direct IO into a sparse file, as demonstrated by xfstest 240. The root cause is that while ext4 preallocates space in the hole, mappings of that space still look "new" and dio_zero_block() will zero out the unwritten portions. When more than one AIO thread is going, they both find this "new" block and race to zero out their portion; this is uncoordinated and causes data corruption. Dave Chinner fixed this for xfs by simply serializing all unaligned asynchronous direct IO. I've done the same here. The difference is that we only wait on conversions, not all IO. This is a very big hammer, and I'm not very pleased with stuffing this into ext4_file_write(). But since ext4 is DIO_LOCKING, we need to serialize it at this high level. I tried to move this into ext4_ext_direct_IO, but by then we have the i_mutex already, and we will wait on the work queue to do conversions - which must also take the i_mutex. So that won't work. This was originally exposed by qemu-kvm installing to a raw disk image with a normal sector-63 alignment. I've tested a backport of this patch with qemu, and it does avoid the corruption. It is also quite a lot slower (14 min for package installs, vs. 8 min for well-aligned) but I'll take slow correctness over fast corruption any day. Mingming suggested that we can track outstanding conversions, and wait on those so that non-sparse files won't be affected, and I've implemented that here; unaligned AIO to nonsparse files won't take a perf hit. [[email protected]: Keep the mutex as a hashed array instead of bloating the ext4 inode] [[email protected]: Fix up namespace issues so that global variables are protected with an "ext4_" prefix.] Signed-off-by: Eric Sandeen <[email protected]> Signed-off-by: "Theodore Ts'o" <[email protected]>
1 parent 2892c15 commit e9e3bce

File tree

5 files changed

+100
-18
lines changed

5 files changed

+100
-18
lines changed

fs/ext4/ext4.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -848,6 +848,7 @@ struct ext4_inode_info {
848848
atomic_t i_ioend_count; /* Number of outstanding io_end structs */
849849
/* current io_end structure for async DIO write*/
850850
ext4_io_end_t *cur_aio_dio;
851+
atomic_t i_aiodio_unwritten; /* Nr. of inflight conversions pending */
851852

852853
spinlock_t i_block_reservation_lock;
853854

@@ -2119,6 +2120,15 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh)
21192120

21202121
#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
21212122

2123+
/* For ioend & aio unwritten conversion wait queues */
2124+
#define EXT4_WQ_HASH_SZ 37
2125+
#define ext4_ioend_wq(v) (&ext4__ioend_wq[((unsigned long)(v)) %\
2126+
EXT4_WQ_HASH_SZ])
2127+
#define ext4_aio_mutex(v) (&ext4__aio_mutex[((unsigned long)(v)) %\
2128+
EXT4_WQ_HASH_SZ])
2129+
extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
2130+
extern struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
2131+
21222132
#endif /* __KERNEL__ */
21232133

21242134
#endif /* _EXT4_H */

fs/ext4/extents.c

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3174,9 +3174,10 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
31743174
* that this IO needs to convertion to written when IO is
31753175
* completed
31763176
*/
3177-
if (io)
3177+
if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) {
31783178
io->flag = EXT4_IO_END_UNWRITTEN;
3179-
else
3179+
atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
3180+
} else
31803181
ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
31813182
if (ext4_should_dioread_nolock(inode))
31823183
map->m_flags |= EXT4_MAP_UNINIT;
@@ -3463,9 +3464,10 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
34633464
* that we need to perform convertion when IO is done.
34643465
*/
34653466
if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
3466-
if (io)
3467+
if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) {
34673468
io->flag = EXT4_IO_END_UNWRITTEN;
3468-
else
3469+
atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
3470+
} else
34693471
ext4_set_inode_state(inode,
34703472
EXT4_STATE_DIO_UNWRITTEN);
34713473
}

fs/ext4/file.c

Lines changed: 59 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,11 +55,47 @@ static int ext4_release_file(struct inode *inode, struct file *filp)
5555
return 0;
5656
}
5757

58+
static void ext4_aiodio_wait(struct inode *inode)
59+
{
60+
wait_queue_head_t *wq = ext4_ioend_wq(inode);
61+
62+
wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_aiodio_unwritten) == 0));
63+
}
64+
65+
/*
66+
* This tests whether the IO in question is block-aligned or not.
67+
* Ext4 utilizes unwritten extents when hole-filling during direct IO, and they
68+
* are converted to written only after the IO is complete. Until they are
69+
* mapped, these blocks appear as holes, so dio_zero_block() will assume that
70+
* it needs to zero out portions of the start and/or end block. If 2 AIO
71+
* threads are at work on the same unwritten block, they must be synchronized
72+
* or one thread will zero the other's data, causing corruption.
73+
*/
74+
static int
75+
ext4_unaligned_aio(struct inode *inode, const struct iovec *iov,
76+
unsigned long nr_segs, loff_t pos)
77+
{
78+
struct super_block *sb = inode->i_sb;
79+
int blockmask = sb->s_blocksize - 1;
80+
size_t count = iov_length(iov, nr_segs);
81+
loff_t final_size = pos + count;
82+
83+
if (pos >= inode->i_size)
84+
return 0;
85+
86+
if ((pos & blockmask) || (final_size & blockmask))
87+
return 1;
88+
89+
return 0;
90+
}
91+
5892
static ssize_t
5993
ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
6094
unsigned long nr_segs, loff_t pos)
6195
{
6296
struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
97+
int unaligned_aio = 0;
98+
int ret;
6399

64100
/*
65101
* If we have encountered a bitmap-format file, the size limit
@@ -78,9 +114,31 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
78114
nr_segs = iov_shorten((struct iovec *)iov, nr_segs,
79115
sbi->s_bitmap_maxbytes - pos);
80116
}
117+
} else if (unlikely((iocb->ki_filp->f_flags & O_DIRECT) &&
118+
!is_sync_kiocb(iocb))) {
119+
unaligned_aio = ext4_unaligned_aio(inode, iov, nr_segs, pos);
81120
}
82121

83-
return generic_file_aio_write(iocb, iov, nr_segs, pos);
122+
/* Unaligned direct AIO must be serialized; see comment above */
123+
if (unaligned_aio) {
124+
static unsigned long unaligned_warn_time;
125+
126+
/* Warn about this once per day */
127+
if (printk_timed_ratelimit(&unaligned_warn_time, 60*60*24*HZ))
128+
ext4_msg(inode->i_sb, KERN_WARNING,
129+
"Unaligned AIO/DIO on inode %ld by %s; "
130+
"performance will be poor.",
131+
inode->i_ino, current->comm);
132+
mutex_lock(ext4_aio_mutex(inode));
133+
ext4_aiodio_wait(inode);
134+
}
135+
136+
ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
137+
138+
if (unaligned_aio)
139+
mutex_unlock(ext4_aio_mutex(inode));
140+
141+
return ret;
84142
}
85143

86144
static const struct vm_operations_struct ext4_file_vm_ops = {

fs/ext4/page-io.c

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -32,14 +32,8 @@
3232

3333
static struct kmem_cache *io_page_cachep, *io_end_cachep;
3434

35-
#define WQ_HASH_SZ 37
36-
#define to_ioend_wq(v) (&ioend_wq[((unsigned long)v) % WQ_HASH_SZ])
37-
static wait_queue_head_t ioend_wq[WQ_HASH_SZ];
38-
3935
int __init ext4_init_pageio(void)
4036
{
41-
int i;
42-
4337
io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT);
4438
if (io_page_cachep == NULL)
4539
return -ENOMEM;
@@ -48,9 +42,6 @@ int __init ext4_init_pageio(void)
4842
kmem_cache_destroy(io_page_cachep);
4943
return -ENOMEM;
5044
}
51-
for (i = 0; i < WQ_HASH_SZ; i++)
52-
init_waitqueue_head(&ioend_wq[i]);
53-
5445
return 0;
5546
}
5647

@@ -62,7 +53,7 @@ void ext4_exit_pageio(void)
6253

6354
void ext4_ioend_wait(struct inode *inode)
6455
{
65-
wait_queue_head_t *wq = to_ioend_wq(inode);
56+
wait_queue_head_t *wq = ext4_ioend_wq(inode);
6657

6758
wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0));
6859
}
@@ -87,7 +78,7 @@ void ext4_free_io_end(ext4_io_end_t *io)
8778
for (i = 0; i < io->num_io_pages; i++)
8879
put_io_page(io->pages[i]);
8980
io->num_io_pages = 0;
90-
wq = to_ioend_wq(io->inode);
81+
wq = ext4_ioend_wq(io->inode);
9182
if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count) &&
9283
waitqueue_active(wq))
9384
wake_up_all(wq);
@@ -102,6 +93,7 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
10293
struct inode *inode = io->inode;
10394
loff_t offset = io->offset;
10495
ssize_t size = io->size;
96+
wait_queue_head_t *wq;
10597
int ret = 0;
10698

10799
ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
@@ -126,7 +118,16 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
126118
if (io->iocb)
127119
aio_complete(io->iocb, io->result, 0);
128120
/* clear the DIO AIO unwritten flag */
129-
io->flag &= ~EXT4_IO_END_UNWRITTEN;
121+
if (io->flag & EXT4_IO_END_UNWRITTEN) {
122+
io->flag &= ~EXT4_IO_END_UNWRITTEN;
123+
/* Wake up anyone waiting on unwritten extent conversion */
124+
wq = ext4_ioend_wq(io->inode);
125+
if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten) &&
126+
waitqueue_active(wq)) {
127+
wake_up_all(wq);
128+
}
129+
}
130+
130131
return ret;
131132
}
132133

fs/ext4/super.c

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -833,6 +833,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
833833
ei->i_sync_tid = 0;
834834
ei->i_datasync_tid = 0;
835835
atomic_set(&ei->i_ioend_count, 0);
836+
atomic_set(&ei->i_aiodio_unwritten, 0);
836837

837838
return &ei->vfs_inode;
838839
}
@@ -4800,11 +4801,21 @@ static void ext4_exit_feat_adverts(void)
48004801
kfree(ext4_feat);
48014802
}
48024803

4804+
/* Shared across all ext4 file systems */
4805+
wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
4806+
struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
4807+
48034808
static int __init ext4_init_fs(void)
48044809
{
4805-
int err;
4810+
int i, err;
48064811

48074812
ext4_check_flag_values();
4813+
4814+
for (i = 0; i < EXT4_WQ_HASH_SZ; i++) {
4815+
mutex_init(&ext4__aio_mutex[i]);
4816+
init_waitqueue_head(&ext4__ioend_wq[i]);
4817+
}
4818+
48084819
err = ext4_init_pageio();
48094820
if (err)
48104821
return err;

0 commit comments

Comments
 (0)