Skip to content

Commit bd5fe6c

Browse files
Christoph HellwigAl Viro
authored andcommitted
fs: kill i_alloc_sem
i_alloc_sem is a rather special rw_semaphore. It's the last one that may be released by a non-owner, and it's write side is always mirrored by real exclusion. It's intended use it to wait for all pending direct I/O requests to finish before starting a truncate. Replace it with a hand-grown construct: - exclusion for truncates is already guaranteed by i_mutex, so it can simply fall way - the reader side is replaced by an i_dio_count member in struct inode that counts the number of pending direct I/O requests. Truncate can't proceed as long as it's non-zero - when i_dio_count reaches non-zero we wake up a pending truncate using wake_up_bit on a new bit in i_flags - new references to i_dio_count can't appear while we are waiting for it to read zero because the direct I/O count always needs i_mutex (or an equivalent like XFS's i_iolock) for starting a new operation. This scheme is much simpler, and saves the space of a spinlock_t and a struct list_head in struct inode (typically 160 bits on a non-debug 64-bit system). Signed-off-by: Christoph Hellwig <[email protected]> Signed-off-by: Al Viro <[email protected]>
1 parent f9b5570 commit bd5fe6c

File tree

13 files changed

+78
-53
lines changed

13 files changed

+78
-53
lines changed

fs/attr.c

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -233,16 +233,13 @@ int notify_change(struct dentry * dentry, struct iattr * attr)
233233
return error;
234234

235235
if (ia_valid & ATTR_SIZE)
236-
down_write(&dentry->d_inode->i_alloc_sem);
236+
inode_dio_wait(inode);
237237

238238
if (inode->i_op->setattr)
239239
error = inode->i_op->setattr(dentry, attr);
240240
else
241241
error = simple_setattr(dentry, attr);
242242

243-
if (ia_valid & ATTR_SIZE)
244-
up_write(&dentry->d_inode->i_alloc_sem);
245-
246243
if (!error)
247244
fsnotify_change(dentry, ia_valid);
248245

fs/direct-io.c

Lines changed: 51 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,50 @@ struct dio {
135135
struct page *pages[DIO_PAGES]; /* page buffer */
136136
};
137137

138+
static void __inode_dio_wait(struct inode *inode)
139+
{
140+
wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_DIO_WAKEUP);
141+
DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP);
142+
143+
do {
144+
prepare_to_wait(wq, &q.wait, TASK_UNINTERRUPTIBLE);
145+
if (atomic_read(&inode->i_dio_count))
146+
schedule();
147+
} while (atomic_read(&inode->i_dio_count));
148+
finish_wait(wq, &q.wait);
149+
}
150+
151+
/**
152+
* inode_dio_wait - wait for outstanding DIO requests to finish
153+
* @inode: inode to wait for
154+
*
155+
* Waits for all pending direct I/O requests to finish so that we can
156+
* proceed with a truncate or equivalent operation.
157+
*
158+
* Must be called under a lock that serializes taking new references
159+
* to i_dio_count, usually by inode->i_mutex.
160+
*/
161+
void inode_dio_wait(struct inode *inode)
162+
{
163+
if (atomic_read(&inode->i_dio_count))
164+
__inode_dio_wait(inode);
165+
}
166+
EXPORT_SYMBOL_GPL(inode_dio_wait);
167+
168+
/*
169+
* inode_dio_done - signal finish of a direct I/O requests
170+
* @inode: inode the direct I/O happens on
171+
*
172+
* This is called once we've finished processing a direct I/O request,
173+
* and is used to wake up callers waiting for direct I/O to be quiesced.
174+
*/
175+
void inode_dio_done(struct inode *inode)
176+
{
177+
if (atomic_dec_and_test(&inode->i_dio_count))
178+
wake_up_bit(&inode->i_state, __I_DIO_WAKEUP);
179+
}
180+
EXPORT_SYMBOL_GPL(inode_dio_done);
181+
138182
/*
139183
* How many pages are in the queue?
140184
*/
@@ -254,9 +298,7 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is
254298
}
255299

256300
if (dio->flags & DIO_LOCKING)
257-
/* lockdep: non-owner release */
258-
up_read_non_owner(&dio->inode->i_alloc_sem);
259-
301+
inode_dio_done(dio->inode);
260302
return ret;
261303
}
262304

@@ -980,9 +1022,6 @@ static int do_direct_IO(struct dio *dio)
9801022
return ret;
9811023
}
9821024

983-
/*
984-
* Releases both i_mutex and i_alloc_sem
985-
*/
9861025
static ssize_t
9871026
direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
9881027
const struct iovec *iov, loff_t offset, unsigned long nr_segs,
@@ -1146,15 +1185,14 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
11461185
* For writes this function is called under i_mutex and returns with
11471186
* i_mutex held, for reads, i_mutex is not held on entry, but it is
11481187
* taken and dropped again before returning.
1149-
* For reads and writes i_alloc_sem is taken in shared mode and released
1150-
* on I/O completion (which may happen asynchronously after returning to
1151-
* the caller).
1188+
* The i_dio_count counter keeps track of the number of outstanding
1189+
* direct I/O requests, and truncate waits for it to reach zero.
1190+
* New references to i_dio_count must only be grabbed with i_mutex
1191+
* held.
11521192
*
11531193
* - if the flags value does NOT contain DIO_LOCKING we don't use any
11541194
* internal locking but rather rely on the filesystem to synchronize
11551195
* direct I/O reads/writes versus each other and truncate.
1156-
* For reads and writes both i_mutex and i_alloc_sem are not held on
1157-
* entry and are never taken.
11581196
*/
11591197
ssize_t
11601198
__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
@@ -1234,10 +1272,9 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
12341272
}
12351273

12361274
/*
1237-
* Will be released at I/O completion, possibly in a
1238-
* different thread.
1275+
* Will be decremented at I/O completion time.
12391276
*/
1240-
down_read_non_owner(&inode->i_alloc_sem);
1277+
atomic_inc(&inode->i_dio_count);
12411278
}
12421279

12431280
/*

fs/inode.c

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -168,8 +168,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
168168
mutex_init(&inode->i_mutex);
169169
lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key);
170170

171-
init_rwsem(&inode->i_alloc_sem);
172-
lockdep_set_class(&inode->i_alloc_sem, &sb->s_type->i_alloc_sem_key);
171+
atomic_set(&inode->i_dio_count, 0);
173172

174173
mapping->a_ops = &empty_aops;
175174
mapping->host = inode;

fs/ntfs/file.c

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1832,9 +1832,8 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
18321832
* fails again.
18331833
*/
18341834
if (unlikely(NInoTruncateFailed(ni))) {
1835-
down_write(&vi->i_alloc_sem);
1835+
inode_dio_wait(vi);
18361836
err = ntfs_truncate(vi);
1837-
up_write(&vi->i_alloc_sem);
18381837
if (err || NInoTruncateFailed(ni)) {
18391838
if (!err)
18401839
err = -EIO;

fs/ntfs/inode.c

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2357,12 +2357,7 @@ static const char *es = " Leaving inconsistent metadata. Unmount and run "
23572357
*
23582358
* Returns 0 on success or -errno on error.
23592359
*
2360-
* Called with ->i_mutex held. In all but one case ->i_alloc_sem is held for
2361-
* writing. The only case in the kernel where ->i_alloc_sem is not held is
2362-
* mm/filemap.c::generic_file_buffered_write() where vmtruncate() is called
2363-
* with the current i_size as the offset. The analogous place in NTFS is in
2364-
* fs/ntfs/file.c::ntfs_file_buffered_write() where we call vmtruncate() again
2365-
* without holding ->i_alloc_sem.
2360+
* Called with ->i_mutex held.
23662361
*/
23672362
int ntfs_truncate(struct inode *vi)
23682363
{
@@ -2887,8 +2882,7 @@ void ntfs_truncate_vfs(struct inode *vi) {
28872882
* We also abort all changes of user, group, and mode as we do not implement
28882883
* the NTFS ACLs yet.
28892884
*
2890-
* Called with ->i_mutex held. For the ATTR_SIZE (i.e. ->truncate) case, also
2891-
* called with ->i_alloc_sem held for writing.
2885+
* Called with ->i_mutex held.
28922886
*/
28932887
int ntfs_setattr(struct dentry *dentry, struct iattr *attr)
28942888
{

fs/ocfs2/aops.c

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -551,9 +551,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
551551

552552
/*
553553
* ocfs2_dio_end_io is called by the dio core when a dio is finished. We're
554-
* particularly interested in the aio/dio case. Like the core uses
555-
* i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from
556-
* truncation on another.
554+
* particularly interested in the aio/dio case. We use the rw_lock DLM lock
555+
* to protect io on one node from truncation on another.
557556
*/
558557
static void ocfs2_dio_end_io(struct kiocb *iocb,
559558
loff_t offset,
@@ -569,7 +568,7 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
569568
BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
570569

571570
if (ocfs2_iocb_is_sem_locked(iocb)) {
572-
up_read(&inode->i_alloc_sem);
571+
inode_dio_done(inode);
573572
ocfs2_iocb_clear_sem_locked(iocb);
574573
}
575574

fs/ocfs2/file.c

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2236,9 +2236,9 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
22362236
ocfs2_iocb_clear_sem_locked(iocb);
22372237

22382238
relock:
2239-
/* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */
2239+
/* to match setattr's i_mutex -> rw_lock ordering */
22402240
if (direct_io) {
2241-
down_read(&inode->i_alloc_sem);
2241+
atomic_inc(&inode->i_dio_count);
22422242
have_alloc_sem = 1;
22432243
/* communicate with ocfs2_dio_end_io */
22442244
ocfs2_iocb_set_sem_locked(iocb);
@@ -2290,7 +2290,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
22902290
*/
22912291
if (direct_io && !can_do_direct) {
22922292
ocfs2_rw_unlock(inode, rw_level);
2293-
up_read(&inode->i_alloc_sem);
2293+
inode_dio_done(inode);
22942294

22952295
have_alloc_sem = 0;
22962296
rw_level = -1;
@@ -2361,8 +2361,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
23612361
/*
23622362
* deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
23632363
* function pointer which is called when o_direct io completes so that
2364-
* it can unlock our rw lock. (it's the clustered equivalent of
2365-
* i_alloc_sem; protects truncate from racing with pending ios).
2364+
* it can unlock our rw lock.
23662365
* Unfortunately there are error cases which call end_io and others
23672366
* that don't. so we don't have to unlock the rw_lock if either an
23682367
* async dio is going to do it in the future or an end_io after an
@@ -2379,7 +2378,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
23792378

23802379
out_sems:
23812380
if (have_alloc_sem) {
2382-
up_read(&inode->i_alloc_sem);
2381+
inode_dio_done(inode);
23832382
ocfs2_iocb_clear_sem_locked(iocb);
23842383
}
23852384

@@ -2531,8 +2530,8 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
25312530
* need locks to protect pending reads from racing with truncate.
25322531
*/
25332532
if (filp->f_flags & O_DIRECT) {
2534-
down_read(&inode->i_alloc_sem);
25352533
have_alloc_sem = 1;
2534+
atomic_inc(&inode->i_dio_count);
25362535
ocfs2_iocb_set_sem_locked(iocb);
25372536

25382537
ret = ocfs2_rw_lock(inode, 0);
@@ -2575,7 +2574,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
25752574

25762575
bail:
25772576
if (have_alloc_sem) {
2578-
up_read(&inode->i_alloc_sem);
2577+
inode_dio_done(inode);
25792578
ocfs2_iocb_clear_sem_locked(iocb);
25802579
}
25812580
if (rw_level != -1)

fs/reiserfs/xattr.c

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -555,11 +555,10 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
555555

556556
reiserfs_write_unlock(inode->i_sb);
557557
mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_XATTR);
558-
down_write(&dentry->d_inode->i_alloc_sem);
558+
inode_dio_wait(dentry->d_inode);
559559
reiserfs_write_lock(inode->i_sb);
560560

561561
err = reiserfs_setattr(dentry, &newattrs);
562-
up_write(&dentry->d_inode->i_alloc_sem);
563562
mutex_unlock(&dentry->d_inode->i_mutex);
564563
} else
565564
update_ctime(inode);

include/linux/fs.h

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -779,7 +779,7 @@ struct inode {
779779
struct timespec i_ctime;
780780
blkcnt_t i_blocks;
781781
unsigned short i_bytes;
782-
struct rw_semaphore i_alloc_sem;
782+
atomic_t i_dio_count;
783783
const struct file_operations *i_fop; /* former ->i_op->default_file_ops */
784784
struct file_lock *i_flock;
785785
struct address_space *i_mapping;
@@ -1705,6 +1705,10 @@ struct super_operations {
17051705
* set during data writeback, and cleared with a wakeup
17061706
* on the bit address once it is done.
17071707
*
1708+
* I_REFERENCED Marks the inode as recently references on the LRU list.
1709+
*
1710+
* I_DIO_WAKEUP Never set. Only used as a key for wait_on_bit().
1711+
*
17081712
* Q: What is the difference between I_WILL_FREE and I_FREEING?
17091713
*/
17101714
#define I_DIRTY_SYNC (1 << 0)
@@ -1718,6 +1722,8 @@ struct super_operations {
17181722
#define __I_SYNC 7
17191723
#define I_SYNC (1 << __I_SYNC)
17201724
#define I_REFERENCED (1 << 8)
1725+
#define __I_DIO_WAKEUP 9
1726+
#define I_DIO_WAKEUP (1 << I_DIO_WAKEUP)
17211727

17221728
#define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES)
17231729

@@ -1828,7 +1834,6 @@ struct file_system_type {
18281834
struct lock_class_key i_lock_key;
18291835
struct lock_class_key i_mutex_key;
18301836
struct lock_class_key i_mutex_dir_key;
1831-
struct lock_class_key i_alloc_sem_key;
18321837
};
18331838

18341839
extern struct dentry *mount_ns(struct file_system_type *fs_type, int flags,
@@ -2404,6 +2409,8 @@ enum {
24042409
};
24052410

24062411
void dio_end_io(struct bio *bio, int error);
2412+
void inode_dio_wait(struct inode *inode);
2413+
void inode_dio_done(struct inode *inode);
24072414

24082415
ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
24092416
struct block_device *bdev, const struct iovec *iov, loff_t offset,

mm/filemap.c

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -78,9 +78,6 @@
7878
* ->i_mutex (generic_file_buffered_write)
7979
* ->mmap_sem (fault_in_pages_readable->do_page_fault)
8080
*
81-
* ->i_mutex
82-
* ->i_alloc_sem (various)
83-
*
8481
* inode_wb_list_lock
8582
* sb_lock (fs/fs-writeback.c)
8683
* ->mapping->tree_lock (__sync_single_inode)

mm/madvise.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -218,7 +218,7 @@ static long madvise_remove(struct vm_area_struct *vma,
218218
endoff = (loff_t)(end - vma->vm_start - 1)
219219
+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
220220

221-
/* vmtruncate_range needs to take i_mutex and i_alloc_sem */
221+
/* vmtruncate_range needs to take i_mutex */
222222
up_read(&current->mm->mmap_sem);
223223
error = vmtruncate_range(mapping->host, offset, endoff);
224224
down_read(&current->mm->mmap_sem);

mm/rmap.c

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121
* Lock ordering in mm:
2222
*
2323
* inode->i_mutex (while writing or truncating, not reading or faulting)
24-
* inode->i_alloc_sem (vmtruncate_range)
2524
* mm->mmap_sem
2625
* page->flags PG_locked (lock_page)
2726
* mapping->i_mmap_mutex

mm/truncate.c

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -622,12 +622,11 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
622622
return -ENOSYS;
623623

624624
mutex_lock(&inode->i_mutex);
625-
down_write(&inode->i_alloc_sem);
625+
inode_dio_wait(inode);
626626
unmap_mapping_range(mapping, offset, (end - offset), 1);
627627
inode->i_op->truncate_range(inode, offset, end);
628628
/* unmap again to remove racily COWed private pages */
629629
unmap_mapping_range(mapping, offset, (end - offset), 1);
630-
up_write(&inode->i_alloc_sem);
631630
mutex_unlock(&inode->i_mutex);
632631

633632
return 0;

0 commit comments

Comments
 (0)