Skip to content

Commit 14a5379

Browse files
catherinehoangChandan Babu R
authored andcommitted
xfs: allow read IO and FICLONE to run concurrently
One of our VM cluster management products needs to snapshot KVM image files so that they can be restored in case of failure. Snapshotting is done by redirecting VM disk writes to a sidecar file and using reflink on the disk image, specifically the FICLONE ioctl as used by "cp --reflink". Reflink locks the source and destination files while it operates, which means that reads from the main vm disk image are blocked, causing the vm to stall. When an image file is heavily fragmented, the copy process could take several minutes. Some of the vm image files have 50-100 million extent records, and duplicating that much metadata locks the file for 30 minutes or more. Having activities suspended for such a long time in a cluster node could result in node eviction. Clone operations and read IO do not change any data in the source file, so they should be able to run concurrently. Demote the exclusive locks taken by FICLONE to shared locks to allow reads while cloning. While a clone is in progress, writes will take the IOLOCK_EXCL, so they block until the clone completes. Link: https://lore.kernel.org/linux-xfs/[email protected]/ Signed-off-by: Catherine Hoang <[email protected]> Reviewed-by: "Darrick J. Wong" <[email protected]> Reviewed-by: Dave Chinner <[email protected]> Reviewed-by: Christoph Hellwig <[email protected]> Signed-off-by: Chandan Babu R <[email protected]>
1 parent 35dc55b commit 14a5379

File tree

4 files changed

+80
-13
lines changed

4 files changed

+80
-13
lines changed

fs/xfs/xfs_file.c

Lines changed: 50 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,43 @@ xfs_ilock_iocb(
214214
return 0;
215215
}
216216

217+
static int
218+
xfs_ilock_iocb_for_write(
219+
struct kiocb *iocb,
220+
unsigned int *lock_mode)
221+
{
222+
ssize_t ret;
223+
struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
224+
225+
ret = xfs_ilock_iocb(iocb, *lock_mode);
226+
if (ret)
227+
return ret;
228+
229+
if (*lock_mode == XFS_IOLOCK_EXCL)
230+
return 0;
231+
if (!xfs_iflags_test(ip, XFS_IREMAPPING))
232+
return 0;
233+
234+
xfs_iunlock(ip, *lock_mode);
235+
*lock_mode = XFS_IOLOCK_EXCL;
236+
return xfs_ilock_iocb(iocb, *lock_mode);
237+
}
238+
239+
static unsigned int
240+
xfs_ilock_for_write_fault(
241+
struct xfs_inode *ip)
242+
{
243+
/* get a shared lock if no remapping in progress */
244+
xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
245+
if (!xfs_iflags_test(ip, XFS_IREMAPPING))
246+
return XFS_MMAPLOCK_SHARED;
247+
248+
/* wait for remapping to complete */
249+
xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
250+
xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
251+
return XFS_MMAPLOCK_EXCL;
252+
}
253+
217254
STATIC ssize_t
218255
xfs_file_dio_read(
219256
struct kiocb *iocb,
@@ -551,7 +588,7 @@ xfs_file_dio_write_aligned(
551588
unsigned int iolock = XFS_IOLOCK_SHARED;
552589
ssize_t ret;
553590

554-
ret = xfs_ilock_iocb(iocb, iolock);
591+
ret = xfs_ilock_iocb_for_write(iocb, &iolock);
555592
if (ret)
556593
return ret;
557594
ret = xfs_file_write_checks(iocb, from, &iolock);
@@ -618,7 +655,7 @@ xfs_file_dio_write_unaligned(
618655
flags = IOMAP_DIO_FORCE_WAIT;
619656
}
620657

621-
ret = xfs_ilock_iocb(iocb, iolock);
658+
ret = xfs_ilock_iocb_for_write(iocb, &iolock);
622659
if (ret)
623660
return ret;
624661

@@ -1180,7 +1217,7 @@ xfs_file_remap_range(
11801217
if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out))
11811218
xfs_log_force_inode(dest);
11821219
out_unlock:
1183-
xfs_iunlock2_io_mmap(src, dest);
1220+
xfs_iunlock2_remapping(src, dest);
11841221
if (ret)
11851222
trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
11861223
return remapped > 0 ? remapped : ret;
@@ -1328,6 +1365,7 @@ __xfs_filemap_fault(
13281365
struct inode *inode = file_inode(vmf->vma->vm_file);
13291366
struct xfs_inode *ip = XFS_I(inode);
13301367
vm_fault_t ret;
1368+
unsigned int lock_mode = 0;
13311369

13321370
trace_xfs_filemap_fault(ip, order, write_fault);
13331371

@@ -1336,25 +1374,24 @@ __xfs_filemap_fault(
13361374
file_update_time(vmf->vma->vm_file);
13371375
}
13381376

1377+
if (IS_DAX(inode) || write_fault)
1378+
lock_mode = xfs_ilock_for_write_fault(XFS_I(inode));
1379+
13391380
if (IS_DAX(inode)) {
13401381
pfn_t pfn;
13411382

1342-
xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
13431383
ret = xfs_dax_fault(vmf, order, write_fault, &pfn);
13441384
if (ret & VM_FAULT_NEEDDSYNC)
13451385
ret = dax_finish_sync_fault(vmf, order, pfn);
1346-
xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1386+
} else if (write_fault) {
1387+
ret = iomap_page_mkwrite(vmf, &xfs_page_mkwrite_iomap_ops);
13471388
} else {
1348-
if (write_fault) {
1349-
xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1350-
ret = iomap_page_mkwrite(vmf,
1351-
&xfs_page_mkwrite_iomap_ops);
1352-
xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1353-
} else {
1354-
ret = filemap_fault(vmf);
1355-
}
1389+
ret = filemap_fault(vmf);
13561390
}
13571391

1392+
if (lock_mode)
1393+
xfs_iunlock(XFS_I(inode), lock_mode);
1394+
13581395
if (write_fault)
13591396
sb_end_pagefault(inode->i_sb);
13601397
return ret;

fs/xfs/xfs_inode.c

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3628,6 +3628,23 @@ xfs_iunlock2_io_mmap(
36283628
inode_unlock(VFS_I(ip1));
36293629
}
36303630

3631+
/* Drop the MMAPLOCK and the IOLOCK after a remap completes. */
3632+
void
3633+
xfs_iunlock2_remapping(
3634+
struct xfs_inode *ip1,
3635+
struct xfs_inode *ip2)
3636+
{
3637+
xfs_iflags_clear(ip1, XFS_IREMAPPING);
3638+
3639+
if (ip1 != ip2)
3640+
xfs_iunlock(ip1, XFS_MMAPLOCK_SHARED);
3641+
xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL);
3642+
3643+
if (ip1 != ip2)
3644+
inode_unlock_shared(VFS_I(ip1));
3645+
inode_unlock(VFS_I(ip2));
3646+
}
3647+
36313648
/*
36323649
* Reload the incore inode list for this inode. Caller should ensure that
36333650
* the link count cannot change, either by taking ILOCK_SHARED or otherwise

fs/xfs/xfs_inode.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -347,6 +347,14 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip)
347347
/* Quotacheck is running but inode has not been added to quota counts. */
348348
#define XFS_IQUOTAUNCHECKED (1 << 14)
349349

350+
/*
351+
* Remap in progress. Callers that wish to update file data while
352+
* holding a shared IOLOCK or MMAPLOCK must drop the lock and retake
353+
* the lock in exclusive mode. Relocking the file will block until
354+
* IREMAPPING is cleared.
355+
*/
356+
#define XFS_IREMAPPING (1U << 15)
357+
350358
/* All inode state flags related to inode reclaim. */
351359
#define XFS_ALL_IRECLAIM_FLAGS (XFS_IRECLAIMABLE | \
352360
XFS_IRECLAIM | \
@@ -595,6 +603,7 @@ void xfs_end_io(struct work_struct *work);
595603

596604
int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
597605
void xfs_iunlock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
606+
void xfs_iunlock2_remapping(struct xfs_inode *ip1, struct xfs_inode *ip2);
598607

599608
static inline bool
600609
xfs_inode_unlinked_incomplete(

fs/xfs/xfs_reflink.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1540,6 +1540,10 @@ xfs_reflink_remap_prep(
15401540
if (ret)
15411541
goto out_unlock;
15421542

1543+
xfs_iflags_set(src, XFS_IREMAPPING);
1544+
if (inode_in != inode_out)
1545+
xfs_ilock_demote(src, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL);
1546+
15431547
return 0;
15441548
out_unlock:
15451549
xfs_iunlock2_io_mmap(src, dest);

0 commit comments

Comments
 (0)