Skip to content

Commit be4f1ac

Browse files
Christoph HellwigBen Myers
authored andcommitted
xfs: log all dirty inodes in xfs_fs_sync_fs
Since Linux 2.6.36 the writeback code has introduces various measures for live lock prevention during sync(). Unfortunately some of these are actively harmful for the XFS model, where the inode gets marked dirty for metadata from the data I/O handler. The older_than_this checks that are now more strictly enforced since writeback: avoid livelocking WB_SYNC_ALL writeback by only calling into __writeback_inodes_sb and thus only sampling the current cut off time once. But on a slow enough devices the previous asynchronous sync pass might not have fully completed yet, and thus XFS might mark metadata dirty only after that sampling of the cut off time for the blocking pass already happened. I have not myself reproduced this myself on a real system, but by introducing artificial delay into the XFS I/O completion workqueues it can be reproduced easily. Fix this by iterating over all XFS inodes in ->sync_fs and log all that are dirty. This might log inode that only got redirtied after the previous pass, but given how cheap delayed logging of inodes is it isn't a major concern for performance. Signed-off-by: Christoph Hellwig <[email protected]> Reviewed-by: Dave Chinner <[email protected]> Tested-by: Mark Tinguely <[email protected]> Reviewed-by: Mark Tinguely <[email protected]> Signed-off-by: Ben Myers <[email protected]>
1 parent 0b8fd30 commit be4f1ac

File tree

3 files changed

+42
-24
lines changed

3 files changed

+42
-24
lines changed

fs/xfs/xfs_super.c

Lines changed: 4 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -868,27 +868,6 @@ xfs_fs_dirty_inode(
868868
XFS_I(inode)->i_update_core = 1;
869869
}
870870

871-
STATIC int
872-
xfs_log_inode(
873-
struct xfs_inode *ip)
874-
{
875-
struct xfs_mount *mp = ip->i_mount;
876-
struct xfs_trans *tp;
877-
int error;
878-
879-
tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
880-
error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
881-
if (error) {
882-
xfs_trans_cancel(tp, 0);
883-
return error;
884-
}
885-
886-
xfs_ilock(ip, XFS_ILOCK_EXCL);
887-
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
888-
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
889-
return xfs_trans_commit(tp, 0);
890-
}
891-
892871
STATIC int
893872
xfs_fs_write_inode(
894873
struct inode *inode,
@@ -902,8 +881,6 @@ xfs_fs_write_inode(
902881

903882
if (XFS_FORCED_SHUTDOWN(mp))
904883
return -XFS_ERROR(EIO);
905-
if (!ip->i_update_core)
906-
return 0;
907884

908885
if (wbc->sync_mode == WB_SYNC_ALL || wbc->for_kupdate) {
909886
/*
@@ -913,11 +890,14 @@ xfs_fs_write_inode(
913890
* ->sync_fs call do that for thus, which reduces the number
914891
* of synchronous log forces dramatically.
915892
*/
916-
error = xfs_log_inode(ip);
893+
error = xfs_log_dirty_inode(ip, NULL, 0);
917894
if (error)
918895
goto out;
919896
return 0;
920897
} else {
898+
if (!ip->i_update_core)
899+
return 0;
900+
921901
/*
922902
* We make this non-blocking if the inode is contended, return
923903
* EAGAIN to indicate to the caller that they did not succeed.

fs/xfs/xfs_sync.c

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -336,6 +336,32 @@ xfs_sync_fsdata(
336336
return error;
337337
}
338338

339+
int
340+
xfs_log_dirty_inode(
341+
struct xfs_inode *ip,
342+
struct xfs_perag *pag,
343+
int flags)
344+
{
345+
struct xfs_mount *mp = ip->i_mount;
346+
struct xfs_trans *tp;
347+
int error;
348+
349+
if (!ip->i_update_core)
350+
return 0;
351+
352+
tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
353+
error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
354+
if (error) {
355+
xfs_trans_cancel(tp, 0);
356+
return error;
357+
}
358+
359+
xfs_ilock(ip, XFS_ILOCK_EXCL);
360+
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
361+
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
362+
return xfs_trans_commit(tp, 0);
363+
}
364+
339365
/*
340366
* When remounting a filesystem read-only or freezing the filesystem, we have
341367
* two phases to execute. This first phase is syncing the data before we
@@ -359,6 +385,16 @@ xfs_quiesce_data(
359385
{
360386
int error, error2 = 0;
361387

388+
/*
389+
* Log all pending size and timestamp updates. The vfs writeback
390+
* code is supposed to do this, but due to its overagressive
391+
* livelock detection it will skip inodes where appending writes
392+
* were written out in the first non-blocking sync phase if their
393+
* completion took long enough that it happened after taking the
394+
* timestamp for the cut-off in the blocking phase.
395+
*/
396+
xfs_inode_ag_iterator(mp, xfs_log_dirty_inode, 0);
397+
362398
xfs_qm_sync(mp, SYNC_TRYLOCK);
363399
xfs_qm_sync(mp, SYNC_WAIT);
364400

fs/xfs/xfs_sync.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ void xfs_quiesce_attr(struct xfs_mount *mp);
3434

3535
void xfs_flush_inodes(struct xfs_inode *ip);
3636

37+
int xfs_log_dirty_inode(struct xfs_inode *ip, struct xfs_perag *pag, int flags);
38+
3739
int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
3840
int xfs_reclaim_inodes_count(struct xfs_mount *mp);
3941
void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);

0 commit comments

Comments
 (0)