Skip to content

Commit ccf7c23

Browse files
Dave ChinnerAlex Elder
authored andcommitted
xfs: Ensure inode allocation buffers are fully replayed
With delayed logging, we can get inode allocation buffers in the same transaction inode unlink buffers. We don't currently mark inode allocation buffers in the log, so inode unlink buffers take precedence over allocation buffers. The result is that when they are combined into the same checkpoint, only the unlinked inode chain fields are replayed, resulting in uninitialised inode buffers being detected when the next inode modification is replayed. To fix this, we need to ensure that we do not set the inode buffer flag in the buffer log item format flags if the inode allocation has not already hit the log. To avoid requiring a change to log recovery, we really need to make this a modification that relies only on in-memory sate. We can do this by checking during buffer log formatting (while the CIL cannot be flushed) if we are still in the same sequence when we commit the unlink transaction as the inode allocation transaction. If we are, then we do not add the inode buffer flag to the buffer log format item flags. This means the entire buffer will be replayed, not just the unlinked fields. We do this while CIL flusheѕ are locked out to ensure that we don't race with the sequence numbers changing and hence fail to put the inode buffer flag in the buffer format flags when we really need to. Signed-off-by: Dave Chinner <[email protected]> Reviewed-by: Christoph Hellwig <[email protected]> Signed-off-by: Alex Elder <[email protected]>
1 parent df80615 commit ccf7c23

File tree

6 files changed

+74
-11
lines changed

6 files changed

+74
-11
lines changed

fs/xfs/xfs_buf_item.c

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -254,6 +254,20 @@ xfs_buf_item_format(
254254
vecp++;
255255
nvecs = 1;
256256

257+
/*
258+
* If it is an inode buffer, transfer the in-memory state to the
259+
* format flags and clear the in-memory state. We do not transfer
260+
* this state if the inode buffer allocation has not yet been committed
261+
* to the log as setting the XFS_BLI_INODE_BUF flag will prevent
262+
* correct replay of the inode allocation.
263+
*/
264+
if (bip->bli_flags & XFS_BLI_INODE_BUF) {
265+
if (!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
266+
xfs_log_item_in_current_chkpt(&bip->bli_item)))
267+
bip->bli_format.blf_flags |= XFS_BLF_INODE_BUF;
268+
bip->bli_flags &= ~XFS_BLI_INODE_BUF;
269+
}
270+
257271
if (bip->bli_flags & XFS_BLI_STALE) {
258272
/*
259273
* The buffer is stale, so all we need to log

fs/xfs/xfs_buf_item.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,14 +69,16 @@ typedef struct xfs_buf_log_format {
6969
#define XFS_BLI_LOGGED 0x08
7070
#define XFS_BLI_INODE_ALLOC_BUF 0x10
7171
#define XFS_BLI_STALE_INODE 0x20
72+
#define XFS_BLI_INODE_BUF 0x40
7273

7374
#define XFS_BLI_FLAGS \
7475
{ XFS_BLI_HOLD, "HOLD" }, \
7576
{ XFS_BLI_DIRTY, "DIRTY" }, \
7677
{ XFS_BLI_STALE, "STALE" }, \
7778
{ XFS_BLI_LOGGED, "LOGGED" }, \
7879
{ XFS_BLI_INODE_ALLOC_BUF, "INODE_ALLOC" }, \
79-
{ XFS_BLI_STALE_INODE, "STALE_INODE" }
80+
{ XFS_BLI_STALE_INODE, "STALE_INODE" }, \
81+
{ XFS_BLI_INODE_BUF, "INODE_BUF" }
8082

8183

8284
#ifdef __KERNEL__

fs/xfs/xfs_log.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,7 @@ xlog_tid_t xfs_log_get_trans_ident(struct xfs_trans *tp);
198198
int xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
199199
struct xfs_log_vec *log_vector,
200200
xfs_lsn_t *commit_lsn, int flags);
201+
bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
201202

202203
#endif
203204

fs/xfs/xfs_log_cil.c

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,15 @@ xlog_cil_insert(
199199
list_move_tail(&item->li_cil, &cil->xc_cil);
200200
ctx->nvecs += diff_iovecs;
201201

202+
/*
203+
* If this is the first time the item is being committed to the CIL,
204+
* store the sequence number on the log item so we can tell
205+
* in future commits whether this is the first checkpoint the item is
206+
* being committed into.
207+
*/
208+
if (!item->li_seq)
209+
item->li_seq = ctx->sequence;
210+
202211
/*
203212
* Now transfer enough transaction reservation to the context ticket
204213
* for the checkpoint. The context ticket is special - the unit
@@ -325,6 +334,10 @@ xlog_cil_free_logvec(
325334
* For more specific information about the order of operations in
326335
* xfs_log_commit_cil() please refer to the comments in
327336
* xfs_trans_commit_iclog().
337+
*
338+
* Called with the context lock already held in read mode to lock out
339+
* background commit, returns without it held once background commits are
340+
* allowed again.
328341
*/
329342
int
330343
xfs_log_commit_cil(
@@ -678,3 +691,35 @@ xlog_cil_push_lsn(
678691
spin_unlock(&cil->xc_cil_lock);
679692
return commit_lsn;
680693
}
694+
695+
/*
696+
* Check if the current log item was first committed in this sequence.
697+
* We can't rely on just the log item being in the CIL, we have to check
698+
* the recorded commit sequence number.
699+
*
700+
* Note: for this to be used in a non-racy manner, it has to be called with
701+
* CIL flushing locked out. As a result, it should only be used during the
702+
* transaction commit process when deciding what to format into the item.
703+
*/
704+
bool
705+
xfs_log_item_in_current_chkpt(
706+
struct xfs_log_item *lip)
707+
{
708+
struct xfs_cil_ctx *ctx;
709+
710+
if (!(lip->li_mountp->m_flags & XFS_MOUNT_DELAYLOG))
711+
return false;
712+
if (list_empty(&lip->li_cil))
713+
return false;
714+
715+
ctx = lip->li_mountp->m_log->l_cilp->xc_ctx;
716+
717+
/*
718+
* li_seq is written on the first commit of a log item to record the
719+
* first checkpoint it is written to. Hence if it is different to the
720+
* current sequence, we're in a new checkpoint.
721+
*/
722+
if (XFS_LSN_CMP(lip->li_seq, ctx->sequence) != 0)
723+
return false;
724+
return true;
725+
}

fs/xfs/xfs_trans.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -835,6 +835,7 @@ typedef struct xfs_log_item {
835835
/* delayed logging */
836836
struct list_head li_cil; /* CIL pointers */
837837
struct xfs_log_vec *li_lv; /* active log vector */
838+
xfs_lsn_t li_seq; /* CIL commit seq */
838839
} xfs_log_item_t;
839840

840841
#define XFS_LI_IN_AIL 0x1

fs/xfs/xfs_trans_buf.c

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -792,7 +792,7 @@ xfs_trans_binval(
792792
XFS_BUF_UNDELAYWRITE(bp);
793793
XFS_BUF_STALE(bp);
794794
bip->bli_flags |= XFS_BLI_STALE;
795-
bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_DIRTY);
795+
bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY);
796796
bip->bli_format.blf_flags &= ~XFS_BLF_INODE_BUF;
797797
bip->bli_format.blf_flags |= XFS_BLF_CANCEL;
798798
memset((char *)(bip->bli_format.blf_data_map), 0,
@@ -802,16 +802,16 @@ xfs_trans_binval(
802802
}
803803

804804
/*
805-
* This call is used to indicate that the buffer contains on-disk
806-
* inodes which must be handled specially during recovery. They
807-
* require special handling because only the di_next_unlinked from
808-
* the inodes in the buffer should be recovered. The rest of the
809-
* data in the buffer is logged via the inodes themselves.
805+
* This call is used to indicate that the buffer contains on-disk inodes which
806+
* must be handled specially during recovery. They require special handling
807+
* because only the di_next_unlinked from the inodes in the buffer should be
808+
* recovered. The rest of the data in the buffer is logged via the inodes
809+
* themselves.
810810
*
811-
* All we do is set the XFS_BLI_INODE_BUF flag in the buffer's log
812-
* format structure so that we'll know what to do at recovery time.
811+
* All we do is set the XFS_BLI_INODE_BUF flag in the items flags so it can be
812+
* transferred to the buffer's log format structure so that we'll know what to
813+
* do at recovery time.
813814
*/
814-
/* ARGSUSED */
815815
void
816816
xfs_trans_inode_buf(
817817
xfs_trans_t *tp,
@@ -826,7 +826,7 @@ xfs_trans_inode_buf(
826826
bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
827827
ASSERT(atomic_read(&bip->bli_refcount) > 0);
828828

829-
bip->bli_format.blf_flags |= XFS_BLF_INODE_BUF;
829+
bip->bli_flags |= XFS_BLI_INODE_BUF;
830830
}
831831

832832
/*

0 commit comments

Comments
 (0)