Skip to content

Commit 102363a

Browse files
committed
Merge tag 'xfs-6.6-fixes-3' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux
Pull xfs fixes from Chandan Babu: - Prevent filesystem hang when executing fstrim operations on large and slow storage * tag 'xfs-6.6-fixes-3' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux: xfs: abort fstrim if kernel is suspending xfs: reduce AGF hold times during fstrim operations xfs: move log discard work to xfs_discard.c
2 parents 4aef108 + 4e69f49 commit 102363a

File tree

6 files changed

+311
-117
lines changed

6 files changed

+311
-117
lines changed

fs/xfs/xfs_discard.c

Lines changed: 242 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
// SPDX-License-Identifier: GPL-2.0
22
/*
3-
* Copyright (C) 2010 Red Hat, Inc.
3+
* Copyright (C) 2010, 2023 Red Hat, Inc.
44
* All Rights Reserved.
55
*/
66
#include "xfs.h"
@@ -19,21 +19,147 @@
1919
#include "xfs_log.h"
2020
#include "xfs_ag.h"
2121

22-
STATIC int
23-
xfs_trim_extents(
22+
/*
23+
* Notes on an efficient, low latency fstrim algorithm
24+
*
25+
* We need to walk the filesystem free space and issue discards on the free
26+
* space that meet the search criteria (size and location). We cannot issue
27+
* discards on extents that might be in use, or are so recently in use they are
28+
* still marked as busy. To serialise against extent state changes whilst we are
29+
* gathering extents to trim, we must hold the AGF lock to lock out other
30+
* allocations and extent free operations that might change extent state.
31+
*
32+
* However, we cannot just hold the AGF for the entire AG free space walk whilst
33+
* we issue discards on each free space that is found. Storage devices can have
34+
* extremely slow discard implementations (e.g. ceph RBD) and so walking a
35+
* couple of million free extents and issuing synchronous discards on each
36+
* extent can take a *long* time. Whilst we are doing this walk, nothing else
37+
* can access the AGF, and we can stall transactions and hence the log whilst
38+
* modifications wait for the AGF lock to be released. This can lead hung tasks
39+
* kicking the hung task timer and rebooting the system. This is bad.
40+
*
41+
* Hence we need to take a leaf from the bulkstat playbook. It takes the AGI
42+
* lock, gathers a range of inode cluster buffers that are allocated, drops the
43+
* AGI lock and then reads all the inode cluster buffers and processes them. It
44+
* loops doing this, using a cursor to keep track of where it is up to in the AG
45+
* for each iteration to restart the INOBT lookup from.
46+
*
47+
* We can't do this exactly with free space - once we drop the AGF lock, the
48+
* state of the free extent is out of our control and we cannot run a discard
49+
* safely on it in this situation. Unless, of course, we've marked the free
50+
* extent as busy and undergoing a discard operation whilst we held the AGF
51+
* locked.
52+
*
53+
* This is exactly how online discard works - free extents are marked busy when
54+
* they are freed, and once the extent free has been committed to the journal,
55+
* the busy extent record is marked as "undergoing discard" and the discard is
56+
* then issued on the free extent. Once the discard completes, the busy extent
57+
* record is removed and the extent is able to be allocated again.
58+
*
59+
* In the context of fstrim, if we find a free extent we need to discard, we
60+
* don't have to discard it immediately. All we need to do it record that free
61+
* extent as being busy and under discard, and all the allocation routines will
62+
* now avoid trying to allocate it. Hence if we mark the extent as busy under
63+
* the AGF lock, we can safely discard it without holding the AGF lock because
64+
* nothing will attempt to allocate that free space until the discard completes.
65+
*
66+
* This also allows us to issue discards asynchronously like we do with online
67+
* discard, and so for fast devices fstrim will run much faster as we can have
68+
* multiple discard operations in flight at once, as well as pipeline the free
69+
* extent search so that it overlaps in flight discard IO.
70+
*/
71+
72+
struct workqueue_struct *xfs_discard_wq;
73+
74+
static void
75+
xfs_discard_endio_work(
76+
struct work_struct *work)
77+
{
78+
struct xfs_busy_extents *extents =
79+
container_of(work, struct xfs_busy_extents, endio_work);
80+
81+
xfs_extent_busy_clear(extents->mount, &extents->extent_list, false);
82+
kmem_free(extents->owner);
83+
}
84+
85+
/*
86+
* Queue up the actual completion to a thread to avoid IRQ-safe locking for
87+
* pagb_lock.
88+
*/
89+
static void
90+
xfs_discard_endio(
91+
struct bio *bio)
92+
{
93+
struct xfs_busy_extents *extents = bio->bi_private;
94+
95+
INIT_WORK(&extents->endio_work, xfs_discard_endio_work);
96+
queue_work(xfs_discard_wq, &extents->endio_work);
97+
bio_put(bio);
98+
}
99+
100+
/*
101+
* Walk the discard list and issue discards on all the busy extents in the
102+
* list. We plug and chain the bios so that we only need a single completion
103+
* call to clear all the busy extents once the discards are complete.
104+
*/
105+
int
106+
xfs_discard_extents(
107+
struct xfs_mount *mp,
108+
struct xfs_busy_extents *extents)
109+
{
110+
struct xfs_extent_busy *busyp;
111+
struct bio *bio = NULL;
112+
struct blk_plug plug;
113+
int error = 0;
114+
115+
blk_start_plug(&plug);
116+
list_for_each_entry(busyp, &extents->extent_list, list) {
117+
trace_xfs_discard_extent(mp, busyp->agno, busyp->bno,
118+
busyp->length);
119+
120+
error = __blkdev_issue_discard(mp->m_ddev_targp->bt_bdev,
121+
XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno),
122+
XFS_FSB_TO_BB(mp, busyp->length),
123+
GFP_NOFS, &bio);
124+
if (error && error != -EOPNOTSUPP) {
125+
xfs_info(mp,
126+
"discard failed for extent [0x%llx,%u], error %d",
127+
(unsigned long long)busyp->bno,
128+
busyp->length,
129+
error);
130+
break;
131+
}
132+
}
133+
134+
if (bio) {
135+
bio->bi_private = extents;
136+
bio->bi_end_io = xfs_discard_endio;
137+
submit_bio(bio);
138+
} else {
139+
xfs_discard_endio_work(&extents->endio_work);
140+
}
141+
blk_finish_plug(&plug);
142+
143+
return error;
144+
}
145+
146+
147+
static int
148+
xfs_trim_gather_extents(
24149
struct xfs_perag *pag,
25150
xfs_daddr_t start,
26151
xfs_daddr_t end,
27152
xfs_daddr_t minlen,
153+
struct xfs_alloc_rec_incore *tcur,
154+
struct xfs_busy_extents *extents,
28155
uint64_t *blocks_trimmed)
29156
{
30157
struct xfs_mount *mp = pag->pag_mount;
31-
struct block_device *bdev = mp->m_ddev_targp->bt_bdev;
32158
struct xfs_btree_cur *cur;
33159
struct xfs_buf *agbp;
34-
struct xfs_agf *agf;
35160
int error;
36161
int i;
162+
int batch = 100;
37163

38164
/*
39165
* Force out the log. This means any transactions that might have freed
@@ -45,20 +171,28 @@ xfs_trim_extents(
45171
error = xfs_alloc_read_agf(pag, NULL, 0, &agbp);
46172
if (error)
47173
return error;
48-
agf = agbp->b_addr;
49174

50175
cur = xfs_allocbt_init_cursor(mp, NULL, agbp, pag, XFS_BTNUM_CNT);
51176

52177
/*
53-
* Look up the longest btree in the AGF and start with it.
178+
* Look up the extent length requested in the AGF and start with it.
54179
*/
55-
error = xfs_alloc_lookup_ge(cur, 0, be32_to_cpu(agf->agf_longest), &i);
180+
if (tcur->ar_startblock == NULLAGBLOCK)
181+
error = xfs_alloc_lookup_ge(cur, 0, tcur->ar_blockcount, &i);
182+
else
183+
error = xfs_alloc_lookup_le(cur, tcur->ar_startblock,
184+
tcur->ar_blockcount, &i);
56185
if (error)
57186
goto out_del_cursor;
187+
if (i == 0) {
188+
/* nothing of that length left in the AG, we are done */
189+
tcur->ar_blockcount = 0;
190+
goto out_del_cursor;
191+
}
58192

59193
/*
60194
* Loop until we are done with all extents that are large
61-
* enough to be worth discarding.
195+
* enough to be worth discarding or we hit batch limits.
62196
*/
63197
while (i) {
64198
xfs_agblock_t fbno;
@@ -73,7 +207,16 @@ xfs_trim_extents(
73207
error = -EFSCORRUPTED;
74208
break;
75209
}
76-
ASSERT(flen <= be32_to_cpu(agf->agf_longest));
210+
211+
if (--batch <= 0) {
212+
/*
213+
* Update the cursor to point at this extent so we
214+
* restart the next batch from this extent.
215+
*/
216+
tcur->ar_startblock = fbno;
217+
tcur->ar_blockcount = flen;
218+
break;
219+
}
77220

78221
/*
79222
* use daddr format for all range/len calculations as that is
@@ -88,6 +231,7 @@ xfs_trim_extents(
88231
*/
89232
if (dlen < minlen) {
90233
trace_xfs_discard_toosmall(mp, pag->pag_agno, fbno, flen);
234+
tcur->ar_blockcount = 0;
91235
break;
92236
}
93237

@@ -110,29 +254,103 @@ xfs_trim_extents(
110254
goto next_extent;
111255
}
112256

113-
trace_xfs_discard_extent(mp, pag->pag_agno, fbno, flen);
114-
error = blkdev_issue_discard(bdev, dbno, dlen, GFP_NOFS);
115-
if (error)
116-
break;
257+
xfs_extent_busy_insert_discard(pag, fbno, flen,
258+
&extents->extent_list);
117259
*blocks_trimmed += flen;
118-
119260
next_extent:
120261
error = xfs_btree_decrement(cur, 0, &i);
121262
if (error)
122263
break;
123264

124-
if (fatal_signal_pending(current)) {
125-
error = -ERESTARTSYS;
126-
break;
127-
}
265+
/*
266+
* If there's no more records in the tree, we are done. Set the
267+
* cursor block count to 0 to indicate to the caller that there
268+
* is no more extents to search.
269+
*/
270+
if (i == 0)
271+
tcur->ar_blockcount = 0;
128272
}
129273

274+
/*
275+
* If there was an error, release all the gathered busy extents because
276+
* we aren't going to issue a discard on them any more.
277+
*/
278+
if (error)
279+
xfs_extent_busy_clear(mp, &extents->extent_list, false);
130280
out_del_cursor:
131281
xfs_btree_del_cursor(cur, error);
132282
xfs_buf_relse(agbp);
133283
return error;
134284
}
135285

286+
static bool
287+
xfs_trim_should_stop(void)
288+
{
289+
return fatal_signal_pending(current) || freezing(current);
290+
}
291+
292+
/*
293+
* Iterate the free list gathering extents and discarding them. We need a cursor
294+
* for the repeated iteration of gather/discard loop, so use the longest extent
295+
* we found in the last batch as the key to start the next.
296+
*/
297+
static int
298+
xfs_trim_extents(
299+
struct xfs_perag *pag,
300+
xfs_daddr_t start,
301+
xfs_daddr_t end,
302+
xfs_daddr_t minlen,
303+
uint64_t *blocks_trimmed)
304+
{
305+
struct xfs_alloc_rec_incore tcur = {
306+
.ar_blockcount = pag->pagf_longest,
307+
.ar_startblock = NULLAGBLOCK,
308+
};
309+
int error = 0;
310+
311+
do {
312+
struct xfs_busy_extents *extents;
313+
314+
extents = kzalloc(sizeof(*extents), GFP_KERNEL);
315+
if (!extents) {
316+
error = -ENOMEM;
317+
break;
318+
}
319+
320+
extents->mount = pag->pag_mount;
321+
extents->owner = extents;
322+
INIT_LIST_HEAD(&extents->extent_list);
323+
324+
error = xfs_trim_gather_extents(pag, start, end, minlen,
325+
&tcur, extents, blocks_trimmed);
326+
if (error) {
327+
kfree(extents);
328+
break;
329+
}
330+
331+
/*
332+
* We hand the extent list to the discard function here so the
333+
* discarded extents can be removed from the busy extent list.
334+
* This allows the discards to run asynchronously with gathering
335+
* the next round of extents to discard.
336+
*
337+
* However, we must ensure that we do not reference the extent
338+
* list after this function call, as it may have been freed by
339+
* the time control returns to us.
340+
*/
341+
error = xfs_discard_extents(pag->pag_mount, extents);
342+
if (error)
343+
break;
344+
345+
if (xfs_trim_should_stop())
346+
break;
347+
348+
} while (tcur.ar_blockcount != 0);
349+
350+
return error;
351+
352+
}
353+
136354
/*
137355
* trim a range of the filesystem.
138356
*
@@ -195,12 +413,12 @@ xfs_ioc_trim(
195413
for_each_perag_range(mp, agno, xfs_daddr_to_agno(mp, end), pag) {
196414
error = xfs_trim_extents(pag, start, end, minlen,
197415
&blocks_trimmed);
198-
if (error) {
416+
if (error)
199417
last_error = error;
200-
if (error == -ERESTARTSYS) {
201-
xfs_perag_rele(pag);
202-
break;
203-
}
418+
419+
if (xfs_trim_should_stop()) {
420+
xfs_perag_rele(pag);
421+
break;
204422
}
205423
}
206424

fs/xfs/xfs_discard.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,10 @@
33
#define XFS_DISCARD_H 1
44

55
struct fstrim_range;
6-
struct list_head;
6+
struct xfs_mount;
7+
struct xfs_busy_extents;
78

8-
extern int xfs_ioc_trim(struct xfs_mount *, struct fstrim_range __user *);
9+
int xfs_discard_extents(struct xfs_mount *mp, struct xfs_busy_extents *busy);
10+
int xfs_ioc_trim(struct xfs_mount *mp, struct fstrim_range __user *fstrim);
911

1012
#endif /* XFS_DISCARD_H */

0 commit comments

Comments
 (0)