1
1
// SPDX-License-Identifier: GPL-2.0
2
2
/*
3
- * Copyright (C) 2010 Red Hat, Inc.
3
+ * Copyright (C) 2010, 2023 Red Hat, Inc.
4
4
* All Rights Reserved.
5
5
*/
6
6
#include "xfs.h"
19
19
#include "xfs_log.h"
20
20
#include "xfs_ag.h"
21
21
22
- STATIC int
23
- xfs_trim_extents (
22
+ /*
23
+ * Notes on an efficient, low latency fstrim algorithm
24
+ *
25
+ * We need to walk the filesystem free space and issue discards on the free
26
+ * space that meet the search criteria (size and location). We cannot issue
27
+ * discards on extents that might be in use, or are so recently in use they are
28
+ * still marked as busy. To serialise against extent state changes whilst we are
29
+ * gathering extents to trim, we must hold the AGF lock to lock out other
30
+ * allocations and extent free operations that might change extent state.
31
+ *
32
+ * However, we cannot just hold the AGF for the entire AG free space walk whilst
33
+ * we issue discards on each free space that is found. Storage devices can have
34
+ * extremely slow discard implementations (e.g. ceph RBD) and so walking a
35
+ * couple of million free extents and issuing synchronous discards on each
36
+ * extent can take a *long* time. Whilst we are doing this walk, nothing else
37
+ * can access the AGF, and we can stall transactions and hence the log whilst
38
+ * modifications wait for the AGF lock to be released. This can lead hung tasks
39
+ * kicking the hung task timer and rebooting the system. This is bad.
40
+ *
41
+ * Hence we need to take a leaf from the bulkstat playbook. It takes the AGI
42
+ * lock, gathers a range of inode cluster buffers that are allocated, drops the
43
+ * AGI lock and then reads all the inode cluster buffers and processes them. It
44
+ * loops doing this, using a cursor to keep track of where it is up to in the AG
45
+ * for each iteration to restart the INOBT lookup from.
46
+ *
47
+ * We can't do this exactly with free space - once we drop the AGF lock, the
48
+ * state of the free extent is out of our control and we cannot run a discard
49
+ * safely on it in this situation. Unless, of course, we've marked the free
50
+ * extent as busy and undergoing a discard operation whilst we held the AGF
51
+ * locked.
52
+ *
53
+ * This is exactly how online discard works - free extents are marked busy when
54
+ * they are freed, and once the extent free has been committed to the journal,
55
+ * the busy extent record is marked as "undergoing discard" and the discard is
56
+ * then issued on the free extent. Once the discard completes, the busy extent
57
+ * record is removed and the extent is able to be allocated again.
58
+ *
59
+ * In the context of fstrim, if we find a free extent we need to discard, we
60
+ * don't have to discard it immediately. All we need to do it record that free
61
+ * extent as being busy and under discard, and all the allocation routines will
62
+ * now avoid trying to allocate it. Hence if we mark the extent as busy under
63
+ * the AGF lock, we can safely discard it without holding the AGF lock because
64
+ * nothing will attempt to allocate that free space until the discard completes.
65
+ *
66
+ * This also allows us to issue discards asynchronously like we do with online
67
+ * discard, and so for fast devices fstrim will run much faster as we can have
68
+ * multiple discard operations in flight at once, as well as pipeline the free
69
+ * extent search so that it overlaps in flight discard IO.
70
+ */
71
+
72
+ struct workqueue_struct * xfs_discard_wq ;
73
+
74
+ static void
75
+ xfs_discard_endio_work (
76
+ struct work_struct * work )
77
+ {
78
+ struct xfs_busy_extents * extents =
79
+ container_of (work , struct xfs_busy_extents , endio_work );
80
+
81
+ xfs_extent_busy_clear (extents -> mount , & extents -> extent_list , false);
82
+ kmem_free (extents -> owner );
83
+ }
84
+
85
+ /*
86
+ * Queue up the actual completion to a thread to avoid IRQ-safe locking for
87
+ * pagb_lock.
88
+ */
89
+ static void
90
+ xfs_discard_endio (
91
+ struct bio * bio )
92
+ {
93
+ struct xfs_busy_extents * extents = bio -> bi_private ;
94
+
95
+ INIT_WORK (& extents -> endio_work , xfs_discard_endio_work );
96
+ queue_work (xfs_discard_wq , & extents -> endio_work );
97
+ bio_put (bio );
98
+ }
99
+
100
+ /*
101
+ * Walk the discard list and issue discards on all the busy extents in the
102
+ * list. We plug and chain the bios so that we only need a single completion
103
+ * call to clear all the busy extents once the discards are complete.
104
+ */
105
+ int
106
+ xfs_discard_extents (
107
+ struct xfs_mount * mp ,
108
+ struct xfs_busy_extents * extents )
109
+ {
110
+ struct xfs_extent_busy * busyp ;
111
+ struct bio * bio = NULL ;
112
+ struct blk_plug plug ;
113
+ int error = 0 ;
114
+
115
+ blk_start_plug (& plug );
116
+ list_for_each_entry (busyp , & extents -> extent_list , list ) {
117
+ trace_xfs_discard_extent (mp , busyp -> agno , busyp -> bno ,
118
+ busyp -> length );
119
+
120
+ error = __blkdev_issue_discard (mp -> m_ddev_targp -> bt_bdev ,
121
+ XFS_AGB_TO_DADDR (mp , busyp -> agno , busyp -> bno ),
122
+ XFS_FSB_TO_BB (mp , busyp -> length ),
123
+ GFP_NOFS , & bio );
124
+ if (error && error != - EOPNOTSUPP ) {
125
+ xfs_info (mp ,
126
+ "discard failed for extent [0x%llx,%u], error %d" ,
127
+ (unsigned long long )busyp -> bno ,
128
+ busyp -> length ,
129
+ error );
130
+ break ;
131
+ }
132
+ }
133
+
134
+ if (bio ) {
135
+ bio -> bi_private = extents ;
136
+ bio -> bi_end_io = xfs_discard_endio ;
137
+ submit_bio (bio );
138
+ } else {
139
+ xfs_discard_endio_work (& extents -> endio_work );
140
+ }
141
+ blk_finish_plug (& plug );
142
+
143
+ return error ;
144
+ }
145
+
146
+
147
+ static int
148
+ xfs_trim_gather_extents (
24
149
struct xfs_perag * pag ,
25
150
xfs_daddr_t start ,
26
151
xfs_daddr_t end ,
27
152
xfs_daddr_t minlen ,
153
+ struct xfs_alloc_rec_incore * tcur ,
154
+ struct xfs_busy_extents * extents ,
28
155
uint64_t * blocks_trimmed )
29
156
{
30
157
struct xfs_mount * mp = pag -> pag_mount ;
31
- struct block_device * bdev = mp -> m_ddev_targp -> bt_bdev ;
32
158
struct xfs_btree_cur * cur ;
33
159
struct xfs_buf * agbp ;
34
- struct xfs_agf * agf ;
35
160
int error ;
36
161
int i ;
162
+ int batch = 100 ;
37
163
38
164
/*
39
165
* Force out the log. This means any transactions that might have freed
@@ -45,20 +171,28 @@ xfs_trim_extents(
45
171
error = xfs_alloc_read_agf (pag , NULL , 0 , & agbp );
46
172
if (error )
47
173
return error ;
48
- agf = agbp -> b_addr ;
49
174
50
175
cur = xfs_allocbt_init_cursor (mp , NULL , agbp , pag , XFS_BTNUM_CNT );
51
176
52
177
/*
53
- * Look up the longest btree in the AGF and start with it.
178
+ * Look up the extent length requested in the AGF and start with it.
54
179
*/
55
- error = xfs_alloc_lookup_ge (cur , 0 , be32_to_cpu (agf -> agf_longest ), & i );
180
+ if (tcur -> ar_startblock == NULLAGBLOCK )
181
+ error = xfs_alloc_lookup_ge (cur , 0 , tcur -> ar_blockcount , & i );
182
+ else
183
+ error = xfs_alloc_lookup_le (cur , tcur -> ar_startblock ,
184
+ tcur -> ar_blockcount , & i );
56
185
if (error )
57
186
goto out_del_cursor ;
187
+ if (i == 0 ) {
188
+ /* nothing of that length left in the AG, we are done */
189
+ tcur -> ar_blockcount = 0 ;
190
+ goto out_del_cursor ;
191
+ }
58
192
59
193
/*
60
194
* Loop until we are done with all extents that are large
61
- * enough to be worth discarding.
195
+ * enough to be worth discarding or we hit batch limits .
62
196
*/
63
197
while (i ) {
64
198
xfs_agblock_t fbno ;
@@ -73,7 +207,16 @@ xfs_trim_extents(
73
207
error = - EFSCORRUPTED ;
74
208
break ;
75
209
}
76
- ASSERT (flen <= be32_to_cpu (agf -> agf_longest ));
210
+
211
+ if (-- batch <= 0 ) {
212
+ /*
213
+ * Update the cursor to point at this extent so we
214
+ * restart the next batch from this extent.
215
+ */
216
+ tcur -> ar_startblock = fbno ;
217
+ tcur -> ar_blockcount = flen ;
218
+ break ;
219
+ }
77
220
78
221
/*
79
222
* use daddr format for all range/len calculations as that is
@@ -88,6 +231,7 @@ xfs_trim_extents(
88
231
*/
89
232
if (dlen < minlen ) {
90
233
trace_xfs_discard_toosmall (mp , pag -> pag_agno , fbno , flen );
234
+ tcur -> ar_blockcount = 0 ;
91
235
break ;
92
236
}
93
237
@@ -110,29 +254,103 @@ xfs_trim_extents(
110
254
goto next_extent ;
111
255
}
112
256
113
- trace_xfs_discard_extent (mp , pag -> pag_agno , fbno , flen );
114
- error = blkdev_issue_discard (bdev , dbno , dlen , GFP_NOFS );
115
- if (error )
116
- break ;
257
+ xfs_extent_busy_insert_discard (pag , fbno , flen ,
258
+ & extents -> extent_list );
117
259
* blocks_trimmed += flen ;
118
-
119
260
next_extent :
120
261
error = xfs_btree_decrement (cur , 0 , & i );
121
262
if (error )
122
263
break ;
123
264
124
- if (fatal_signal_pending (current )) {
125
- error = - ERESTARTSYS ;
126
- break ;
127
- }
265
+ /*
266
+ * If there's no more records in the tree, we are done. Set the
267
+ * cursor block count to 0 to indicate to the caller that there
268
+ * is no more extents to search.
269
+ */
270
+ if (i == 0 )
271
+ tcur -> ar_blockcount = 0 ;
128
272
}
129
273
274
+ /*
275
+ * If there was an error, release all the gathered busy extents because
276
+ * we aren't going to issue a discard on them any more.
277
+ */
278
+ if (error )
279
+ xfs_extent_busy_clear (mp , & extents -> extent_list , false);
130
280
out_del_cursor :
131
281
xfs_btree_del_cursor (cur , error );
132
282
xfs_buf_relse (agbp );
133
283
return error ;
134
284
}
135
285
286
+ static bool
287
+ xfs_trim_should_stop (void )
288
+ {
289
+ return fatal_signal_pending (current ) || freezing (current );
290
+ }
291
+
292
+ /*
293
+ * Iterate the free list gathering extents and discarding them. We need a cursor
294
+ * for the repeated iteration of gather/discard loop, so use the longest extent
295
+ * we found in the last batch as the key to start the next.
296
+ */
297
+ static int
298
+ xfs_trim_extents (
299
+ struct xfs_perag * pag ,
300
+ xfs_daddr_t start ,
301
+ xfs_daddr_t end ,
302
+ xfs_daddr_t minlen ,
303
+ uint64_t * blocks_trimmed )
304
+ {
305
+ struct xfs_alloc_rec_incore tcur = {
306
+ .ar_blockcount = pag -> pagf_longest ,
307
+ .ar_startblock = NULLAGBLOCK ,
308
+ };
309
+ int error = 0 ;
310
+
311
+ do {
312
+ struct xfs_busy_extents * extents ;
313
+
314
+ extents = kzalloc (sizeof (* extents ), GFP_KERNEL );
315
+ if (!extents ) {
316
+ error = - ENOMEM ;
317
+ break ;
318
+ }
319
+
320
+ extents -> mount = pag -> pag_mount ;
321
+ extents -> owner = extents ;
322
+ INIT_LIST_HEAD (& extents -> extent_list );
323
+
324
+ error = xfs_trim_gather_extents (pag , start , end , minlen ,
325
+ & tcur , extents , blocks_trimmed );
326
+ if (error ) {
327
+ kfree (extents );
328
+ break ;
329
+ }
330
+
331
+ /*
332
+ * We hand the extent list to the discard function here so the
333
+ * discarded extents can be removed from the busy extent list.
334
+ * This allows the discards to run asynchronously with gathering
335
+ * the next round of extents to discard.
336
+ *
337
+ * However, we must ensure that we do not reference the extent
338
+ * list after this function call, as it may have been freed by
339
+ * the time control returns to us.
340
+ */
341
+ error = xfs_discard_extents (pag -> pag_mount , extents );
342
+ if (error )
343
+ break ;
344
+
345
+ if (xfs_trim_should_stop ())
346
+ break ;
347
+
348
+ } while (tcur .ar_blockcount != 0 );
349
+
350
+ return error ;
351
+
352
+ }
353
+
136
354
/*
137
355
* trim a range of the filesystem.
138
356
*
@@ -195,12 +413,12 @@ xfs_ioc_trim(
195
413
for_each_perag_range (mp , agno , xfs_daddr_to_agno (mp , end ), pag ) {
196
414
error = xfs_trim_extents (pag , start , end , minlen ,
197
415
& blocks_trimmed );
198
- if (error ) {
416
+ if (error )
199
417
last_error = error ;
200
- if ( error == - ERESTARTSYS ) {
201
- xfs_perag_rele ( pag );
202
- break ;
203
- }
418
+
419
+ if ( xfs_trim_should_stop ()) {
420
+ xfs_perag_rele ( pag ) ;
421
+ break ;
204
422
}
205
423
}
206
424
0 commit comments