Skip to content

Commit c6c9c4d

Browse files
fdmananakdave
authored andcommitted
btrfs: skip inodes without loaded extent maps when shrinking extent maps
If there are inodes that don't have any loaded extent maps, we end up grabbing a reference on them and later adding a delayed iput, which wakes up the cleaner and makes it do unnecessary work. This is common when for example the inodes were open only to run stat(2) or all their extent maps were already released through the folio release callback (btrfs_release_folio()) or released by a previous run of the shrinker, or directories which never have extent maps. Reported-by: Ivan Shapovalov <[email protected]> Tested-by: Ivan Shapovalov <[email protected]> Link: https://lore.kernel.org/linux-btrfs/[email protected]/ CC: [email protected] # 6.13+ Reviewed-by: Johannes Thumshirn <[email protected]> Reviewed-by: Qu Wenruo <[email protected]> Signed-off-by: Filipe Manana <[email protected]> Signed-off-by: David Sterba <[email protected]>
1 parent 59f3703 commit c6c9c4d

File tree

1 file changed

+57
-21
lines changed

1 file changed

+57
-21
lines changed

fs/btrfs/extent_map.c

Lines changed: 57 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1128,6 +1128,8 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c
11281128
long nr_dropped = 0;
11291129
struct rb_node *node;
11301130

1131+
lockdep_assert_held_write(&tree->lock);
1132+
11311133
/*
11321134
* Take the mmap lock so that we serialize with the inode logging phase
11331135
* of fsync because we may need to set the full sync flag on the inode,
@@ -1139,28 +1141,12 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c
11391141
* to find new extents, which may not be there yet because ordered
11401142
* extents haven't completed yet.
11411143
*
1142-
* We also do a try lock because otherwise we could deadlock. This is
1143-
* because the shrinker for this filesystem may be invoked while we are
1144-
* in a path that is holding the mmap lock in write mode. For example in
1145-
* a reflink operation while COWing an extent buffer, when allocating
1146-
* pages for a new extent buffer and under memory pressure, the shrinker
1147-
* may be invoked, and therefore we would deadlock by attempting to read
1148-
* lock the mmap lock while we are holding already a write lock on it.
1144+
* We also do a try lock because we don't want to block for too long and
1145+
* we are holding the extent map tree's lock in write mode.
11491146
*/
11501147
if (!down_read_trylock(&inode->i_mmap_lock))
11511148
return 0;
11521149

1153-
/*
1154-
* We want to be fast so if the lock is busy we don't want to spend time
1155-
* waiting for it - either some task is about to do IO for the inode or
1156-
* we may have another task shrinking extent maps, here in this code, so
1157-
* skip this inode.
1158-
*/
1159-
if (!write_trylock(&tree->lock)) {
1160-
up_read(&inode->i_mmap_lock);
1161-
return 0;
1162-
}
1163-
11641150
node = rb_first(&tree->root);
11651151
while (node) {
11661152
struct rb_node *next = rb_next(node);
@@ -1201,22 +1187,72 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c
12011187
break;
12021188
node = next;
12031189
}
1204-
write_unlock(&tree->lock);
12051190
up_read(&inode->i_mmap_lock);
12061191

12071192
return nr_dropped;
12081193
}
12091194

1195+
static struct btrfs_inode *find_first_inode_to_shrink(struct btrfs_root *root,
1196+
u64 min_ino)
1197+
{
1198+
struct btrfs_inode *inode;
1199+
unsigned long from = min_ino;
1200+
1201+
xa_lock(&root->inodes);
1202+
while (true) {
1203+
struct extent_map_tree *tree;
1204+
1205+
inode = xa_find(&root->inodes, &from, ULONG_MAX, XA_PRESENT);
1206+
if (!inode)
1207+
break;
1208+
1209+
tree = &inode->extent_tree;
1210+
1211+
/*
1212+
* We want to be fast so if the lock is busy we don't want to
1213+
* spend time waiting for it (some task is about to do IO for
1214+
* the inode).
1215+
*/
1216+
if (!write_trylock(&tree->lock))
1217+
goto next;
1218+
1219+
/*
1220+
* Skip inode if it doesn't have loaded extent maps, so we avoid
1221+
* getting a reference and doing an iput later. This includes
1222+
* cases like files that were opened for things like stat(2), or
1223+
* files with all extent maps previously released through the
1224+
* release folio callback (btrfs_release_folio()) or released in
1225+
* a previous run, or directories which never have extent maps.
1226+
*/
1227+
if (RB_EMPTY_ROOT(&tree->root)) {
1228+
write_unlock(&tree->lock);
1229+
goto next;
1230+
}
1231+
1232+
if (igrab(&inode->vfs_inode))
1233+
break;
1234+
1235+
write_unlock(&tree->lock);
1236+
next:
1237+
from = btrfs_ino(inode) + 1;
1238+
cond_resched_lock(&root->inodes.xa_lock);
1239+
}
1240+
xa_unlock(&root->inodes);
1241+
1242+
return inode;
1243+
}
1244+
12101245
static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx *ctx)
12111246
{
12121247
struct btrfs_fs_info *fs_info = root->fs_info;
12131248
struct btrfs_inode *inode;
12141249
long nr_dropped = 0;
12151250
u64 min_ino = fs_info->em_shrinker_last_ino + 1;
12161251

1217-
inode = btrfs_find_first_inode(root, min_ino);
1252+
inode = find_first_inode_to_shrink(root, min_ino);
12181253
while (inode) {
12191254
nr_dropped += btrfs_scan_inode(inode, ctx);
1255+
write_unlock(&inode->extent_tree.lock);
12201256

12211257
min_ino = btrfs_ino(inode) + 1;
12221258
fs_info->em_shrinker_last_ino = btrfs_ino(inode);
@@ -1227,7 +1263,7 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx
12271263

12281264
cond_resched();
12291265

1230-
inode = btrfs_find_first_inode(root, min_ino);
1266+
inode = find_first_inode_to_shrink(root, min_ino);
12311267
}
12321268

12331269
if (inode) {

0 commit comments

Comments
 (0)