|
8 | 8 | #include "extent_map.h"
|
9 | 9 | #include "compression.h"
|
10 | 10 | #include "btrfs_inode.h"
|
| 11 | +#include "disk-io.h" |
11 | 12 |
|
12 | 13 |
|
13 | 14 | static struct kmem_cache *extent_map_cache;
|
@@ -1026,3 +1027,162 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
|
1026 | 1027 | free_extent_map(split_pre);
|
1027 | 1028 | return ret;
|
1028 | 1029 | }
|
| 1030 | + |
| 1031 | +static long btrfs_scan_inode(struct btrfs_inode *inode, long *scanned, long nr_to_scan) |
| 1032 | +{ |
| 1033 | + const u64 cur_fs_gen = btrfs_get_fs_generation(inode->root->fs_info); |
| 1034 | + struct extent_map_tree *tree = &inode->extent_tree; |
| 1035 | + long nr_dropped = 0; |
| 1036 | + struct rb_node *node; |
| 1037 | + |
| 1038 | + /* |
| 1039 | + * Take the mmap lock so that we serialize with the inode logging phase |
| 1040 | + * of fsync because we may need to set the full sync flag on the inode, |
| 1041 | + * in case we have to remove extent maps in the tree's list of modified |
| 1042 | + * extents. If we set the full sync flag in the inode while an fsync is |
| 1043 | + * in progress, we may risk missing new extents because before the flag |
| 1044 | + * is set, fsync decides to only wait for writeback to complete and then |
| 1045 | + * during inode logging it sees the flag set and uses the subvolume tree |
| 1046 | + * to find new extents, which may not be there yet because ordered |
| 1047 | + * extents haven't completed yet. |
| 1048 | + * |
| 1049 | + * We also do a try lock because otherwise we could deadlock. This is |
| 1050 | + * because the shrinker for this filesystem may be invoked while we are |
| 1051 | + * in a path that is holding the mmap lock in write mode. For example in |
| 1052 | + * a reflink operation while COWing an extent buffer, when allocating |
| 1053 | + * pages for a new extent buffer and under memory pressure, the shrinker |
| 1054 | + * may be invoked, and therefore we would deadlock by attempting to read |
| 1055 | + * lock the mmap lock while we are holding already a write lock on it. |
| 1056 | + */ |
| 1057 | + if (!down_read_trylock(&inode->i_mmap_lock)) |
| 1058 | + return 0; |
| 1059 | + |
| 1060 | + write_lock(&tree->lock); |
| 1061 | + node = rb_first_cached(&tree->map); |
| 1062 | + while (node) { |
| 1063 | + struct extent_map *em; |
| 1064 | + |
| 1065 | + em = rb_entry(node, struct extent_map, rb_node); |
| 1066 | + node = rb_next(node); |
| 1067 | + (*scanned)++; |
| 1068 | + |
| 1069 | + if (em->flags & EXTENT_FLAG_PINNED) |
| 1070 | + goto next; |
| 1071 | + |
| 1072 | + /* |
| 1073 | + * If the inode is in the list of modified extents (new) and its |
| 1074 | + * generation is the same (or is greater than) the current fs |
| 1075 | + * generation, it means it was not yet persisted so we have to |
| 1076 | + * set the full sync flag so that the next fsync will not miss |
| 1077 | + * it. |
| 1078 | + */ |
| 1079 | + if (!list_empty(&em->list) && em->generation >= cur_fs_gen) |
| 1080 | + btrfs_set_inode_full_sync(inode); |
| 1081 | + |
| 1082 | + remove_extent_mapping(inode, em); |
| 1083 | + /* Drop the reference for the tree. */ |
| 1084 | + free_extent_map(em); |
| 1085 | + nr_dropped++; |
| 1086 | +next: |
| 1087 | + if (*scanned >= nr_to_scan) |
| 1088 | + break; |
| 1089 | + |
| 1090 | + /* |
| 1091 | + * Restart if we had to reschedule, and any extent maps that were |
| 1092 | + * pinned before may have become unpinned after we released the |
| 1093 | + * lock and took it again. |
| 1094 | + */ |
| 1095 | + if (cond_resched_rwlock_write(&tree->lock)) |
| 1096 | + node = rb_first_cached(&tree->map); |
| 1097 | + } |
| 1098 | + write_unlock(&tree->lock); |
| 1099 | + up_read(&inode->i_mmap_lock); |
| 1100 | + |
| 1101 | + return nr_dropped; |
| 1102 | +} |
| 1103 | + |
| 1104 | +static long btrfs_scan_root(struct btrfs_root *root, long *scanned, long nr_to_scan) |
| 1105 | +{ |
| 1106 | + struct btrfs_fs_info *fs_info = root->fs_info; |
| 1107 | + struct btrfs_inode *inode; |
| 1108 | + long nr_dropped = 0; |
| 1109 | + u64 min_ino = fs_info->extent_map_shrinker_last_ino + 1; |
| 1110 | + |
| 1111 | + inode = btrfs_find_first_inode(root, min_ino); |
| 1112 | + while (inode) { |
| 1113 | + nr_dropped += btrfs_scan_inode(inode, scanned, nr_to_scan); |
| 1114 | + |
| 1115 | + min_ino = btrfs_ino(inode) + 1; |
| 1116 | + fs_info->extent_map_shrinker_last_ino = btrfs_ino(inode); |
| 1117 | + iput(&inode->vfs_inode); |
| 1118 | + |
| 1119 | + if (*scanned >= nr_to_scan) |
| 1120 | + break; |
| 1121 | + |
| 1122 | + cond_resched(); |
| 1123 | + inode = btrfs_find_first_inode(root, min_ino); |
| 1124 | + } |
| 1125 | + |
| 1126 | + if (inode) { |
| 1127 | + /* |
| 1128 | + * There are still inodes in this root or we happened to process |
| 1129 | + * the last one and reached the scan limit. In either case set |
| 1130 | + * the current root to this one, so we'll resume from the next |
| 1131 | + * inode if there is one or we will find out this was the last |
| 1132 | + * one and move to the next root. |
| 1133 | + */ |
| 1134 | + fs_info->extent_map_shrinker_last_root = btrfs_root_id(root); |
| 1135 | + } else { |
| 1136 | + /* |
| 1137 | + * No more inodes in this root, set extent_map_shrinker_last_ino to 0 so |
| 1138 | + * that when processing the next root we start from its first inode. |
| 1139 | + */ |
| 1140 | + fs_info->extent_map_shrinker_last_ino = 0; |
| 1141 | + fs_info->extent_map_shrinker_last_root = btrfs_root_id(root) + 1; |
| 1142 | + } |
| 1143 | + |
| 1144 | + return nr_dropped; |
| 1145 | +} |
| 1146 | + |
| 1147 | +long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan) |
| 1148 | +{ |
| 1149 | + const u64 start_root_id = fs_info->extent_map_shrinker_last_root; |
| 1150 | + u64 next_root_id = start_root_id; |
| 1151 | + bool cycled = false; |
| 1152 | + long nr_dropped = 0; |
| 1153 | + long scanned = 0; |
| 1154 | + |
| 1155 | + while (scanned < nr_to_scan) { |
| 1156 | + struct btrfs_root *root; |
| 1157 | + unsigned long count; |
| 1158 | + |
| 1159 | + spin_lock(&fs_info->fs_roots_radix_lock); |
| 1160 | + count = radix_tree_gang_lookup(&fs_info->fs_roots_radix, |
| 1161 | + (void **)&root, |
| 1162 | + (unsigned long)next_root_id, 1); |
| 1163 | + if (count == 0) { |
| 1164 | + spin_unlock(&fs_info->fs_roots_radix_lock); |
| 1165 | + if (start_root_id > 0 && !cycled) { |
| 1166 | + next_root_id = 0; |
| 1167 | + fs_info->extent_map_shrinker_last_root = 0; |
| 1168 | + fs_info->extent_map_shrinker_last_ino = 0; |
| 1169 | + cycled = true; |
| 1170 | + continue; |
| 1171 | + } |
| 1172 | + break; |
| 1173 | + } |
| 1174 | + next_root_id = btrfs_root_id(root) + 1; |
| 1175 | + root = btrfs_grab_root(root); |
| 1176 | + spin_unlock(&fs_info->fs_roots_radix_lock); |
| 1177 | + |
| 1178 | + if (!root) |
| 1179 | + continue; |
| 1180 | + |
| 1181 | + if (is_fstree(btrfs_root_id(root))) |
| 1182 | + nr_dropped += btrfs_scan_root(root, &scanned, nr_to_scan); |
| 1183 | + |
| 1184 | + btrfs_put_root(root); |
| 1185 | + } |
| 1186 | + |
| 1187 | + return nr_dropped; |
| 1188 | +} |
0 commit comments