Skip to content

Commit 02f310f

Browse files
jankaratytso
authored andcommitted
ext4: Speedup ext4 orphan inode handling
Ext4 orphan inode handling is a bottleneck for workloads which heavily truncate / unlink small files since it contends on the global s_orphan_mutex lock (and generally it's difficult to improve scalability of the ondisk linked list of orphaned inodes). This patch implements new way of handling orphan inodes. Instead of linking orphaned inode into a linked list, we store it's inode number in a new special file which we call "orphan file". Only if there's no more space in the orphan file (too many inodes are currently orphaned) we fall back to using old style linked list. Currently we protect operations in the orphan file with a spinlock for simplicity but even in this setting we can substantially reduce the length of the critical section and thus speedup some workloads. In the next patch we improve this by making orphan handling lockless. Note that the change is backwards compatible when the filesystem is clean - the existence of the orphan file is a compat feature, we set another ro-compat feature indicating orphan file needs scanning for orphaned inodes when mounting filesystem read-write. This ro-compat feature gets cleared on unmount / remount read-only. Some performance data from 80 CPU Xeon Server with 512 GB of RAM, filesystem located on SSD, average of 5 runs: stress-orphan (microbenchmark truncating files byte-by-byte from N processes in parallel) Threads Time Time Vanilla Patched 1 1.057200 0.945600 2 1.680400 1.331800 4 2.547000 1.995000 8 7.049400 6.424200 16 14.827800 14.937600 32 40.948200 33.038200 64 87.787400 60.823600 128 206.504000 122.941400 So we can see significant wins all over the board. Reviewed-by: Theodore Ts'o <[email protected]> Signed-off-by: Jan Kara <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Theodore Ts'o <[email protected]>
1 parent 25c6d98 commit 02f310f

File tree

4 files changed

+394
-52
lines changed

4 files changed

+394
-52
lines changed

fs/ext4/ext4.h

Lines changed: 62 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1034,7 +1034,14 @@ struct ext4_inode_info {
10341034
*/
10351035
struct rw_semaphore xattr_sem;
10361036

1037-
struct list_head i_orphan; /* unlinked but open inodes */
1037+
/*
1038+
* Inodes with EXT4_STATE_ORPHAN_FILE use i_orphan_idx. Otherwise
1039+
* i_orphan is used.
1040+
*/
1041+
union {
1042+
struct list_head i_orphan; /* unlinked but open inodes */
1043+
unsigned int i_orphan_idx; /* Index in orphan file */
1044+
};
10381045

10391046
/* Fast commit related info */
10401047

@@ -1428,7 +1435,8 @@ struct ext4_super_block {
14281435
__u8 s_last_error_errcode;
14291436
__le16 s_encoding; /* Filename charset encoding */
14301437
__le16 s_encoding_flags; /* Filename charset encoding flags */
1431-
__le32 s_reserved[95]; /* Padding to the end of the block */
1438+
__le32 s_orphan_file_inum; /* Inode for tracking orphan inodes */
1439+
__le32 s_reserved[94]; /* Padding to the end of the block */
14321440
__le32 s_checksum; /* crc32c(superblock) */
14331441
};
14341442

@@ -1449,6 +1457,7 @@ struct ext4_super_block {
14491457

14501458
/* Types of ext4 journal triggers */
14511459
enum ext4_journal_trigger_type {
1460+
EXT4_JTR_ORPHAN_FILE,
14521461
EXT4_JTR_NONE /* This must be the last entry for indexing to work! */
14531462
};
14541463

@@ -1465,6 +1474,36 @@ static inline struct ext4_journal_trigger *EXT4_TRIGGER(
14651474
return container_of(trigger, struct ext4_journal_trigger, tr_triggers);
14661475
}
14671476

1477+
#define EXT4_ORPHAN_BLOCK_MAGIC 0x0b10ca04
1478+
1479+
/* Structure at the tail of orphan block */
1480+
struct ext4_orphan_block_tail {
1481+
__le32 ob_magic;
1482+
__le32 ob_checksum;
1483+
};
1484+
1485+
static inline int ext4_inodes_per_orphan_block(struct super_block *sb)
1486+
{
1487+
return (sb->s_blocksize - sizeof(struct ext4_orphan_block_tail)) /
1488+
sizeof(u32);
1489+
}
1490+
1491+
struct ext4_orphan_block {
1492+
int ob_free_entries; /* Number of free orphan entries in block */
1493+
struct buffer_head *ob_bh; /* Buffer for orphan block */
1494+
};
1495+
1496+
/*
1497+
* Info about orphan file.
1498+
*/
1499+
struct ext4_orphan_info {
1500+
spinlock_t of_lock;
1501+
int of_blocks; /* Number of orphan blocks in a file */
1502+
__u32 of_csum_seed; /* Checksum seed for orphan file */
1503+
struct ext4_orphan_block *of_binfo; /* Array with info about orphan
1504+
* file blocks */
1505+
};
1506+
14681507
/*
14691508
* fourth extended-fs super-block data in memory
14701509
*/
@@ -1519,9 +1558,11 @@ struct ext4_sb_info {
15191558

15201559
/* Journaling */
15211560
struct journal_s *s_journal;
1522-
struct list_head s_orphan;
1523-
struct mutex s_orphan_lock;
15241561
unsigned long s_ext4_flags; /* Ext4 superblock flags */
1562+
struct mutex s_orphan_lock; /* Protects on disk list changes */
1563+
struct list_head s_orphan; /* List of orphaned inodes in on disk
1564+
list */
1565+
struct ext4_orphan_info s_orphan_info;
15251566
unsigned long s_commit_interval;
15261567
u32 s_max_batch_time;
15271568
u32 s_min_batch_time;
@@ -1859,6 +1900,7 @@ enum {
18591900
EXT4_STATE_LUSTRE_EA_INODE, /* Lustre-style ea_inode */
18601901
EXT4_STATE_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */
18611902
EXT4_STATE_FC_COMMITTING, /* Fast commit ongoing */
1903+
EXT4_STATE_ORPHAN_FILE, /* Inode orphaned in orphan file */
18621904
};
18631905

18641906
#define EXT4_INODE_BIT_FNS(name, field, offset) \
@@ -1960,6 +2002,7 @@ static inline bool ext4_verity_in_progress(struct inode *inode)
19602002
*/
19612003
#define EXT4_FEATURE_COMPAT_FAST_COMMIT 0x0400
19622004
#define EXT4_FEATURE_COMPAT_STABLE_INODES 0x0800
2005+
#define EXT4_FEATURE_COMPAT_ORPHAN_FILE 0x1000 /* Orphan file exists */
19632006

19642007
#define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001
19652008
#define EXT4_FEATURE_RO_COMPAT_LARGE_FILE 0x0002
@@ -1980,6 +2023,8 @@ static inline bool ext4_verity_in_progress(struct inode *inode)
19802023
#define EXT4_FEATURE_RO_COMPAT_READONLY 0x1000
19812024
#define EXT4_FEATURE_RO_COMPAT_PROJECT 0x2000
19822025
#define EXT4_FEATURE_RO_COMPAT_VERITY 0x8000
2026+
#define EXT4_FEATURE_RO_COMPAT_ORPHAN_PRESENT 0x10000 /* Orphan file may be
2027+
non-empty */
19832028

19842029
#define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001
19852030
#define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002
@@ -2063,6 +2108,7 @@ EXT4_FEATURE_COMPAT_FUNCS(dir_index, DIR_INDEX)
20632108
EXT4_FEATURE_COMPAT_FUNCS(sparse_super2, SPARSE_SUPER2)
20642109
EXT4_FEATURE_COMPAT_FUNCS(fast_commit, FAST_COMMIT)
20652110
EXT4_FEATURE_COMPAT_FUNCS(stable_inodes, STABLE_INODES)
2111+
EXT4_FEATURE_COMPAT_FUNCS(orphan_file, ORPHAN_FILE)
20662112

20672113
EXT4_FEATURE_RO_COMPAT_FUNCS(sparse_super, SPARSE_SUPER)
20682114
EXT4_FEATURE_RO_COMPAT_FUNCS(large_file, LARGE_FILE)
@@ -2077,6 +2123,7 @@ EXT4_FEATURE_RO_COMPAT_FUNCS(metadata_csum, METADATA_CSUM)
20772123
EXT4_FEATURE_RO_COMPAT_FUNCS(readonly, READONLY)
20782124
EXT4_FEATURE_RO_COMPAT_FUNCS(project, PROJECT)
20792125
EXT4_FEATURE_RO_COMPAT_FUNCS(verity, VERITY)
2126+
EXT4_FEATURE_RO_COMPAT_FUNCS(orphan_present, ORPHAN_PRESENT)
20802127

20812128
EXT4_FEATURE_INCOMPAT_FUNCS(compression, COMPRESSION)
20822129
EXT4_FEATURE_INCOMPAT_FUNCS(filetype, FILETYPE)
@@ -2110,7 +2157,8 @@ EXT4_FEATURE_INCOMPAT_FUNCS(casefold, CASEFOLD)
21102157
EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
21112158
EXT4_FEATURE_RO_COMPAT_BTREE_DIR)
21122159

2113-
#define EXT4_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR
2160+
#define EXT4_FEATURE_COMPAT_SUPP (EXT4_FEATURE_COMPAT_EXT_ATTR| \
2161+
EXT4_FEATURE_COMPAT_ORPHAN_FILE)
21142162
#define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
21152163
EXT4_FEATURE_INCOMPAT_RECOVER| \
21162164
EXT4_FEATURE_INCOMPAT_META_BG| \
@@ -2135,7 +2183,8 @@ EXT4_FEATURE_INCOMPAT_FUNCS(casefold, CASEFOLD)
21352183
EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\
21362184
EXT4_FEATURE_RO_COMPAT_QUOTA |\
21372185
EXT4_FEATURE_RO_COMPAT_PROJECT |\
2138-
EXT4_FEATURE_RO_COMPAT_VERITY)
2186+
EXT4_FEATURE_RO_COMPAT_VERITY |\
2187+
EXT4_FEATURE_RO_COMPAT_ORPHAN_PRESENT)
21392188

21402189
#define EXTN_FEATURE_FUNCS(ver) \
21412190
static inline bool ext4_has_unknown_ext##ver##_compat_features(struct super_block *sb) \
@@ -2185,7 +2234,6 @@ static inline int ext4_forced_shutdown(struct ext4_sb_info *sbi)
21852234
return test_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags);
21862235
}
21872236

2188-
21892237
/*
21902238
* Default values for user and/or group using reserved blocks
21912239
*/
@@ -3768,6 +3816,13 @@ extern int ext4_orphan_add(handle_t *, struct inode *);
37683816
extern int ext4_orphan_del(handle_t *, struct inode *);
37693817
extern void ext4_orphan_cleanup(struct super_block *sb,
37703818
struct ext4_super_block *es);
3819+
extern void ext4_release_orphan_info(struct super_block *sb);
3820+
extern int ext4_init_orphan_info(struct super_block *sb);
3821+
extern int ext4_orphan_file_empty(struct super_block *sb);
3822+
extern void ext4_orphan_file_block_trigger(
3823+
struct jbd2_buffer_trigger_type *triggers,
3824+
struct buffer_head *bh,
3825+
void *data, size_t size);
37713826

37723827
/*
37733828
* Add new method to test whether block and inode bitmaps are properly

fs/ext4/inode.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4624,7 +4624,8 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
46244624
((ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) ||
46254625
ino == le32_to_cpu(es->s_usr_quota_inum) ||
46264626
ino == le32_to_cpu(es->s_grp_quota_inum) ||
4627-
ino == le32_to_cpu(es->s_prj_quota_inum))) ||
4627+
ino == le32_to_cpu(es->s_prj_quota_inum) ||
4628+
ino == le32_to_cpu(es->s_orphan_file_inum))) ||
46284629
(ino < EXT4_ROOT_INO) ||
46294630
(ino > le32_to_cpu(es->s_inodes_count))) {
46304631
if (flags & EXT4_IGET_HANDLE)

0 commit comments

Comments
 (0)