Skip to content

Commit 4a79a98

Browse files
jankaratytso
authored andcommitted
ext4: Improve scalability of ext4 orphan file handling
Even though the length of the critical section when adding / removing orphaned inodes was significantly reduced by using orphan file, the contention of lock protecting orphan file still appears high in profiles for truncate / unlink intensive workloads with high number of threads. This patch makes handling of orphan file completely lockless. Also to reduce conflicts between CPUs different CPUs start searching for empty slot in orphan file in different blocks. Performance comparison of locked orphan file handling, lockless orphan file handling, and completely disabled orphan inode handling from 80 CPU Xeon Server with 526 GB of RAM, filesystem located on SAS SSD disk, average of 5 runs: stress-orphan (microbenchmark truncating files byte-by-byte from N processes in parallel) Threads Time Time Time Orphan locked Orphan lockless No orphan 1 0.945600 0.939400 0.891200 2 1.331800 1.246600 1.174400 4 1.995000 1.780600 1.713200 8 6.424200 4.900000 4.106000 16 14.937600 8.516400 8.138000 32 33.038200 24.565600 24.002200 64 60.823600 39.844600 38.440200 128 122.941400 70.950400 69.315000 So we can see that with lockless orphan file handling, addition / deletion of orphaned inodes got almost completely out of picture even for a microbenchmark stressing it. For reaim creat_clo workload on ramdisk there are also noticeable gains (average of 5 runs): Clients Vanilla (ops/s) Patched (ops/s) creat_clo-1 14705.88 ( 0.00%) 14354.07 * -2.39%* creat_clo-3 27108.43 ( 0.00%) 28301.89 ( 4.40%) creat_clo-5 37406.48 ( 0.00%) 45180.73 * 20.78%* creat_clo-7 41338.58 ( 0.00%) 54687.50 * 32.29%* creat_clo-9 45226.13 ( 0.00%) 62937.07 * 39.16%* creat_clo-11 44000.00 ( 0.00%) 65088.76 * 47.93%* creat_clo-13 36516.85 ( 0.00%) 68661.97 * 88.03%* creat_clo-15 30864.20 ( 0.00%) 69551.78 * 125.35%* creat_clo-17 27478.45 ( 0.00%) 67729.08 * 146.48%* creat_clo-19 25000.00 ( 0.00%) 61621.62 * 146.49%* creat_clo-21 18772.35 ( 0.00%) 63829.79 * 240.02%* creat_clo-23 16698.94 ( 0.00%) 61938.96 * 270.92%* creat_clo-25 14973.05 ( 0.00%) 56947.61 * 280.33%* creat_clo-27 16436.69 ( 0.00%) 65008.03 * 295.51%* creat_clo-29 13949.01 ( 0.00%) 69047.62 * 395.00%* creat_clo-31 14283.52 ( 0.00%) 67982.45 * 375.95%* Reviewed-by: Theodore Ts'o <[email protected]> Reviewed-by: Lukas Czerner <[email protected]> Signed-off-by: Jan Kara <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Theodore Ts'o <[email protected]>
1 parent 3a6541e commit 4a79a98

File tree

2 files changed

+53
-27
lines changed

2 files changed

+53
-27
lines changed

fs/ext4/ext4.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1489,15 +1489,14 @@ static inline int ext4_inodes_per_orphan_block(struct super_block *sb)
14891489
}
14901490

14911491
struct ext4_orphan_block {
1492-
int ob_free_entries; /* Number of free orphan entries in block */
1492+
atomic_t ob_free_entries; /* Number of free orphan entries in block */
14931493
struct buffer_head *ob_bh; /* Buffer for orphan block */
14941494
};
14951495

14961496
/*
14971497
* Info about orphan file.
14981498
*/
14991499
struct ext4_orphan_info {
1500-
spinlock_t of_lock;
15011500
int of_blocks; /* Number of orphan blocks in a file */
15021501
__u32 of_csum_seed; /* Checksum seed for orphan file */
15031502
struct ext4_orphan_block *of_binfo; /* Array with info about orphan

fs/ext4/orphan.c

Lines changed: 52 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -10,16 +10,31 @@
1010

1111
static int ext4_orphan_file_add(handle_t *handle, struct inode *inode)
1212
{
13-
int i, j;
13+
int i, j, start;
1414
struct ext4_orphan_info *oi = &EXT4_SB(inode->i_sb)->s_orphan_info;
1515
int ret = 0;
16+
bool found = false;
1617
__le32 *bdata;
1718
int inodes_per_ob = ext4_inodes_per_orphan_block(inode->i_sb);
19+
int looped = 0;
20+
21+
/*
22+
* Find block with free orphan entry. Use CPU number for a naive hash
23+
* for a search start in the orphan file
24+
*/
25+
start = raw_smp_processor_id()*13 % oi->of_blocks;
26+
i = start;
27+
do {
28+
if (atomic_dec_if_positive(&oi->of_binfo[i].ob_free_entries)
29+
>= 0) {
30+
found = true;
31+
break;
32+
}
33+
if (++i >= oi->of_blocks)
34+
i = 0;
35+
} while (i != start);
1836

19-
spin_lock(&oi->of_lock);
20-
for (i = 0; i < oi->of_blocks && !oi->of_binfo[i].ob_free_entries; i++);
21-
if (i == oi->of_blocks) {
22-
spin_unlock(&oi->of_lock);
37+
if (!found) {
2338
/*
2439
* For now we don't grow or shrink orphan file. We just use
2540
* whatever was allocated at mke2fs time. The additional
@@ -28,28 +43,43 @@ static int ext4_orphan_file_add(handle_t *handle, struct inode *inode)
2843
*/
2944
return -ENOSPC;
3045
}
31-
oi->of_binfo[i].ob_free_entries--;
32-
spin_unlock(&oi->of_lock);
3346

34-
/*
35-
* Get access to orphan block. We have dropped of_lock but since we
36-
* have decremented number of free entries we are guaranteed free entry
37-
* in our block.
38-
*/
3947
ret = ext4_journal_get_write_access(handle, inode->i_sb,
4048
oi->of_binfo[i].ob_bh, EXT4_JTR_ORPHAN_FILE);
41-
if (ret)
49+
if (ret) {
50+
atomic_inc(&oi->of_binfo[i].ob_free_entries);
4251
return ret;
52+
}
4353

4454
bdata = (__le32 *)(oi->of_binfo[i].ob_bh->b_data);
45-
spin_lock(&oi->of_lock);
4655
/* Find empty slot in a block */
47-
for (j = 0; j < inodes_per_ob && bdata[j]; j++);
48-
BUG_ON(j == inodes_per_ob);
49-
bdata[j] = cpu_to_le32(inode->i_ino);
56+
j = 0;
57+
do {
58+
if (looped) {
59+
/*
60+
* Did we walk through the block several times without
61+
* finding free entry? It is theoretically possible
62+
* if entries get constantly allocated and freed or
63+
* if the block is corrupted. Avoid indefinite looping
64+
* and bail. We'll use orphan list instead.
65+
*/
66+
if (looped > 3) {
67+
atomic_inc(&oi->of_binfo[i].ob_free_entries);
68+
return -ENOSPC;
69+
}
70+
cond_resched();
71+
}
72+
while (bdata[j]) {
73+
if (++j >= inodes_per_ob) {
74+
j = 0;
75+
looped++;
76+
}
77+
}
78+
} while (cmpxchg(&bdata[j], (__le32)0, cpu_to_le32(inode->i_ino)) !=
79+
(__le32)0);
80+
5081
EXT4_I(inode)->i_orphan_idx = i * inodes_per_ob + j;
5182
ext4_set_inode_state(inode, EXT4_STATE_ORPHAN_FILE);
52-
spin_unlock(&oi->of_lock);
5383

5484
return ext4_handle_dirty_metadata(handle, NULL, oi->of_binfo[i].ob_bh);
5585
}
@@ -180,10 +210,8 @@ static int ext4_orphan_file_del(handle_t *handle, struct inode *inode)
180210
goto out;
181211

182212
bdata = (__le32 *)(oi->of_binfo[blk].ob_bh->b_data);
183-
spin_lock(&oi->of_lock);
184213
bdata[off] = 0;
185-
oi->of_binfo[blk].ob_free_entries++;
186-
spin_unlock(&oi->of_lock);
214+
atomic_inc(&oi->of_binfo[blk].ob_free_entries);
187215
ret = ext4_handle_dirty_metadata(handle, NULL, oi->of_binfo[blk].ob_bh);
188216
out:
189217
ext4_clear_inode_state(inode, EXT4_STATE_ORPHAN_FILE);
@@ -552,8 +580,6 @@ int ext4_init_orphan_info(struct super_block *sb)
552580
struct ext4_orphan_block_tail *ot;
553581
ino_t orphan_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_orphan_file_inum);
554582

555-
spin_lock_init(&oi->of_lock);
556-
557583
if (!ext4_has_feature_orphan_file(sb))
558584
return 0;
559585

@@ -597,7 +623,7 @@ int ext4_init_orphan_info(struct super_block *sb)
597623
for (j = 0; j < inodes_per_ob; j++)
598624
if (bdata[j] == 0)
599625
free++;
600-
oi->of_binfo[i].ob_free_entries = free;
626+
atomic_set(&oi->of_binfo[i].ob_free_entries, free);
601627
}
602628
iput(inode);
603629
return 0;
@@ -619,7 +645,8 @@ int ext4_orphan_file_empty(struct super_block *sb)
619645
if (!ext4_has_feature_orphan_file(sb))
620646
return 1;
621647
for (i = 0; i < oi->of_blocks; i++)
622-
if (oi->of_binfo[i].ob_free_entries != inodes_per_ob)
648+
if (atomic_read(&oi->of_binfo[i].ob_free_entries) !=
649+
inodes_per_ob)
623650
return 0;
624651
return 1;
625652
}

0 commit comments

Comments
 (0)