Skip to content

Commit 8820132

Browse files
biger410Somasundaram Krishnasamy
authored andcommitted
xfs: fix deadlock between shrinker and fs freeze
Orabug: 30898016 Shrinker hold sb->s_umount lock and invoked .destroy_inode to reclaim inode, if fs was freezed, Shrinker would hung by freeze lock. But unfreeze could never happen because it would be hung by sb->s_umount. Backgroud inode inactivation feature could fix this, but it was not merged by mainline yet, according Darrick, even merged, it would be nearly impossible to backport to 4.14. The effort here is to make a one OFF-MAINLINE fix for uek, if future uek have that feature merged, this patch should be dropped. To avoid deadlock, add inode needing inactivation to list and destroy them async. crash7latest> set 132 PID: 132 COMMAND: "kswapd0:0" TASK: ffff9cdc9dfb5f00 [THREAD_INFO: ffff9cdc9dfb5f00] CPU: 6 STATE: TASK_UNINTERRUPTIBLE crash7latest> bt PID: 132 TASK: ffff9cdc9dfb5f00 CPU: 6 COMMAND: "kswapd0:0" #0 [ffffaa5d075bf900] __schedule at ffffffff8186487c #1 [ffffaa5d075bf998] schedule at ffffffff81864e96 #2 [ffffaa5d075bf9b0] rwsem_down_read_failed at ffffffff818689ee #3 [ffffaa5d075bfa40] call_rwsem_down_read_failed at ffffffff81859308 #4 [ffffaa5d075bfa90] __percpu_down_read at ffffffff810ebd38 #5 [ffffaa5d075bfab0] __sb_start_write at ffffffff812859ef #6 [ffffaa5d075bfad0] xfs_trans_alloc at ffffffffc07ebe9c [xfs] #7 [ffffaa5d075bfb18] xfs_free_eofblocks at ffffffffc07c39d1 [xfs] #8 [ffffaa5d075bfb80] xfs_inactive at ffffffffc07de878 [xfs] #9 [ffffaa5d075bfba0] __dta_xfs_fs_destroy_inode_3543 at ffffffffc07e885e [xfs] #10 [ffffaa5d075bfbd0] destroy_inode at ffffffff812a25de #11 [ffffaa5d075bfbe8] evict at ffffffff812a2b73 #12 [ffffaa5d075bfc10] dispose_list at ffffffff812a2c1d #13 [ffffaa5d075bfc38] prune_icache_sb at ffffffff812a421a #14 [ffffaa5d075bfc70] super_cache_scan at ffffffff812870a1 #15 [ffffaa5d075bfcc8] shrink_slab at ffffffff811eebb3 #16 [ffffaa5d075bfdb0] shrink_node at ffffffff811f4788 #17 [ffffaa5d075bfe38] kswapd at ffffffff811f58c3 #18 [ffffaa5d075bff08] kthread at ffffffff810b75d5 #19 [ffffaa5d075bff50] ret_from_fork at ffffffff81a0035e crash7latest> set 31060 PID: 31060 COMMAND: "safefreeze" TASK: ffff9cd292868000 [THREAD_INFO: ffff9cd292868000] CPU: 2 STATE: TASK_UNINTERRUPTIBLE crash7latest> bt PID: 31060 TASK: ffff9cd292868000 CPU: 2 COMMAND: "safefreeze" #0 [ffffaa5d10047c90] __schedule at ffffffff8186487c #1 [ffffaa5d10047d28] schedule at ffffffff81864e96 #2 [ffffaa5d10047d40] rwsem_down_write_failed at ffffffff81868f18 #3 [ffffaa5d10047dd8] call_rwsem_down_write_failed at ffffffff81859367 #4 [ffffaa5d10047e20] down_write at ffffffff81867cfd #5 [ffffaa5d10047e38] thaw_super at ffffffff81285d2d #6 [ffffaa5d10047e60] do_vfs_ioctl at ffffffff81299566 #7 [ffffaa5d10047ee8] sys_ioctl at ffffffff81299709 #8 [ffffaa5d10047f28] do_syscall_64 at ffffffff81003949 #9 [ffffaa5d10047f50] entry_SYSCALL_64_after_hwframe at ffffffff81a001ad RIP: 0000000000453d67 RSP: 00007ffff9c1ce78 RFLAGS: 00000206 RAX: ffffffffffffffda RBX: 0000000001cbe92c RCX: 0000000000453d67 RDX: 0000000000000000 RSI: 00000000c0045878 RDI: 0000000000000014 RBP: 00007ffff9c1cf80 R8: 0000000000000000 R9: 0000000000000012 R10: 0000000000000008 R11: 0000000000000206 R12: 0000000000401fb0 R13: 0000000000402040 R14: 0000000000000000 R15: 0000000000000000 ORIG_RAX: 0000000000000010 CS: 0033 SS: 002b Signed-off-by: Junxiao Bi <[email protected]> Reviewed-by: Darrick J. Wong <[email protected]> Signed-off-by: Somasundaram Krishnasamy <[email protected]>
1 parent a2bb9a9 commit 8820132

File tree

8 files changed

+186
-22
lines changed

8 files changed

+186
-22
lines changed

fs/super.c

Lines changed: 22 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1509,21 +1509,12 @@ int freeze_super(struct super_block *sb)
15091509
}
15101510
EXPORT_SYMBOL(freeze_super);
15111511

1512-
/**
1513-
* thaw_super -- unlock filesystem
1514-
* @sb: the super to thaw
1515-
*
1516-
* Unlocks the filesystem and marks it writeable again after freeze_super().
1517-
*/
1518-
int thaw_super(struct super_block *sb)
1512+
int __thaw_super(struct super_block *sb)
15191513
{
15201514
int error;
15211515

1522-
down_write(&sb->s_umount);
1523-
if (sb->s_writers.frozen != SB_FREEZE_COMPLETE) {
1524-
up_write(&sb->s_umount);
1516+
if (sb->s_writers.frozen != SB_FREEZE_COMPLETE)
15251517
return -EINVAL;
1526-
}
15271518

15281519
if (sb_rdonly(sb)) {
15291520
sb->s_writers.frozen = SB_UNFROZEN;
@@ -1538,7 +1529,6 @@ int thaw_super(struct super_block *sb)
15381529
printk(KERN_ERR
15391530
"VFS:Filesystem thaw failed\n");
15401531
lockdep_sb_freeze_release(sb);
1541-
up_write(&sb->s_umount);
15421532
return error;
15431533
}
15441534
}
@@ -1547,7 +1537,26 @@ int thaw_super(struct super_block *sb)
15471537
sb_freeze_unlock(sb);
15481538
out:
15491539
wake_up(&sb->s_writers.wait_unfrozen);
1550-
deactivate_locked_super(sb);
15511540
return 0;
15521541
}
1542+
EXPORT_SYMBOL(__thaw_super);
1543+
1544+
/**
1545+
* thaw_super -- unlock filesystem
1546+
* @sb: the super to thaw
1547+
*
1548+
* Unlocks the filesystem and marks it writeable again after freeze_super().
1549+
*/
1550+
int thaw_super(struct super_block *sb)
1551+
{
1552+
int error;
1553+
1554+
down_write(&sb->s_umount);
1555+
error = __thaw_super(sb);
1556+
if (error)
1557+
up_write(&sb->s_umount);
1558+
else
1559+
deactivate_locked_super(sb);
1560+
return error;
1561+
}
15531562
EXPORT_SYMBOL(thaw_super);

fs/xfs/xfs_icache.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ xfs_inode_alloc(
8282
ip->i_flags = 0;
8383
ip->i_delayed_blks = 0;
8484
memset(&ip->i_d, 0, sizeof(ip->i_d));
85+
INIT_LIST_HEAD(&ip->i_inact_list);
8586

8687
return ip;
8788
}

fs/xfs/xfs_inode.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,8 @@ typedef struct xfs_inode {
7070

7171
/* VFS inode */
7272
struct inode i_vnode; /* embedded VFS inode */
73+
74+
struct list_head i_inact_list;
7375
} xfs_inode_t;
7476

7577
/* Convert from vfs inode to xfs inode */

fs/xfs/xfs_mount.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,10 @@ xfs_initialize_perag(
223223
if (radix_tree_preload(GFP_NOFS))
224224
goto out_hash_destroy;
225225

226+
INIT_WORK(&pag->pag_inact_work, xfs_fs_inact_worker);
227+
INIT_LIST_HEAD(&pag->pag_inact_list);
228+
spin_lock_init(&pag->pag_inact_lock);
229+
226230
spin_lock(&mp->m_perag_lock);
227231
if (radix_tree_insert(&mp->m_perag_tree, index, pag)) {
228232
BUG();
@@ -1078,6 +1082,7 @@ xfs_unmountfs(
10781082

10791083
cancel_delayed_work_sync(&mp->m_eofblocks_work);
10801084
cancel_delayed_work_sync(&mp->m_cowblocks_work);
1085+
flush_workqueue(mp->m_inact_workqueue);
10811086

10821087
xfs_fs_unreserve_ag_blocks(mp);
10831088
xfs_qm_unmount_quotas(mp);

fs/xfs/xfs_mount.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,7 @@ typedef struct xfs_mount {
184184
struct workqueue_struct *m_log_workqueue;
185185
struct workqueue_struct *m_eofblocks_workqueue;
186186
struct workqueue_struct *m_sync_workqueue;
187+
struct workqueue_struct *m_inact_workqueue;
187188

188189
/*
189190
* Generation of the filesysyem layout. This is incremented by each
@@ -397,6 +398,11 @@ typedef struct xfs_perag {
397398

398399
/* reference count */
399400
uint8_t pagf_refcount_level;
401+
402+
/* For inode inactivation */
403+
struct work_struct pag_inact_work;
404+
struct list_head pag_inact_list;
405+
spinlock_t pag_inact_lock;
400406
} xfs_perag_t;
401407

402408
static inline struct xfs_ag_resv *

fs/xfs/xfs_super.c

Lines changed: 147 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -865,6 +865,24 @@ xfs_setup_devices(
865865
return 0;
866866
}
867867

868+
STATIC int
869+
xfs_init_inact_workqueue(
870+
struct xfs_mount *mp)
871+
{
872+
mp->m_inact_workqueue = alloc_workqueue("xfs-inact/%s", WQ_FREEZABLE,
873+
xfs_guess_metadata_threads(mp), mp->m_fsname);
874+
if (!mp->m_inact_workqueue)
875+
return -ENOMEM;
876+
return 0;
877+
}
878+
879+
STATIC void
880+
xfs_destroy_inact_workqueue(
881+
struct xfs_mount *mp)
882+
{
883+
destroy_workqueue(mp->m_inact_workqueue);
884+
}
885+
868886
STATIC int
869887
xfs_init_mount_workqueues(
870888
struct xfs_mount *mp)
@@ -971,12 +989,8 @@ xfs_fs_alloc_inode(
971989
return NULL;
972990
}
973991

974-
/*
975-
* Now that the generic code is guaranteed not to be accessing
976-
* the linux inode, we can inactivate and reclaim the inode.
977-
*/
978992
STATIC void
979-
xfs_fs_destroy_inode(
993+
_xfs_fs_destroy_inode(
980994
struct inode *inode)
981995
{
982996
struct xfs_inode *ip = XFS_I(inode);
@@ -1017,6 +1031,87 @@ xfs_fs_destroy_inode(
10171031
xfs_inode_set_reclaim_tag(ip);
10181032
}
10191033

1034+
/*
1035+
* Now that the generic code is guaranteed not to be accessing
1036+
* the linux inode, we can inactivate and reclaim the inode.
1037+
*/
1038+
STATIC void
1039+
xfs_fs_destroy_inode(
1040+
struct inode *inode)
1041+
{
1042+
struct xfs_inode *ip = XFS_I(inode);
1043+
struct xfs_mount *mp = ip->i_mount;
1044+
struct xfs_perag *pag;
1045+
1046+
if (xfs_inode_needs_inactivation(ip)) {
1047+
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
1048+
spin_lock(&pag->pag_inact_lock);
1049+
list_add_tail(&ip->i_inact_list, &pag->pag_inact_list);
1050+
spin_unlock(&pag->pag_inact_lock);
1051+
queue_work(mp->m_inact_workqueue, &pag->pag_inact_work);
1052+
xfs_perag_put(pag);
1053+
return;
1054+
}
1055+
1056+
_xfs_fs_destroy_inode(inode);
1057+
}
1058+
1059+
void
1060+
xfs_fs_inact_worker(
1061+
struct work_struct *work)
1062+
{
1063+
struct xfs_perag *pag = container_of(work,
1064+
struct xfs_perag, pag_inact_work);
1065+
struct list_head list;
1066+
struct xfs_inode *ip;
1067+
struct xfs_inode *next_ip;
1068+
struct xfs_mount *mp;
1069+
1070+
mp = pag->pag_mount;
1071+
while (1) {
1072+
/* fs freezed, return to avoid hung task, requeue at thaw. */
1073+
if (!sb_start_write_trylock(mp->m_super))
1074+
return;
1075+
1076+
spin_lock(&pag->pag_inact_lock);
1077+
if (list_empty(&pag->pag_inact_list)) {
1078+
spin_unlock(&pag->pag_inact_lock);
1079+
sb_end_write(mp->m_super);
1080+
return;
1081+
}
1082+
list_replace_init(&pag->pag_inact_list, &list);
1083+
spin_unlock(&pag->pag_inact_lock);
1084+
1085+
list_for_each_entry_safe(ip, next_ip, &list, i_inact_list) {
1086+
list_del_init(&ip->i_inact_list);
1087+
_xfs_fs_destroy_inode(&ip->i_vnode);
1088+
cond_resched();
1089+
}
1090+
sb_end_write(mp->m_super);
1091+
}
1092+
}
1093+
1094+
STATIC void
1095+
xfs_fs_requeue_inact_work(
1096+
struct xfs_mount *mp)
1097+
{
1098+
struct xfs_perag *pag;
1099+
xfs_agnumber_t index;
1100+
1101+
for (index = 0; index < mp->m_sb.sb_agcount; index++) {
1102+
pag = xfs_perag_get(mp, index);
1103+
spin_lock(&pag->pag_inact_lock);
1104+
if (list_empty(&pag->pag_inact_list)) {
1105+
spin_unlock(&pag->pag_inact_lock);
1106+
xfs_perag_put(pag);
1107+
continue;
1108+
}
1109+
spin_unlock(&pag->pag_inact_lock);
1110+
queue_work(mp->m_inact_workqueue, &pag->pag_inact_work);
1111+
xfs_perag_put(pag);
1112+
}
1113+
}
1114+
10201115
static void
10211116
xfs_fs_dirty_inode(
10221117
struct inode *inode,
@@ -1445,6 +1540,7 @@ xfs_fs_remount(
14451540
* final log force+buftarg wait and deadlock the remount.
14461541
*/
14471542
cancel_delayed_work_sync(&mp->m_eofblocks_work);
1543+
flush_workqueue(mp->m_inact_workqueue);
14481544

14491545
xfs_quiesce_attr(mp);
14501546
mp->m_flags |= XFS_MOUNT_RDONLY;
@@ -1453,6 +1549,19 @@ xfs_fs_remount(
14531549
return 0;
14541550
}
14551551

1552+
STATIC int
1553+
xfs_fs_freeze_super(struct super_block *sb)
1554+
{
1555+
struct xfs_mount *mp = XFS_M(sb);
1556+
1557+
/*
1558+
* clean up inactive inodes before freezing to minimize
1559+
* the amount of recovery work if we crash while frozen.
1560+
*/
1561+
flush_workqueue(mp->m_inact_workqueue);
1562+
return freeze_super(sb);
1563+
}
1564+
14561565
/*
14571566
* Second stage of a freeze. The data is already frozen so we only
14581567
* need to take care of the metadata. Once that's done sync the superblock
@@ -1470,6 +1579,25 @@ xfs_fs_freeze(
14701579
return xfs_sync_sb(mp, true);
14711580
}
14721581

1582+
STATIC int
1583+
xfs_fs_thaw_super(
1584+
struct super_block *sb)
1585+
{
1586+
struct xfs_mount *mp = XFS_M(sb);
1587+
int error;
1588+
1589+
down_write(&sb->s_umount);
1590+
error = __thaw_super(sb);
1591+
if (error)
1592+
up_write(&sb->s_umount);
1593+
else {
1594+
/* inact work was skiped for fs frozen, requeue here. */
1595+
xfs_fs_requeue_inact_work(mp);
1596+
deactivate_locked_super(sb);
1597+
}
1598+
return error;
1599+
}
1600+
14731601
STATIC int
14741602
xfs_fs_unfreeze(
14751603
struct super_block *sb)
@@ -1666,17 +1794,22 @@ xfs_fs_fill_super(
16661794
if (error)
16671795
goto out_free_stats;
16681796

1669-
error = xfs_finish_flags(mp);
1797+
/* worker thread number depends on agcount. */
1798+
error = xfs_init_inact_workqueue(mp);
16701799
if (error)
16711800
goto out_free_sb;
16721801

1802+
error = xfs_finish_flags(mp);
1803+
if (error)
1804+
goto out_destroy_inact_workqueue;
1805+
16731806
error = xfs_setup_devices(mp);
16741807
if (error)
1675-
goto out_free_sb;
1808+
goto out_destroy_inact_workqueue;
16761809

16771810
error = xfs_filestream_mount(mp);
16781811
if (error)
1679-
goto out_free_sb;
1812+
goto out_destroy_inact_workqueue;
16801813

16811814
/*
16821815
* we must configure the block size in the superblock before we run the
@@ -1765,6 +1898,8 @@ xfs_fs_fill_super(
17651898

17661899
out_filestream_unmount:
17671900
xfs_filestream_unmount(mp);
1901+
out_destroy_inact_workqueue:
1902+
xfs_destroy_inact_workqueue(mp);
17681903
out_free_sb:
17691904
xfs_freesb(mp);
17701905
out_free_stats:
@@ -1784,7 +1919,7 @@ xfs_fs_fill_super(
17841919
out_unmount:
17851920
xfs_filestream_unmount(mp);
17861921
xfs_unmountfs(mp);
1787-
goto out_free_sb;
1922+
goto out_destroy_inact_workqueue;
17881923
}
17891924

17901925
STATIC void
@@ -1801,6 +1936,7 @@ xfs_fs_put_super(
18011936
free_percpu(mp->m_stats.xs_stats);
18021937
xfs_destroy_percpu_counters(mp);
18031938
xfs_destroy_mount_workqueues(mp);
1939+
xfs_destroy_inact_workqueue(mp);
18041940
xfs_close_devices(mp);
18051941
xfs_free_fsname(mp);
18061942
kfree(mp);
@@ -1839,7 +1975,9 @@ static const struct super_operations xfs_super_operations = {
18391975
.drop_inode = xfs_fs_drop_inode,
18401976
.put_super = xfs_fs_put_super,
18411977
.sync_fs = xfs_fs_sync_fs,
1978+
.freeze_super = xfs_fs_freeze_super,
18421979
.freeze_fs = xfs_fs_freeze,
1980+
.thaw_super = xfs_fs_thaw_super,
18431981
.unfreeze_fs = xfs_fs_unfreeze,
18441982
.statfs = xfs_fs_statfs,
18451983
.remount_fs = xfs_fs_remount,

fs/xfs/xfs_super.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,12 +60,14 @@ struct xfs_inode;
6060
struct xfs_mount;
6161
struct xfs_buftarg;
6262
struct block_device;
63+
struct work_struct;
6364

6465
extern void xfs_quiesce_attr(struct xfs_mount *mp);
6566
extern void xfs_flush_inodes(struct xfs_mount *mp);
6667
extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
6768
extern xfs_agnumber_t xfs_set_inode_alloc(struct xfs_mount *,
6869
xfs_agnumber_t agcount);
70+
extern void xfs_fs_inact_worker(struct work_struct *work);
6971

7072
extern const struct export_operations xfs_export_operations;
7173
extern const struct xattr_handler *xfs_xattr_handlers[];

include/linux/fs.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2166,6 +2166,7 @@ extern int user_statfs(const char __user *, struct kstatfs *);
21662166
extern int fd_statfs(int, struct kstatfs *);
21672167
extern int vfs_ustat(dev_t, struct kstatfs *);
21682168
extern int freeze_super(struct super_block *super);
2169+
extern int __thaw_super(struct super_block *super);
21692170
extern int thaw_super(struct super_block *super);
21702171
extern bool our_mnt(struct vfsmount *mnt);
21712172
extern __printf(2, 3)

0 commit comments

Comments
 (0)