Skip to content

Commit 4f92ea0

Browse files
biger410Brian Maly
authored andcommitted
xfs: fix deadlock between shrinker and fs freeze
Orabug: 30944736 Shrinker hold sb->s_umount lock and invoked .destroy_inode to reclaim inode, if fs was freezed, Shrinker would hung by freeze lock. But unfreeze could never happen because it would be hung by sb->s_umount. Backgroud inode inactivation feature could fix this, but it was not merged by mainline yet, according Darrick, even merged, it would be nearly impossible to backport to 4.14. The effort here is to make a one OFF-MAINLINE fix for uek, if future uek have that feature merged, this patch should be dropped. To avoid deadlock, add inode needing inactivation to list and destroy them async. crash7latest> set 132 PID: 132 COMMAND: "kswapd0:0" TASK: ffff9cdc9dfb5f00 [THREAD_INFO: ffff9cdc9dfb5f00] CPU: 6 STATE: TASK_UNINTERRUPTIBLE crash7latest> bt PID: 132 TASK: ffff9cdc9dfb5f00 CPU: 6 COMMAND: "kswapd0:0" #0 [ffffaa5d075bf900] __schedule at ffffffff8186487c #1 [ffffaa5d075bf998] schedule at ffffffff81864e96 #2 [ffffaa5d075bf9b0] rwsem_down_read_failed at ffffffff818689ee #3 [ffffaa5d075bfa40] call_rwsem_down_read_failed at ffffffff81859308 #4 [ffffaa5d075bfa90] __percpu_down_read at ffffffff810ebd38 #5 [ffffaa5d075bfab0] __sb_start_write at ffffffff812859ef #6 [ffffaa5d075bfad0] xfs_trans_alloc at ffffffffc07ebe9c [xfs] #7 [ffffaa5d075bfb18] xfs_free_eofblocks at ffffffffc07c39d1 [xfs] #8 [ffffaa5d075bfb80] xfs_inactive at ffffffffc07de878 [xfs] #9 [ffffaa5d075bfba0] __dta_xfs_fs_destroy_inode_3543 at ffffffffc07e885e [xfs] #10 [ffffaa5d075bfbd0] destroy_inode at ffffffff812a25de #11 [ffffaa5d075bfbe8] evict at ffffffff812a2b73 #12 [ffffaa5d075bfc10] dispose_list at ffffffff812a2c1d #13 [ffffaa5d075bfc38] prune_icache_sb at ffffffff812a421a #14 [ffffaa5d075bfc70] super_cache_scan at ffffffff812870a1 #15 [ffffaa5d075bfcc8] shrink_slab at ffffffff811eebb3 #16 [ffffaa5d075bfdb0] shrink_node at ffffffff811f4788 #17 [ffffaa5d075bfe38] kswapd at ffffffff811f58c3 #18 [ffffaa5d075bff08] kthread at ffffffff810b75d5 #19 [ffffaa5d075bff50] ret_from_fork at ffffffff81a0035e crash7latest> set 31060 PID: 31060 COMMAND: "safefreeze" TASK: ffff9cd292868000 [THREAD_INFO: ffff9cd292868000] CPU: 2 STATE: TASK_UNINTERRUPTIBLE crash7latest> bt PID: 31060 TASK: ffff9cd292868000 CPU: 2 COMMAND: "safefreeze" #0 [ffffaa5d10047c90] __schedule at ffffffff8186487c #1 [ffffaa5d10047d28] schedule at ffffffff81864e96 #2 [ffffaa5d10047d40] rwsem_down_write_failed at ffffffff81868f18 #3 [ffffaa5d10047dd8] call_rwsem_down_write_failed at ffffffff81859367 #4 [ffffaa5d10047e20] down_write at ffffffff81867cfd #5 [ffffaa5d10047e38] thaw_super at ffffffff81285d2d #6 [ffffaa5d10047e60] do_vfs_ioctl at ffffffff81299566 #7 [ffffaa5d10047ee8] sys_ioctl at ffffffff81299709 #8 [ffffaa5d10047f28] do_syscall_64 at ffffffff81003949 #9 [ffffaa5d10047f50] entry_SYSCALL_64_after_hwframe at ffffffff81a001ad RIP: 0000000000453d67 RSP: 00007ffff9c1ce78 RFLAGS: 00000206 RAX: ffffffffffffffda RBX: 0000000001cbe92c RCX: 0000000000453d67 RDX: 0000000000000000 RSI: 00000000c0045878 RDI: 0000000000000014 RBP: 00007ffff9c1cf80 R8: 0000000000000000 R9: 0000000000000012 R10: 0000000000000008 R11: 0000000000000206 R12: 0000000000401fb0 R13: 0000000000402040 R14: 0000000000000000 R15: 0000000000000000 ORIG_RAX: 0000000000000010 CS: 0033 SS: 002b Signed-off-by: Junxiao Bi <[email protected]> Reviewed-by: Darrick J. Wong <[email protected]> Signed-off-by: Brian Maly <[email protected]>
1 parent 2e065f1 commit 4f92ea0

File tree

8 files changed

+186
-18
lines changed

8 files changed

+186
-18
lines changed

fs/super.c

Lines changed: 22 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1363,21 +1363,12 @@ int freeze_super(struct super_block *sb)
13631363
}
13641364
EXPORT_SYMBOL(freeze_super);
13651365

1366-
/**
1367-
* thaw_super -- unlock filesystem
1368-
* @sb: the super to thaw
1369-
*
1370-
* Unlocks the filesystem and marks it writeable again after freeze_super().
1371-
*/
1372-
int thaw_super(struct super_block *sb)
1366+
int __thaw_super(struct super_block *sb)
13731367
{
13741368
int error;
13751369

1376-
down_write(&sb->s_umount);
1377-
if (sb->s_writers.frozen == SB_UNFROZEN) {
1378-
up_write(&sb->s_umount);
1370+
if (sb->s_writers.frozen == SB_UNFROZEN)
13791371
return -EINVAL;
1380-
}
13811372

13821373
if (sb->s_flags & MS_RDONLY)
13831374
goto out;
@@ -1387,7 +1378,6 @@ int thaw_super(struct super_block *sb)
13871378
if (error) {
13881379
printk(KERN_ERR
13891380
"VFS:Filesystem thaw failed\n");
1390-
up_write(&sb->s_umount);
13911381
return error;
13921382
}
13931383
}
@@ -1396,8 +1386,27 @@ int thaw_super(struct super_block *sb)
13961386
sb->s_writers.frozen = SB_UNFROZEN;
13971387
smp_wmb();
13981388
wake_up(&sb->s_writers.wait_unfrozen);
1399-
deactivate_locked_super(sb);
14001389

14011390
return 0;
14021391
}
1392+
EXPORT_SYMBOL(__thaw_super);
1393+
1394+
/**
1395+
* thaw_super -- unlock filesystem
1396+
* @sb: the super to thaw
1397+
*
1398+
* Unlocks the filesystem and marks it writeable again after freeze_super().
1399+
*/
1400+
int thaw_super(struct super_block *sb)
1401+
{
1402+
int error;
1403+
1404+
down_write(&sb->s_umount);
1405+
error = __thaw_super(sb);
1406+
if (error)
1407+
up_write(&sb->s_umount);
1408+
else
1409+
deactivate_locked_super(sb);
1410+
return error;
1411+
}
14031412
EXPORT_SYMBOL(thaw_super);

fs/xfs/xfs_icache.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ xfs_inode_alloc(
8080
ip->i_flags = 0;
8181
ip->i_delayed_blks = 0;
8282
memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
83+
INIT_LIST_HEAD(&ip->i_inact_list);
8384

8485
return ip;
8586
}

fs/xfs/xfs_inode.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,8 @@ typedef struct xfs_inode {
6767

6868
/* VFS inode */
6969
struct inode i_vnode; /* embedded VFS inode */
70+
71+
struct list_head i_inact_list;
7072
} xfs_inode_t;
7173

7274
/* Convert from vfs inode to xfs inode */

fs/xfs/xfs_mount.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,10 @@ xfs_initialize_perag(
208208
if (radix_tree_preload(GFP_NOFS))
209209
goto out_unwind;
210210

211+
INIT_WORK(&pag->pag_inact_work, xfs_fs_inact_worker);
212+
INIT_LIST_HEAD(&pag->pag_inact_list);
213+
spin_lock_init(&pag->pag_inact_lock);
214+
211215
spin_lock(&mp->m_perag_lock);
212216
if (radix_tree_insert(&mp->m_perag_tree, index, pag)) {
213217
BUG();
@@ -991,6 +995,7 @@ xfs_unmountfs(
991995
int error;
992996

993997
cancel_delayed_work_sync(&mp->m_eofblocks_work);
998+
flush_workqueue(mp->m_inact_workqueue);
994999

9951000
xfs_qm_unmount_quotas(mp);
9961001
xfs_rtunmount_inodes(mp);

fs/xfs/xfs_mount.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,7 @@ typedef struct xfs_mount {
170170
struct workqueue_struct *m_log_workqueue;
171171
struct workqueue_struct *m_eofblocks_workqueue;
172172
struct workqueue_struct *m_sync_workqueue;
173+
struct workqueue_struct *m_inact_workqueue;
173174

174175
/*
175176
* Generation of the filesysyem layout. This is incremented by each
@@ -348,6 +349,11 @@ typedef struct xfs_perag {
348349
/* for rcu-safe freeing */
349350
struct rcu_head rcu_head;
350351
int pagb_count; /* pagb slots in use */
352+
353+
/* For inode inactivation */
354+
struct work_struct pag_inact_work;
355+
struct list_head pag_inact_list;
356+
spinlock_t pag_inact_lock;
351357
} xfs_perag_t;
352358

353359
extern int xfs_log_sbcount(xfs_mount_t *);

fs/xfs/xfs_super.c

Lines changed: 147 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -821,6 +821,24 @@ xfs_setup_devices(
821821
return 0;
822822
}
823823

824+
STATIC int
825+
xfs_init_inact_workqueue(
826+
struct xfs_mount *mp)
827+
{
828+
mp->m_inact_workqueue = alloc_workqueue("xfs-inact/%s", WQ_FREEZABLE,
829+
xfs_guess_metadata_threads(mp), mp->m_fsname);
830+
if (!mp->m_inact_workqueue)
831+
return -ENOMEM;
832+
return 0;
833+
}
834+
835+
STATIC void
836+
xfs_destroy_inact_workqueue(
837+
struct xfs_mount *mp)
838+
{
839+
destroy_workqueue(mp->m_inact_workqueue);
840+
}
841+
824842
STATIC int
825843
xfs_init_mount_workqueues(
826844
struct xfs_mount *mp)
@@ -989,7 +1007,7 @@ xfs_fs_inode_init_once(
9891007
}
9901008

9911009
STATIC void
992-
xfs_fs_evict_inode(
1010+
_xfs_fs_evict_inode(
9931011
struct inode *inode)
9941012
{
9951013
xfs_inode_t *ip = XFS_I(inode);
@@ -1006,6 +1024,87 @@ xfs_fs_evict_inode(
10061024
xfs_inactive(ip);
10071025
}
10081026

1027+
/*
1028+
* Now that the generic code is guaranteed not to be accessing
1029+
* the linux inode, we can inactivate and reclaim the inode.
1030+
*/
1031+
STATIC void
1032+
xfs_fs_evict_inode(
1033+
struct inode *inode)
1034+
{
1035+
struct xfs_inode *ip = XFS_I(inode);
1036+
struct xfs_mount *mp = ip->i_mount;
1037+
struct xfs_perag *pag;
1038+
1039+
if (xfs_inode_needs_inactivation(ip)) {
1040+
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
1041+
spin_lock(&pag->pag_inact_lock);
1042+
list_add_tail(&ip->i_inact_list, &pag->pag_inact_list);
1043+
spin_unlock(&pag->pag_inact_lock);
1044+
queue_work(mp->m_inact_workqueue, &pag->pag_inact_work);
1045+
xfs_perag_put(pag);
1046+
return;
1047+
}
1048+
1049+
_xfs_fs_evict_inode(inode);
1050+
}
1051+
1052+
void
1053+
xfs_fs_inact_worker(
1054+
struct work_struct *work)
1055+
{
1056+
struct xfs_perag *pag = container_of(work,
1057+
struct xfs_perag, pag_inact_work);
1058+
struct list_head list;
1059+
struct xfs_inode *ip;
1060+
struct xfs_inode *next_ip;
1061+
struct xfs_mount *mp;
1062+
1063+
mp = pag->pag_mount;
1064+
while (1) {
1065+
/* fs freezed, return to avoid hung task, requeue at thaw. */
1066+
if (!sb_start_write_trylock(mp->m_super))
1067+
return;
1068+
1069+
spin_lock(&pag->pag_inact_lock);
1070+
if (list_empty(&pag->pag_inact_list)) {
1071+
spin_unlock(&pag->pag_inact_lock);
1072+
sb_end_write(mp->m_super);
1073+
return;
1074+
}
1075+
list_replace_init(&pag->pag_inact_list, &list);
1076+
spin_unlock(&pag->pag_inact_lock);
1077+
1078+
list_for_each_entry_safe(ip, next_ip, &list, i_inact_list) {
1079+
list_del_init(&ip->i_inact_list);
1080+
_xfs_fs_evict_inode(&ip->i_vnode);
1081+
cond_resched();
1082+
}
1083+
sb_end_write(mp->m_super);
1084+
}
1085+
}
1086+
1087+
STATIC void
1088+
xfs_fs_requeue_inact_work(
1089+
struct xfs_mount *mp)
1090+
{
1091+
struct xfs_perag *pag;
1092+
xfs_agnumber_t index;
1093+
1094+
for (index = 0; index < mp->m_sb.sb_agcount; index++) {
1095+
pag = xfs_perag_get(mp, index);
1096+
spin_lock(&pag->pag_inact_lock);
1097+
if (list_empty(&pag->pag_inact_list)) {
1098+
spin_unlock(&pag->pag_inact_lock);
1099+
xfs_perag_put(pag);
1100+
continue;
1101+
}
1102+
spin_unlock(&pag->pag_inact_lock);
1103+
queue_work(mp->m_inact_workqueue, &pag->pag_inact_work);
1104+
xfs_perag_put(pag);
1105+
}
1106+
}
1107+
10091108
/*
10101109
* We do an unlocked check for XFS_IDONTCACHE here because we are already
10111110
* serialised against cache hits here via the inode->i_lock and igrab() in
@@ -1284,6 +1383,7 @@ xfs_fs_remount(
12841383

12851384
/* rw -> ro */
12861385
if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) {
1386+
flush_workqueue(mp->m_inact_workqueue);
12871387
/*
12881388
* Before we sync the metadata, we need to free up the reserve
12891389
* block pool so that the used block count in the superblock on
@@ -1299,6 +1399,19 @@ xfs_fs_remount(
12991399
return 0;
13001400
}
13011401

1402+
STATIC int
1403+
xfs_fs_freeze_super(struct super_block *sb)
1404+
{
1405+
struct xfs_mount *mp = XFS_M(sb);
1406+
1407+
/*
1408+
* clean up inactive inodes before freezing to minimize
1409+
* the amount of recovery work if we crash while frozen.
1410+
*/
1411+
flush_workqueue(mp->m_inact_workqueue);
1412+
return freeze_super(sb);
1413+
}
1414+
13021415
/*
13031416
* Second stage of a freeze. The data is already frozen so we only
13041417
* need to take care of the metadata. Once that's done sync the superblock
@@ -1316,6 +1429,25 @@ xfs_fs_freeze(
13161429
return xfs_sync_sb(mp, true);
13171430
}
13181431

1432+
STATIC int
1433+
xfs_fs_thaw_super(
1434+
struct super_block *sb)
1435+
{
1436+
struct xfs_mount *mp = XFS_M(sb);
1437+
int error;
1438+
1439+
down_write(&sb->s_umount);
1440+
error = __thaw_super(sb);
1441+
if (error)
1442+
up_write(&sb->s_umount);
1443+
else {
1444+
/* inact work was skiped for fs frozen, requeue here. */
1445+
xfs_fs_requeue_inact_work(mp);
1446+
deactivate_locked_super(sb);
1447+
}
1448+
return error;
1449+
}
1450+
13191451
STATIC int
13201452
xfs_fs_unfreeze(
13211453
struct super_block *sb)
@@ -1512,17 +1644,22 @@ xfs_fs_fill_super(
15121644
if (error)
15131645
goto out_free_stats;
15141646

1515-
error = xfs_finish_flags(mp);
1647+
/* worker thread number depends on agcount. */
1648+
error = xfs_init_inact_workqueue(mp);
15161649
if (error)
15171650
goto out_free_sb;
15181651

1652+
error = xfs_finish_flags(mp);
1653+
if (error)
1654+
goto out_destroy_inact_workqueue;
1655+
15191656
error = xfs_setup_devices(mp);
15201657
if (error)
1521-
goto out_free_sb;
1658+
goto out_destroy_inact_workqueue;
15221659

15231660
error = xfs_filestream_mount(mp);
15241661
if (error)
1525-
goto out_free_sb;
1662+
goto out_destroy_inact_workqueue;
15261663

15271664
/*
15281665
* we must configure the block size in the superblock before we run the
@@ -1573,6 +1710,8 @@ xfs_fs_fill_super(
15731710

15741711
out_filestream_unmount:
15751712
xfs_filestream_unmount(mp);
1713+
out_destroy_inact_workqueue:
1714+
xfs_destroy_inact_workqueue(mp);
15761715
out_free_sb:
15771716
xfs_freesb(mp);
15781717
out_free_stats:
@@ -1592,7 +1731,7 @@ xfs_fs_fill_super(
15921731
out_unmount:
15931732
xfs_filestream_unmount(mp);
15941733
xfs_unmountfs(mp);
1595-
goto out_free_sb;
1734+
goto out_destroy_inact_workqueue;
15961735
}
15971736

15981737
STATIC void
@@ -1609,6 +1748,7 @@ xfs_fs_put_super(
16091748
free_percpu(mp->m_stats.xs_stats);
16101749
xfs_destroy_percpu_counters(mp);
16111750
xfs_destroy_mount_workqueues(mp);
1751+
xfs_destroy_inact_workqueue(mp);
16121752
xfs_close_devices(mp);
16131753
xfs_free_fsname(mp);
16141754
kfree(mp);
@@ -1647,7 +1787,9 @@ static const struct super_operations xfs_super_operations = {
16471787
.drop_inode = xfs_fs_drop_inode,
16481788
.put_super = xfs_fs_put_super,
16491789
.sync_fs = xfs_fs_sync_fs,
1790+
.freeze_super = xfs_fs_freeze_super,
16501791
.freeze_fs = xfs_fs_freeze,
1792+
.thaw_super = xfs_fs_thaw_super,
16511793
.unfreeze_fs = xfs_fs_unfreeze,
16521794
.statfs = xfs_fs_statfs,
16531795
.remount_fs = xfs_fs_remount,

fs/xfs/xfs_super.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,13 +60,15 @@ struct xfs_inode;
6060
struct xfs_mount;
6161
struct xfs_buftarg;
6262
struct block_device;
63+
struct work_struct;
6364

6465
extern __uint64_t xfs_max_file_offset(unsigned int);
6566

6667
extern void xfs_flush_inodes(struct xfs_mount *mp);
6768
extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
6869
extern xfs_agnumber_t xfs_set_inode32(struct xfs_mount *, xfs_agnumber_t agcount);
6970
extern xfs_agnumber_t xfs_set_inode64(struct xfs_mount *, xfs_agnumber_t agcount);
71+
extern void xfs_fs_inact_worker(struct work_struct *work);
7072

7173
extern const struct export_operations xfs_export_operations;
7274
extern const struct xattr_handler *xfs_xattr_handlers[];

include/linux/fs.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2009,6 +2009,7 @@ extern int user_statfs(const char __user *, struct kstatfs *);
20092009
extern int fd_statfs(int, struct kstatfs *);
20102010
extern int vfs_ustat(dev_t, struct kstatfs *);
20112011
extern int freeze_super(struct super_block *super);
2012+
extern int __thaw_super(struct super_block *super);
20122013
extern int thaw_super(struct super_block *super);
20132014
extern bool our_mnt(struct vfsmount *mnt);
20142015

0 commit comments

Comments
 (0)