Skip to content

Commit 8d19f1c

Browse files
Mike ChristieChristian Brauner
authored andcommitted
prctl: PR_{G,S}ET_IO_FLUSHER to support controlling memory reclaim
There are several storage drivers like dm-multipath, iscsi, tcmu-runner, amd nbd that have userspace components that can run in the IO path. For example, iscsi and nbd's userspace deamons may need to recreate a socket and/or send IO on it, and dm-multipath's daemon multipathd may need to send SG IO or read/write IO to figure out the state of paths and re-set them up. In the kernel these drivers have access to GFP_NOIO/GFP_NOFS and the memalloc_*_save/restore functions to control the allocation behavior, but for userspace we would end up hitting an allocation that ended up writing data back to the same device we are trying to allocate for. The device is then in a state of deadlock, because to execute IO the device needs to allocate memory, but to allocate memory the memory layers want execute IO to the device. Here is an example with nbd using a local userspace daemon that performs network IO to a remote server. We are using XFS on top of the nbd device, but it can happen with any FS or other modules layered on top of the nbd device that can write out data to free memory. Here a nbd daemon helper thread, msgr-worker-1, is performing a write/sendmsg on a socket to execute a request. This kicks off a reclaim operation which results in a WRITE to the nbd device and the nbd thread calling back into the mm layer. [ 1626.609191] msgr-worker-1 D 0 1026 1 0x00004000 [ 1626.609193] Call Trace: [ 1626.609195] ? __schedule+0x29b/0x630 [ 1626.609197] ? wait_for_completion+0xe0/0x170 [ 1626.609198] schedule+0x30/0xb0 [ 1626.609200] schedule_timeout+0x1f6/0x2f0 [ 1626.609202] ? blk_finish_plug+0x21/0x2e [ 1626.609204] ? _xfs_buf_ioapply+0x2e6/0x410 [ 1626.609206] ? wait_for_completion+0xe0/0x170 [ 1626.609208] wait_for_completion+0x108/0x170 [ 1626.609210] ? wake_up_q+0x70/0x70 [ 1626.609212] ? __xfs_buf_submit+0x12e/0x250 [ 1626.609214] ? xfs_bwrite+0x25/0x60 [ 1626.609215] xfs_buf_iowait+0x22/0xf0 [ 1626.609218] __xfs_buf_submit+0x12e/0x250 [ 1626.609220] xfs_bwrite+0x25/0x60 [ 1626.609222] xfs_reclaim_inode+0x2e8/0x310 [ 1626.609224] xfs_reclaim_inodes_ag+0x1b6/0x300 [ 1626.609227] xfs_reclaim_inodes_nr+0x31/0x40 [ 1626.609228] super_cache_scan+0x152/0x1a0 [ 1626.609231] do_shrink_slab+0x12c/0x2d0 [ 1626.609233] shrink_slab+0x9c/0x2a0 [ 1626.609235] shrink_node+0xd7/0x470 [ 1626.609237] do_try_to_free_pages+0xbf/0x380 [ 1626.609240] try_to_free_pages+0xd9/0x1f0 [ 1626.609245] __alloc_pages_slowpath+0x3a4/0xd30 [ 1626.609251] ? ___slab_alloc+0x238/0x560 [ 1626.609254] __alloc_pages_nodemask+0x30c/0x350 [ 1626.609259] skb_page_frag_refill+0x97/0xd0 [ 1626.609274] sk_page_frag_refill+0x1d/0x80 [ 1626.609279] tcp_sendmsg_locked+0x2bb/0xdd0 [ 1626.609304] tcp_sendmsg+0x27/0x40 [ 1626.609307] sock_sendmsg+0x54/0x60 [ 1626.609308] ___sys_sendmsg+0x29f/0x320 [ 1626.609313] ? sock_poll+0x66/0xb0 [ 1626.609318] ? ep_item_poll.isra.15+0x40/0xc0 [ 1626.609320] ? ep_send_events_proc+0xe6/0x230 [ 1626.609322] ? hrtimer_try_to_cancel+0x54/0xf0 [ 1626.609324] ? ep_read_events_proc+0xc0/0xc0 [ 1626.609326] ? _raw_write_unlock_irq+0xa/0x20 [ 1626.609327] ? ep_scan_ready_list.constprop.19+0x218/0x230 [ 1626.609329] ? __hrtimer_init+0xb0/0xb0 [ 1626.609331] ? _raw_spin_unlock_irq+0xa/0x20 [ 1626.609334] ? ep_poll+0x26c/0x4a0 [ 1626.609337] ? tcp_tsq_write.part.54+0xa0/0xa0 [ 1626.609339] ? release_sock+0x43/0x90 [ 1626.609341] ? _raw_spin_unlock_bh+0xa/0x20 [ 1626.609342] __sys_sendmsg+0x47/0x80 [ 1626.609347] do_syscall_64+0x5f/0x1c0 [ 1626.609349] ? prepare_exit_to_usermode+0x75/0xa0 [ 1626.609351] entry_SYSCALL_64_after_hwframe+0x44/0xa9 This patch adds a new prctl command that daemons can use after they have done their initial setup, and before they start to do allocations that are in the IO path. It sets the PF_MEMALLOC_NOIO and PF_LESS_THROTTLE flags so both userspace block and FS threads can use it to avoid the allocation recursion and try to prevent from being throttled while writing out data to free up memory. Signed-off-by: Mike Christie <[email protected]> Acked-by: Michal Hocko <[email protected]> Tested-by: Masato Suzuki <[email protected]> Reviewed-by: Damien Le Moal <[email protected]> Reviewed-by: Bart Van Assche <[email protected]> Reviewed-by: Dave Chinner <[email protected]> Reviewed-by: Darrick J. Wong <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Christian Brauner <[email protected]>
1 parent 913292c commit 8d19f1c

File tree

3 files changed

+30
-0
lines changed

3 files changed

+30
-0
lines changed

include/uapi/linux/capability.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -301,6 +301,7 @@ struct vfs_ns_cap_data {
301301
/* Allow more than 64hz interrupts from the real-time clock */
302302
/* Override max number of consoles on console allocation */
303303
/* Override max number of keymaps */
304+
/* Control memory reclaim behavior */
304305

305306
#define CAP_SYS_RESOURCE 24
306307

include/uapi/linux/prctl.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -234,4 +234,8 @@ struct prctl_mm_map {
234234
#define PR_GET_TAGGED_ADDR_CTRL 56
235235
# define PR_TAGGED_ADDR_ENABLE (1UL << 0)
236236

237+
/* Control reclaim behavior when allocating memory */
238+
#define PR_SET_IO_FLUSHER 57
239+
#define PR_GET_IO_FLUSHER 58
240+
237241
#endif /* _LINUX_PRCTL_H */

kernel/sys.c

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2261,6 +2261,8 @@ int __weak arch_prctl_spec_ctrl_set(struct task_struct *t, unsigned long which,
22612261
return -EINVAL;
22622262
}
22632263

2264+
#define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LESS_THROTTLE)
2265+
22642266
SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
22652267
unsigned long, arg4, unsigned long, arg5)
22662268
{
@@ -2488,6 +2490,29 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
24882490
return -EINVAL;
24892491
error = GET_TAGGED_ADDR_CTRL();
24902492
break;
2493+
case PR_SET_IO_FLUSHER:
2494+
if (!capable(CAP_SYS_RESOURCE))
2495+
return -EPERM;
2496+
2497+
if (arg3 || arg4 || arg5)
2498+
return -EINVAL;
2499+
2500+
if (arg2 == 1)
2501+
current->flags |= PR_IO_FLUSHER;
2502+
else if (!arg2)
2503+
current->flags &= ~PR_IO_FLUSHER;
2504+
else
2505+
return -EINVAL;
2506+
break;
2507+
case PR_GET_IO_FLUSHER:
2508+
if (!capable(CAP_SYS_RESOURCE))
2509+
return -EPERM;
2510+
2511+
if (arg2 || arg3 || arg4 || arg5)
2512+
return -EINVAL;
2513+
2514+
error = (current->flags & PR_IO_FLUSHER) == PR_IO_FLUSHER;
2515+
break;
24912516
default:
24922517
error = -EINVAL;
24932518
break;

0 commit comments

Comments
 (0)