Skip to content

Commit b3e5838

Browse files
committed
clone: add CLONE_PIDFD
This patchset makes it possible to retrieve pid file descriptors at process creation time by introducing the new flag CLONE_PIDFD to the clone() system call. Linus originally suggested to implement this as a new flag to clone() instead of making it a separate system call. As spotted by Linus, there is exactly one bit for clone() left. CLONE_PIDFD creates file descriptors based on the anonymous inode implementation in the kernel that will also be used to implement the new mount api. They serve as a simple opaque handle on pids. Logically, this makes it possible to interpret a pidfd differently, narrowing or widening the scope of various operations (e.g. signal sending). Thus, a pidfd cannot just refer to a tgid, but also a tid, or in theory - given appropriate flag arguments in relevant syscalls - a process group or session. A pidfd does not represent a privilege. This does not imply it cannot ever be that way but for now this is not the case. A pidfd comes with additional information in fdinfo if the kernel supports procfs. The fdinfo file contains the pid of the process in the callers pid namespace in the same format as the procfs status file, i.e. "Pid:\t%d". As suggested by Oleg, with CLONE_PIDFD the pidfd is returned in the parent_tidptr argument of clone. This has the advantage that we can give back the associated pid and the pidfd at the same time. To remove worries about missing metadata access this patchset comes with a sample program that illustrates how a combination of CLONE_PIDFD, and pidfd_send_signal() can be used to gain race-free access to process metadata through /proc/<pid>. The sample program can easily be translated into a helper that would be suitable for inclusion in libc so that users don't have to worry about writing it themselves. Suggested-by: Linus Torvalds <[email protected]> Signed-off-by: Christian Brauner <[email protected]> Co-developed-by: Jann Horn <[email protected]> Signed-off-by: Jann Horn <[email protected]> Reviewed-by: Oleg Nesterov <[email protected]> Cc: Arnd Bergmann <[email protected]> Cc: "Eric W. Biederman" <[email protected]> Cc: Kees Cook <[email protected]> Cc: Thomas Gleixner <[email protected]> Cc: David Howells <[email protected]> Cc: "Michael Kerrisk (man-pages)" <[email protected]> Cc: Andy Lutomirsky <[email protected]> Cc: Andrew Morton <[email protected]> Cc: Aleksa Sarai <[email protected]> Cc: Linus Torvalds <[email protected]> Cc: Al Viro <[email protected]>
1 parent 5dd50aa commit b3e5838

File tree

3 files changed

+106
-4
lines changed

3 files changed

+106
-4
lines changed

include/linux/pid.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,8 @@ struct pid
6666

6767
extern struct pid init_struct_pid;
6868

69+
extern const struct file_operations pidfd_fops;
70+
6971
static inline struct pid *get_pid(struct pid *pid)
7072
{
7173
if (pid)

include/uapi/linux/sched.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#define CLONE_FS 0x00000200 /* set if fs info shared between processes */
1111
#define CLONE_FILES 0x00000400 /* set if open files shared between processes */
1212
#define CLONE_SIGHAND 0x00000800 /* set if signal handlers and blocked signals shared */
13+
#define CLONE_PIDFD 0x00001000 /* set if a pidfd should be placed in parent */
1314
#define CLONE_PTRACE 0x00002000 /* set if we want to let tracing continue on the child too */
1415
#define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */
1516
#define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */

kernel/fork.c

Lines changed: 103 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
* management can be a bitch. See 'mm/memory.c': 'copy_page_range()'
1212
*/
1313

14+
#include <linux/anon_inodes.h>
1415
#include <linux/slab.h>
1516
#include <linux/sched/autogroup.h>
1617
#include <linux/sched/mm.h>
@@ -21,6 +22,7 @@
2122
#include <linux/sched/task.h>
2223
#include <linux/sched/task_stack.h>
2324
#include <linux/sched/cputime.h>
25+
#include <linux/seq_file.h>
2426
#include <linux/rtmutex.h>
2527
#include <linux/init.h>
2628
#include <linux/unistd.h>
@@ -1662,6 +1664,58 @@ static inline void rcu_copy_process(struct task_struct *p)
16621664
#endif /* #ifdef CONFIG_TASKS_RCU */
16631665
}
16641666

1667+
static int pidfd_release(struct inode *inode, struct file *file)
1668+
{
1669+
struct pid *pid = file->private_data;
1670+
1671+
file->private_data = NULL;
1672+
put_pid(pid);
1673+
return 0;
1674+
}
1675+
1676+
#ifdef CONFIG_PROC_FS
1677+
static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
1678+
{
1679+
struct pid_namespace *ns = proc_pid_ns(file_inode(m->file));
1680+
struct pid *pid = f->private_data;
1681+
1682+
seq_put_decimal_ull(m, "Pid:\t", pid_nr_ns(pid, ns));
1683+
seq_putc(m, '\n');
1684+
}
1685+
#endif
1686+
1687+
const struct file_operations pidfd_fops = {
1688+
.release = pidfd_release,
1689+
#ifdef CONFIG_PROC_FS
1690+
.show_fdinfo = pidfd_show_fdinfo,
1691+
#endif
1692+
};
1693+
1694+
/**
1695+
* pidfd_create() - Create a new pid file descriptor.
1696+
*
1697+
* @pid: struct pid that the pidfd will reference
1698+
*
1699+
* This creates a new pid file descriptor with the O_CLOEXEC flag set.
1700+
*
1701+
* Note, that this function can only be called after the fd table has
1702+
* been unshared to avoid leaking the pidfd to the new process.
1703+
*
1704+
* Return: On success, a cloexec pidfd is returned.
1705+
* On error, a negative errno number will be returned.
1706+
*/
1707+
static int pidfd_create(struct pid *pid)
1708+
{
1709+
int fd;
1710+
1711+
fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid),
1712+
O_RDWR | O_CLOEXEC);
1713+
if (fd < 0)
1714+
put_pid(pid);
1715+
1716+
return fd;
1717+
}
1718+
16651719
/*
16661720
* This creates a new process as a copy of the old one,
16671721
* but does not actually start it yet.
@@ -1674,13 +1728,14 @@ static __latent_entropy struct task_struct *copy_process(
16741728
unsigned long clone_flags,
16751729
unsigned long stack_start,
16761730
unsigned long stack_size,
1731+
int __user *parent_tidptr,
16771732
int __user *child_tidptr,
16781733
struct pid *pid,
16791734
int trace,
16801735
unsigned long tls,
16811736
int node)
16821737
{
1683-
int retval;
1738+
int pidfd = -1, retval;
16841739
struct task_struct *p;
16851740
struct multiprocess_signals delayed;
16861741

@@ -1730,6 +1785,31 @@ static __latent_entropy struct task_struct *copy_process(
17301785
return ERR_PTR(-EINVAL);
17311786
}
17321787

1788+
if (clone_flags & CLONE_PIDFD) {
1789+
int reserved;
1790+
1791+
/*
1792+
* - CLONE_PARENT_SETTID is useless for pidfds and also
1793+
* parent_tidptr is used to return pidfds.
1794+
* - CLONE_DETACHED is blocked so that we can potentially
1795+
* reuse it later for CLONE_PIDFD.
1796+
* - CLONE_THREAD is blocked until someone really needs it.
1797+
*/
1798+
if (clone_flags &
1799+
(CLONE_DETACHED | CLONE_PARENT_SETTID | CLONE_THREAD))
1800+
return ERR_PTR(-EINVAL);
1801+
1802+
/*
1803+
* Verify that parent_tidptr is sane so we can potentially
1804+
* reuse it later.
1805+
*/
1806+
if (get_user(reserved, parent_tidptr))
1807+
return ERR_PTR(-EFAULT);
1808+
1809+
if (reserved != 0)
1810+
return ERR_PTR(-EINVAL);
1811+
}
1812+
17331813
/*
17341814
* Force any signals received before this point to be delivered
17351815
* before the fork happens. Collect up signals sent to multiple
@@ -1936,6 +2016,22 @@ static __latent_entropy struct task_struct *copy_process(
19362016
}
19372017
}
19382018

2019+
/*
2020+
* This has to happen after we've potentially unshared the file
2021+
* descriptor table (so that the pidfd doesn't leak into the child
2022+
* if the fd table isn't shared).
2023+
*/
2024+
if (clone_flags & CLONE_PIDFD) {
2025+
retval = pidfd_create(pid);
2026+
if (retval < 0)
2027+
goto bad_fork_free_pid;
2028+
2029+
pidfd = retval;
2030+
retval = put_user(pidfd, parent_tidptr);
2031+
if (retval)
2032+
goto bad_fork_put_pidfd;
2033+
}
2034+
19392035
#ifdef CONFIG_BLOCK
19402036
p->plug = NULL;
19412037
#endif
@@ -1996,7 +2092,7 @@ static __latent_entropy struct task_struct *copy_process(
19962092
*/
19972093
retval = cgroup_can_fork(p);
19982094
if (retval)
1999-
goto bad_fork_free_pid;
2095+
goto bad_fork_put_pidfd;
20002096

20012097
/*
20022098
* From this point on we must avoid any synchronous user-space
@@ -2111,6 +2207,9 @@ static __latent_entropy struct task_struct *copy_process(
21112207
spin_unlock(&current->sighand->siglock);
21122208
write_unlock_irq(&tasklist_lock);
21132209
cgroup_cancel_fork(p);
2210+
bad_fork_put_pidfd:
2211+
if (clone_flags & CLONE_PIDFD)
2212+
ksys_close(pidfd);
21142213
bad_fork_free_pid:
21152214
cgroup_threadgroup_change_end(current);
21162215
if (pid != &init_struct_pid)
@@ -2176,7 +2275,7 @@ static inline void init_idle_pids(struct task_struct *idle)
21762275
struct task_struct *fork_idle(int cpu)
21772276
{
21782277
struct task_struct *task;
2179-
task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0,
2278+
task = copy_process(CLONE_VM, 0, 0, NULL, NULL, &init_struct_pid, 0, 0,
21802279
cpu_to_node(cpu));
21812280
if (!IS_ERR(task)) {
21822281
init_idle_pids(task);
@@ -2223,7 +2322,7 @@ long _do_fork(unsigned long clone_flags,
22232322
trace = 0;
22242323
}
22252324

2226-
p = copy_process(clone_flags, stack_start, stack_size,
2325+
p = copy_process(clone_flags, stack_start, stack_size, parent_tidptr,
22272326
child_tidptr, NULL, trace, tls, NUMA_NO_NODE);
22282327
add_latent_entropy();
22292328

0 commit comments

Comments
 (0)