Skip to content

Commit 3eb07c8

Browse files
Sukadev BhattiproluLinus Torvalds
authored andcommitted
pid namespaces: destroy pid namespace on init's death
Terminate all processes in a namespace when the reaper of the namespace is exiting. We do this by walking the pidmap of the namespace and sending SIGKILL to all processes. Signed-off-by: Sukadev Bhattiprolu <[email protected]> Acked-by: Pavel Emelyanov <[email protected]> Cc: Oleg Nesterov <[email protected]> Cc: Sukadev Bhattiprolu <[email protected]> Cc: Paul Menage <[email protected]> Cc: "Eric W. Biederman" <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent 0fbc26a commit 3eb07c8

File tree

3 files changed

+65
-1
lines changed

3 files changed

+65
-1
lines changed

include/linux/pid.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,7 @@ extern struct pid *find_ge_pid(int nr, struct pid_namespace *);
125125

126126
extern struct pid *alloc_pid(struct pid_namespace *ns);
127127
extern void FASTCALL(free_pid(struct pid *pid));
128+
extern void zap_pid_ns_processes(struct pid_namespace *pid_ns);
128129

129130
/*
130131
* the helpers to get the pid's id seen from different namespaces

kernel/exit.c

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -879,7 +879,32 @@ static inline void exit_child_reaper(struct task_struct *tsk)
879879
if (likely(tsk->group_leader != task_child_reaper(tsk)))
880880
return;
881881

882-
panic("Attempted to kill init!");
882+
if (tsk->nsproxy->pid_ns == &init_pid_ns)
883+
panic("Attempted to kill init!");
884+
885+
/*
886+
* @tsk is the last thread in the 'cgroup-init' and is exiting.
887+
* Terminate all remaining processes in the namespace and reap them
888+
* before exiting @tsk.
889+
*
890+
* Note that @tsk (last thread of cgroup-init) may not necessarily
891+
* be the child-reaper (i.e main thread of cgroup-init) of the
892+
* namespace i.e the child_reaper may have already exited.
893+
*
894+
* Even after a child_reaper exits, we let it inherit orphaned children,
895+
* because, pid_ns->child_reaper remains valid as long as there is
896+
* at least one living sub-thread in the cgroup init.
897+
898+
* This living sub-thread of the cgroup-init will be notified when
899+
* a child inherited by the 'child-reaper' exits (do_notify_parent()
900+
* uses __group_send_sig_info()). Further, when reaping child processes,
901+
* do_wait() iterates over children of all living sub threads.
902+
903+
* i.e even though 'child_reaper' thread is listed as the parent of the
904+
* orphaned children, any living sub-thread in the cgroup-init can
905+
* perform the role of the child_reaper.
906+
*/
907+
zap_pid_ns_processes(tsk->nsproxy->pid_ns);
883908
}
884909

885910
fastcall NORET_TYPE void do_exit(long code)

kernel/pid.c

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
#include <linux/hash.h>
3535
#include <linux/pid_namespace.h>
3636
#include <linux/init_task.h>
37+
#include <linux/syscalls.h>
3738

3839
#define pid_hashfn(nr, ns) \
3940
hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
@@ -567,6 +568,43 @@ void free_pid_ns(struct kref *kref)
567568
put_pid_ns(parent);
568569
}
569570

571+
void zap_pid_ns_processes(struct pid_namespace *pid_ns)
572+
{
573+
int nr;
574+
int rc;
575+
576+
/*
577+
* The last thread in the cgroup-init thread group is terminating.
578+
* Find remaining pid_ts in the namespace, signal and wait for them
579+
* to exit.
580+
*
581+
* Note: This signals each threads in the namespace - even those that
582+
* belong to the same thread group, To avoid this, we would have
583+
* to walk the entire tasklist looking a processes in this
584+
* namespace, but that could be unnecessarily expensive if the
585+
* pid namespace has just a few processes. Or we need to
586+
* maintain a tasklist for each pid namespace.
587+
*
588+
*/
589+
read_lock(&tasklist_lock);
590+
nr = next_pidmap(pid_ns, 1);
591+
while (nr > 0) {
592+
kill_proc_info(SIGKILL, SEND_SIG_PRIV, nr);
593+
nr = next_pidmap(pid_ns, nr);
594+
}
595+
read_unlock(&tasklist_lock);
596+
597+
do {
598+
clear_thread_flag(TIF_SIGPENDING);
599+
rc = sys_wait4(-1, NULL, __WALL, NULL);
600+
} while (rc != -ECHILD);
601+
602+
603+
/* Child reaper for the pid namespace is going away */
604+
pid_ns->child_reaper = NULL;
605+
return;
606+
}
607+
570608
/*
571609
* The pid hash table is scaled according to the amount of memory in the
572610
* machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or

0 commit comments

Comments
 (0)