Skip to content

Commit 30e49c2

Browse files
xemulLinus Torvalds
authored andcommitted
pid namespaces: allow cloning of new namespace
When clone() is invoked with CLONE_NEWPID, create a new pid namespace and then create a new struct pid for the new process. Allocate pid_t's for the new process in the new pid namespace and all ancestor pid namespaces. Make the newly cloned process the session and process group leader. Since the active pid namespace is special and expected to be the first entry in pid->upid_list, preserve the order of pid namespaces. The size of 'struct pid' is dependent on the the number of pid namespaces the process exists in, so we use multiple pid-caches'. Only one pid cache is created during system startup and this used by processes that exist only in init_pid_ns. When a process clones its pid namespace, we create additional pid caches as necessary and use the pid cache to allocate 'struct pids' for that depth. Note, that with this patch the newly created namespace won't work, since the rest of the kernel still uses global pids, but this is to be fixed soon. Init pid namespace still works. [[email protected]: merge fix] Signed-off-by: Pavel Emelyanov <[email protected]> Signed-off-by: Sukadev Bhattiprolu <[email protected]> Cc: Paul Menage <[email protected]> Cc: "Eric W. Biederman" <[email protected]> Cc: Oleg Nesterov <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent b461cc0 commit 30e49c2

File tree

4 files changed

+113
-22
lines changed

4 files changed

+113
-22
lines changed

include/linux/sched.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
#define CLONE_NEWUTS 0x04000000 /* New utsname group? */
2626
#define CLONE_NEWIPC 0x08000000 /* New ipcs */
2727
#define CLONE_NEWUSER 0x10000000 /* New user namespace */
28+
#define CLONE_NEWPID 0x20000000 /* New pid namespace */
2829
#define CLONE_NEWNET 0x40000000 /* New network namespace */
2930

3031
/*

kernel/fork.c

Lines changed: 28 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -973,7 +973,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
973973
unsigned long stack_start,
974974
struct pt_regs *regs,
975975
unsigned long stack_size,
976-
int __user *parent_tidptr,
977976
int __user *child_tidptr,
978977
struct pid *pid)
979978
{
@@ -1043,11 +1042,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
10431042
p->did_exec = 0;
10441043
delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
10451044
copy_flags(clone_flags, p);
1046-
retval = -EFAULT;
1047-
if (clone_flags & CLONE_PARENT_SETTID)
1048-
if (put_user(p->pid, parent_tidptr))
1049-
goto bad_fork_cleanup_delays_binfmt;
1050-
10511045
INIT_LIST_HEAD(&p->children);
10521046
INIT_LIST_HEAD(&p->sibling);
10531047
p->vfork_done = NULL;
@@ -1289,11 +1283,22 @@ static struct task_struct *copy_process(unsigned long clone_flags,
12891283
__ptrace_link(p, current->parent);
12901284

12911285
if (thread_group_leader(p)) {
1292-
p->signal->tty = current->signal->tty;
1293-
p->signal->pgrp = task_pgrp_nr(current);
1294-
set_task_session(p, task_session_nr(current));
1295-
attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
1296-
attach_pid(p, PIDTYPE_SID, task_session(current));
1286+
if (clone_flags & CLONE_NEWPID) {
1287+
p->nsproxy->pid_ns->child_reaper = p;
1288+
p->signal->tty = NULL;
1289+
p->signal->pgrp = p->pid;
1290+
set_task_session(p, p->pid);
1291+
attach_pid(p, PIDTYPE_PGID, pid);
1292+
attach_pid(p, PIDTYPE_SID, pid);
1293+
} else {
1294+
p->signal->tty = current->signal->tty;
1295+
p->signal->pgrp = task_pgrp_nr(current);
1296+
set_task_session(p, task_session_nr(current));
1297+
attach_pid(p, PIDTYPE_PGID,
1298+
task_pgrp(current));
1299+
attach_pid(p, PIDTYPE_SID,
1300+
task_session(current));
1301+
}
12971302

12981303
list_add_tail_rcu(&p->tasks, &init_task.tasks);
12991304
__get_cpu_var(process_counts)++;
@@ -1339,7 +1344,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
13391344
bad_fork_cleanup_cgroup:
13401345
#endif
13411346
cgroup_exit(p, cgroup_callbacks_done);
1342-
bad_fork_cleanup_delays_binfmt:
13431347
delayacct_tsk_free(p);
13441348
if (p->binfmt)
13451349
module_put(p->binfmt->module);
@@ -1366,7 +1370,7 @@ struct task_struct * __cpuinit fork_idle(int cpu)
13661370
struct task_struct *task;
13671371
struct pt_regs regs;
13681372

1369-
task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL, NULL,
1373+
task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL,
13701374
&init_struct_pid);
13711375
if (!IS_ERR(task))
13721376
init_idle(task, cpu);
@@ -1414,15 +1418,24 @@ long do_fork(unsigned long clone_flags,
14141418
}
14151419

14161420
p = copy_process(clone_flags, stack_start, regs, stack_size,
1417-
parent_tidptr, child_tidptr, NULL);
1421+
child_tidptr, NULL);
14181422
/*
14191423
* Do this prior waking up the new thread - the thread pointer
14201424
* might get invalid after that point, if the thread exits quickly.
14211425
*/
14221426
if (!IS_ERR(p)) {
14231427
struct completion vfork;
14241428

1425-
nr = pid_nr(task_pid(p));
1429+
/*
1430+
* this is enough to call pid_nr_ns here, but this if
1431+
* improves optimisation of regular fork()
1432+
*/
1433+
nr = (clone_flags & CLONE_NEWPID) ?
1434+
task_pid_nr_ns(p, current->nsproxy->pid_ns) :
1435+
task_pid_vnr(p);
1436+
1437+
if (clone_flags & CLONE_PARENT_SETTID)
1438+
put_user(nr, parent_tidptr);
14261439

14271440
if (clone_flags & CLONE_VFORK) {
14281441
p->vfork_done = &vfork;

kernel/nsproxy.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,8 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
129129

130130
get_nsproxy(old_ns);
131131

132-
if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWUSER | CLONE_NEWNET)))
132+
if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
133+
CLONE_NEWUSER | CLONE_NEWPID | CLONE_NEWNET)))
133134
return 0;
134135

135136
if (!capable(CAP_SYS_ADMIN)) {

kernel/pid.c

Lines changed: 82 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,12 @@
1818
* allocation scenario when all but one out of 1 million PIDs possible are
1919
* allocated already: the scanning of 32 list entries and at most PAGE_SIZE
2020
* bytes. The typical fastpath is a single successful setbit. Freeing is O(1).
21+
*
22+
* Pid namespaces:
23+
* (C) 2007 Pavel Emelyanov <[email protected]>, OpenVZ, SWsoft Inc.
24+
* (C) 2007 Sukadev Bhattiprolu <[email protected]>, IBM
25+
* Many thanks to Oleg Nesterov for comments and help
26+
*
2127
*/
2228

2329
#include <linux/mm.h>
@@ -456,8 +462,8 @@ static struct kmem_cache *create_pid_cachep(int nr_ids)
456462

457463
snprintf(pcache->name, sizeof(pcache->name), "pid_%d", nr_ids);
458464
cachep = kmem_cache_create(pcache->name,
459-
/* FIXME add numerical ids here */
460-
sizeof(struct pid), 0, SLAB_HWCACHE_ALIGN, NULL);
465+
sizeof(struct pid) + (nr_ids - 1) * sizeof(struct upid),
466+
0, SLAB_HWCACHE_ALIGN, NULL);
461467
if (cachep == NULL)
462468
goto err_cachep;
463469

@@ -475,19 +481,89 @@ static struct kmem_cache *create_pid_cachep(int nr_ids)
475481
return NULL;
476482
}
477483

484+
static struct pid_namespace *create_pid_namespace(int level)
485+
{
486+
struct pid_namespace *ns;
487+
int i;
488+
489+
ns = kmalloc(sizeof(struct pid_namespace), GFP_KERNEL);
490+
if (ns == NULL)
491+
goto out;
492+
493+
ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
494+
if (!ns->pidmap[0].page)
495+
goto out_free;
496+
497+
ns->pid_cachep = create_pid_cachep(level + 1);
498+
if (ns->pid_cachep == NULL)
499+
goto out_free_map;
500+
501+
kref_init(&ns->kref);
502+
ns->last_pid = 0;
503+
ns->child_reaper = NULL;
504+
ns->level = level;
505+
506+
set_bit(0, ns->pidmap[0].page);
507+
atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
508+
509+
for (i = 1; i < PIDMAP_ENTRIES; i++) {
510+
ns->pidmap[i].page = 0;
511+
atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
512+
}
513+
514+
return ns;
515+
516+
out_free_map:
517+
kfree(ns->pidmap[0].page);
518+
out_free:
519+
kfree(ns);
520+
out:
521+
return ERR_PTR(-ENOMEM);
522+
}
523+
524+
static void destroy_pid_namespace(struct pid_namespace *ns)
525+
{
526+
int i;
527+
528+
for (i = 0; i < PIDMAP_ENTRIES; i++)
529+
kfree(ns->pidmap[i].page);
530+
kfree(ns);
531+
}
532+
478533
struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns)
479534
{
535+
struct pid_namespace *new_ns;
536+
480537
BUG_ON(!old_ns);
481-
get_pid_ns(old_ns);
482-
return old_ns;
538+
new_ns = get_pid_ns(old_ns);
539+
if (!(flags & CLONE_NEWPID))
540+
goto out;
541+
542+
new_ns = ERR_PTR(-EINVAL);
543+
if (flags & CLONE_THREAD)
544+
goto out_put;
545+
546+
new_ns = create_pid_namespace(old_ns->level + 1);
547+
if (!IS_ERR(new_ns))
548+
new_ns->parent = get_pid_ns(old_ns);
549+
550+
out_put:
551+
put_pid_ns(old_ns);
552+
out:
553+
return new_ns;
483554
}
484555

485556
void free_pid_ns(struct kref *kref)
486557
{
487-
struct pid_namespace *ns;
558+
struct pid_namespace *ns, *parent;
488559

489560
ns = container_of(kref, struct pid_namespace, kref);
490-
kfree(ns);
561+
562+
parent = ns->parent;
563+
destroy_pid_namespace(ns);
564+
565+
if (parent != NULL)
566+
put_pid_ns(parent);
491567
}
492568

493569
/*

0 commit comments

Comments
 (0)