Skip to content

Commit 21d1c5e

Browse files
legionusebiederm
authored andcommitted
Reimplement RLIMIT_NPROC on top of ucounts
The rlimit counter is tied to uid in the user_namespace. This allows rlimit values to be specified in userns even if they are already globally exceeded by the user. However, the value of the previous user_namespaces cannot be exceeded. To illustrate the impact of rlimits, let's say there is a program that does not fork. Some service-A wants to run this program as user X in multiple containers. Since the program never fork the service wants to set RLIMIT_NPROC=1. service-A \- program (uid=1000, container1, rlimit_nproc=1) \- program (uid=1000, container2, rlimit_nproc=1) The service-A sets RLIMIT_NPROC=1 and runs the program in container1. When the service-A tries to run a program with RLIMIT_NPROC=1 in container2 it fails since user X already has one running process. We cannot use existing inc_ucounts / dec_ucounts because they do not allow us to exceed the maximum for the counter. Some rlimits can be overlimited by root or if the user has the appropriate capability. Changelog v11: * Change inc_rlimit_ucounts() which now returns top value of ucounts. * Drop inc_rlimit_ucounts_and_test() because the return code of inc_rlimit_ucounts() can be checked. Signed-off-by: Alexey Gladkov <[email protected]> Link: https://lkml.kernel.org/r/c5286a8aa16d2d698c222f7532f3d735c82bc6bc.1619094428.git.legion@kernel.org Signed-off-by: Eric W. Biederman <[email protected]>
1 parent b6c3365 commit 21d1c5e

File tree

11 files changed

+73
-15
lines changed

11 files changed

+73
-15
lines changed

fs/exec.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1878,7 +1878,7 @@ static int do_execveat_common(int fd, struct filename *filename,
18781878
* whether NPROC limit is still exceeded.
18791879
*/
18801880
if ((current->flags & PF_NPROC_EXCEEDED) &&
1881-
atomic_read(&current_user()->processes) > rlimit(RLIMIT_NPROC)) {
1881+
is_ucounts_overlimit(current_ucounts(), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) {
18821882
retval = -EAGAIN;
18831883
goto out_ret;
18841884
}

include/linux/cred.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -372,6 +372,7 @@ static inline void put_cred(const struct cred *_cred)
372372

373373
#define task_uid(task) (task_cred_xxx((task), uid))
374374
#define task_euid(task) (task_cred_xxx((task), euid))
375+
#define task_ucounts(task) (task_cred_xxx((task), ucounts))
375376

376377
#define current_cred_xxx(xxx) \
377378
({ \
@@ -388,6 +389,7 @@ static inline void put_cred(const struct cred *_cred)
388389
#define current_fsgid() (current_cred_xxx(fsgid))
389390
#define current_cap() (current_cred_xxx(cap_effective))
390391
#define current_user() (current_cred_xxx(user))
392+
#define current_ucounts() (current_cred_xxx(ucounts))
391393

392394
extern struct user_namespace init_user_ns;
393395
#ifdef CONFIG_USER_NS

include/linux/sched/user.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
*/
1313
struct user_struct {
1414
refcount_t __count; /* reference count */
15-
atomic_t processes; /* How many processes does this user have? */
1615
atomic_t sigpending; /* How many pending signals does this user have? */
1716
#ifdef CONFIG_FANOTIFY
1817
atomic_t fanotify_listeners;

include/linux/user_namespace.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,9 +50,12 @@ enum ucount_type {
5050
UCOUNT_INOTIFY_INSTANCES,
5151
UCOUNT_INOTIFY_WATCHES,
5252
#endif
53+
UCOUNT_RLIMIT_NPROC,
5354
UCOUNT_COUNTS,
5455
};
5556

57+
#define MAX_PER_NAMESPACE_UCOUNTS UCOUNT_RLIMIT_NPROC
58+
5659
struct user_namespace {
5760
struct uid_gid_map uid_map;
5861
struct uid_gid_map gid_map;
@@ -110,6 +113,15 @@ struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid);
110113
struct ucounts * __must_check get_ucounts(struct ucounts *ucounts);
111114
void put_ucounts(struct ucounts *ucounts);
112115

116+
static inline long get_ucounts_value(struct ucounts *ucounts, enum ucount_type type)
117+
{
118+
return atomic_long_read(&ucounts->ucount[type]);
119+
}
120+
121+
long inc_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v);
122+
bool dec_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v);
123+
bool is_ucounts_overlimit(struct ucounts *ucounts, enum ucount_type type, unsigned long max);
124+
113125
#ifdef CONFIG_USER_NS
114126

115127
static inline struct user_namespace *get_user_ns(struct user_namespace *ns)

kernel/cred.c

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -360,7 +360,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
360360
kdebug("share_creds(%p{%d,%d})",
361361
p->cred, atomic_read(&p->cred->usage),
362362
read_cred_subscribers(p->cred));
363-
atomic_inc(&p->cred->user->processes);
363+
inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
364364
return 0;
365365
}
366366

@@ -395,8 +395,8 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
395395
}
396396
#endif
397397

398-
atomic_inc(&new->user->processes);
399398
p->cred = p->real_cred = get_cred(new);
399+
inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
400400
alter_cred_subscribers(new, 2);
401401
validate_creds(new);
402402
return 0;
@@ -496,12 +496,12 @@ int commit_creds(struct cred *new)
496496
* in set_user().
497497
*/
498498
alter_cred_subscribers(new, 2);
499-
if (new->user != old->user)
500-
atomic_inc(&new->user->processes);
499+
if (new->user != old->user || new->user_ns != old->user_ns)
500+
inc_rlimit_ucounts(new->ucounts, UCOUNT_RLIMIT_NPROC, 1);
501501
rcu_assign_pointer(task->real_cred, new);
502502
rcu_assign_pointer(task->cred, new);
503503
if (new->user != old->user)
504-
atomic_dec(&old->user->processes);
504+
dec_rlimit_ucounts(old->ucounts, UCOUNT_RLIMIT_NPROC, 1);
505505
alter_cred_subscribers(old, -2);
506506

507507
/* send notifications */

kernel/exit.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -188,7 +188,7 @@ void release_task(struct task_struct *p)
188188
/* don't need to get the RCU readlock here - the process is dead and
189189
* can't be modifying its own credentials. But shut RCU-lockdep up */
190190
rcu_read_lock();
191-
atomic_dec(&__task_cred(p)->user->processes);
191+
dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
192192
rcu_read_unlock();
193193

194194
cgroup_release(p);

kernel/fork.c

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -819,9 +819,11 @@ void __init fork_init(void)
819819
init_task.signal->rlim[RLIMIT_SIGPENDING] =
820820
init_task.signal->rlim[RLIMIT_NPROC];
821821

822-
for (i = 0; i < UCOUNT_COUNTS; i++)
822+
for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++)
823823
init_user_ns.ucount_max[i] = max_threads/2;
824824

825+
init_user_ns.ucount_max[UCOUNT_RLIMIT_NPROC] = task_rlimit(&init_task, RLIMIT_NPROC);
826+
825827
#ifdef CONFIG_VMAP_STACK
826828
cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache",
827829
NULL, free_vm_stack_cache);
@@ -1978,8 +1980,7 @@ static __latent_entropy struct task_struct *copy_process(
19781980
DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
19791981
#endif
19801982
retval = -EAGAIN;
1981-
if (atomic_read(&p->real_cred->user->processes) >=
1982-
task_rlimit(p, RLIMIT_NPROC)) {
1983+
if (is_ucounts_overlimit(task_ucounts(p), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) {
19831984
if (p->real_cred->user != INIT_USER &&
19841985
!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
19851986
goto bad_fork_free;
@@ -2382,7 +2383,7 @@ static __latent_entropy struct task_struct *copy_process(
23822383
#endif
23832384
delayacct_tsk_free(p);
23842385
bad_fork_cleanup_count:
2385-
atomic_dec(&p->cred->user->processes);
2386+
dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
23862387
exit_creds(p);
23872388
bad_fork_free:
23882389
p->state = TASK_DEAD;

kernel/sys.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -473,7 +473,7 @@ static int set_user(struct cred *new)
473473
* for programs doing set*uid()+execve() by harmlessly deferring the
474474
* failure to the execve() stage.
475475
*/
476-
if (atomic_read(&new_user->processes) >= rlimit(RLIMIT_NPROC) &&
476+
if (is_ucounts_overlimit(new->ucounts, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC)) &&
477477
new_user != INIT_USER)
478478
current->flags |= PF_NPROC_EXCEEDED;
479479
else

kernel/ucount.c

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ static struct ctl_table user_table[] = {
8080
UCOUNT_ENTRY("max_inotify_instances"),
8181
UCOUNT_ENTRY("max_inotify_watches"),
8282
#endif
83+
{ },
8384
{ }
8485
};
8586
#endif /* CONFIG_SYSCTL */
@@ -240,6 +241,48 @@ void dec_ucount(struct ucounts *ucounts, enum ucount_type type)
240241
put_ucounts(ucounts);
241242
}
242243

244+
long inc_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v)
245+
{
246+
struct ucounts *iter;
247+
long ret = 0;
248+
249+
for (iter = ucounts; iter; iter = iter->ns->ucounts) {
250+
long max = READ_ONCE(iter->ns->ucount_max[type]);
251+
long new = atomic_long_add_return(v, &iter->ucount[type]);
252+
if (new < 0 || new > max)
253+
ret = LONG_MAX;
254+
else if (iter == ucounts)
255+
ret = new;
256+
}
257+
return ret;
258+
}
259+
260+
bool dec_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v)
261+
{
262+
struct ucounts *iter;
263+
long new;
264+
for (iter = ucounts; iter; iter = iter->ns->ucounts) {
265+
long dec = atomic_long_add_return(-v, &iter->ucount[type]);
266+
WARN_ON_ONCE(dec < 0);
267+
if (iter == ucounts)
268+
new = dec;
269+
}
270+
return (new == 0);
271+
}
272+
273+
bool is_ucounts_overlimit(struct ucounts *ucounts, enum ucount_type type, unsigned long max)
274+
{
275+
struct ucounts *iter;
276+
if (get_ucounts_value(ucounts, type) > max)
277+
return true;
278+
for (iter = ucounts; iter; iter = iter->ns->ucounts) {
279+
max = READ_ONCE(iter->ns->ucount_max[type]);
280+
if (get_ucounts_value(iter, type) > max)
281+
return true;
282+
}
283+
return false;
284+
}
285+
243286
static __init int user_namespace_sysctl_init(void)
244287
{
245288
#ifdef CONFIG_SYSCTL
@@ -256,6 +299,7 @@ static __init int user_namespace_sysctl_init(void)
256299
BUG_ON(!setup_userns_sysctls(&init_user_ns));
257300
#endif
258301
hlist_add_ucounts(&init_ucounts);
302+
inc_rlimit_ucounts(&init_ucounts, UCOUNT_RLIMIT_NPROC, 1);
259303
return 0;
260304
}
261305
subsys_initcall(user_namespace_sysctl_init);

kernel/user.c

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,6 @@ static DEFINE_SPINLOCK(uidhash_lock);
9898
/* root_user.__count is 1, for init task cred */
9999
struct user_struct root_user = {
100100
.__count = REFCOUNT_INIT(1),
101-
.processes = ATOMIC_INIT(1),
102101
.sigpending = ATOMIC_INIT(0),
103102
.locked_shm = 0,
104103
.uid = GLOBAL_ROOT_UID,

kernel/user_namespace.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,9 +119,10 @@ int create_user_ns(struct cred *new)
119119
ns->owner = owner;
120120
ns->group = group;
121121
INIT_WORK(&ns->work, free_user_ns);
122-
for (i = 0; i < UCOUNT_COUNTS; i++) {
122+
for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++) {
123123
ns->ucount_max[i] = INT_MAX;
124124
}
125+
ns->ucount_max[UCOUNT_RLIMIT_NPROC] = rlimit(RLIMIT_NPROC);
125126
ns->ucounts = ucounts;
126127

127128
/* Inherit USERNS_SETGROUPS_ALLOWED from our parent */

0 commit comments

Comments
 (0)