Skip to content

Commit 76f969e

Browse files
rgushchinhtejun
authored andcommitted
cgroup: cgroup v2 freezer
Cgroup v1 implements the freezer controller, which provides an ability to stop the workload in a cgroup and temporarily free up some resources (cpu, io, network bandwidth and, potentially, memory) for some other tasks. Cgroup v2 lacks this functionality. This patch implements freezer for cgroup v2. Cgroup v2 freezer tries to put tasks into a state similar to jobctl stop. This means that tasks can be killed, ptraced (using PTRACE_SEIZE*), and interrupted. It is possible to attach to a frozen task, get some information (e.g. read registers) and detach. It's also possible to migrate a frozen tasks to another cgroup. This differs cgroup v2 freezer from cgroup v1 freezer, which mostly tried to imitate the system-wide freezer. However uninterruptible sleep is fine when all tasks are going to be frozen (hibernation case), it's not the acceptable state for some subset of the system. Cgroup v2 freezer is not supporting freezing kthreads. If a non-root cgroup contains kthread, the cgroup still can be frozen, but the kthread will remain running, the cgroup will be shown as non-frozen, and the notification will not be delivered. * PTRACE_ATTACH is not working because non-fatal signal delivery is blocked in frozen state. There are some interface differences between cgroup v1 and cgroup v2 freezer too, which are required to conform the cgroup v2 interface design principles: 1) There is no separate controller, which has to be turned on: the functionality is always available and is represented by cgroup.freeze and cgroup.events cgroup control files. 2) The desired state is defined by the cgroup.freeze control file. Any hierarchical configuration is allowed. 3) The interface is asynchronous. The actual state is available using cgroup.events control file ("frozen" field). There are no dedicated transitional states. 4) It's allowed to make any changes with the cgroup hierarchy (create new cgroups, remove old cgroups, move tasks between cgroups) no matter if some cgroups are frozen. Signed-off-by: Roman Gushchin <[email protected]> Signed-off-by: Tejun Heo <[email protected]> No-objection-from-me-by: Oleg Nesterov <[email protected]> Cc: [email protected]
1 parent 4dcabec commit 76f969e

File tree

9 files changed

+566
-10
lines changed

9 files changed

+566
-10
lines changed

include/linux/cgroup-defs.h

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,12 @@ enum {
6565
* specified at mount time and thus is implemented here.
6666
*/
6767
CGRP_CPUSET_CLONE_CHILDREN,
68+
69+
/* Control group has to be frozen. */
70+
CGRP_FREEZE,
71+
72+
/* Cgroup is frozen. */
73+
CGRP_FROZEN,
6874
};
6975

7076
/* cgroup_root->flags */
@@ -317,6 +323,25 @@ struct cgroup_rstat_cpu {
317323
struct cgroup *updated_next; /* NULL iff not on the list */
318324
};
319325

326+
struct cgroup_freezer_state {
327+
/* Should the cgroup and its descendants be frozen. */
328+
bool freeze;
329+
330+
/* Should the cgroup actually be frozen? */
331+
int e_freeze;
332+
333+
/* Fields below are protected by css_set_lock */
334+
335+
/* Number of frozen descendant cgroups */
336+
int nr_frozen_descendants;
337+
338+
/*
339+
* Number of tasks, which are counted as frozen:
340+
* frozen, SIGSTOPped, and PTRACEd.
341+
*/
342+
int nr_frozen_tasks;
343+
};
344+
320345
struct cgroup {
321346
/* self css with NULL ->ss, points back to this cgroup */
322347
struct cgroup_subsys_state self;
@@ -453,6 +478,9 @@ struct cgroup {
453478
/* If there is block congestion on this cgroup. */
454479
atomic_t congestion_count;
455480

481+
/* Used to store internal freezer state */
482+
struct cgroup_freezer_state freezer;
483+
456484
/* ids of the ancestors at each level including self */
457485
int ancestor_ids[];
458486
};

include/linux/cgroup.h

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -881,4 +881,47 @@ static inline void put_cgroup_ns(struct cgroup_namespace *ns)
881881
free_cgroup_ns(ns);
882882
}
883883

884+
#ifdef CONFIG_CGROUPS
885+
886+
void cgroup_enter_frozen(void);
887+
void cgroup_leave_frozen(bool always_leave);
888+
void cgroup_update_frozen(struct cgroup *cgrp);
889+
void cgroup_freeze(struct cgroup *cgrp, bool freeze);
890+
void cgroup_freezer_migrate_task(struct task_struct *task, struct cgroup *src,
891+
struct cgroup *dst);
892+
void cgroup_freezer_frozen_exit(struct task_struct *task);
893+
static inline bool cgroup_task_freeze(struct task_struct *task)
894+
{
895+
bool ret;
896+
897+
if (task->flags & PF_KTHREAD)
898+
return false;
899+
900+
rcu_read_lock();
901+
ret = test_bit(CGRP_FREEZE, &task_dfl_cgroup(task)->flags);
902+
rcu_read_unlock();
903+
904+
return ret;
905+
}
906+
907+
static inline bool cgroup_task_frozen(struct task_struct *task)
908+
{
909+
return task->frozen;
910+
}
911+
912+
#else /* !CONFIG_CGROUPS */
913+
914+
static inline void cgroup_enter_frozen(void) { }
915+
static inline void cgroup_leave_frozen(bool always_leave) { }
916+
static inline bool cgroup_task_freeze(struct task_struct *task)
917+
{
918+
return false;
919+
}
920+
static inline bool cgroup_task_frozen(struct task_struct *task)
921+
{
922+
return false;
923+
}
924+
925+
#endif /* !CONFIG_CGROUPS */
926+
884927
#endif /* _LINUX_CGROUP_H */

include/linux/sched.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -726,6 +726,8 @@ struct task_struct {
726726
#ifdef CONFIG_CGROUPS
727727
/* disallow userland-initiated cgroup migration */
728728
unsigned no_cgroup_migration:1;
729+
/* task is frozen/stopped (used by the cgroup freezer) */
730+
unsigned frozen:1;
729731
#endif
730732
#ifdef CONFIG_BLK_CGROUP
731733
/* to be used once the psi infrastructure lands upstream. */

include/linux/sched/jobctl.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ struct task_struct;
1818
#define JOBCTL_TRAP_NOTIFY_BIT 20 /* trap for NOTIFY */
1919
#define JOBCTL_TRAPPING_BIT 21 /* switching to TRACED */
2020
#define JOBCTL_LISTENING_BIT 22 /* ptracer is listening for events */
21+
#define JOBCTL_TRAP_FREEZE_BIT 23 /* trap for cgroup freezer */
2122

2223
#define JOBCTL_STOP_DEQUEUED (1UL << JOBCTL_STOP_DEQUEUED_BIT)
2324
#define JOBCTL_STOP_PENDING (1UL << JOBCTL_STOP_PENDING_BIT)
@@ -26,6 +27,7 @@ struct task_struct;
2627
#define JOBCTL_TRAP_NOTIFY (1UL << JOBCTL_TRAP_NOTIFY_BIT)
2728
#define JOBCTL_TRAPPING (1UL << JOBCTL_TRAPPING_BIT)
2829
#define JOBCTL_LISTENING (1UL << JOBCTL_LISTENING_BIT)
30+
#define JOBCTL_TRAP_FREEZE (1UL << JOBCTL_TRAP_FREEZE_BIT)
2931

3032
#define JOBCTL_TRAP_MASK (JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY)
3133
#define JOBCTL_PENDING_MASK (JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK)

kernel/cgroup/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# SPDX-License-Identifier: GPL-2.0
2-
obj-y := cgroup.o rstat.o namespace.o cgroup-v1.o
2+
obj-y := cgroup.o rstat.o namespace.o cgroup-v1.o freezer.o
33

44
obj-$(CONFIG_CGROUP_FREEZER) += legacy_freezer.o
55
obj-$(CONFIG_CGROUP_PIDS) += pids.o

kernel/cgroup/cgroup.c

Lines changed: 106 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2435,8 +2435,15 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
24352435
get_css_set(to_cset);
24362436
to_cset->nr_tasks++;
24372437
css_set_move_task(task, from_cset, to_cset, true);
2438-
put_css_set_locked(from_cset);
24392438
from_cset->nr_tasks--;
2439+
/*
2440+
* If the source or destination cgroup is frozen,
2441+
* the task might require to change its state.
2442+
*/
2443+
cgroup_freezer_migrate_task(task, from_cset->dfl_cgrp,
2444+
to_cset->dfl_cgrp);
2445+
put_css_set_locked(from_cset);
2446+
24402447
}
24412448
}
24422449
spin_unlock_irq(&css_set_lock);
@@ -3477,8 +3484,11 @@ static ssize_t cgroup_max_depth_write(struct kernfs_open_file *of,
34773484

34783485
static int cgroup_events_show(struct seq_file *seq, void *v)
34793486
{
3480-
seq_printf(seq, "populated %d\n",
3481-
cgroup_is_populated(seq_css(seq)->cgroup));
3487+
struct cgroup *cgrp = seq_css(seq)->cgroup;
3488+
3489+
seq_printf(seq, "populated %d\n", cgroup_is_populated(cgrp));
3490+
seq_printf(seq, "frozen %d\n", test_bit(CGRP_FROZEN, &cgrp->flags));
3491+
34823492
return 0;
34833493
}
34843494

@@ -3540,6 +3550,40 @@ static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
35403550
}
35413551
#endif
35423552

3553+
static int cgroup_freeze_show(struct seq_file *seq, void *v)
3554+
{
3555+
struct cgroup *cgrp = seq_css(seq)->cgroup;
3556+
3557+
seq_printf(seq, "%d\n", cgrp->freezer.freeze);
3558+
3559+
return 0;
3560+
}
3561+
3562+
static ssize_t cgroup_freeze_write(struct kernfs_open_file *of,
3563+
char *buf, size_t nbytes, loff_t off)
3564+
{
3565+
struct cgroup *cgrp;
3566+
ssize_t ret;
3567+
int freeze;
3568+
3569+
ret = kstrtoint(strstrip(buf), 0, &freeze);
3570+
if (ret)
3571+
return ret;
3572+
3573+
if (freeze < 0 || freeze > 1)
3574+
return -ERANGE;
3575+
3576+
cgrp = cgroup_kn_lock_live(of->kn, false);
3577+
if (!cgrp)
3578+
return -ENOENT;
3579+
3580+
cgroup_freeze(cgrp, freeze);
3581+
3582+
cgroup_kn_unlock(of->kn);
3583+
3584+
return nbytes;
3585+
}
3586+
35433587
static int cgroup_file_open(struct kernfs_open_file *of)
35443588
{
35453589
struct cftype *cft = of->kn->priv;
@@ -4683,6 +4727,12 @@ static struct cftype cgroup_base_files[] = {
46834727
.name = "cgroup.stat",
46844728
.seq_show = cgroup_stat_show,
46854729
},
4730+
{
4731+
.name = "cgroup.freeze",
4732+
.flags = CFTYPE_NOT_ON_ROOT,
4733+
.seq_show = cgroup_freeze_show,
4734+
.write = cgroup_freeze_write,
4735+
},
46864736
{
46874737
.name = "cpu.stat",
46884738
.flags = CFTYPE_NOT_ON_ROOT,
@@ -5033,12 +5083,29 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
50335083
if (ret)
50345084
goto out_psi_free;
50355085

5086+
/*
5087+
* New cgroup inherits effective freeze counter, and
5088+
* if the parent has to be frozen, the child has too.
5089+
*/
5090+
cgrp->freezer.e_freeze = parent->freezer.e_freeze;
5091+
if (cgrp->freezer.e_freeze)
5092+
set_bit(CGRP_FROZEN, &cgrp->flags);
5093+
50365094
spin_lock_irq(&css_set_lock);
50375095
for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
50385096
cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;
50395097

5040-
if (tcgrp != cgrp)
5098+
if (tcgrp != cgrp) {
50415099
tcgrp->nr_descendants++;
5100+
5101+
/*
5102+
* If the new cgroup is frozen, all ancestor cgroups
5103+
* get a new frozen descendant, but their state can't
5104+
* change because of this.
5105+
*/
5106+
if (cgrp->freezer.e_freeze)
5107+
tcgrp->freezer.nr_frozen_descendants++;
5108+
}
50425109
}
50435110
spin_unlock_irq(&css_set_lock);
50445111

@@ -5329,6 +5396,12 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
53295396
for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) {
53305397
tcgrp->nr_descendants--;
53315398
tcgrp->nr_dying_descendants++;
5399+
/*
5400+
* If the dying cgroup is frozen, decrease frozen descendants
5401+
* counters of ancestor cgroups.
5402+
*/
5403+
if (test_bit(CGRP_FROZEN, &cgrp->flags))
5404+
tcgrp->freezer.nr_frozen_descendants--;
53325405
}
53335406
spin_unlock_irq(&css_set_lock);
53345407

@@ -5782,6 +5855,29 @@ void cgroup_post_fork(struct task_struct *child)
57825855
cset->nr_tasks++;
57835856
css_set_move_task(child, NULL, cset, false);
57845857
}
5858+
5859+
/*
5860+
* If the cgroup has to be frozen, the new task has too.
5861+
* Let's set the JOBCTL_TRAP_FREEZE jobctl bit to get
5862+
* the task into the frozen state.
5863+
*/
5864+
if (unlikely(cgroup_task_freeze(child))) {
5865+
struct cgroup *cgrp;
5866+
5867+
spin_lock(&child->sighand->siglock);
5868+
WARN_ON_ONCE(child->frozen);
5869+
cgrp = cset->dfl_cgrp;
5870+
child->jobctl |= JOBCTL_TRAP_FREEZE;
5871+
spin_unlock(&child->sighand->siglock);
5872+
5873+
/*
5874+
* Calling cgroup_update_frozen() isn't required here,
5875+
* because it will be called anyway a bit later
5876+
* from do_freezer_trap(). So we avoid cgroup's
5877+
* transient switch from the frozen state and back.
5878+
*/
5879+
}
5880+
57855881
spin_unlock_irq(&css_set_lock);
57865882
}
57875883

@@ -5830,6 +5926,12 @@ void cgroup_exit(struct task_struct *tsk)
58305926
spin_lock_irq(&css_set_lock);
58315927
css_set_move_task(tsk, cset, NULL, false);
58325928
cset->nr_tasks--;
5929+
5930+
if (unlikely(cgroup_task_frozen(tsk)))
5931+
cgroup_freezer_frozen_exit(tsk);
5932+
else if (unlikely(cgroup_task_freeze(tsk)))
5933+
cgroup_update_frozen(task_dfl_cgroup(tsk));
5934+
58335935
spin_unlock_irq(&css_set_lock);
58345936
} else {
58355937
get_css_set(cset);

0 commit comments

Comments
 (0)