Skip to content

Commit 8195136

Browse files
committed
sched_ext: Add cgroup support
Add sched_ext_ops operations to init/exit cgroups, and track task migrations and config changes. A BPF scheduler may not implement or implement only subset of cgroup features. The implemented features can be indicated using %SCX_OPS_HAS_CGOUP_* flags. If cgroup configuration makes use of features that are not implemented, a warning is triggered. While a BPF scheduler is being enabled and disabled, relevant cgroup operations are locked out using scx_cgroup_rwsem. This avoids situations like task prep taking place while the task is being moved across cgroups, making things easier for BPF schedulers. v7: - cgroup interface file visibility toggling is dropped in favor just warning messages. Dynamically changing interface visiblity caused more confusion than helping. v6: - Updated to reflect the removal of SCX_KF_SLEEPABLE. - Updated to use CONFIG_GROUP_SCHED_WEIGHT and fixes for !CONFIG_FAIR_GROUP_SCHED && CONFIG_EXT_GROUP_SCHED. v5: - Flipped the locking order between scx_cgroup_rwsem and cpus_read_lock() to avoid locking order conflict w/ cpuset. Better documentation around locking. - sched_move_task() takes an early exit if the source and destination are identical. This triggered the warning in scx_cgroup_can_attach() as it left p->scx.cgrp_moving_from uncleared. Updated the cgroup migration path so that ops.cgroup_prep_move() is skipped for identity migrations so that its invocations always match ops.cgroup_move() one-to-one. v4: - Example schedulers moved into their own patches. - Fix build failure when !CONFIG_CGROUP_SCHED, reported by Andrea Righi. v3: - Make scx_example_pair switch all tasks by default. - Convert to BPF inline iterators. - scx_bpf_task_cgroup() is added to determine the current cgroup from CPU controller's POV. This allows BPF schedulers to accurately track CPU cgroup membership. - scx_example_flatcg added. This demonstrates flattened hierarchy implementation of CPU cgroup control and shows significant performance improvement when cgroups which are nested multiple levels are under competition. v2: - Build fixes for different CONFIG combinations. Signed-off-by: Tejun Heo <[email protected]> Reviewed-by: David Vernet <[email protected]> Acked-by: Josh Don <[email protected]> Acked-by: Hao Luo <[email protected]> Acked-by: Barret Rhoden <[email protected]> Reported-by: kernel test robot <[email protected]> Cc: Andrea Righi <[email protected]>
1 parent e179e80 commit 8195136

File tree

8 files changed

+636
-19
lines changed

8 files changed

+636
-19
lines changed

include/linux/sched/ext.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,9 @@ struct sched_ext_entity {
188188
bool disallow; /* reject switching into SCX */
189189

190190
/* cold fields */
191+
#ifdef CONFIG_EXT_GROUP_SCHED
192+
struct cgroup *cgrp_moving_from;
193+
#endif
191194
/* must be the last field, see init_scx_entity() */
192195
struct list_head tasks_node;
193196
};

init/Kconfig

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1055,6 +1055,12 @@ config RT_GROUP_SCHED
10551055
realtime bandwidth for them.
10561056
See Documentation/scheduler/sched-rt-group.rst for more information.
10571057

1058+
config EXT_GROUP_SCHED
1059+
bool
1060+
depends on SCHED_CLASS_EXT && CGROUP_SCHED
1061+
select GROUP_SCHED_WEIGHT
1062+
default y
1063+
10581064
endif #CGROUP_SCHED
10591065

10601066
config SCHED_MM_CID

kernel/sched/core.c

Lines changed: 57 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -8364,6 +8364,9 @@ void __init sched_init(void)
83648364
root_task_group.shares = ROOT_TASK_GROUP_LOAD;
83658365
init_cfs_bandwidth(&root_task_group.cfs_bandwidth, NULL);
83668366
#endif /* CONFIG_FAIR_GROUP_SCHED */
8367+
#ifdef CONFIG_EXT_GROUP_SCHED
8368+
root_task_group.scx_weight = CGROUP_WEIGHT_DFL;
8369+
#endif /* CONFIG_EXT_GROUP_SCHED */
83678370
#ifdef CONFIG_RT_GROUP_SCHED
83688371
root_task_group.rt_se = (struct sched_rt_entity **)ptr;
83698372
ptr += nr_cpu_ids * sizeof(void **);
@@ -8801,6 +8804,7 @@ struct task_group *sched_create_group(struct task_group *parent)
88018804
if (!alloc_rt_sched_group(tg, parent))
88028805
goto err;
88038806

8807+
scx_group_set_weight(tg, CGROUP_WEIGHT_DFL);
88048808
alloc_uclamp_sched_group(tg, parent);
88058809

88068810
return tg;
@@ -8928,6 +8932,7 @@ void sched_move_task(struct task_struct *tsk)
89288932
put_prev_task(rq, tsk);
89298933

89308934
sched_change_group(tsk, group);
8935+
scx_move_task(tsk);
89318936

89328937
if (queued)
89338938
enqueue_task(rq, tsk, queue_flags);
@@ -8965,6 +8970,11 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
89658970
{
89668971
struct task_group *tg = css_tg(css);
89678972
struct task_group *parent = css_tg(css->parent);
8973+
int ret;
8974+
8975+
ret = scx_tg_online(tg);
8976+
if (ret)
8977+
return ret;
89688978

89698979
if (parent)
89708980
sched_online_group(tg, parent);
@@ -8979,6 +8989,13 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
89798989
return 0;
89808990
}
89818991

8992+
static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
8993+
{
8994+
struct task_group *tg = css_tg(css);
8995+
8996+
scx_tg_offline(tg);
8997+
}
8998+
89828999
static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
89839000
{
89849001
struct task_group *tg = css_tg(css);
@@ -8996,19 +9013,19 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
89969013
sched_unregister_group(tg);
89979014
}
89989015

8999-
#ifdef CONFIG_RT_GROUP_SCHED
90009016
static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
90019017
{
9018+
#ifdef CONFIG_RT_GROUP_SCHED
90029019
struct task_struct *task;
90039020
struct cgroup_subsys_state *css;
90049021

90059022
cgroup_taskset_for_each(task, css, tset) {
90069023
if (!sched_rt_can_attach(css_tg(css), task))
90079024
return -EINVAL;
90089025
}
9009-
return 0;
9010-
}
90119026
#endif
9027+
return scx_cgroup_can_attach(tset);
9028+
}
90129029

90139030
static void cpu_cgroup_attach(struct cgroup_taskset *tset)
90149031
{
@@ -9017,6 +9034,13 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset)
90179034

90189035
cgroup_taskset_for_each(task, css, tset)
90199036
sched_move_task(task);
9037+
9038+
scx_cgroup_finish_attach();
9039+
}
9040+
9041+
static void cpu_cgroup_cancel_attach(struct cgroup_taskset *tset)
9042+
{
9043+
scx_cgroup_cancel_attach(tset);
90209044
}
90219045

90229046
#ifdef CONFIG_UCLAMP_TASK_GROUP
@@ -9196,15 +9220,25 @@ static int cpu_uclamp_max_show(struct seq_file *sf, void *v)
91969220
#ifdef CONFIG_GROUP_SCHED_WEIGHT
91979221
static unsigned long tg_weight(struct task_group *tg)
91989222
{
9223+
#ifdef CONFIG_FAIR_GROUP_SCHED
91999224
return scale_load_down(tg->shares);
9225+
#else
9226+
return sched_weight_from_cgroup(tg->scx_weight);
9227+
#endif
92009228
}
92019229

92029230
static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
92039231
struct cftype *cftype, u64 shareval)
92049232
{
9233+
int ret;
9234+
92059235
if (shareval > scale_load_down(ULONG_MAX))
92069236
shareval = MAX_SHARES;
9207-
return sched_group_set_shares(css_tg(css), scale_load(shareval));
9237+
ret = sched_group_set_shares(css_tg(css), scale_load(shareval));
9238+
if (!ret)
9239+
scx_group_set_weight(css_tg(css),
9240+
sched_weight_to_cgroup(shareval));
9241+
return ret;
92089242
}
92099243

92109244
static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
@@ -9595,7 +9629,12 @@ static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css,
95959629
static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
95969630
struct cftype *cft, s64 idle)
95979631
{
9598-
return sched_group_set_idle(css_tg(css), idle);
9632+
int ret;
9633+
9634+
ret = sched_group_set_idle(css_tg(css), idle);
9635+
if (!ret)
9636+
scx_group_set_idle(css_tg(css), idle);
9637+
return ret;
95999638
}
96009639
#endif
96019640

@@ -9722,13 +9761,17 @@ static int cpu_weight_write_u64(struct cgroup_subsys_state *css,
97229761
struct cftype *cft, u64 cgrp_weight)
97239762
{
97249763
unsigned long weight;
9764+
int ret;
97259765

97269766
if (cgrp_weight < CGROUP_WEIGHT_MIN || cgrp_weight > CGROUP_WEIGHT_MAX)
97279767
return -ERANGE;
97289768

97299769
weight = sched_weight_from_cgroup(cgrp_weight);
97309770

9731-
return sched_group_set_shares(css_tg(css), scale_load(weight));
9771+
ret = sched_group_set_shares(css_tg(css), scale_load(weight));
9772+
if (!ret)
9773+
scx_group_set_weight(css_tg(css), cgrp_weight);
9774+
return ret;
97329775
}
97339776

97349777
static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css,
@@ -9753,7 +9796,7 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
97539796
struct cftype *cft, s64 nice)
97549797
{
97559798
unsigned long weight;
9756-
int idx;
9799+
int idx, ret;
97579800

97589801
if (nice < MIN_NICE || nice > MAX_NICE)
97599802
return -ERANGE;
@@ -9762,7 +9805,11 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
97629805
idx = array_index_nospec(idx, 40);
97639806
weight = sched_prio_to_weight[idx];
97649807

9765-
return sched_group_set_shares(css_tg(css), scale_load(weight));
9808+
ret = sched_group_set_shares(css_tg(css), scale_load(weight));
9809+
if (!ret)
9810+
scx_group_set_weight(css_tg(css),
9811+
sched_weight_to_cgroup(weight));
9812+
return ret;
97669813
}
97679814
#endif /* CONFIG_GROUP_SCHED_WEIGHT */
97689815

@@ -9878,14 +9925,14 @@ static struct cftype cpu_files[] = {
98789925
struct cgroup_subsys cpu_cgrp_subsys = {
98799926
.css_alloc = cpu_cgroup_css_alloc,
98809927
.css_online = cpu_cgroup_css_online,
9928+
.css_offline = cpu_cgroup_css_offline,
98819929
.css_released = cpu_cgroup_css_released,
98829930
.css_free = cpu_cgroup_css_free,
98839931
.css_extra_stat_show = cpu_extra_stat_show,
98849932
.css_local_stat_show = cpu_local_stat_show,
9885-
#ifdef CONFIG_RT_GROUP_SCHED
98869933
.can_attach = cpu_cgroup_can_attach,
9887-
#endif
98889934
.attach = cpu_cgroup_attach,
9935+
.cancel_attach = cpu_cgroup_cancel_attach,
98899936
.legacy_cftypes = cpu_legacy_files,
98909937
.dfl_cftypes = cpu_files,
98919938
.early_init = true,

0 commit comments

Comments
 (0)