Skip to content

Commit d86adb4

Browse files
committed
sched_ext: Add cpuperf support
sched_ext currently does not integrate with schedutil. When schedutil is the governor, frequencies are left unregulated and usually get stuck close to the highest performance level from running RT tasks. Add CPU performance monitoring and scaling support by integrating into schedutil. The following kfuncs are added: - scx_bpf_cpuperf_cap(): Query the relative performance capacity of different CPUs in the system. - scx_bpf_cpuperf_cur(): Query the current performance level of a CPU relative to its max performance. - scx_bpf_cpuperf_set(): Set the current target performance level of a CPU. This gives direct control over CPU performance setting to the BPF scheduler. The only changes on the schedutil side are accounting for the utilization factor from sched_ext and disabling frequency holding heuristics as it may not apply well to sched_ext schedulers which may have a lot weaker connection between tasks and their current / last CPU. With cpuperf support added, there is no reason to block uclamp. Enable while at it. A toy implementation of cpuperf is added to scx_qmap as a demonstration of the feature. v2: Ignore cpu_util_cfs_boost() when scx_switched_all() in sugov_get_util() to avoid factoring in stale util metric. (Christian) Signed-off-by: Tejun Heo <[email protected]> Reviewed-by: David Vernet <[email protected]> Cc: Rafael J. Wysocki <[email protected]> Cc: Viresh Kumar <[email protected]> Cc: Christian Loehle <[email protected]>
1 parent 8988cad commit d86adb4

File tree

7 files changed

+252
-6
lines changed

7 files changed

+252
-6
lines changed

kernel/sched/cpufreq_schedutil.c

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -197,8 +197,10 @@ unsigned long sugov_effective_cpu_perf(int cpu, unsigned long actual,
197197

198198
static void sugov_get_util(struct sugov_cpu *sg_cpu, unsigned long boost)
199199
{
200-
unsigned long min, max, util = cpu_util_cfs_boost(sg_cpu->cpu);
200+
unsigned long min, max, util = scx_cpuperf_target(sg_cpu->cpu);
201201

202+
if (!scx_switched_all())
203+
util += cpu_util_cfs_boost(sg_cpu->cpu);
202204
util = effective_cpu_util(sg_cpu->cpu, util, &min, &max);
203205
util = max(util, boost);
204206
sg_cpu->bw_min = min;
@@ -330,6 +332,14 @@ static bool sugov_hold_freq(struct sugov_cpu *sg_cpu)
330332
unsigned long idle_calls;
331333
bool ret;
332334

335+
/*
336+
* The heuristics in this function is for the fair class. For SCX, the
337+
* performance target comes directly from the BPF scheduler. Let's just
338+
* follow it.
339+
*/
340+
if (scx_switched_all())
341+
return false;
342+
333343
/* if capped by uclamp_max, always update to be in compliance */
334344
if (uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)))
335345
return false;

kernel/sched/ext.c

Lines changed: 81 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ enum scx_consts {
1616
SCX_EXIT_BT_LEN = 64,
1717
SCX_EXIT_MSG_LEN = 1024,
1818
SCX_EXIT_DUMP_DFL_LEN = 32768,
19+
20+
SCX_CPUPERF_ONE = SCHED_CAPACITY_SCALE,
1921
};
2022

2123
enum scx_exit_kind {
@@ -3520,7 +3522,7 @@ DEFINE_SCHED_CLASS(ext) = {
35203522
.update_curr = update_curr_scx,
35213523

35223524
#ifdef CONFIG_UCLAMP_TASK
3523-
.uclamp_enabled = 0,
3525+
.uclamp_enabled = 1,
35243526
#endif
35253527
};
35263528

@@ -4393,7 +4395,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
43934395
struct scx_task_iter sti;
43944396
struct task_struct *p;
43954397
unsigned long timeout;
4396-
int i, ret;
4398+
int i, cpu, ret;
43974399

43984400
mutex_lock(&scx_ops_enable_mutex);
43994401

@@ -4442,6 +4444,9 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
44424444

44434445
atomic_long_set(&scx_nr_rejected, 0);
44444446

4447+
for_each_possible_cpu(cpu)
4448+
cpu_rq(cpu)->scx.cpuperf_target = SCX_CPUPERF_ONE;
4449+
44454450
/*
44464451
* Keep CPUs stable during enable so that the BPF scheduler can track
44474452
* online CPUs by watching ->on/offline_cpu() after ->init().
@@ -5835,6 +5840,77 @@ __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data,
58355840
ops_dump_flush();
58365841
}
58375842

5843+
/**
5844+
* scx_bpf_cpuperf_cap - Query the maximum relative capacity of a CPU
5845+
* @cpu: CPU of interest
5846+
*
5847+
* Return the maximum relative capacity of @cpu in relation to the most
5848+
* performant CPU in the system. The return value is in the range [1,
5849+
* %SCX_CPUPERF_ONE]. See scx_bpf_cpuperf_cur().
5850+
*/
5851+
__bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu)
5852+
{
5853+
if (ops_cpu_valid(cpu, NULL))
5854+
return arch_scale_cpu_capacity(cpu);
5855+
else
5856+
return SCX_CPUPERF_ONE;
5857+
}
5858+
5859+
/**
5860+
* scx_bpf_cpuperf_cur - Query the current relative performance of a CPU
5861+
* @cpu: CPU of interest
5862+
*
5863+
* Return the current relative performance of @cpu in relation to its maximum.
5864+
* The return value is in the range [1, %SCX_CPUPERF_ONE].
5865+
*
5866+
* The current performance level of a CPU in relation to the maximum performance
5867+
* available in the system can be calculated as follows:
5868+
*
5869+
* scx_bpf_cpuperf_cap() * scx_bpf_cpuperf_cur() / %SCX_CPUPERF_ONE
5870+
*
5871+
* The result is in the range [1, %SCX_CPUPERF_ONE].
5872+
*/
5873+
__bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu)
5874+
{
5875+
if (ops_cpu_valid(cpu, NULL))
5876+
return arch_scale_freq_capacity(cpu);
5877+
else
5878+
return SCX_CPUPERF_ONE;
5879+
}
5880+
5881+
/**
5882+
* scx_bpf_cpuperf_set - Set the relative performance target of a CPU
5883+
* @cpu: CPU of interest
5884+
* @perf: target performance level [0, %SCX_CPUPERF_ONE]
5885+
* @flags: %SCX_CPUPERF_* flags
5886+
*
5887+
* Set the target performance level of @cpu to @perf. @perf is in linear
5888+
* relative scale between 0 and %SCX_CPUPERF_ONE. This determines how the
5889+
* schedutil cpufreq governor chooses the target frequency.
5890+
*
5891+
* The actual performance level chosen, CPU grouping, and the overhead and
5892+
* latency of the operations are dependent on the hardware and cpufreq driver in
5893+
* use. Consult hardware and cpufreq documentation for more information. The
5894+
* current performance level can be monitored using scx_bpf_cpuperf_cur().
5895+
*/
5896+
__bpf_kfunc void scx_bpf_cpuperf_set(u32 cpu, u32 perf)
5897+
{
5898+
if (unlikely(perf > SCX_CPUPERF_ONE)) {
5899+
scx_ops_error("Invalid cpuperf target %u for CPU %d", perf, cpu);
5900+
return;
5901+
}
5902+
5903+
if (ops_cpu_valid(cpu, NULL)) {
5904+
struct rq *rq = cpu_rq(cpu);
5905+
5906+
rq->scx.cpuperf_target = perf;
5907+
5908+
rcu_read_lock_sched_notrace();
5909+
cpufreq_update_util(cpu_rq(cpu), 0);
5910+
rcu_read_unlock_sched_notrace();
5911+
}
5912+
}
5913+
58385914
/**
58395915
* scx_bpf_nr_cpu_ids - Return the number of possible CPU IDs
58405916
*
@@ -6045,6 +6121,9 @@ BTF_ID_FLAGS(func, scx_bpf_destroy_dsq)
60456121
BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_TRUSTED_ARGS)
60466122
BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_TRUSTED_ARGS)
60476123
BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_TRUSTED_ARGS)
6124+
BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap)
6125+
BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur)
6126+
BTF_ID_FLAGS(func, scx_bpf_cpuperf_set)
60486127
BTF_ID_FLAGS(func, scx_bpf_nr_cpu_ids)
60496128
BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE)
60506129
BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE)

kernel/sched/ext.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,14 @@ int scx_check_setscheduler(struct task_struct *p, int policy);
4646
bool task_should_scx(struct task_struct *p);
4747
void init_sched_ext_class(void);
4848

49+
static inline u32 scx_cpuperf_target(s32 cpu)
50+
{
51+
if (scx_enabled())
52+
return cpu_rq(cpu)->scx.cpuperf_target;
53+
else
54+
return 0;
55+
}
56+
4957
static inline const struct sched_class *next_active_class(const struct sched_class *class)
5058
{
5159
class++;
@@ -85,6 +93,7 @@ static inline void scx_pre_fork(struct task_struct *p) {}
8593
static inline int scx_fork(struct task_struct *p) { return 0; }
8694
static inline void scx_post_fork(struct task_struct *p) {}
8795
static inline void scx_cancel_fork(struct task_struct *p) {}
96+
static inline u32 scx_cpuperf_target(s32 cpu) { return 0; }
8897
static inline bool scx_can_stop_tick(struct rq *rq) { return true; }
8998
static inline void scx_rq_activate(struct rq *rq) {}
9099
static inline void scx_rq_deactivate(struct rq *rq) {}

kernel/sched/sched.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -743,6 +743,7 @@ struct scx_rq {
743743
u64 extra_enq_flags; /* see move_task_to_local_dsq() */
744744
u32 nr_running;
745745
u32 flags;
746+
u32 cpuperf_target; /* [0, SCHED_CAPACITY_SCALE] */
746747
bool cpu_released;
747748
cpumask_var_t cpus_to_kick;
748749
cpumask_var_t cpus_to_kick_if_idle;

tools/sched_ext/include/scx/common.bpf.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,9 @@ void scx_bpf_destroy_dsq(u64 dsq_id) __ksym;
4242
void scx_bpf_exit_bstr(s64 exit_code, char *fmt, unsigned long long *data, u32 data__sz) __ksym __weak;
4343
void scx_bpf_error_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym;
4444
void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym __weak;
45+
u32 scx_bpf_cpuperf_cap(s32 cpu) __ksym __weak;
46+
u32 scx_bpf_cpuperf_cur(s32 cpu) __ksym __weak;
47+
void scx_bpf_cpuperf_set(s32 cpu, u32 perf) __ksym __weak;
4548
u32 scx_bpf_nr_cpu_ids(void) __ksym __weak;
4649
const struct cpumask *scx_bpf_get_possible_cpumask(void) __ksym __weak;
4750
const struct cpumask *scx_bpf_get_online_cpumask(void) __ksym __weak;

tools/sched_ext/scx_qmap.bpf.c

Lines changed: 139 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,18 @@ struct {
6868
},
6969
};
7070

71+
/*
72+
* If enabled, CPU performance target is set according to the queue index
73+
* according to the following table.
74+
*/
75+
static const u32 qidx_to_cpuperf_target[] = {
76+
[0] = SCX_CPUPERF_ONE * 0 / 4,
77+
[1] = SCX_CPUPERF_ONE * 1 / 4,
78+
[2] = SCX_CPUPERF_ONE * 2 / 4,
79+
[3] = SCX_CPUPERF_ONE * 3 / 4,
80+
[4] = SCX_CPUPERF_ONE * 4 / 4,
81+
};
82+
7183
/*
7284
* Per-queue sequence numbers to implement core-sched ordering.
7385
*
@@ -95,6 +107,8 @@ struct {
95107
struct cpu_ctx {
96108
u64 dsp_idx; /* dispatch index */
97109
u64 dsp_cnt; /* remaining count */
110+
u32 avg_weight;
111+
u32 cpuperf_target;
98112
};
99113

100114
struct {
@@ -107,6 +121,8 @@ struct {
107121
/* Statistics */
108122
u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued;
109123
u64 nr_core_sched_execed;
124+
u32 cpuperf_min, cpuperf_avg, cpuperf_max;
125+
u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max;
110126

111127
s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
112128
s32 prev_cpu, u64 wake_flags)
@@ -313,6 +329,29 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
313329
}
314330
}
315331

332+
void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p)
333+
{
334+
struct cpu_ctx *cpuc;
335+
u32 zero = 0;
336+
int idx;
337+
338+
if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
339+
scx_bpf_error("failed to look up cpu_ctx");
340+
return;
341+
}
342+
343+
/*
344+
* Use the running avg of weights to select the target cpuperf level.
345+
* This is a demonstration of the cpuperf feature rather than a
346+
* practical strategy to regulate CPU frequency.
347+
*/
348+
cpuc->avg_weight = cpuc->avg_weight * 3 / 4 + p->scx.weight / 4;
349+
idx = weight_to_idx(cpuc->avg_weight);
350+
cpuc->cpuperf_target = qidx_to_cpuperf_target[idx];
351+
352+
scx_bpf_cpuperf_set(scx_bpf_task_cpu(p), cpuc->cpuperf_target);
353+
}
354+
316355
/*
317356
* The distance from the head of the queue scaled by the weight of the queue.
318357
* The lower the number, the older the task and the higher the priority.
@@ -422,8 +461,9 @@ void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle
422461
if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, cpu)))
423462
return;
424463

425-
scx_bpf_dump("QMAP: dsp_idx=%llu dsp_cnt=%llu",
426-
cpuc->dsp_idx, cpuc->dsp_cnt);
464+
scx_bpf_dump("QMAP: dsp_idx=%llu dsp_cnt=%llu avg_weight=%u cpuperf_target=%u",
465+
cpuc->dsp_idx, cpuc->dsp_cnt, cpuc->avg_weight,
466+
cpuc->cpuperf_target);
427467
}
428468

429469
void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struct *p)
@@ -492,11 +532,106 @@ void BPF_STRUCT_OPS(qmap_cpu_offline, s32 cpu)
492532
print_cpus();
493533
}
494534

535+
struct monitor_timer {
536+
struct bpf_timer timer;
537+
};
538+
539+
struct {
540+
__uint(type, BPF_MAP_TYPE_ARRAY);
541+
__uint(max_entries, 1);
542+
__type(key, u32);
543+
__type(value, struct monitor_timer);
544+
} monitor_timer SEC(".maps");
545+
546+
/*
547+
* Print out the min, avg and max performance levels of CPUs every second to
548+
* demonstrate the cpuperf interface.
549+
*/
550+
static void monitor_cpuperf(void)
551+
{
552+
u32 zero = 0, nr_cpu_ids;
553+
u64 cap_sum = 0, cur_sum = 0, cur_min = SCX_CPUPERF_ONE, cur_max = 0;
554+
u64 target_sum = 0, target_min = SCX_CPUPERF_ONE, target_max = 0;
555+
const struct cpumask *online;
556+
int i, nr_online_cpus = 0;
557+
558+
nr_cpu_ids = scx_bpf_nr_cpu_ids();
559+
online = scx_bpf_get_online_cpumask();
560+
561+
bpf_for(i, 0, nr_cpu_ids) {
562+
struct cpu_ctx *cpuc;
563+
u32 cap, cur;
564+
565+
if (!bpf_cpumask_test_cpu(i, online))
566+
continue;
567+
nr_online_cpus++;
568+
569+
/* collect the capacity and current cpuperf */
570+
cap = scx_bpf_cpuperf_cap(i);
571+
cur = scx_bpf_cpuperf_cur(i);
572+
573+
cur_min = cur < cur_min ? cur : cur_min;
574+
cur_max = cur > cur_max ? cur : cur_max;
575+
576+
/*
577+
* $cur is relative to $cap. Scale it down accordingly so that
578+
* it's in the same scale as other CPUs and $cur_sum/$cap_sum
579+
* makes sense.
580+
*/
581+
cur_sum += cur * cap / SCX_CPUPERF_ONE;
582+
cap_sum += cap;
583+
584+
if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, i))) {
585+
scx_bpf_error("failed to look up cpu_ctx");
586+
goto out;
587+
}
588+
589+
/* collect target */
590+
cur = cpuc->cpuperf_target;
591+
target_sum += cur;
592+
target_min = cur < target_min ? cur : target_min;
593+
target_max = cur > target_max ? cur : target_max;
594+
}
595+
596+
cpuperf_min = cur_min;
597+
cpuperf_avg = cur_sum * SCX_CPUPERF_ONE / cap_sum;
598+
cpuperf_max = cur_max;
599+
600+
cpuperf_target_min = target_min;
601+
cpuperf_target_avg = target_sum / nr_online_cpus;
602+
cpuperf_target_max = target_max;
603+
out:
604+
scx_bpf_put_cpumask(online);
605+
}
606+
607+
static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer)
608+
{
609+
monitor_cpuperf();
610+
611+
bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
612+
return 0;
613+
}
614+
495615
s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
496616
{
617+
u32 key = 0;
618+
struct bpf_timer *timer;
619+
s32 ret;
620+
497621
print_cpus();
498622

499-
return scx_bpf_create_dsq(SHARED_DSQ, -1);
623+
ret = scx_bpf_create_dsq(SHARED_DSQ, -1);
624+
if (ret)
625+
return ret;
626+
627+
timer = bpf_map_lookup_elem(&monitor_timer, &key);
628+
if (!timer)
629+
return -ESRCH;
630+
631+
bpf_timer_init(timer, &monitor_timer, CLOCK_MONOTONIC);
632+
bpf_timer_set_callback(timer, monitor_timerfn);
633+
634+
return bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
500635
}
501636

502637
void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei)
@@ -509,6 +644,7 @@ SCX_OPS_DEFINE(qmap_ops,
509644
.enqueue = (void *)qmap_enqueue,
510645
.dequeue = (void *)qmap_dequeue,
511646
.dispatch = (void *)qmap_dispatch,
647+
.tick = (void *)qmap_tick,
512648
.core_sched_before = (void *)qmap_core_sched_before,
513649
.cpu_release = (void *)qmap_cpu_release,
514650
.init_task = (void *)qmap_init_task,

0 commit comments

Comments
 (0)