Skip to content

Commit dad1c12

Browse files
committed
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: - Remove the unused per rq load array and all its infrastructure, by Dietmar Eggemann. - Add utilization clamping support by Patrick Bellasi. This is a refinement of the energy aware scheduling framework with support for boosting of interactive and capping of background workloads: to make sure critical GUI threads get maximum frequency ASAP, and to make sure background processing doesn't unnecessarily move to cpufreq governor to higher frequencies and less energy efficient CPU modes. - Add the bare minimum of tracepoints required for LISA EAS regression testing, by Qais Yousef - which allows automated testing of various power management features, including energy aware scheduling. - Restructure the former tsk_nr_cpus_allowed() facility that the -rt kernel used to modify the scheduler's CPU affinity logic such as migrate_disable() - introduce the task->cpus_ptr value instead of taking the address of &task->cpus_allowed directly - by Sebastian Andrzej Siewior. - Misc optimizations, fixes, cleanups and small enhancements - see the Git log for details. * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (33 commits) sched/uclamp: Add uclamp support to energy_compute() sched/uclamp: Add uclamp_util_with() sched/cpufreq, sched/uclamp: Add clamps for FAIR and RT tasks sched/uclamp: Set default clamps for RT tasks sched/uclamp: Reset uclamp values on RESET_ON_FORK sched/uclamp: Extend sched_setattr() to support utilization clamping sched/core: Allow sched_setattr() to use the current policy sched/uclamp: Add system default clamps sched/uclamp: Enforce last task's UCLAMP_MAX sched/uclamp: Add bucket local max tracking sched/uclamp: Add CPU's clamp buckets refcounting sched/fair: Rename weighted_cpuload() to cpu_runnable_load() sched/debug: Export the newly added tracepoints sched/debug: Add sched_overutilized tracepoint sched/debug: Add new tracepoint to track PELT at se level sched/debug: Add new tracepoints to track PELT at rq level sched/debug: Add a new sched_trace_*() helper functions sched/autogroup: Make autogroup_path() always available sched/wait: Deduplicate code with do-while sched/topology: Remove unused 'sd' parameter from arch_scale_cpu_capacity() ...
2 parents 090bc5a + af24bde commit dad1c12

File tree

49 files changed

+1216
-618
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

49 files changed

+1216
-618
lines changed

Documentation/scheduler/sched-pelt.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,8 @@ void calc_runnable_avg_yN_inv(void)
2020
int i;
2121
unsigned int x;
2222

23-
printf("static const u32 runnable_avg_yN_inv[] = {");
23+
/* To silence -Wunused-but-set-variable warnings. */
24+
printf("static const u32 runnable_avg_yN_inv[] __maybe_unused = {");
2425
for (i = 0; i < HALFLIFE; i++) {
2526
x = ((1UL<<32)-1)*pow(y, i);
2627

arch/arm/kernel/topology.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,7 @@ static void update_cpu_capacity(unsigned int cpu)
169169
topology_set_cpu_scale(cpu, cpu_capacity(cpu) / middle_capacity);
170170

171171
pr_info("CPU%u: update cpu_capacity %lu\n",
172-
cpu, topology_get_cpu_scale(NULL, cpu));
172+
cpu, topology_get_cpu_scale(cpu));
173173
}
174174

175175
#else

arch/ia64/kernel/mca.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1831,7 +1831,7 @@ format_mca_init_stack(void *mca_data, unsigned long offset,
18311831
ti->cpu = cpu;
18321832
p->stack = ti;
18331833
p->state = TASK_UNINTERRUPTIBLE;
1834-
cpumask_set_cpu(cpu, &p->cpus_allowed);
1834+
cpumask_set_cpu(cpu, &p->cpus_mask);
18351835
INIT_LIST_HEAD(&p->tasks);
18361836
p->parent = p->real_parent = p->group_leader = p;
18371837
INIT_LIST_HEAD(&p->children);

arch/mips/include/asm/switch_to.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ extern struct task_struct *ll_task;
4242
* inline to try to keep the overhead down. If we have been forced to run on
4343
* a "CPU" with an FPU because of a previous high level of FP computation,
4444
* but did not actually use the FPU during the most recent time-slice (CU1
45-
* isn't set), we undo the restriction on cpus_allowed.
45+
* isn't set), we undo the restriction on cpus_mask.
4646
*
4747
* We're not calling set_cpus_allowed() here, because we have no need to
4848
* force prompt migration - we're already switching the current CPU to a
@@ -57,7 +57,7 @@ do { \
5757
test_ti_thread_flag(__prev_ti, TIF_FPUBOUND) && \
5858
(!(KSTK_STATUS(prev) & ST0_CU1))) { \
5959
clear_ti_thread_flag(__prev_ti, TIF_FPUBOUND); \
60-
prev->cpus_allowed = prev->thread.user_cpus_allowed; \
60+
prev->cpus_mask = prev->thread.user_cpus_allowed; \
6161
} \
6262
next->thread.emulated_fp = 0; \
6363
} while(0)

arch/mips/kernel/mips-mt-fpaff.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,7 @@ asmlinkage long mipsmt_sys_sched_getaffinity(pid_t pid, unsigned int len,
177177
if (retval)
178178
goto out_unlock;
179179

180-
cpumask_or(&allowed, &p->thread.user_cpus_allowed, &p->cpus_allowed);
180+
cpumask_or(&allowed, &p->thread.user_cpus_allowed, p->cpus_ptr);
181181
cpumask_and(&mask, &allowed, cpu_active_mask);
182182

183183
out_unlock:

arch/mips/kernel/traps.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -891,12 +891,12 @@ static void mt_ase_fp_affinity(void)
891891
* restricted the allowed set to exclude any CPUs with FPUs,
892892
* we'll skip the procedure.
893893
*/
894-
if (cpumask_intersects(&current->cpus_allowed, &mt_fpu_cpumask)) {
894+
if (cpumask_intersects(&current->cpus_mask, &mt_fpu_cpumask)) {
895895
cpumask_t tmask;
896896

897897
current->thread.user_cpus_allowed
898-
= current->cpus_allowed;
899-
cpumask_and(&tmask, &current->cpus_allowed,
898+
= current->cpus_mask;
899+
cpumask_and(&tmask, &current->cpus_mask,
900900
&mt_fpu_cpumask);
901901
set_cpus_allowed_ptr(current, &tmask);
902902
set_thread_flag(TIF_FPUBOUND);

arch/powerpc/platforms/cell/spufs/sched.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@ void __spu_update_sched_info(struct spu_context *ctx)
128128
* runqueue. The context will be rescheduled on the proper node
129129
* if it is timesliced or preempted.
130130
*/
131-
cpumask_copy(&ctx->cpus_allowed, &current->cpus_allowed);
131+
cpumask_copy(&ctx->cpus_allowed, current->cpus_ptr);
132132

133133
/* Save the current cpu id for spu interrupt routing. */
134134
ctx->last_ran = raw_smp_processor_id();

arch/x86/kernel/cpu/resctrl/pseudo_lock.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1503,7 +1503,7 @@ static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma)
15031503
* may be scheduled elsewhere and invalidate entries in the
15041504
* pseudo-locked region.
15051505
*/
1506-
if (!cpumask_subset(&current->cpus_allowed, &plr->d->cpu_mask)) {
1506+
if (!cpumask_subset(current->cpus_ptr, &plr->d->cpu_mask)) {
15071507
mutex_unlock(&rdtgroup_mutex);
15081508
return -EINVAL;
15091509
}

drivers/base/arch_topology.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ static ssize_t cpu_capacity_show(struct device *dev,
4343
{
4444
struct cpu *cpu = container_of(dev, struct cpu, dev);
4545

46-
return sprintf(buf, "%lu\n", topology_get_cpu_scale(NULL, cpu->dev.id));
46+
return sprintf(buf, "%lu\n", topology_get_cpu_scale(cpu->dev.id));
4747
}
4848

4949
static void update_topology_flags_workfn(struct work_struct *work);
@@ -116,7 +116,7 @@ void topology_normalize_cpu_scale(void)
116116
/ capacity_scale;
117117
topology_set_cpu_scale(cpu, capacity);
118118
pr_debug("cpu_capacity: CPU%d cpu_capacity=%lu\n",
119-
cpu, topology_get_cpu_scale(NULL, cpu));
119+
cpu, topology_get_cpu_scale(cpu));
120120
}
121121
}
122122

@@ -185,7 +185,7 @@ init_cpu_capacity_callback(struct notifier_block *nb,
185185
cpumask_andnot(cpus_to_visit, cpus_to_visit, policy->related_cpus);
186186

187187
for_each_cpu(cpu, policy->related_cpus) {
188-
raw_capacity[cpu] = topology_get_cpu_scale(NULL, cpu) *
188+
raw_capacity[cpu] = topology_get_cpu_scale(cpu) *
189189
policy->cpuinfo.max_freq / 1000UL;
190190
capacity_scale = max(raw_capacity[cpu], capacity_scale);
191191
}

drivers/infiniband/hw/hfi1/affinity.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1038,15 +1038,15 @@ int hfi1_get_proc_affinity(int node)
10381038
struct hfi1_affinity_node *entry;
10391039
cpumask_var_t diff, hw_thread_mask, available_mask, intrs_mask;
10401040
const struct cpumask *node_mask,
1041-
*proc_mask = &current->cpus_allowed;
1041+
*proc_mask = current->cpus_ptr;
10421042
struct hfi1_affinity_node_list *affinity = &node_affinity;
10431043
struct cpu_mask_set *set = &affinity->proc;
10441044

10451045
/*
10461046
* check whether process/context affinity has already
10471047
* been set
10481048
*/
1049-
if (cpumask_weight(proc_mask) == 1) {
1049+
if (current->nr_cpus_allowed == 1) {
10501050
hfi1_cdbg(PROC, "PID %u %s affinity set to CPU %*pbl",
10511051
current->pid, current->comm,
10521052
cpumask_pr_args(proc_mask));
@@ -1057,7 +1057,7 @@ int hfi1_get_proc_affinity(int node)
10571057
cpu = cpumask_first(proc_mask);
10581058
cpumask_set_cpu(cpu, &set->used);
10591059
goto done;
1060-
} else if (cpumask_weight(proc_mask) < cpumask_weight(&set->mask)) {
1060+
} else if (current->nr_cpus_allowed < cpumask_weight(&set->mask)) {
10611061
hfi1_cdbg(PROC, "PID %u %s affinity set to CPU set(s) %*pbl",
10621062
current->pid, current->comm,
10631063
cpumask_pr_args(proc_mask));

drivers/infiniband/hw/hfi1/sdma.c

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -869,14 +869,13 @@ struct sdma_engine *sdma_select_user_engine(struct hfi1_devdata *dd,
869869
{
870870
struct sdma_rht_node *rht_node;
871871
struct sdma_engine *sde = NULL;
872-
const struct cpumask *current_mask = &current->cpus_allowed;
873872
unsigned long cpu_id;
874873

875874
/*
876875
* To ensure that always the same sdma engine(s) will be
877876
* selected make sure the process is pinned to this CPU only.
878877
*/
879-
if (cpumask_weight(current_mask) != 1)
878+
if (current->nr_cpus_allowed != 1)
880879
goto out;
881880

882881
cpu_id = smp_processor_id();

drivers/infiniband/hw/qib/qib_file_ops.c

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1142,7 +1142,7 @@ static __poll_t qib_poll(struct file *fp, struct poll_table_struct *pt)
11421142
static void assign_ctxt_affinity(struct file *fp, struct qib_devdata *dd)
11431143
{
11441144
struct qib_filedata *fd = fp->private_data;
1145-
const unsigned int weight = cpumask_weight(&current->cpus_allowed);
1145+
const unsigned int weight = current->nr_cpus_allowed;
11461146
const struct cpumask *local_mask = cpumask_of_pcibus(dd->pcidev->bus);
11471147
int local_cpu;
11481148

@@ -1623,9 +1623,8 @@ static int qib_assign_ctxt(struct file *fp, const struct qib_user_info *uinfo)
16231623
ret = find_free_ctxt(i_minor - 1, fp, uinfo);
16241624
else {
16251625
int unit;
1626-
const unsigned int cpu = cpumask_first(&current->cpus_allowed);
1627-
const unsigned int weight =
1628-
cpumask_weight(&current->cpus_allowed);
1626+
const unsigned int cpu = cpumask_first(current->cpus_ptr);
1627+
const unsigned int weight = current->nr_cpus_allowed;
16291628

16301629
if (weight == 1 && !test_bit(cpu, qib_cpulist))
16311630
if (!find_hca(cpu, &unit) && unit >= 0)

fs/proc/array.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -381,9 +381,9 @@ static inline void task_context_switch_counts(struct seq_file *m,
381381
static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
382382
{
383383
seq_printf(m, "Cpus_allowed:\t%*pb\n",
384-
cpumask_pr_args(&task->cpus_allowed));
384+
cpumask_pr_args(task->cpus_ptr));
385385
seq_printf(m, "Cpus_allowed_list:\t%*pbl\n",
386-
cpumask_pr_args(&task->cpus_allowed));
386+
cpumask_pr_args(task->cpus_ptr));
387387
}
388388

389389
static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm)

include/linux/arch_topology.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ DECLARE_PER_CPU(unsigned long, cpu_scale);
1818

1919
struct sched_domain;
2020
static inline
21-
unsigned long topology_get_cpu_scale(struct sched_domain *sd, int cpu)
21+
unsigned long topology_get_cpu_scale(int cpu)
2222
{
2323
return per_cpu(cpu_scale, cpu);
2424
}

include/linux/energy_model.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ static inline unsigned long em_pd_energy(struct em_perf_domain *pd,
8989
* like schedutil.
9090
*/
9191
cpu = cpumask_first(to_cpumask(pd->cpus));
92-
scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
92+
scale_cpu = arch_scale_cpu_capacity(cpu);
9393
cs = &pd->table[pd->nr_cap_states - 1];
9494
freq = map_util_freq(max_util, cs->frequency, scale_cpu);
9595

include/linux/log2.h

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,4 +220,38 @@ int __order_base_2(unsigned long n)
220220
ilog2((n) - 1) + 1) : \
221221
__order_base_2(n) \
222222
)
223+
224+
static inline __attribute__((const))
225+
int __bits_per(unsigned long n)
226+
{
227+
if (n < 2)
228+
return 1;
229+
if (is_power_of_2(n))
230+
return order_base_2(n) + 1;
231+
return order_base_2(n);
232+
}
233+
234+
/**
235+
* bits_per - calculate the number of bits required for the argument
236+
* @n: parameter
237+
*
238+
* This is constant-capable and can be used for compile time
239+
* initializations, e.g bitfields.
240+
*
241+
* The first few values calculated by this routine:
242+
* bf(0) = 1
243+
* bf(1) = 1
244+
* bf(2) = 2
245+
* bf(3) = 2
246+
* bf(4) = 3
247+
* ... and so on.
248+
*/
249+
#define bits_per(n) \
250+
( \
251+
__builtin_constant_p(n) ? ( \
252+
((n) == 0 || (n) == 1) \
253+
? 1 : ilog2(n) + 1 \
254+
) : \
255+
__bits_per(n) \
256+
)
223257
#endif /* _LINUX_LOG2_H */

include/linux/sched.h

Lines changed: 76 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ struct audit_context;
3535
struct backing_dev_info;
3636
struct bio_list;
3737
struct blk_plug;
38+
struct capture_control;
3839
struct cfs_rq;
3940
struct fs_struct;
4041
struct futex_pi_state;
@@ -47,8 +48,9 @@ struct pid_namespace;
4748
struct pipe_inode_info;
4849
struct rcu_node;
4950
struct reclaim_state;
50-
struct capture_control;
5151
struct robust_list_head;
52+
struct root_domain;
53+
struct rq;
5254
struct sched_attr;
5355
struct sched_param;
5456
struct seq_file;
@@ -281,6 +283,18 @@ struct vtime {
281283
u64 gtime;
282284
};
283285

286+
/*
287+
* Utilization clamp constraints.
288+
* @UCLAMP_MIN: Minimum utilization
289+
* @UCLAMP_MAX: Maximum utilization
290+
* @UCLAMP_CNT: Utilization clamp constraints count
291+
*/
292+
enum uclamp_id {
293+
UCLAMP_MIN = 0,
294+
UCLAMP_MAX,
295+
UCLAMP_CNT
296+
};
297+
284298
struct sched_info {
285299
#ifdef CONFIG_SCHED_INFO
286300
/* Cumulative counters: */
@@ -312,6 +326,10 @@ struct sched_info {
312326
# define SCHED_FIXEDPOINT_SHIFT 10
313327
# define SCHED_FIXEDPOINT_SCALE (1L << SCHED_FIXEDPOINT_SHIFT)
314328

329+
/* Increase resolution of cpu_capacity calculations */
330+
# define SCHED_CAPACITY_SHIFT SCHED_FIXEDPOINT_SHIFT
331+
# define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT)
332+
315333
struct load_weight {
316334
unsigned long weight;
317335
u32 inv_weight;
@@ -560,6 +578,41 @@ struct sched_dl_entity {
560578
struct hrtimer inactive_timer;
561579
};
562580

581+
#ifdef CONFIG_UCLAMP_TASK
582+
/* Number of utilization clamp buckets (shorter alias) */
583+
#define UCLAMP_BUCKETS CONFIG_UCLAMP_BUCKETS_COUNT
584+
585+
/*
586+
* Utilization clamp for a scheduling entity
587+
* @value: clamp value "assigned" to a se
588+
* @bucket_id: bucket index corresponding to the "assigned" value
589+
* @active: the se is currently refcounted in a rq's bucket
590+
* @user_defined: the requested clamp value comes from user-space
591+
*
592+
* The bucket_id is the index of the clamp bucket matching the clamp value
593+
* which is pre-computed and stored to avoid expensive integer divisions from
594+
* the fast path.
595+
*
596+
* The active bit is set whenever a task has got an "effective" value assigned,
597+
* which can be different from the clamp value "requested" from user-space.
598+
* This allows to know a task is refcounted in the rq's bucket corresponding
599+
* to the "effective" bucket_id.
600+
*
601+
* The user_defined bit is set whenever a task has got a task-specific clamp
602+
* value requested from userspace, i.e. the system defaults apply to this task
603+
* just as a restriction. This allows to relax default clamps when a less
604+
* restrictive task-specific value has been requested, thus allowing to
605+
* implement a "nice" semantic. For example, a task running with a 20%
606+
* default boost can still drop its own boosting to 0%.
607+
*/
608+
struct uclamp_se {
609+
unsigned int value : bits_per(SCHED_CAPACITY_SCALE);
610+
unsigned int bucket_id : bits_per(UCLAMP_BUCKETS);
611+
unsigned int active : 1;
612+
unsigned int user_defined : 1;
613+
};
614+
#endif /* CONFIG_UCLAMP_TASK */
615+
563616
union rcu_special {
564617
struct {
565618
u8 blocked;
@@ -640,6 +693,13 @@ struct task_struct {
640693
#endif
641694
struct sched_dl_entity dl;
642695

696+
#ifdef CONFIG_UCLAMP_TASK
697+
/* Clamp values requested for a scheduling entity */
698+
struct uclamp_se uclamp_req[UCLAMP_CNT];
699+
/* Effective clamp values used for a scheduling entity */
700+
struct uclamp_se uclamp[UCLAMP_CNT];
701+
#endif
702+
643703
#ifdef CONFIG_PREEMPT_NOTIFIERS
644704
/* List of struct preempt_notifier: */
645705
struct hlist_head preempt_notifiers;
@@ -651,7 +711,8 @@ struct task_struct {
651711

652712
unsigned int policy;
653713
int nr_cpus_allowed;
654-
cpumask_t cpus_allowed;
714+
const cpumask_t *cpus_ptr;
715+
cpumask_t cpus_mask;
655716

656717
#ifdef CONFIG_PREEMPT_RCU
657718
int rcu_read_lock_nesting;
@@ -1399,7 +1460,7 @@ extern struct pid *cad_pid;
13991460
#define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */
14001461
#define PF_MEMSTALL 0x01000000 /* Stalled due to lack of memory */
14011462
#define PF_UMH 0x02000000 /* I'm an Usermodehelper process */
1402-
#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */
1463+
#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */
14031464
#define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */
14041465
#define PF_MEMALLOC_NOCMA 0x10000000 /* All allocation request will have _GFP_MOVABLE cleared */
14051466
#define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */
@@ -1915,4 +1976,16 @@ static inline void rseq_syscall(struct pt_regs *regs)
19151976

19161977
#endif
19171978

1979+
const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq);
1980+
char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len);
1981+
int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq);
1982+
1983+
const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq);
1984+
const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq);
1985+
const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq);
1986+
1987+
int sched_trace_rq_cpu(struct rq *rq);
1988+
1989+
const struct cpumask *sched_trace_rd_span(struct root_domain *rd);
1990+
19181991
#endif

0 commit comments

Comments
 (0)