Skip to content

Commit 3ca9a83

Browse files
committed
Merge tag 'sched-core-2023-08-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: - The biggest change is introduction of a new iteration of the SCHED_FAIR interactivity code: the EEVDF ("Earliest Eligible Virtual Deadline First") scheduler EEVDF too is a virtual-time scheduler, with two parameters (weight and relative deadline), compared to CFS that had weight only. It completely reworks the base scheduler: placement, preemption, picking -- everything LWN.net, as usual, has a terrific writeup about EEVDF: https://lwn.net/Articles/925371/ Preemption (both tick and wakeup) is driven by testing against a fresh pick. Because the tree is now effectively an interval tree, and the selection is no longer the 'leftmost' task, over-scheduling is less of a problem. A lot of the CFS heuristics are removed or replaced by more natural latency-space parameters & constructs In terms of expected performance regressions: we will and can fix everything where a 'good' workload misbehaves with the new scheduler, but EEVDF inevitably changes workload scheduling in a binary fashion, hopefully for the better in the overwhelming majority of cases, but in some cases it won't, especially in adversarial loads that got lucky with the previous code, such as some variants of hackbench. We are trying hard to err on the side of fixing all performance regressions, but we expect some inevitable post-release iterations of that process - Improve load-balancing on hybrid x86 systems: enable cluster scheduling (again) - Improve & fix bandwidth-scheduling on nohz systems - Improve bandwidth-throttling - Use lock guards to simplify and de-goto-ify control flow - Misc improvements, cleanups and fixes * tag 'sched-core-2023-08-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (43 commits) sched/eevdf/doc: Modify the documented knob to base_slice_ns as well sched/eevdf: Curb wakeup-preemption sched: Simplify sched_core_cpu_{starting,deactivate}() sched: Simplify try_steal_cookie() sched: Simplify sched_tick_remote() sched: Simplify sched_exec() sched: Simplify ttwu() sched: Simplify wake_up_if_idle() sched: Simplify: migrate_swap_stop() sched: Simplify sysctl_sched_uclamp_handler() sched: Simplify get_nohz_timer_target() sched/rt: sysctl_sched_rr_timeslice show default timeslice after reset sched/rt: Fix sysctl_sched_rr_timeslice intial value sched/fair: Block nohz tick_stop when cfs bandwidth in use sched, cgroup: Restore meaning to hierarchical_quota MAINTAINERS: Add Peter explicitly to the psi section sched/psi: Select KERNFS as needed sched/topology: Align group flags when removing degenerate domain sched/fair: remove util_est boosting sched/fair: Propagate enqueue flags into place_entity() ...
2 parents 1a7c611 + 2f88c8e commit 3ca9a83

File tree

19 files changed

+1220
-913
lines changed

19 files changed

+1220
-913
lines changed

Documentation/scheduler/sched-design-CFS.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ other HZ detail. Thus the CFS scheduler has no notion of "timeslices" in the
9494
way the previous scheduler had, and has no heuristics whatsoever. There is
9595
only one central tunable (you have to switch on CONFIG_SCHED_DEBUG):
9696

97-
/sys/kernel/debug/sched/min_granularity_ns
97+
/sys/kernel/debug/sched/base_slice_ns
9898

9999
which can be used to tune the scheduler from "desktop" (i.e., low latencies) to
100100
"server" (i.e., good batching) workloads. It defaults to a setting suitable

MAINTAINERS

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17057,6 +17057,7 @@ F: drivers/net/ppp/pptp.c
1705717057
PRESSURE STALL INFORMATION (PSI)
1705817058
M: Johannes Weiner <[email protected]>
1705917059
M: Suren Baghdasaryan <[email protected]>
17060+
R: Peter Ziljstra <[email protected]>
1706017061
S: Maintained
1706117062
F: include/linux/psi*
1706217063
F: kernel/sched/psi.c

arch/x86/kernel/smpboot.c

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -624,14 +624,9 @@ static void __init build_sched_topology(void)
624624
};
625625
#endif
626626
#ifdef CONFIG_SCHED_CLUSTER
627-
/*
628-
* For now, skip the cluster domain on Hybrid.
629-
*/
630-
if (!cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) {
631-
x86_topology[i++] = (struct sched_domain_topology_level){
632-
cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS)
633-
};
634-
}
627+
x86_topology[i++] = (struct sched_domain_topology_level){
628+
cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS)
629+
};
635630
#endif
636631
#ifdef CONFIG_SCHED_MC
637632
x86_topology[i++] = (struct sched_domain_topology_level){

include/linux/cgroup-defs.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -661,6 +661,8 @@ struct cgroup_subsys {
661661
void (*css_rstat_flush)(struct cgroup_subsys_state *css, int cpu);
662662
int (*css_extra_stat_show)(struct seq_file *seq,
663663
struct cgroup_subsys_state *css);
664+
int (*css_local_stat_show)(struct seq_file *seq,
665+
struct cgroup_subsys_state *css);
664666

665667
int (*can_attach)(struct cgroup_taskset *tset);
666668
void (*cancel_attach)(struct cgroup_taskset *tset);

include/linux/rbtree_augmented.h

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,32 @@ rb_insert_augmented_cached(struct rb_node *node,
6060
rb_insert_augmented(node, &root->rb_root, augment);
6161
}
6262

63+
static __always_inline struct rb_node *
64+
rb_add_augmented_cached(struct rb_node *node, struct rb_root_cached *tree,
65+
bool (*less)(struct rb_node *, const struct rb_node *),
66+
const struct rb_augment_callbacks *augment)
67+
{
68+
struct rb_node **link = &tree->rb_root.rb_node;
69+
struct rb_node *parent = NULL;
70+
bool leftmost = true;
71+
72+
while (*link) {
73+
parent = *link;
74+
if (less(node, parent)) {
75+
link = &parent->rb_left;
76+
} else {
77+
link = &parent->rb_right;
78+
leftmost = false;
79+
}
80+
}
81+
82+
rb_link_node(node, parent, link);
83+
augment->propagate(parent, NULL); /* suboptimal */
84+
rb_insert_augmented_cached(node, tree, leftmost, augment);
85+
86+
return leftmost ? node : NULL;
87+
}
88+
6389
/*
6490
* Template for declaring augmented rbtree callbacks (generic case)
6591
*

include/linux/sched.h

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -75,14 +75,14 @@ struct user_event_mm;
7575
* Task state bitmask. NOTE! These bits are also
7676
* encoded in fs/proc/array.c: get_task_state().
7777
*
78-
* We have two separate sets of flags: task->state
78+
* We have two separate sets of flags: task->__state
7979
* is about runnability, while task->exit_state are
8080
* about the task exiting. Confusing, but this way
8181
* modifying one set can't modify the other one by
8282
* mistake.
8383
*/
8484

85-
/* Used in tsk->state: */
85+
/* Used in tsk->__state: */
8686
#define TASK_RUNNING 0x00000000
8787
#define TASK_INTERRUPTIBLE 0x00000001
8888
#define TASK_UNINTERRUPTIBLE 0x00000002
@@ -92,7 +92,7 @@ struct user_event_mm;
9292
#define EXIT_DEAD 0x00000010
9393
#define EXIT_ZOMBIE 0x00000020
9494
#define EXIT_TRACE (EXIT_ZOMBIE | EXIT_DEAD)
95-
/* Used in tsk->state again: */
95+
/* Used in tsk->__state again: */
9696
#define TASK_PARKED 0x00000040
9797
#define TASK_DEAD 0x00000080
9898
#define TASK_WAKEKILL 0x00000100
@@ -173,7 +173,7 @@ struct user_event_mm;
173173
#endif
174174

175175
/*
176-
* set_current_state() includes a barrier so that the write of current->state
176+
* set_current_state() includes a barrier so that the write of current->__state
177177
* is correctly serialised wrt the caller's subsequent test of whether to
178178
* actually sleep:
179179
*
@@ -196,9 +196,9 @@ struct user_event_mm;
196196
* wake_up_state(p, TASK_UNINTERRUPTIBLE);
197197
*
198198
* where wake_up_state()/try_to_wake_up() executes a full memory barrier before
199-
* accessing p->state.
199+
* accessing p->__state.
200200
*
201-
* Wakeup will do: if (@state & p->state) p->state = TASK_RUNNING, that is,
201+
* Wakeup will do: if (@state & p->__state) p->__state = TASK_RUNNING, that is,
202202
* once it observes the TASK_UNINTERRUPTIBLE store the waking CPU can issue a
203203
* TASK_RUNNING store which can collide with __set_current_state(TASK_RUNNING).
204204
*
@@ -549,13 +549,18 @@ struct sched_entity {
549549
/* For load-balancing: */
550550
struct load_weight load;
551551
struct rb_node run_node;
552+
u64 deadline;
553+
u64 min_deadline;
554+
552555
struct list_head group_node;
553556
unsigned int on_rq;
554557

555558
u64 exec_start;
556559
u64 sum_exec_runtime;
557-
u64 vruntime;
558560
u64 prev_sum_exec_runtime;
561+
u64 vruntime;
562+
s64 vlag;
563+
u64 slice;
559564

560565
u64 nr_migrations;
561566

@@ -2433,9 +2438,11 @@ extern void sched_core_free(struct task_struct *tsk);
24332438
extern void sched_core_fork(struct task_struct *p);
24342439
extern int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type,
24352440
unsigned long uaddr);
2441+
extern int sched_core_idle_cpu(int cpu);
24362442
#else
24372443
static inline void sched_core_free(struct task_struct *tsk) { }
24382444
static inline void sched_core_fork(struct task_struct *p) { }
2445+
static inline int sched_core_idle_cpu(int cpu) { return idle_cpu(cpu); }
24392446
#endif
24402447

24412448
extern void sched_set_stop_task(int cpu, struct task_struct *stop);

include/linux/sched/task.h

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,11 +118,47 @@ static inline struct task_struct *get_task_struct(struct task_struct *t)
118118
}
119119

120120
extern void __put_task_struct(struct task_struct *t);
121+
extern void __put_task_struct_rcu_cb(struct rcu_head *rhp);
121122

122123
static inline void put_task_struct(struct task_struct *t)
123124
{
124-
if (refcount_dec_and_test(&t->usage))
125+
if (!refcount_dec_and_test(&t->usage))
126+
return;
127+
128+
/*
129+
* In !RT, it is always safe to call __put_task_struct().
130+
* Under RT, we can only call it in preemptible context.
131+
*/
132+
if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible()) {
133+
static DEFINE_WAIT_OVERRIDE_MAP(put_task_map, LD_WAIT_SLEEP);
134+
135+
lock_map_acquire_try(&put_task_map);
125136
__put_task_struct(t);
137+
lock_map_release(&put_task_map);
138+
return;
139+
}
140+
141+
/*
142+
* under PREEMPT_RT, we can't call put_task_struct
143+
* in atomic context because it will indirectly
144+
* acquire sleeping locks.
145+
*
146+
* call_rcu() will schedule delayed_put_task_struct_rcu()
147+
* to be called in process context.
148+
*
149+
* __put_task_struct() is called when
150+
* refcount_dec_and_test(&t->usage) succeeds.
151+
*
152+
* This means that it can't "conflict" with
153+
* put_task_struct_rcu_user() which abuses ->rcu the same
154+
* way; rcu_users has a reference so task->usage can't be
155+
* zero after rcu_users 1 -> 0 transition.
156+
*
157+
* delayed_free_task() also uses ->rcu, but it is only called
158+
* when it fails to fork a process. Therefore, there is no
159+
* way it can conflict with put_task_struct().
160+
*/
161+
call_rcu(&t->rcu, __put_task_struct_rcu_cb);
126162
}
127163

128164
DEFINE_FREE(put_task, struct task_struct *, if (_T) put_task_struct(_T))

init/Kconfig

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -629,6 +629,7 @@ config TASK_IO_ACCOUNTING
629629

630630
config PSI
631631
bool "Pressure stall information tracking"
632+
select KERNFS
632633
help
633634
Collect metrics that indicate how overcommitted the CPU, memory,
634635
and IO capacity are in the system.

kernel/cgroup/cgroup.c

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3685,6 +3685,36 @@ static int cpu_stat_show(struct seq_file *seq, void *v)
36853685
return ret;
36863686
}
36873687

3688+
static int __maybe_unused cgroup_local_stat_show(struct seq_file *seq,
3689+
struct cgroup *cgrp, int ssid)
3690+
{
3691+
struct cgroup_subsys *ss = cgroup_subsys[ssid];
3692+
struct cgroup_subsys_state *css;
3693+
int ret;
3694+
3695+
if (!ss->css_local_stat_show)
3696+
return 0;
3697+
3698+
css = cgroup_tryget_css(cgrp, ss);
3699+
if (!css)
3700+
return 0;
3701+
3702+
ret = ss->css_local_stat_show(seq, css);
3703+
css_put(css);
3704+
return ret;
3705+
}
3706+
3707+
static int cpu_local_stat_show(struct seq_file *seq, void *v)
3708+
{
3709+
struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup;
3710+
int ret = 0;
3711+
3712+
#ifdef CONFIG_CGROUP_SCHED
3713+
ret = cgroup_local_stat_show(seq, cgrp, cpu_cgrp_id);
3714+
#endif
3715+
return ret;
3716+
}
3717+
36883718
#ifdef CONFIG_PSI
36893719
static int cgroup_io_pressure_show(struct seq_file *seq, void *v)
36903720
{
@@ -5235,6 +5265,10 @@ static struct cftype cgroup_base_files[] = {
52355265
.name = "cpu.stat",
52365266
.seq_show = cpu_stat_show,
52375267
},
5268+
{
5269+
.name = "cpu.stat.local",
5270+
.seq_show = cpu_local_stat_show,
5271+
},
52385272
{ } /* terminate */
52395273
};
52405274

kernel/fork.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -985,6 +985,14 @@ void __put_task_struct(struct task_struct *tsk)
985985
}
986986
EXPORT_SYMBOL_GPL(__put_task_struct);
987987

988+
void __put_task_struct_rcu_cb(struct rcu_head *rhp)
989+
{
990+
struct task_struct *task = container_of(rhp, struct task_struct, rcu);
991+
992+
__put_task_struct(task);
993+
}
994+
EXPORT_SYMBOL_GPL(__put_task_struct_rcu_cb);
995+
988996
void __init __weak arch_task_cache_init(void) { }
989997

990998
/*

0 commit comments

Comments
 (0)