Skip to content

Commit fc661f2

Browse files
committed
Merge tag 'sched_urgent_for_v5.16_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler fixes from Borislav Petkov: - Avoid touching ~100 config files in order to be able to select the preemption model - clear cluster CPU masks too, on the CPU unplug path - prevent use-after-free in cfs - Prevent a race condition when updating CPU cache domains - Factor out common shared part of smp_prepare_cpus() into a common helper which can be called by both baremetal and Xen, in order to fix a booting of Xen PV guests * tag 'sched_urgent_for_v5.16_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: preempt: Restore preemption model selection configs arch_topology: Fix missing clear cluster_cpumask in remove_cpu_topology() sched/fair: Prevent dead task groups from regaining cfs_rq's sched/core: Mitigate race cpus_share_cache()/update_top_cache_domain() x86/smp: Factor out parts of native_smp_prepare_cpus()
2 parents f7018be + a8b7691 commit fc661f2

File tree

13 files changed

+96
-59
lines changed

13 files changed

+96
-59
lines changed

arch/x86/include/asm/smp.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,7 @@ static inline void arch_send_call_function_ipi_mask(const struct cpumask *mask)
126126

127127
void cpu_disable_common(void);
128128
void native_smp_prepare_boot_cpu(void);
129+
void smp_prepare_cpus_common(void);
129130
void native_smp_prepare_cpus(unsigned int max_cpus);
130131
void calculate_max_logical_packages(void);
131132
void native_smp_cpus_done(unsigned int max_cpus);

arch/x86/kernel/smpboot.c

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1350,12 +1350,7 @@ static void __init smp_get_logical_apicid(void)
13501350
cpu0_logical_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR));
13511351
}
13521352

1353-
/*
1354-
* Prepare for SMP bootup.
1355-
* @max_cpus: configured maximum number of CPUs, It is a legacy parameter
1356-
* for common interface support.
1357-
*/
1358-
void __init native_smp_prepare_cpus(unsigned int max_cpus)
1353+
void __init smp_prepare_cpus_common(void)
13591354
{
13601355
unsigned int i;
13611356

@@ -1386,6 +1381,17 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
13861381
set_sched_topology(x86_topology);
13871382

13881383
set_cpu_sibling_map(0);
1384+
}
1385+
1386+
/*
1387+
* Prepare for SMP bootup.
1388+
* @max_cpus: configured maximum number of CPUs, It is a legacy parameter
1389+
* for common interface support.
1390+
*/
1391+
void __init native_smp_prepare_cpus(unsigned int max_cpus)
1392+
{
1393+
smp_prepare_cpus_common();
1394+
13891395
init_freq_invariance(false, false);
13901396
smp_sanity_check();
13911397

arch/x86/xen/smp_pv.c

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,6 @@ static void __init xen_pv_smp_prepare_boot_cpu(void)
225225
static void __init xen_pv_smp_prepare_cpus(unsigned int max_cpus)
226226
{
227227
unsigned cpu;
228-
unsigned int i;
229228

230229
if (skip_ioapic_setup) {
231230
char *m = (max_cpus == 0) ?
@@ -238,16 +237,9 @@ static void __init xen_pv_smp_prepare_cpus(unsigned int max_cpus)
238237
}
239238
xen_init_lock_cpu(0);
240239

241-
smp_store_boot_cpu_info();
242-
cpu_data(0).x86_max_cores = 1;
240+
smp_prepare_cpus_common();
243241

244-
for_each_possible_cpu(i) {
245-
zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
246-
zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
247-
zalloc_cpumask_var(&per_cpu(cpu_die_map, i), GFP_KERNEL);
248-
zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL);
249-
}
250-
set_cpu_sibling_map(0);
242+
cpu_data(0).x86_max_cores = 1;
251243

252244
speculative_store_bypass_ht_init();
253245

drivers/base/arch_topology.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -677,6 +677,8 @@ void remove_cpu_topology(unsigned int cpu)
677677
cpumask_clear_cpu(cpu, topology_core_cpumask(sibling));
678678
for_each_cpu(sibling, topology_sibling_cpumask(cpu))
679679
cpumask_clear_cpu(cpu, topology_sibling_cpumask(sibling));
680+
for_each_cpu(sibling, topology_cluster_cpumask(cpu))
681+
cpumask_clear_cpu(cpu, topology_cluster_cpumask(sibling));
680682
for_each_cpu(sibling, topology_llc_cpumask(cpu))
681683
cpumask_clear_cpu(cpu, topology_llc_cpumask(sibling));
682684

include/linux/kernel.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@
8585
struct completion;
8686
struct user;
8787

88-
#ifdef CONFIG_PREEMPT_VOLUNTARY
88+
#ifdef CONFIG_PREEMPT_VOLUNTARY_BUILD
8989

9090
extern int __cond_resched(void);
9191
# define might_resched() __cond_resched()

include/linux/vermagic.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
#else
1616
#define MODULE_VERMAGIC_SMP ""
1717
#endif
18-
#ifdef CONFIG_PREEMPT
18+
#ifdef CONFIG_PREEMPT_BUILD
1919
#define MODULE_VERMAGIC_PREEMPT "preempt "
2020
#elif defined(CONFIG_PREEMPT_RT)
2121
#define MODULE_VERMAGIC_PREEMPT "preempt_rt "

init/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ $(obj)/version.o: include/generated/compile.h
3030
quiet_cmd_compile.h = CHK $@
3131
cmd_compile.h = \
3232
$(CONFIG_SHELL) $(srctree)/scripts/mkcompile_h $@ \
33-
"$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" \
33+
"$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT_BUILD)" \
3434
"$(CONFIG_PREEMPT_RT)" $(CONFIG_CC_VERSION_TEXT) "$(LD)"
3535

3636
include/generated/compile.h: FORCE

kernel/Kconfig.preempt

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,23 @@
11
# SPDX-License-Identifier: GPL-2.0-only
22

3+
config PREEMPT_NONE_BUILD
4+
bool
5+
6+
config PREEMPT_VOLUNTARY_BUILD
7+
bool
8+
9+
config PREEMPT_BUILD
10+
bool
11+
select PREEMPTION
12+
select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
13+
314
choice
415
prompt "Preemption Model"
5-
default PREEMPT_NONE_BEHAVIOUR
16+
default PREEMPT_NONE
617

7-
config PREEMPT_NONE_BEHAVIOUR
18+
config PREEMPT_NONE
819
bool "No Forced Preemption (Server)"
9-
select PREEMPT_NONE if !PREEMPT_DYNAMIC
20+
select PREEMPT_NONE_BUILD if !PREEMPT_DYNAMIC
1021
help
1122
This is the traditional Linux preemption model, geared towards
1223
throughput. It will still provide good latencies most of the
@@ -18,10 +29,10 @@ config PREEMPT_NONE_BEHAVIOUR
1829
raw processing power of the kernel, irrespective of scheduling
1930
latencies.
2031

21-
config PREEMPT_VOLUNTARY_BEHAVIOUR
32+
config PREEMPT_VOLUNTARY
2233
bool "Voluntary Kernel Preemption (Desktop)"
2334
depends on !ARCH_NO_PREEMPT
24-
select PREEMPT_VOLUNTARY if !PREEMPT_DYNAMIC
35+
select PREEMPT_VOLUNTARY_BUILD if !PREEMPT_DYNAMIC
2536
help
2637
This option reduces the latency of the kernel by adding more
2738
"explicit preemption points" to the kernel code. These new
@@ -37,10 +48,10 @@ config PREEMPT_VOLUNTARY_BEHAVIOUR
3748

3849
Select this if you are building a kernel for a desktop system.
3950

40-
config PREEMPT_BEHAVIOUR
51+
config PREEMPT
4152
bool "Preemptible Kernel (Low-Latency Desktop)"
4253
depends on !ARCH_NO_PREEMPT
43-
select PREEMPT
54+
select PREEMPT_BUILD
4455
help
4556
This option reduces the latency of the kernel by making
4657
all kernel code (that is not executing in a critical section)
@@ -58,7 +69,7 @@ config PREEMPT_BEHAVIOUR
5869

5970
config PREEMPT_RT
6071
bool "Fully Preemptible Kernel (Real-Time)"
61-
depends on EXPERT && ARCH_SUPPORTS_RT && !PREEMPT_DYNAMIC
72+
depends on EXPERT && ARCH_SUPPORTS_RT
6273
select PREEMPTION
6374
help
6475
This option turns the kernel into a real-time kernel by replacing
@@ -75,17 +86,6 @@ config PREEMPT_RT
7586

7687
endchoice
7788

78-
config PREEMPT_NONE
79-
bool
80-
81-
config PREEMPT_VOLUNTARY
82-
bool
83-
84-
config PREEMPT
85-
bool
86-
select PREEMPTION
87-
select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
88-
8989
config PREEMPT_COUNT
9090
bool
9191

@@ -95,8 +95,8 @@ config PREEMPTION
9595

9696
config PREEMPT_DYNAMIC
9797
bool "Preemption behaviour defined on boot"
98-
depends on HAVE_PREEMPT_DYNAMIC
99-
select PREEMPT
98+
depends on HAVE_PREEMPT_DYNAMIC && !PREEMPT_RT
99+
select PREEMPT_BUILD
100100
default y
101101
help
102102
This option allows to define the preemption model on the kernel

kernel/sched/autogroup.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ static inline void autogroup_destroy(struct kref *kref)
3131
ag->tg->rt_se = NULL;
3232
ag->tg->rt_rq = NULL;
3333
#endif
34-
sched_offline_group(ag->tg);
34+
sched_release_group(ag->tg);
3535
sched_destroy_group(ag->tg);
3636
}
3737

kernel/sched/core.c

Lines changed: 41 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -3726,6 +3726,9 @@ void wake_up_if_idle(int cpu)
37263726

37273727
bool cpus_share_cache(int this_cpu, int that_cpu)
37283728
{
3729+
if (this_cpu == that_cpu)
3730+
return true;
3731+
37293732
return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
37303733
}
37313734

@@ -6625,13 +6628,13 @@ __setup("preempt=", setup_preempt_mode);
66256628
static void __init preempt_dynamic_init(void)
66266629
{
66276630
if (preempt_dynamic_mode == preempt_dynamic_undefined) {
6628-
if (IS_ENABLED(CONFIG_PREEMPT_NONE_BEHAVIOUR)) {
6631+
if (IS_ENABLED(CONFIG_PREEMPT_NONE)) {
66296632
sched_dynamic_update(preempt_dynamic_none);
6630-
} else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY_BEHAVIOUR)) {
6633+
} else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY)) {
66316634
sched_dynamic_update(preempt_dynamic_voluntary);
66326635
} else {
66336636
/* Default static call setting, nothing to do */
6634-
WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT_BEHAVIOUR));
6637+
WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT));
66356638
preempt_dynamic_mode = preempt_dynamic_full;
66366639
pr_info("Dynamic Preempt: full\n");
66376640
}
@@ -9716,6 +9719,22 @@ static void sched_free_group(struct task_group *tg)
97169719
kmem_cache_free(task_group_cache, tg);
97179720
}
97189721

9722+
static void sched_free_group_rcu(struct rcu_head *rcu)
9723+
{
9724+
sched_free_group(container_of(rcu, struct task_group, rcu));
9725+
}
9726+
9727+
static void sched_unregister_group(struct task_group *tg)
9728+
{
9729+
unregister_fair_sched_group(tg);
9730+
unregister_rt_sched_group(tg);
9731+
/*
9732+
* We have to wait for yet another RCU grace period to expire, as
9733+
* print_cfs_stats() might run concurrently.
9734+
*/
9735+
call_rcu(&tg->rcu, sched_free_group_rcu);
9736+
}
9737+
97199738
/* allocate runqueue etc for a new task group */
97209739
struct task_group *sched_create_group(struct task_group *parent)
97219740
{
@@ -9759,25 +9778,35 @@ void sched_online_group(struct task_group *tg, struct task_group *parent)
97599778
}
97609779

97619780
/* rcu callback to free various structures associated with a task group */
9762-
static void sched_free_group_rcu(struct rcu_head *rhp)
9781+
static void sched_unregister_group_rcu(struct rcu_head *rhp)
97639782
{
97649783
/* Now it should be safe to free those cfs_rqs: */
9765-
sched_free_group(container_of(rhp, struct task_group, rcu));
9784+
sched_unregister_group(container_of(rhp, struct task_group, rcu));
97669785
}
97679786

97689787
void sched_destroy_group(struct task_group *tg)
97699788
{
97709789
/* Wait for possible concurrent references to cfs_rqs complete: */
9771-
call_rcu(&tg->rcu, sched_free_group_rcu);
9790+
call_rcu(&tg->rcu, sched_unregister_group_rcu);
97729791
}
97739792

9774-
void sched_offline_group(struct task_group *tg)
9793+
void sched_release_group(struct task_group *tg)
97759794
{
97769795
unsigned long flags;
97779796

9778-
/* End participation in shares distribution: */
9779-
unregister_fair_sched_group(tg);
9780-
9797+
/*
9798+
* Unlink first, to avoid walk_tg_tree_from() from finding us (via
9799+
* sched_cfs_period_timer()).
9800+
*
9801+
* For this to be effective, we have to wait for all pending users of
9802+
* this task group to leave their RCU critical section to ensure no new
9803+
* user will see our dying task group any more. Specifically ensure
9804+
* that tg_unthrottle_up() won't add decayed cfs_rq's to it.
9805+
*
9806+
* We therefore defer calling unregister_fair_sched_group() to
9807+
* sched_unregister_group() which is guarantied to get called only after the
9808+
* current RCU grace period has expired.
9809+
*/
97819810
spin_lock_irqsave(&task_group_lock, flags);
97829811
list_del_rcu(&tg->list);
97839812
list_del_rcu(&tg->siblings);
@@ -9896,7 +9925,7 @@ static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
98969925
{
98979926
struct task_group *tg = css_tg(css);
98989927

9899-
sched_offline_group(tg);
9928+
sched_release_group(tg);
99009929
}
99019930

99029931
static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
@@ -9906,7 +9935,7 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
99069935
/*
99079936
* Relies on the RCU grace period between css_released() and this.
99089937
*/
9909-
sched_free_group(tg);
9938+
sched_unregister_group(tg);
99109939
}
99119940

99129941
/*

kernel/sched/fair.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11456,8 +11456,6 @@ void free_fair_sched_group(struct task_group *tg)
1145611456
{
1145711457
int i;
1145811458

11459-
destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
11460-
1146111459
for_each_possible_cpu(i) {
1146211460
if (tg->cfs_rq)
1146311461
kfree(tg->cfs_rq[i]);
@@ -11534,6 +11532,8 @@ void unregister_fair_sched_group(struct task_group *tg)
1153411532
struct rq *rq;
1153511533
int cpu;
1153611534

11535+
destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
11536+
1153711537
for_each_possible_cpu(cpu) {
1153811538
if (tg->se[cpu])
1153911539
remove_entity_load_avg(tg->se[cpu]);

kernel/sched/rt.c

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -137,13 +137,17 @@ static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
137137
return rt_rq->rq;
138138
}
139139

140-
void free_rt_sched_group(struct task_group *tg)
140+
void unregister_rt_sched_group(struct task_group *tg)
141141
{
142-
int i;
143-
144142
if (tg->rt_se)
145143
destroy_rt_bandwidth(&tg->rt_bandwidth);
146144

145+
}
146+
147+
void free_rt_sched_group(struct task_group *tg)
148+
{
149+
int i;
150+
147151
for_each_possible_cpu(i) {
148152
if (tg->rt_rq)
149153
kfree(tg->rt_rq[i]);
@@ -250,6 +254,8 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
250254
return &rq->rt;
251255
}
252256

257+
void unregister_rt_sched_group(struct task_group *tg) { }
258+
253259
void free_rt_sched_group(struct task_group *tg) { }
254260

255261
int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)

kernel/sched/sched.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -488,6 +488,7 @@ extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
488488
extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
489489
extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq);
490490

491+
extern void unregister_rt_sched_group(struct task_group *tg);
491492
extern void free_rt_sched_group(struct task_group *tg);
492493
extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent);
493494
extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
@@ -503,7 +504,7 @@ extern struct task_group *sched_create_group(struct task_group *parent);
503504
extern void sched_online_group(struct task_group *tg,
504505
struct task_group *parent);
505506
extern void sched_destroy_group(struct task_group *tg);
506-
extern void sched_offline_group(struct task_group *tg);
507+
extern void sched_release_group(struct task_group *tg);
507508

508509
extern void sched_move_task(struct task_struct *tsk);
509510

0 commit comments

Comments
 (0)