Skip to content

Commit 13c8da5

Browse files
committed
Merge branch 'sched/core' into core/mm
Pull the migrate disable mechanics which is a prerequisite for preemptible kmap_local().
2 parents a0e1699 + 74d862b commit 13c8da5

35 files changed

+1657
-445
lines changed

Documentation/scheduler/sched-domains.rst

Lines changed: 11 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -65,21 +65,17 @@ of the SMP domain will span the entire machine, with each group having the
6565
cpumask of a node. Or, you could do multi-level NUMA or Opteron, for example,
6666
might have just one domain covering its one NUMA level.
6767

68-
The implementor should read comments in include/linux/sched.h:
69-
struct sched_domain fields, SD_FLAG_*, SD_*_INIT to get an idea of
70-
the specifics and what to tune.
68+
The implementor should read comments in include/linux/sched/sd_flags.h:
69+
SD_* to get an idea of the specifics and what to tune for the SD flags
70+
of a sched_domain.
7171

72-
Architectures may retain the regular override the default SD_*_INIT flags
73-
while using the generic domain builder in kernel/sched/core.c if they wish to
74-
retain the traditional SMT->SMP->NUMA topology (or some subset of that). This
75-
can be done by #define'ing ARCH_HASH_SCHED_TUNE.
76-
77-
Alternatively, the architecture may completely override the generic domain
78-
builder by #define'ing ARCH_HASH_SCHED_DOMAIN, and exporting your
79-
arch_init_sched_domains function. This function will attach domains to all
80-
CPUs using cpu_attach_domain.
72+
Architectures may override the generic domain builder and the default SD flags
73+
for a given topology level by creating a sched_domain_topology_level array and
74+
calling set_sched_topology() with this array as the parameter.
8175

8276
The sched-domains debugging infrastructure can be enabled by enabling
83-
CONFIG_SCHED_DEBUG. This enables an error checking parse of the sched domains
84-
which should catch most possible errors (described above). It also prints out
85-
the domain structure in a visual format.
77+
CONFIG_SCHED_DEBUG and adding 'sched_debug' to your cmdline. If you forgot to
78+
tweak your cmdline, you can also flip the /sys/kernel/debug/sched_debug
79+
knob. This enables an error checking parse of the sched domains which should
80+
catch most possible errors (described above). It also prints out the domain
81+
structure in a visual format.

arch/arm64/kernel/topology.c

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,7 @@ static DEFINE_STATIC_KEY_FALSE(amu_fie_key);
213213

214214
static int __init init_amu_fie(void)
215215
{
216+
bool invariance_status = topology_scale_freq_invariant();
216217
cpumask_var_t valid_cpus;
217218
bool have_policy = false;
218219
int ret = 0;
@@ -255,6 +256,15 @@ static int __init init_amu_fie(void)
255256
if (!topology_scale_freq_invariant())
256257
static_branch_disable(&amu_fie_key);
257258

259+
/*
260+
* Task scheduler behavior depends on frequency invariance support,
261+
* either cpufreq or counter driven. If the support status changes as
262+
* a result of counter initialisation and use, retrigger the build of
263+
* scheduling domains to ensure the information is propagated properly.
264+
*/
265+
if (invariance_status != topology_scale_freq_invariant())
266+
rebuild_sched_domains_energy();
267+
258268
free_valid_mask:
259269
free_cpumask_var(valid_cpus);
260270

fs/proc/array.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -382,9 +382,9 @@ static inline void task_context_switch_counts(struct seq_file *m,
382382
static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
383383
{
384384
seq_printf(m, "Cpus_allowed:\t%*pb\n",
385-
cpumask_pr_args(task->cpus_ptr));
385+
cpumask_pr_args(&task->cpus_mask));
386386
seq_printf(m, "Cpus_allowed_list:\t%*pbl\n",
387-
cpumask_pr_args(task->cpus_ptr));
387+
cpumask_pr_args(&task->cpus_mask));
388388
}
389389

390390
static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm)

include/linux/cpuhotplug.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,7 @@ enum cpuhp_state {
152152
CPUHP_AP_ONLINE,
153153
CPUHP_TEARDOWN_CPU,
154154
CPUHP_AP_ONLINE_IDLE,
155+
CPUHP_AP_SCHED_WAIT_EMPTY,
155156
CPUHP_AP_SMPBOOT_THREADS,
156157
CPUHP_AP_X86_VDSO_VMA_ONLINE,
157158
CPUHP_AP_IRQ_AFFINITY_ONLINE,

include/linux/cpumask.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,11 @@ static inline int cpumask_any_and_distribute(const struct cpumask *src1p,
199199
return cpumask_next_and(-1, src1p, src2p);
200200
}
201201

202+
static inline int cpumask_any_distribute(const struct cpumask *srcp)
203+
{
204+
return cpumask_first(srcp);
205+
}
206+
202207
#define for_each_cpu(cpu, mask) \
203208
for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask)
204209
#define for_each_cpu_not(cpu, mask) \
@@ -252,6 +257,7 @@ int cpumask_any_but(const struct cpumask *mask, unsigned int cpu);
252257
unsigned int cpumask_local_spread(unsigned int i, int node);
253258
int cpumask_any_and_distribute(const struct cpumask *src1p,
254259
const struct cpumask *src2p);
260+
int cpumask_any_distribute(const struct cpumask *srcp);
255261

256262
/**
257263
* for_each_cpu - iterate over every cpu in a mask

include/linux/kernel.h

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,7 @@ extern int _cond_resched(void);
204204
extern void ___might_sleep(const char *file, int line, int preempt_offset);
205205
extern void __might_sleep(const char *file, int line, int preempt_offset);
206206
extern void __cant_sleep(const char *file, int line, int preempt_offset);
207+
extern void __cant_migrate(const char *file, int line);
207208

208209
/**
209210
* might_sleep - annotation for functions that can sleep
@@ -227,6 +228,18 @@ extern void __cant_sleep(const char *file, int line, int preempt_offset);
227228
# define cant_sleep() \
228229
do { __cant_sleep(__FILE__, __LINE__, 0); } while (0)
229230
# define sched_annotate_sleep() (current->task_state_change = 0)
231+
232+
/**
233+
* cant_migrate - annotation for functions that cannot migrate
234+
*
235+
* Will print a stack trace if executed in code which is migratable
236+
*/
237+
# define cant_migrate() \
238+
do { \
239+
if (IS_ENABLED(CONFIG_SMP)) \
240+
__cant_migrate(__FILE__, __LINE__); \
241+
} while (0)
242+
230243
/**
231244
* non_block_start - annotate the start of section where sleeping is prohibited
232245
*
@@ -251,20 +264,14 @@ extern void __cant_sleep(const char *file, int line, int preempt_offset);
251264
int preempt_offset) { }
252265
# define might_sleep() do { might_resched(); } while (0)
253266
# define cant_sleep() do { } while (0)
267+
# define cant_migrate() do { } while (0)
254268
# define sched_annotate_sleep() do { } while (0)
255269
# define non_block_start() do { } while (0)
256270
# define non_block_end() do { } while (0)
257271
#endif
258272

259273
#define might_sleep_if(cond) do { if (cond) might_sleep(); } while (0)
260274

261-
#ifndef CONFIG_PREEMPT_RT
262-
# define cant_migrate() cant_sleep()
263-
#else
264-
/* Placeholder for now */
265-
# define cant_migrate() do { } while (0)
266-
#endif
267-
268275
/**
269276
* abs - return absolute value of an argument
270277
* @x: the value. If it is unsigned type, it is converted to signed type first.

include/linux/preempt.h

Lines changed: 60 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -322,34 +322,71 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier,
322322

323323
#endif
324324

325-
/**
326-
* migrate_disable - Prevent migration of the current task
325+
#ifdef CONFIG_SMP
326+
327+
/*
328+
* Migrate-Disable and why it is undesired.
327329
*
328-
* Maps to preempt_disable() which also disables preemption. Use
329-
* migrate_disable() to annotate that the intent is to prevent migration,
330-
* but not necessarily preemption.
330+
* When a preempted task becomes elegible to run under the ideal model (IOW it
331+
* becomes one of the M highest priority tasks), it might still have to wait
332+
* for the preemptee's migrate_disable() section to complete. Thereby suffering
333+
* a reduction in bandwidth in the exact duration of the migrate_disable()
334+
* section.
331335
*
332-
* Can be invoked nested like preempt_disable() and needs the corresponding
333-
* number of migrate_enable() invocations.
334-
*/
335-
static __always_inline void migrate_disable(void)
336-
{
337-
preempt_disable();
338-
}
339-
340-
/**
341-
* migrate_enable - Allow migration of the current task
336+
* Per this argument, the change from preempt_disable() to migrate_disable()
337+
* gets us:
338+
*
339+
* - a higher priority tasks gains reduced wake-up latency; with preempt_disable()
340+
* it would have had to wait for the lower priority task.
341+
*
342+
* - a lower priority tasks; which under preempt_disable() could've instantly
343+
* migrated away when another CPU becomes available, is now constrained
344+
* by the ability to push the higher priority task away, which might itself be
345+
* in a migrate_disable() section, reducing it's available bandwidth.
346+
*
347+
* IOW it trades latency / moves the interference term, but it stays in the
348+
* system, and as long as it remains unbounded, the system is not fully
349+
* deterministic.
350+
*
351+
*
352+
* The reason we have it anyway.
342353
*
343-
* Counterpart to migrate_disable().
354+
* PREEMPT_RT breaks a number of assumptions traditionally held. By forcing a
355+
* number of primitives into becoming preemptible, they would also allow
356+
* migration. This turns out to break a bunch of per-cpu usage. To this end,
357+
* all these primitives employ migirate_disable() to restore this implicit
358+
* assumption.
344359
*
345-
* As migrate_disable() can be invoked nested, only the outermost invocation
346-
* reenables migration.
360+
* This is a 'temporary' work-around at best. The correct solution is getting
361+
* rid of the above assumptions and reworking the code to employ explicit
362+
* per-cpu locking or short preempt-disable regions.
363+
*
364+
* The end goal must be to get rid of migrate_disable(), alternatively we need
365+
* a schedulability theory that does not depend on abritrary migration.
366+
*
367+
*
368+
* Notes on the implementation.
369+
*
370+
* The implementation is particularly tricky since existing code patterns
371+
* dictate neither migrate_disable() nor migrate_enable() is allowed to block.
372+
* This means that it cannot use cpus_read_lock() to serialize against hotplug,
373+
* nor can it easily migrate itself into a pending affinity mask change on
374+
* migrate_enable().
375+
*
376+
*
377+
* Note: even non-work-conserving schedulers like semi-partitioned depends on
378+
* migration, so migrate_disable() is not only a problem for
379+
* work-conserving schedulers.
347380
*
348-
* Currently mapped to preempt_enable().
349381
*/
350-
static __always_inline void migrate_enable(void)
351-
{
352-
preempt_enable();
353-
}
382+
extern void migrate_disable(void);
383+
extern void migrate_enable(void);
384+
385+
#else
386+
387+
static inline void migrate_disable(void) { }
388+
static inline void migrate_enable(void) { }
389+
390+
#endif /* CONFIG_SMP */
354391

355392
#endif /* __LINUX_PREEMPT_H */

include/linux/sched.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -714,6 +714,11 @@ struct task_struct {
714714
int nr_cpus_allowed;
715715
const cpumask_t *cpus_ptr;
716716
cpumask_t cpus_mask;
717+
void *migration_pending;
718+
#ifdef CONFIG_SMP
719+
unsigned short migration_disabled;
720+
#endif
721+
unsigned short migration_flags;
717722

718723
#ifdef CONFIG_PREEMPT_RCU
719724
int rcu_read_lock_nesting;

include/linux/sched/hotplug.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,10 @@ extern int sched_cpu_activate(unsigned int cpu);
1111
extern int sched_cpu_deactivate(unsigned int cpu);
1212

1313
#ifdef CONFIG_HOTPLUG_CPU
14+
extern int sched_cpu_wait_empty(unsigned int cpu);
1415
extern int sched_cpu_dying(unsigned int cpu);
1516
#else
17+
# define sched_cpu_wait_empty NULL
1618
# define sched_cpu_dying NULL
1719
#endif
1820

include/linux/sched/mm.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -347,6 +347,8 @@ static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
347347

348348
extern void membarrier_exec_mmap(struct mm_struct *mm);
349349

350+
extern void membarrier_update_current_mm(struct mm_struct *next_mm);
351+
350352
#else
351353
#ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS
352354
static inline void membarrier_arch_switch_mm(struct mm_struct *prev,
@@ -361,6 +363,9 @@ static inline void membarrier_exec_mmap(struct mm_struct *mm)
361363
static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
362364
{
363365
}
366+
static inline void membarrier_update_current_mm(struct mm_struct *next_mm)
367+
{
368+
}
364369
#endif
365370

366371
#endif /* _LINUX_SCHED_MM_H */

include/linux/sched/topology.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,14 @@ static inline bool cpus_share_cache(int this_cpu, int that_cpu)
225225

226226
#endif /* !CONFIG_SMP */
227227

228+
#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
229+
extern void rebuild_sched_domains_energy(void);
230+
#else
231+
static inline void rebuild_sched_domains_energy(void)
232+
{
233+
}
234+
#endif
235+
228236
#ifndef arch_scale_cpu_capacity
229237
/**
230238
* arch_scale_cpu_capacity - get the capacity scale factor of a given CPU.

include/linux/stop_machine.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ typedef int (*cpu_stop_fn_t)(void *arg);
2424
struct cpu_stop_work {
2525
struct list_head list; /* cpu_stopper->works */
2626
cpu_stop_fn_t fn;
27+
unsigned long caller;
2728
void *arg;
2829
struct cpu_stop_done *done;
2930
};
@@ -36,6 +37,8 @@ void stop_machine_park(int cpu);
3637
void stop_machine_unpark(int cpu);
3738
void stop_machine_yield(const struct cpumask *cpumask);
3839

40+
extern void print_stop_info(const char *log_lvl, struct task_struct *task);
41+
3942
#else /* CONFIG_SMP */
4043

4144
#include <linux/workqueue.h>
@@ -80,6 +83,8 @@ static inline bool stop_one_cpu_nowait(unsigned int cpu,
8083
return false;
8184
}
8285

86+
static inline void print_stop_info(const char *log_lvl, struct task_struct *task) { }
87+
8388
#endif /* CONFIG_SMP */
8489

8590
/*

include/uapi/linux/sched/types.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,8 @@ struct sched_param {
9696
* on a CPU with a capacity big enough to fit the specified value.
9797
* A task with a max utilization value smaller than 1024 is more likely
9898
* scheduled on a CPU with no more capacity than the specified value.
99+
*
100+
* A task utilization boundary can be reset by setting the attribute to -1.
99101
*/
100102
struct sched_attr {
101103
__u32 size;

kernel/cgroup/cpuset.c

Lines changed: 28 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -983,25 +983,48 @@ partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
983983
*/
984984
static void rebuild_sched_domains_locked(void)
985985
{
986+
struct cgroup_subsys_state *pos_css;
986987
struct sched_domain_attr *attr;
987988
cpumask_var_t *doms;
989+
struct cpuset *cs;
988990
int ndoms;
989991

990992
lockdep_assert_cpus_held();
991993
percpu_rwsem_assert_held(&cpuset_rwsem);
992994

993995
/*
994-
* We have raced with CPU hotplug. Don't do anything to avoid
996+
* If we have raced with CPU hotplug, return early to avoid
995997
* passing doms with offlined cpu to partition_sched_domains().
996-
* Anyways, hotplug work item will rebuild sched domains.
998+
* Anyways, cpuset_hotplug_workfn() will rebuild sched domains.
999+
*
1000+
* With no CPUs in any subpartitions, top_cpuset's effective CPUs
1001+
* should be the same as the active CPUs, so checking only top_cpuset
1002+
* is enough to detect racing CPU offlines.
9971003
*/
9981004
if (!top_cpuset.nr_subparts_cpus &&
9991005
!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
10001006
return;
10011007

1002-
if (top_cpuset.nr_subparts_cpus &&
1003-
!cpumask_subset(top_cpuset.effective_cpus, cpu_active_mask))
1004-
return;
1008+
/*
1009+
* With subpartition CPUs, however, the effective CPUs of a partition
1010+
* root should be only a subset of the active CPUs. Since a CPU in any
1011+
* partition root could be offlined, all must be checked.
1012+
*/
1013+
if (top_cpuset.nr_subparts_cpus) {
1014+
rcu_read_lock();
1015+
cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
1016+
if (!is_partition_root(cs)) {
1017+
pos_css = css_rightmost_descendant(pos_css);
1018+
continue;
1019+
}
1020+
if (!cpumask_subset(cs->effective_cpus,
1021+
cpu_active_mask)) {
1022+
rcu_read_unlock();
1023+
return;
1024+
}
1025+
}
1026+
rcu_read_unlock();
1027+
}
10051028

10061029
/* Generate domain masks and attrs */
10071030
ndoms = generate_sched_domains(&doms, &attr);

0 commit comments

Comments
 (0)