Skip to content

Commit 0b3e9f3

Browse files
committed
Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler fixes from Ingo Molnar. * 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched: Remove NULL assignment of dattr_cur sched: Remove the last NULL entry from sched_feat_names sched: Make sched_feat_names const sched/rt: Fix SCHED_RR across cgroups sched: Move nr_cpus_allowed out of 'struct sched_rt_entity' sched: Make sure to not re-read variables after validation sched: Fix SD_OVERLAP sched: Don't try allocating memory from offline nodes sched/nohz: Fix rq->cpu_load calculations some more sched/x86: Use cpu_llc_shared_mask(cpu) for coregroup_mask
2 parents 99becf1 + 6a4c96e commit 0b3e9f3

File tree

8 files changed

+120
-59
lines changed

8 files changed

+120
-59
lines changed

arch/blackfin/kernel/process.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,7 @@ asmlinkage int bfin_clone(struct pt_regs *regs)
173173
unsigned long newsp;
174174

175175
#ifdef __ARCH_SYNC_CORE_DCACHE
176-
if (current->rt.nr_cpus_allowed == num_possible_cpus())
176+
if (current->nr_cpus_allowed == num_possible_cpus())
177177
set_cpus_allowed_ptr(current, cpumask_of(smp_processor_id()));
178178
#endif
179179

arch/x86/kernel/smpboot.c

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -410,15 +410,7 @@ void __cpuinit set_cpu_sibling_map(int cpu)
410410
/* maps the cpu to the sched domain representing multi-core */
411411
const struct cpumask *cpu_coregroup_mask(int cpu)
412412
{
413-
struct cpuinfo_x86 *c = &cpu_data(cpu);
414-
/*
415-
* For perf, we return last level cache shared map.
416-
* And for power savings, we return cpu_core_map
417-
*/
418-
if (!(cpu_has(c, X86_FEATURE_AMD_DCM)))
419-
return cpu_core_mask(cpu);
420-
else
421-
return cpu_llc_shared_mask(cpu);
413+
return cpu_llc_shared_mask(cpu);
422414
}
423415

424416
static void impress_friends(void)

include/linux/init_task.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,7 @@ extern struct cred init_cred;
149149
.normal_prio = MAX_PRIO-20, \
150150
.policy = SCHED_NORMAL, \
151151
.cpus_allowed = CPU_MASK_ALL, \
152+
.nr_cpus_allowed= NR_CPUS, \
152153
.mm = NULL, \
153154
.active_mm = &init_mm, \
154155
.se = { \
@@ -157,7 +158,6 @@ extern struct cred init_cred;
157158
.rt = { \
158159
.run_list = LIST_HEAD_INIT(tsk.rt.run_list), \
159160
.time_slice = RR_TIMESLICE, \
160-
.nr_cpus_allowed = NR_CPUS, \
161161
}, \
162162
.tasks = LIST_HEAD_INIT(tsk.tasks), \
163163
INIT_PUSHABLE_TASKS(tsk) \

include/linux/sched.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,7 @@ extern unsigned long this_cpu_load(void);
145145

146146

147147
extern void calc_global_load(unsigned long ticks);
148+
extern void update_cpu_load_nohz(void);
148149

149150
extern unsigned long get_parent_ip(unsigned long addr);
150151

@@ -1187,7 +1188,6 @@ struct sched_rt_entity {
11871188
struct list_head run_list;
11881189
unsigned long timeout;
11891190
unsigned int time_slice;
1190-
int nr_cpus_allowed;
11911191

11921192
struct sched_rt_entity *back;
11931193
#ifdef CONFIG_RT_GROUP_SCHED
@@ -1252,6 +1252,7 @@ struct task_struct {
12521252
#endif
12531253

12541254
unsigned int policy;
1255+
int nr_cpus_allowed;
12551256
cpumask_t cpus_allowed;
12561257

12571258
#ifdef CONFIG_PREEMPT_RCU

kernel/sched/core.c

Lines changed: 51 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -142,9 +142,8 @@ const_debug unsigned int sysctl_sched_features =
142142
#define SCHED_FEAT(name, enabled) \
143143
#name ,
144144

145-
static __read_mostly char *sched_feat_names[] = {
145+
static const char * const sched_feat_names[] = {
146146
#include "features.h"
147-
NULL
148147
};
149148

150149
#undef SCHED_FEAT
@@ -2517,25 +2516,32 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
25172516
sched_avg_update(this_rq);
25182517
}
25192518

2519+
#ifdef CONFIG_NO_HZ
2520+
/*
2521+
* There is no sane way to deal with nohz on smp when using jiffies because the
2522+
* cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
2523+
* causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
2524+
*
2525+
* Therefore we cannot use the delta approach from the regular tick since that
2526+
* would seriously skew the load calculation. However we'll make do for those
2527+
* updates happening while idle (nohz_idle_balance) or coming out of idle
2528+
* (tick_nohz_idle_exit).
2529+
*
2530+
* This means we might still be one tick off for nohz periods.
2531+
*/
2532+
25202533
/*
25212534
* Called from nohz_idle_balance() to update the load ratings before doing the
25222535
* idle balance.
25232536
*/
25242537
void update_idle_cpu_load(struct rq *this_rq)
25252538
{
2526-
unsigned long curr_jiffies = jiffies;
2539+
unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
25272540
unsigned long load = this_rq->load.weight;
25282541
unsigned long pending_updates;
25292542

25302543
/*
2531-
* Bloody broken means of dealing with nohz, but better than nothing..
2532-
* jiffies is updated by one cpu, another cpu can drift wrt the jiffy
2533-
* update and see 0 difference the one time and 2 the next, even though
2534-
* we ticked at roughtly the same rate.
2535-
*
2536-
* Hence we only use this from nohz_idle_balance() and skip this
2537-
* nonsense when called from the scheduler_tick() since that's
2538-
* guaranteed a stable rate.
2544+
* bail if there's load or we're actually up-to-date.
25392545
*/
25402546
if (load || curr_jiffies == this_rq->last_load_update_tick)
25412547
return;
@@ -2546,13 +2552,39 @@ void update_idle_cpu_load(struct rq *this_rq)
25462552
__update_cpu_load(this_rq, load, pending_updates);
25472553
}
25482554

2555+
/*
2556+
* Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
2557+
*/
2558+
void update_cpu_load_nohz(void)
2559+
{
2560+
struct rq *this_rq = this_rq();
2561+
unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
2562+
unsigned long pending_updates;
2563+
2564+
if (curr_jiffies == this_rq->last_load_update_tick)
2565+
return;
2566+
2567+
raw_spin_lock(&this_rq->lock);
2568+
pending_updates = curr_jiffies - this_rq->last_load_update_tick;
2569+
if (pending_updates) {
2570+
this_rq->last_load_update_tick = curr_jiffies;
2571+
/*
2572+
* We were idle, this means load 0, the current load might be
2573+
* !0 due to remote wakeups and the sort.
2574+
*/
2575+
__update_cpu_load(this_rq, 0, pending_updates);
2576+
}
2577+
raw_spin_unlock(&this_rq->lock);
2578+
}
2579+
#endif /* CONFIG_NO_HZ */
2580+
25492581
/*
25502582
* Called from scheduler_tick()
25512583
*/
25522584
static void update_cpu_load_active(struct rq *this_rq)
25532585
{
25542586
/*
2555-
* See the mess in update_idle_cpu_load().
2587+
* See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
25562588
*/
25572589
this_rq->last_load_update_tick = jiffies;
25582590
__update_cpu_load(this_rq, this_rq->load.weight, 1);
@@ -4982,7 +5014,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
49825014
p->sched_class->set_cpus_allowed(p, new_mask);
49835015

49845016
cpumask_copy(&p->cpus_allowed, new_mask);
4985-
p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
5017+
p->nr_cpus_allowed = cpumask_weight(new_mask);
49865018
}
49875019

49885020
/*
@@ -5997,11 +6029,14 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
59976029

59986030
cpumask_or(covered, covered, sg_span);
59996031

6000-
sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span));
6032+
sg->sgp = *per_cpu_ptr(sdd->sgp, i);
60016033
atomic_inc(&sg->sgp->ref);
60026034

6003-
if (cpumask_test_cpu(cpu, sg_span))
6035+
if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
6036+
cpumask_first(sg_span) == cpu) {
6037+
WARN_ON_ONCE(!cpumask_test_cpu(cpu, sg_span));
60046038
groups = sg;
6039+
}
60056040

60066041
if (!first)
60076042
first = sg;
@@ -6403,7 +6438,7 @@ static void sched_init_numa(void)
64036438
return;
64046439

64056440
for (j = 0; j < nr_node_ids; j++) {
6406-
struct cpumask *mask = kzalloc_node(cpumask_size(), GFP_KERNEL, j);
6441+
struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
64076442
if (!mask)
64086443
return;
64096444

@@ -6691,7 +6726,6 @@ static int init_sched_domains(const struct cpumask *cpu_map)
66916726
if (!doms_cur)
66926727
doms_cur = &fallback_doms;
66936728
cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
6694-
dattr_cur = NULL;
66956729
err = build_sched_domains(doms_cur[0], NULL);
66966730
register_sched_domain_sysctl();
66976731

kernel/sched/fair.c

Lines changed: 32 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2703,7 +2703,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
27032703
int want_sd = 1;
27042704
int sync = wake_flags & WF_SYNC;
27052705

2706-
if (p->rt.nr_cpus_allowed == 1)
2706+
if (p->nr_cpus_allowed == 1)
27072707
return prev_cpu;
27082708

27092709
if (sd_flag & SD_BALANCE_WAKE) {
@@ -3503,15 +3503,22 @@ unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
35033503
unsigned long scale_rt_power(int cpu)
35043504
{
35053505
struct rq *rq = cpu_rq(cpu);
3506-
u64 total, available;
3506+
u64 total, available, age_stamp, avg;
35073507

3508-
total = sched_avg_period() + (rq->clock - rq->age_stamp);
3508+
/*
3509+
* Since we're reading these variables without serialization make sure
3510+
* we read them once before doing sanity checks on them.
3511+
*/
3512+
age_stamp = ACCESS_ONCE(rq->age_stamp);
3513+
avg = ACCESS_ONCE(rq->rt_avg);
3514+
3515+
total = sched_avg_period() + (rq->clock - age_stamp);
35093516

3510-
if (unlikely(total < rq->rt_avg)) {
3517+
if (unlikely(total < avg)) {
35113518
/* Ensures that power won't end up being negative */
35123519
available = 0;
35133520
} else {
3514-
available = total - rq->rt_avg;
3521+
available = total - avg;
35153522
}
35163523

35173524
if (unlikely((s64)total < SCHED_POWER_SCALE))
@@ -3574,11 +3581,26 @@ void update_group_power(struct sched_domain *sd, int cpu)
35743581

35753582
power = 0;
35763583

3577-
group = child->groups;
3578-
do {
3579-
power += group->sgp->power;
3580-
group = group->next;
3581-
} while (group != child->groups);
3584+
if (child->flags & SD_OVERLAP) {
3585+
/*
3586+
* SD_OVERLAP domains cannot assume that child groups
3587+
* span the current group.
3588+
*/
3589+
3590+
for_each_cpu(cpu, sched_group_cpus(sdg))
3591+
power += power_of(cpu);
3592+
} else {
3593+
/*
3594+
* !SD_OVERLAP domains can assume that child groups
3595+
* span the current group.
3596+
*/
3597+
3598+
group = child->groups;
3599+
do {
3600+
power += group->sgp->power;
3601+
group = group->next;
3602+
} while (group != child->groups);
3603+
}
35823604

35833605
sdg->sgp->power = power;
35843606
}

0 commit comments

Comments
 (0)