Skip to content

Commit 25144ea

Browse files
committed
Merge tag 'timers_urgent_for_v6.13' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull timer fixes from Borislav Petkov: - Reset hrtimers correctly when a CPU hotplug state traversal happens "half-ways" and leaves hrtimers not (re-)initialized properly - Annotate accesses to a timer group's ignore flag to prevent KCSAN from raising data_race warnings - Make sure timer group initialization is visible to timer tree walkers and avoid a hypothetical race - Fix another race between CPU hotplug and idle entry/exit where timers on a fully idle system are getting ignored - Fix a case where an ignored signal is still being handled which it shouldn't be * tag 'timers_urgent_for_v6.13' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: hrtimers: Handle CPU state correctly on hotplug timers/migration: Annotate accesses to ignore flag timers/migration: Enforce group initialization visibility to tree walkers timers/migration: Fix another race between hotplug and idle entry/exit signal/posixtimers: Handle ignore/blocked sequences correctly
2 parents b031457 + 2f8dea1 commit 25144ea

File tree

5 files changed

+95
-20
lines changed

5 files changed

+95
-20
lines changed

include/linux/hrtimer.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -386,6 +386,7 @@ extern void __init hrtimers_init(void);
386386
extern void sysrq_timer_list_show(void);
387387

388388
int hrtimers_prepare_cpu(unsigned int cpu);
389+
int hrtimers_cpu_starting(unsigned int cpu);
389390
#ifdef CONFIG_HOTPLUG_CPU
390391
int hrtimers_cpu_dying(unsigned int cpu);
391392
#else

kernel/cpu.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2179,7 +2179,7 @@ static struct cpuhp_step cpuhp_hp_states[] = {
21792179
},
21802180
[CPUHP_AP_HRTIMERS_DYING] = {
21812181
.name = "hrtimers:dying",
2182-
.startup.single = NULL,
2182+
.startup.single = hrtimers_cpu_starting,
21832183
.teardown.single = hrtimers_cpu_dying,
21842184
},
21852185
[CPUHP_AP_TICK_DYING] = {

kernel/signal.c

Lines changed: 28 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2007,11 +2007,22 @@ void posixtimer_send_sigqueue(struct k_itimer *tmr)
20072007

20082008
if (!list_empty(&q->list)) {
20092009
/*
2010-
* If task group is exiting with the signal already pending,
2011-
* wait for __exit_signal() to do its job. Otherwise if
2012-
* ignored, it's not supposed to be queued. Try to survive.
2010+
* The signal was ignored and blocked. The timer
2011+
* expiry queued it because blocked signals are
2012+
* queued independent of the ignored state.
2013+
*
2014+
* The unblocking set SIGPENDING, but the signal
2015+
* was not yet dequeued from the pending list.
2016+
* So prepare_signal() sees unblocked and ignored,
2017+
* which ends up here. Leave it queued like a
2018+
* regular signal.
2019+
*
2020+
* The same happens when the task group is exiting
2021+
* and the signal is already queued.
2022+
* prepare_signal() treats SIGNAL_GROUP_EXIT as
2023+
* ignored independent of its queued state. This
2024+
* gets cleaned up in __exit_signal().
20132025
*/
2014-
WARN_ON_ONCE(!(t->signal->flags & SIGNAL_GROUP_EXIT));
20152026
goto out;
20162027
}
20172028

@@ -2046,17 +2057,25 @@ void posixtimer_send_sigqueue(struct k_itimer *tmr)
20462057
goto out;
20472058
}
20482059

2049-
/* This should never happen and leaks a reference count */
2050-
if (WARN_ON_ONCE(!hlist_unhashed(&tmr->ignored_list)))
2051-
hlist_del_init(&tmr->ignored_list);
2052-
20532060
if (unlikely(!list_empty(&q->list))) {
20542061
/* This holds a reference count already */
20552062
result = TRACE_SIGNAL_ALREADY_PENDING;
20562063
goto out;
20572064
}
20582065

2059-
posixtimer_sigqueue_getref(q);
2066+
/*
2067+
* If the signal is on the ignore list, it got blocked after it was
2068+
* ignored earlier. But nothing lifted the ignore. Move it back to
2069+
* the pending list to be consistent with the regular signal
2070+
* handling. This already holds a reference count.
2071+
*
2072+
* If it's not on the ignore list acquire a reference count.
2073+
*/
2074+
if (likely(hlist_unhashed(&tmr->ignored_list)))
2075+
posixtimer_sigqueue_getref(q);
2076+
else
2077+
hlist_del_init(&tmr->ignored_list);
2078+
20602079
posixtimer_queue_sigqueue(q, t, tmr->it_pid_type);
20612080
result = TRACE_SIGNAL_DELIVERED;
20622081
out:

kernel/time/hrtimer.c

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2202,6 +2202,15 @@ int hrtimers_prepare_cpu(unsigned int cpu)
22022202
}
22032203

22042204
cpu_base->cpu = cpu;
2205+
hrtimer_cpu_base_init_expiry_lock(cpu_base);
2206+
return 0;
2207+
}
2208+
2209+
int hrtimers_cpu_starting(unsigned int cpu)
2210+
{
2211+
struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
2212+
2213+
/* Clear out any left over state from a CPU down operation */
22052214
cpu_base->active_bases = 0;
22062215
cpu_base->hres_active = 0;
22072216
cpu_base->hang_detected = 0;
@@ -2210,7 +2219,6 @@ int hrtimers_prepare_cpu(unsigned int cpu)
22102219
cpu_base->expires_next = KTIME_MAX;
22112220
cpu_base->softirq_expires_next = KTIME_MAX;
22122221
cpu_base->online = 1;
2213-
hrtimer_cpu_base_init_expiry_lock(cpu_base);
22142222
return 0;
22152223
}
22162224

@@ -2286,5 +2294,6 @@ int hrtimers_cpu_dying(unsigned int dying_cpu)
22862294
void __init hrtimers_init(void)
22872295
{
22882296
hrtimers_prepare_cpu(smp_processor_id());
2297+
hrtimers_cpu_starting(smp_processor_id());
22892298
open_softirq(HRTIMER_SOFTIRQ, hrtimer_run_softirq);
22902299
}

kernel/time/timer_migration.c

Lines changed: 55 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -534,8 +534,13 @@ static void __walk_groups(up_f up, struct tmigr_walk *data,
534534
break;
535535

536536
child = group;
537-
group = group->parent;
537+
/*
538+
* Pairs with the store release on group connection
539+
* to make sure group initialization is visible.
540+
*/
541+
group = READ_ONCE(group->parent);
538542
data->childmask = child->groupmask;
543+
WARN_ON_ONCE(!data->childmask);
539544
} while (group);
540545
}
541546

@@ -564,7 +569,7 @@ static struct tmigr_event *tmigr_next_groupevt(struct tmigr_group *group)
564569
while ((node = timerqueue_getnext(&group->events))) {
565570
evt = container_of(node, struct tmigr_event, nextevt);
566571

567-
if (!evt->ignore) {
572+
if (!READ_ONCE(evt->ignore)) {
568573
WRITE_ONCE(group->next_expiry, evt->nextevt.expires);
569574
return evt;
570575
}
@@ -660,7 +665,7 @@ static bool tmigr_active_up(struct tmigr_group *group,
660665
* lock is held while updating the ignore flag in idle path. So this
661666
* state change will not be lost.
662667
*/
663-
group->groupevt.ignore = true;
668+
WRITE_ONCE(group->groupevt.ignore, true);
664669

665670
return walk_done;
666671
}
@@ -721,6 +726,7 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child,
721726
union tmigr_state childstate, groupstate;
722727
bool remote = data->remote;
723728
bool walk_done = false;
729+
bool ignore;
724730
u64 nextexp;
725731

726732
if (child) {
@@ -739,11 +745,19 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child,
739745
nextexp = child->next_expiry;
740746
evt = &child->groupevt;
741747

742-
evt->ignore = (nextexp == KTIME_MAX) ? true : false;
748+
/*
749+
* This can race with concurrent idle exit (activate).
750+
* If the current writer wins, a useless remote expiration may
751+
* be scheduled. If the activate wins, the event is properly
752+
* ignored.
753+
*/
754+
ignore = (nextexp == KTIME_MAX) ? true : false;
755+
WRITE_ONCE(evt->ignore, ignore);
743756
} else {
744757
nextexp = data->nextexp;
745758

746759
first_childevt = evt = data->evt;
760+
ignore = evt->ignore;
747761

748762
/*
749763
* Walking the hierarchy is required in any case when a
@@ -769,7 +783,7 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child,
769783
* first event information of the group is updated properly and
770784
* also handled properly, so skip this fast return path.
771785
*/
772-
if (evt->ignore && !remote && group->parent)
786+
if (ignore && !remote && group->parent)
773787
return true;
774788

775789
raw_spin_lock(&group->lock);
@@ -783,7 +797,7 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child,
783797
* queue when the expiry time changed only or when it could be ignored.
784798
*/
785799
if (timerqueue_node_queued(&evt->nextevt)) {
786-
if ((evt->nextevt.expires == nextexp) && !evt->ignore) {
800+
if ((evt->nextevt.expires == nextexp) && !ignore) {
787801
/* Make sure not to miss a new CPU event with the same expiry */
788802
evt->cpu = first_childevt->cpu;
789803
goto check_toplvl;
@@ -793,7 +807,7 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child,
793807
WRITE_ONCE(group->next_expiry, KTIME_MAX);
794808
}
795809

796-
if (evt->ignore) {
810+
if (ignore) {
797811
/*
798812
* When the next child event could be ignored (nextexp is
799813
* KTIME_MAX) and there was no remote timer handling before or
@@ -1487,6 +1501,21 @@ static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl,
14871501
s.seq = 0;
14881502
atomic_set(&group->migr_state, s.state);
14891503

1504+
/*
1505+
* If this is a new top-level, prepare its groupmask in advance.
1506+
* This avoids accidents where yet another new top-level is
1507+
* created in the future and made visible before the current groupmask.
1508+
*/
1509+
if (list_empty(&tmigr_level_list[lvl])) {
1510+
group->groupmask = BIT(0);
1511+
/*
1512+
* The previous top level has prepared its groupmask already,
1513+
* simply account it as the first child.
1514+
*/
1515+
if (lvl > 0)
1516+
group->num_children = 1;
1517+
}
1518+
14901519
timerqueue_init_head(&group->events);
14911520
timerqueue_init(&group->groupevt.nextevt);
14921521
group->groupevt.nextevt.expires = KTIME_MAX;
@@ -1550,8 +1579,25 @@ static void tmigr_connect_child_parent(struct tmigr_group *child,
15501579
raw_spin_lock_irq(&child->lock);
15511580
raw_spin_lock_nested(&parent->lock, SINGLE_DEPTH_NESTING);
15521581

1553-
child->parent = parent;
1554-
child->groupmask = BIT(parent->num_children++);
1582+
if (activate) {
1583+
/*
1584+
* @child is the old top and @parent the new one. In this
1585+
* case groupmask is pre-initialized and @child already
1586+
* accounted, along with its new sibling corresponding to the
1587+
* CPU going up.
1588+
*/
1589+
WARN_ON_ONCE(child->groupmask != BIT(0) || parent->num_children != 2);
1590+
} else {
1591+
/* Adding @child for the CPU going up to @parent. */
1592+
child->groupmask = BIT(parent->num_children++);
1593+
}
1594+
1595+
/*
1596+
* Make sure parent initialization is visible before publishing it to a
1597+
* racing CPU entering/exiting idle. This RELEASE barrier enforces an
1598+
* address dependency that pairs with the READ_ONCE() in __walk_groups().
1599+
*/
1600+
smp_store_release(&child->parent, parent);
15551601

15561602
raw_spin_unlock(&parent->lock);
15571603
raw_spin_unlock_irq(&child->lock);

0 commit comments

Comments
 (0)