@@ -534,8 +534,13 @@ static void __walk_groups(up_f up, struct tmigr_walk *data,
534
534
break ;
535
535
536
536
child = group ;
537
- group = group -> parent ;
537
+ /*
538
+ * Pairs with the store release on group connection
539
+ * to make sure group initialization is visible.
540
+ */
541
+ group = READ_ONCE (group -> parent );
538
542
data -> childmask = child -> groupmask ;
543
+ WARN_ON_ONCE (!data -> childmask );
539
544
} while (group );
540
545
}
541
546
@@ -564,7 +569,7 @@ static struct tmigr_event *tmigr_next_groupevt(struct tmigr_group *group)
564
569
while ((node = timerqueue_getnext (& group -> events ))) {
565
570
evt = container_of (node , struct tmigr_event , nextevt );
566
571
567
- if (!evt -> ignore ) {
572
+ if (!READ_ONCE ( evt -> ignore ) ) {
568
573
WRITE_ONCE (group -> next_expiry , evt -> nextevt .expires );
569
574
return evt ;
570
575
}
@@ -660,7 +665,7 @@ static bool tmigr_active_up(struct tmigr_group *group,
660
665
* lock is held while updating the ignore flag in idle path. So this
661
666
* state change will not be lost.
662
667
*/
663
- group -> groupevt .ignore = true;
668
+ WRITE_ONCE ( group -> groupevt .ignore , true) ;
664
669
665
670
return walk_done ;
666
671
}
@@ -721,6 +726,7 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child,
721
726
union tmigr_state childstate , groupstate ;
722
727
bool remote = data -> remote ;
723
728
bool walk_done = false;
729
+ bool ignore ;
724
730
u64 nextexp ;
725
731
726
732
if (child ) {
@@ -739,11 +745,19 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child,
739
745
nextexp = child -> next_expiry ;
740
746
evt = & child -> groupevt ;
741
747
742
- evt -> ignore = (nextexp == KTIME_MAX ) ? true : false;
748
+ /*
749
+ * This can race with concurrent idle exit (activate).
750
+ * If the current writer wins, a useless remote expiration may
751
+ * be scheduled. If the activate wins, the event is properly
752
+ * ignored.
753
+ */
754
+ ignore = (nextexp == KTIME_MAX ) ? true : false;
755
+ WRITE_ONCE (evt -> ignore , ignore );
743
756
} else {
744
757
nextexp = data -> nextexp ;
745
758
746
759
first_childevt = evt = data -> evt ;
760
+ ignore = evt -> ignore ;
747
761
748
762
/*
749
763
* Walking the hierarchy is required in any case when a
@@ -769,7 +783,7 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child,
769
783
* first event information of the group is updated properly and
770
784
* also handled properly, so skip this fast return path.
771
785
*/
772
- if (evt -> ignore && !remote && group -> parent )
786
+ if (ignore && !remote && group -> parent )
773
787
return true;
774
788
775
789
raw_spin_lock (& group -> lock );
@@ -783,7 +797,7 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child,
783
797
* queue when the expiry time changed only or when it could be ignored.
784
798
*/
785
799
if (timerqueue_node_queued (& evt -> nextevt )) {
786
- if ((evt -> nextevt .expires == nextexp ) && !evt -> ignore ) {
800
+ if ((evt -> nextevt .expires == nextexp ) && !ignore ) {
787
801
/* Make sure not to miss a new CPU event with the same expiry */
788
802
evt -> cpu = first_childevt -> cpu ;
789
803
goto check_toplvl ;
@@ -793,7 +807,7 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child,
793
807
WRITE_ONCE (group -> next_expiry , KTIME_MAX );
794
808
}
795
809
796
- if (evt -> ignore ) {
810
+ if (ignore ) {
797
811
/*
798
812
* When the next child event could be ignored (nextexp is
799
813
* KTIME_MAX) and there was no remote timer handling before or
@@ -1487,6 +1501,21 @@ static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl,
1487
1501
s .seq = 0 ;
1488
1502
atomic_set (& group -> migr_state , s .state );
1489
1503
1504
+ /*
1505
+ * If this is a new top-level, prepare its groupmask in advance.
1506
+ * This avoids accidents where yet another new top-level is
1507
+ * created in the future and made visible before the current groupmask.
1508
+ */
1509
+ if (list_empty (& tmigr_level_list [lvl ])) {
1510
+ group -> groupmask = BIT (0 );
1511
+ /*
1512
+ * The previous top level has prepared its groupmask already,
1513
+ * simply account it as the first child.
1514
+ */
1515
+ if (lvl > 0 )
1516
+ group -> num_children = 1 ;
1517
+ }
1518
+
1490
1519
timerqueue_init_head (& group -> events );
1491
1520
timerqueue_init (& group -> groupevt .nextevt );
1492
1521
group -> groupevt .nextevt .expires = KTIME_MAX ;
@@ -1550,8 +1579,25 @@ static void tmigr_connect_child_parent(struct tmigr_group *child,
1550
1579
raw_spin_lock_irq (& child -> lock );
1551
1580
raw_spin_lock_nested (& parent -> lock , SINGLE_DEPTH_NESTING );
1552
1581
1553
- child -> parent = parent ;
1554
- child -> groupmask = BIT (parent -> num_children ++ );
1582
+ if (activate ) {
1583
+ /*
1584
+ * @child is the old top and @parent the new one. In this
1585
+ * case groupmask is pre-initialized and @child already
1586
+ * accounted, along with its new sibling corresponding to the
1587
+ * CPU going up.
1588
+ */
1589
+ WARN_ON_ONCE (child -> groupmask != BIT (0 ) || parent -> num_children != 2 );
1590
+ } else {
1591
+ /* Adding @child for the CPU going up to @parent. */
1592
+ child -> groupmask = BIT (parent -> num_children ++ );
1593
+ }
1594
+
1595
+ /*
1596
+ * Make sure parent initialization is visible before publishing it to a
1597
+ * racing CPU entering/exiting idle. This RELEASE barrier enforces an
1598
+ * address dependency that pairs with the READ_ONCE() in __walk_groups().
1599
+ */
1600
+ smp_store_release (& child -> parent , parent );
1555
1601
1556
1602
raw_spin_unlock (& parent -> lock );
1557
1603
raw_spin_unlock_irq (& child -> lock );
0 commit comments