@@ -420,6 +420,8 @@ static struct list_head *tmigr_level_list __read_mostly;
420420static unsigned int tmigr_hierarchy_levels __read_mostly ;
421421static unsigned int tmigr_crossnode_level __read_mostly ;
422422
423+ static struct tmigr_group * tmigr_root ;
424+
423425static DEFINE_PER_CPU (struct tmigr_cpu , tmigr_cpu ) ;
424426
425427#define TMIGR_NONE 0xFF
@@ -522,11 +524,9 @@ struct tmigr_walk {
522524
523525typedef bool (* up_f )(struct tmigr_group * , struct tmigr_group * , struct tmigr_walk * );
524526
525- static void __walk_groups (up_f up , struct tmigr_walk * data ,
526- struct tmigr_cpu * tmc )
527+ static void __walk_groups_from (up_f up , struct tmigr_walk * data ,
528+ struct tmigr_group * child , struct tmigr_group * group )
527529{
528- struct tmigr_group * child = NULL , * group = tmc -> tmgroup ;
529-
530530 do {
531531 WARN_ON_ONCE (group -> level >= tmigr_hierarchy_levels );
532532
@@ -544,6 +544,12 @@ static void __walk_groups(up_f up, struct tmigr_walk *data,
544544 } while (group );
545545}
546546
547+ static void __walk_groups (up_f up , struct tmigr_walk * data ,
548+ struct tmigr_cpu * tmc )
549+ {
550+ __walk_groups_from (up , data , NULL , tmc -> tmgroup );
551+ }
552+
547553static void walk_groups (up_f up , struct tmigr_walk * data , struct tmigr_cpu * tmc )
548554{
549555 lockdep_assert_held (& tmc -> lock );
@@ -1498,21 +1504,6 @@ static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl,
14981504 s .seq = 0 ;
14991505 atomic_set (& group -> migr_state , s .state );
15001506
1501- /*
1502- * If this is a new top-level, prepare its groupmask in advance.
1503- * This avoids accidents where yet another new top-level is
1504- * created in the future and made visible before the current groupmask.
1505- */
1506- if (list_empty (& tmigr_level_list [lvl ])) {
1507- group -> groupmask = BIT (0 );
1508- /*
1509- * The previous top level has prepared its groupmask already,
1510- * simply account it as the first child.
1511- */
1512- if (lvl > 0 )
1513- group -> num_children = 1 ;
1514- }
1515-
15161507 timerqueue_init_head (& group -> events );
15171508 timerqueue_init (& group -> groupevt .nextevt );
15181509 group -> groupevt .nextevt .expires = KTIME_MAX ;
@@ -1567,22 +1558,51 @@ static struct tmigr_group *tmigr_get_group(unsigned int cpu, int node,
15671558 return group ;
15681559}
15691560
1561+ static bool tmigr_init_root (struct tmigr_group * group , bool activate )
1562+ {
1563+ if (!group -> parent && group != tmigr_root ) {
1564+ /*
1565+ * This is the new top-level, prepare its groupmask in advance
1566+ * to avoid accidents where yet another new top-level is
1567+ * created in the future and made visible before this groupmask.
1568+ */
1569+ group -> groupmask = BIT (0 );
1570+ WARN_ON_ONCE (activate );
1571+
1572+ return true;
1573+ }
1574+
1575+ return false;
1576+
1577+ }
1578+
15701579static void tmigr_connect_child_parent (struct tmigr_group * child ,
15711580 struct tmigr_group * parent ,
15721581 bool activate )
15731582{
1574- struct tmigr_walk data ;
1583+ if (tmigr_init_root (parent , activate )) {
1584+ /*
1585+ * The previous top level had prepared its groupmask already,
1586+ * simply account it in advance as the first child. If some groups
1587+ * have been created between the old and new root due to node
1588+ * mismatch, the new root's child will be intialized accordingly.
1589+ */
1590+ parent -> num_children = 1 ;
1591+ }
15751592
1576- if (activate ) {
1593+ /* Connecting old root to new root ? */
1594+ if (!parent -> parent && activate ) {
15771595 /*
1578- * @child is the old top and @parent the new one. In this
1579- * case groupmask is pre-initialized and @child already
1580- * accounted, along with its new sibling corresponding to the
1581- * CPU going up.
1596+ * @child is the old top, or in case of node mismatch, some
1597+ * intermediate group between the old top and the new one in
1598+ * @parent. In this case the @child must be pre-accounted above
1599+ * as the first child. Its new inactive sibling corresponding
1600+ * to the CPU going up has been accounted as the second child.
15821601 */
1583- WARN_ON_ONCE (child -> groupmask != BIT (0 ) || parent -> num_children != 2 );
1602+ WARN_ON_ONCE (parent -> num_children != 2 );
1603+ child -> groupmask = BIT (0 );
15841604 } else {
1585- /* Adding @child for the CPU going up to @parent. */
1605+ /* Common case adding @child for the CPU going up to @parent. */
15861606 child -> groupmask = BIT (parent -> num_children ++ );
15871607 }
15881608
@@ -1594,56 +1614,28 @@ static void tmigr_connect_child_parent(struct tmigr_group *child,
15941614 smp_store_release (& child -> parent , parent );
15951615
15961616 trace_tmigr_connect_child_parent (child );
1597-
1598- if (!activate )
1599- return ;
1600-
1601- /*
1602- * To prevent inconsistent states, active children need to be active in
1603- * the new parent as well. Inactive children are already marked inactive
1604- * in the parent group:
1605- *
1606- * * When new groups were created by tmigr_setup_groups() starting from
1607- * the lowest level (and not higher then one level below the current
1608- * top level), then they are not active. They will be set active when
1609- * the new online CPU comes active.
1610- *
1611- * * But if a new group above the current top level is required, it is
1612- * mandatory to propagate the active state of the already existing
1613- * child to the new parent. So tmigr_connect_child_parent() is
1614- * executed with the formerly top level group (child) and the newly
1615- * created group (parent).
1616- *
1617- * * It is ensured that the child is active, as this setup path is
1618- * executed in hotplug prepare callback. This is exectued by an
1619- * already connected and !idle CPU. Even if all other CPUs go idle,
1620- * the CPU executing the setup will be responsible up to current top
1621- * level group. And the next time it goes inactive, it will release
1622- * the new childmask and parent to subsequent walkers through this
1623- * @child. Therefore propagate active state unconditionally.
1624- */
1625- data .childmask = child -> groupmask ;
1626-
1627- /*
1628- * There is only one new level per time (which is protected by
1629- * tmigr_mutex). When connecting the child and the parent and set the
1630- * child active when the parent is inactive, the parent needs to be the
1631- * uppermost level. Otherwise there went something wrong!
1632- */
1633- WARN_ON (!tmigr_active_up (parent , child , & data ) && parent -> parent );
16341617}
16351618
1636- static int tmigr_setup_groups (unsigned int cpu , unsigned int node )
1619+ static int tmigr_setup_groups (unsigned int cpu , unsigned int node ,
1620+ struct tmigr_group * start , bool activate )
16371621{
16381622 struct tmigr_group * group , * child , * * stack ;
1639- int i , top = 0 , err = 0 ;
1640- struct list_head * lvllist ;
1623+ int i , top = 0 , err = 0 , start_lvl = 0 ;
1624+ bool root_mismatch = false ;
16411625
16421626 stack = kcalloc (tmigr_hierarchy_levels , sizeof (* stack ), GFP_KERNEL );
16431627 if (!stack )
16441628 return - ENOMEM ;
16451629
1646- for (i = 0 ; i < tmigr_hierarchy_levels ; i ++ ) {
1630+ if (start ) {
1631+ stack [start -> level ] = start ;
1632+ start_lvl = start -> level + 1 ;
1633+ }
1634+
1635+ if (tmigr_root )
1636+ root_mismatch = tmigr_root -> numa_node != node ;
1637+
1638+ for (i = start_lvl ; i < tmigr_hierarchy_levels ; i ++ ) {
16471639 group = tmigr_get_group (cpu , node , i );
16481640 if (IS_ERR (group )) {
16491641 err = PTR_ERR (group );
@@ -1656,23 +1648,25 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node)
16561648
16571649 /*
16581650 * When booting only less CPUs of a system than CPUs are
1659- * available, not all calculated hierarchy levels are required.
1651+ * available, not all calculated hierarchy levels are required,
1652+ * unless a node mismatch is detected.
16601653 *
16611654 * The loop is aborted as soon as the highest level, which might
16621655 * be different from tmigr_hierarchy_levels, contains only a
1663- * single group.
1656+ * single group, unless the nodes mismatch below tmigr_crossnode_level
16641657 */
1665- if (group -> parent || list_is_singular (& tmigr_level_list [i ]))
1658+ if (group -> parent )
1659+ break ;
1660+ if ((!root_mismatch || i >= tmigr_crossnode_level ) &&
1661+ list_is_singular (& tmigr_level_list [i ]))
16661662 break ;
16671663 }
16681664
16691665 /* Assert single root without parent */
16701666 if (WARN_ON_ONCE (i >= tmigr_hierarchy_levels ))
16711667 return - EINVAL ;
1672- if (WARN_ON_ONCE (!err && !group -> parent && !list_is_singular (& tmigr_level_list [top ])))
1673- return - EINVAL ;
16741668
1675- for (; i >= 0 ; i -- ) {
1669+ for (; i >= start_lvl ; i -- ) {
16761670 group = stack [i ];
16771671
16781672 if (err < 0 ) {
@@ -1692,61 +1686,90 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node)
16921686 tmc -> tmgroup = group ;
16931687 tmc -> groupmask = BIT (group -> num_children ++ );
16941688
1689+ tmigr_init_root (group , activate );
1690+
16951691 trace_tmigr_connect_cpu_parent (tmc );
16961692
16971693 /* There are no children that need to be connected */
16981694 continue ;
16991695 } else {
17001696 child = stack [i - 1 ];
1701- /* Will be activated at online time */
1702- tmigr_connect_child_parent (child , group , false);
1697+ tmigr_connect_child_parent (child , group , activate );
17031698 }
1699+ }
17041700
1705- /* check if uppermost level was newly created */
1706- if (top != i )
1707- continue ;
1708-
1709- WARN_ON_ONCE (top == 0 );
1701+ if (err < 0 )
1702+ goto out ;
17101703
1711- lvllist = & tmigr_level_list [top ];
1704+ if (activate ) {
1705+ struct tmigr_walk data ;
17121706
17131707 /*
1714- * Newly created root level should have accounted the upcoming
1715- * CPU's child group and pre-accounted the old root.
1708+ * To prevent inconsistent states, active children need to be active in
1709+ * the new parent as well. Inactive children are already marked inactive
1710+ * in the parent group:
1711+ *
1712+ * * When new groups were created by tmigr_setup_groups() starting from
1713+ * the lowest level, then they are not active. They will be set active
1714+ * when the new online CPU comes active.
1715+ *
1716+ * * But if new groups above the current top level are required, it is
1717+ * mandatory to propagate the active state of the already existing
1718+ * child to the new parents. So tmigr_active_up() activates the
1719+ * new parents while walking up from the old root to the new.
1720+ *
1721+ * * It is ensured that @start is active, as this setup path is
1722+ * executed in hotplug prepare callback. This is executed by an
1723+ * already connected and !idle CPU. Even if all other CPUs go idle,
1724+ * the CPU executing the setup will be responsible up to current top
1725+ * level group. And the next time it goes inactive, it will release
1726+ * the new childmask and parent to subsequent walkers through this
1727+ * @child. Therefore propagate active state unconditionally.
17161728 */
1717- if (group -> num_children == 2 && list_is_singular (lvllist )) {
1718- /*
1719- * The target CPU must never do the prepare work, except
1720- * on early boot when the boot CPU is the target. Otherwise
1721- * it may spuriously activate the old top level group inside
1722- * the new one (nevertheless whether old top level group is
1723- * active or not) and/or release an uninitialized childmask.
1724- */
1725- WARN_ON_ONCE (cpu == raw_smp_processor_id ());
1726-
1727- lvllist = & tmigr_level_list [top - 1 ];
1728- list_for_each_entry (child , lvllist , list ) {
1729- if (child -> parent )
1730- continue ;
1729+ WARN_ON_ONCE (!start -> parent );
1730+ data .childmask = start -> groupmask ;
1731+ __walk_groups_from (tmigr_active_up , & data , start , start -> parent );
1732+ }
17311733
1732- tmigr_connect_child_parent (child , group , true);
1733- }
1734+ /* Root update */
1735+ if (list_is_singular (& tmigr_level_list [top ])) {
1736+ group = list_first_entry (& tmigr_level_list [top ],
1737+ typeof (* group ), list );
1738+ WARN_ON_ONCE (group -> parent );
1739+ if (tmigr_root ) {
1740+ /* Old root should be the same or below */
1741+ WARN_ON_ONCE (tmigr_root -> level > top );
17341742 }
1743+ tmigr_root = group ;
17351744 }
1736-
1745+ out :
17371746 kfree (stack );
17381747
17391748 return err ;
17401749}
17411750
17421751static int tmigr_add_cpu (unsigned int cpu )
17431752{
1753+ struct tmigr_group * old_root = tmigr_root ;
17441754 int node = cpu_to_node (cpu );
17451755 int ret ;
17461756
1747- mutex_lock (& tmigr_mutex );
1748- ret = tmigr_setup_groups (cpu , node );
1749- mutex_unlock (& tmigr_mutex );
1757+ guard (mutex )(& tmigr_mutex );
1758+
1759+ ret = tmigr_setup_groups (cpu , node , NULL , false);
1760+
1761+ /* Root has changed? Connect the old one to the new */
1762+ if (ret >= 0 && old_root && old_root != tmigr_root ) {
1763+ /*
1764+ * The target CPU must never do the prepare work, except
1765+ * on early boot when the boot CPU is the target. Otherwise
1766+ * it may spuriously activate the old top level group inside
1767+ * the new one (nevertheless whether old top level group is
1768+ * active or not) and/or release an uninitialized childmask.
1769+ */
1770+ WARN_ON_ONCE (cpu == raw_smp_processor_id ());
1771+ ret = tmigr_setup_groups (-1 , old_root -> numa_node , old_root , true);
1772+ }
17501773
17511774 return ret ;
17521775}
0 commit comments