Skip to content

Commit 5eb579d

Browse files
Frederic WeisbeckerKAGA-KOKO
authored andcommitted
timers/migration: Fix imbalanced NUMA trees
When a CPU from a new node boots, the old root may happen to be connected to the new root even if their node mismatch, as depicted in the following scenario: 1) CPU 0 boots and creates the first group for node 0. [GRP0:0] node 0 | CPU 0 2) CPU 1 from node 1 boots and creates a new top that corresponds to node 1, but it also connects the old root from node 0 to the new root from node 1 by mistake. [GRP1:0] node 1 / \ / \ [GRP0:0] [GRP0:1] node 0 node 1 | | CPU 0 CPU 1 3) This eventually leads to an imbalanced tree where some node 0 CPUs migrate node 1 timers (and vice versa) way before reaching the crossnode groups, resulting in more frequent remote memory accesses than expected. [GRP2:0] NUMA_NO_NODE / \ [GRP1:0] [GRP1:1] node 1 node 0 / \ | / \ [...] [GRP0:0] [GRP0:1] node 0 node 1 | | CPU 0... CPU 1... A balanced tree should only contain groups having children that belong to the same node: [GRP2:0] NUMA_NO_NODE / \ [GRP1:0] [GRP1:0] node 0 node 1 / \ / \ / \ / \ [GRP0:0] [...] [...] [GRP0:1] node 0 node 1 | | CPU 0... CPU 1... In order to fix this, the hierarchy must be unfolded up to the crossnode level as soon as a node mismatch is detected. For example the stage 2 above should lead to this layout: [GRP2:0] NUMA_NO_NODE / \ [GRP1:0] [GRP1:1] node 0 node 1 / \ / \ [GRP0:0] [GRP0:1] node 0 node 1 | | CPU 0 CPU 1 This means that not only GRP1:0 must be created but also GRP1:1 and GRP2:0 in order to prepare a balanced tree for next CPUs to boot. Fixes: 7ee9887 ("timers: Implement the hierarchical pull model") Signed-off-by: Frederic Weisbecker <frederic@kernel.org> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Link: https://patch.msgid.link/20251024132536.39841-4-frederic@kernel.org
1 parent fa96203 commit 5eb579d

1 file changed

Lines changed: 127 additions & 104 deletions

File tree

kernel/time/timer_migration.c

Lines changed: 127 additions & 104 deletions
Original file line numberDiff line numberDiff line change
@@ -420,6 +420,8 @@ static struct list_head *tmigr_level_list __read_mostly;
420420
static unsigned int tmigr_hierarchy_levels __read_mostly;
421421
static unsigned int tmigr_crossnode_level __read_mostly;
422422

423+
static struct tmigr_group *tmigr_root;
424+
423425
static DEFINE_PER_CPU(struct tmigr_cpu, tmigr_cpu);
424426

425427
#define TMIGR_NONE 0xFF
@@ -522,11 +524,9 @@ struct tmigr_walk {
522524

523525
typedef bool (*up_f)(struct tmigr_group *, struct tmigr_group *, struct tmigr_walk *);
524526

525-
static void __walk_groups(up_f up, struct tmigr_walk *data,
526-
struct tmigr_cpu *tmc)
527+
static void __walk_groups_from(up_f up, struct tmigr_walk *data,
528+
struct tmigr_group *child, struct tmigr_group *group)
527529
{
528-
struct tmigr_group *child = NULL, *group = tmc->tmgroup;
529-
530530
do {
531531
WARN_ON_ONCE(group->level >= tmigr_hierarchy_levels);
532532

@@ -544,6 +544,12 @@ static void __walk_groups(up_f up, struct tmigr_walk *data,
544544
} while (group);
545545
}
546546

547+
static void __walk_groups(up_f up, struct tmigr_walk *data,
548+
struct tmigr_cpu *tmc)
549+
{
550+
__walk_groups_from(up, data, NULL, tmc->tmgroup);
551+
}
552+
547553
static void walk_groups(up_f up, struct tmigr_walk *data, struct tmigr_cpu *tmc)
548554
{
549555
lockdep_assert_held(&tmc->lock);
@@ -1498,21 +1504,6 @@ static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl,
14981504
s.seq = 0;
14991505
atomic_set(&group->migr_state, s.state);
15001506

1501-
/*
1502-
* If this is a new top-level, prepare its groupmask in advance.
1503-
* This avoids accidents where yet another new top-level is
1504-
* created in the future and made visible before the current groupmask.
1505-
*/
1506-
if (list_empty(&tmigr_level_list[lvl])) {
1507-
group->groupmask = BIT(0);
1508-
/*
1509-
* The previous top level has prepared its groupmask already,
1510-
* simply account it as the first child.
1511-
*/
1512-
if (lvl > 0)
1513-
group->num_children = 1;
1514-
}
1515-
15161507
timerqueue_init_head(&group->events);
15171508
timerqueue_init(&group->groupevt.nextevt);
15181509
group->groupevt.nextevt.expires = KTIME_MAX;
@@ -1567,22 +1558,51 @@ static struct tmigr_group *tmigr_get_group(unsigned int cpu, int node,
15671558
return group;
15681559
}
15691560

1561+
static bool tmigr_init_root(struct tmigr_group *group, bool activate)
1562+
{
1563+
if (!group->parent && group != tmigr_root) {
1564+
/*
1565+
* This is the new top-level, prepare its groupmask in advance
1566+
* to avoid accidents where yet another new top-level is
1567+
* created in the future and made visible before this groupmask.
1568+
*/
1569+
group->groupmask = BIT(0);
1570+
WARN_ON_ONCE(activate);
1571+
1572+
return true;
1573+
}
1574+
1575+
return false;
1576+
1577+
}
1578+
15701579
static void tmigr_connect_child_parent(struct tmigr_group *child,
15711580
struct tmigr_group *parent,
15721581
bool activate)
15731582
{
1574-
struct tmigr_walk data;
1583+
if (tmigr_init_root(parent, activate)) {
1584+
/*
1585+
* The previous top level had prepared its groupmask already,
1586+
* simply account it in advance as the first child. If some groups
1587+
* have been created between the old and new root due to node
1588+
* mismatch, the new root's child will be intialized accordingly.
1589+
*/
1590+
parent->num_children = 1;
1591+
}
15751592

1576-
if (activate) {
1593+
/* Connecting old root to new root ? */
1594+
if (!parent->parent && activate) {
15771595
/*
1578-
* @child is the old top and @parent the new one. In this
1579-
* case groupmask is pre-initialized and @child already
1580-
* accounted, along with its new sibling corresponding to the
1581-
* CPU going up.
1596+
* @child is the old top, or in case of node mismatch, some
1597+
* intermediate group between the old top and the new one in
1598+
* @parent. In this case the @child must be pre-accounted above
1599+
* as the first child. Its new inactive sibling corresponding
1600+
* to the CPU going up has been accounted as the second child.
15821601
*/
1583-
WARN_ON_ONCE(child->groupmask != BIT(0) || parent->num_children != 2);
1602+
WARN_ON_ONCE(parent->num_children != 2);
1603+
child->groupmask = BIT(0);
15841604
} else {
1585-
/* Adding @child for the CPU going up to @parent. */
1605+
/* Common case adding @child for the CPU going up to @parent. */
15861606
child->groupmask = BIT(parent->num_children++);
15871607
}
15881608

@@ -1594,56 +1614,28 @@ static void tmigr_connect_child_parent(struct tmigr_group *child,
15941614
smp_store_release(&child->parent, parent);
15951615

15961616
trace_tmigr_connect_child_parent(child);
1597-
1598-
if (!activate)
1599-
return;
1600-
1601-
/*
1602-
* To prevent inconsistent states, active children need to be active in
1603-
* the new parent as well. Inactive children are already marked inactive
1604-
* in the parent group:
1605-
*
1606-
* * When new groups were created by tmigr_setup_groups() starting from
1607-
* the lowest level (and not higher then one level below the current
1608-
* top level), then they are not active. They will be set active when
1609-
* the new online CPU comes active.
1610-
*
1611-
* * But if a new group above the current top level is required, it is
1612-
* mandatory to propagate the active state of the already existing
1613-
* child to the new parent. So tmigr_connect_child_parent() is
1614-
* executed with the formerly top level group (child) and the newly
1615-
* created group (parent).
1616-
*
1617-
* * It is ensured that the child is active, as this setup path is
1618-
* executed in hotplug prepare callback. This is exectued by an
1619-
* already connected and !idle CPU. Even if all other CPUs go idle,
1620-
* the CPU executing the setup will be responsible up to current top
1621-
* level group. And the next time it goes inactive, it will release
1622-
* the new childmask and parent to subsequent walkers through this
1623-
* @child. Therefore propagate active state unconditionally.
1624-
*/
1625-
data.childmask = child->groupmask;
1626-
1627-
/*
1628-
* There is only one new level per time (which is protected by
1629-
* tmigr_mutex). When connecting the child and the parent and set the
1630-
* child active when the parent is inactive, the parent needs to be the
1631-
* uppermost level. Otherwise there went something wrong!
1632-
*/
1633-
WARN_ON(!tmigr_active_up(parent, child, &data) && parent->parent);
16341617
}
16351618

1636-
static int tmigr_setup_groups(unsigned int cpu, unsigned int node)
1619+
static int tmigr_setup_groups(unsigned int cpu, unsigned int node,
1620+
struct tmigr_group *start, bool activate)
16371621
{
16381622
struct tmigr_group *group, *child, **stack;
1639-
int i, top = 0, err = 0;
1640-
struct list_head *lvllist;
1623+
int i, top = 0, err = 0, start_lvl = 0;
1624+
bool root_mismatch = false;
16411625

16421626
stack = kcalloc(tmigr_hierarchy_levels, sizeof(*stack), GFP_KERNEL);
16431627
if (!stack)
16441628
return -ENOMEM;
16451629

1646-
for (i = 0; i < tmigr_hierarchy_levels; i++) {
1630+
if (start) {
1631+
stack[start->level] = start;
1632+
start_lvl = start->level + 1;
1633+
}
1634+
1635+
if (tmigr_root)
1636+
root_mismatch = tmigr_root->numa_node != node;
1637+
1638+
for (i = start_lvl; i < tmigr_hierarchy_levels; i++) {
16471639
group = tmigr_get_group(cpu, node, i);
16481640
if (IS_ERR(group)) {
16491641
err = PTR_ERR(group);
@@ -1656,23 +1648,25 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node)
16561648

16571649
/*
16581650
* When booting only less CPUs of a system than CPUs are
1659-
* available, not all calculated hierarchy levels are required.
1651+
* available, not all calculated hierarchy levels are required,
1652+
* unless a node mismatch is detected.
16601653
*
16611654
* The loop is aborted as soon as the highest level, which might
16621655
* be different from tmigr_hierarchy_levels, contains only a
1663-
* single group.
1656+
* single group, unless the nodes mismatch below tmigr_crossnode_level
16641657
*/
1665-
if (group->parent || list_is_singular(&tmigr_level_list[i]))
1658+
if (group->parent)
1659+
break;
1660+
if ((!root_mismatch || i >= tmigr_crossnode_level) &&
1661+
list_is_singular(&tmigr_level_list[i]))
16661662
break;
16671663
}
16681664

16691665
/* Assert single root without parent */
16701666
if (WARN_ON_ONCE(i >= tmigr_hierarchy_levels))
16711667
return -EINVAL;
1672-
if (WARN_ON_ONCE(!err && !group->parent && !list_is_singular(&tmigr_level_list[top])))
1673-
return -EINVAL;
16741668

1675-
for (; i >= 0; i--) {
1669+
for (; i >= start_lvl; i--) {
16761670
group = stack[i];
16771671

16781672
if (err < 0) {
@@ -1692,61 +1686,90 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node)
16921686
tmc->tmgroup = group;
16931687
tmc->groupmask = BIT(group->num_children++);
16941688

1689+
tmigr_init_root(group, activate);
1690+
16951691
trace_tmigr_connect_cpu_parent(tmc);
16961692

16971693
/* There are no children that need to be connected */
16981694
continue;
16991695
} else {
17001696
child = stack[i - 1];
1701-
/* Will be activated at online time */
1702-
tmigr_connect_child_parent(child, group, false);
1697+
tmigr_connect_child_parent(child, group, activate);
17031698
}
1699+
}
17041700

1705-
/* check if uppermost level was newly created */
1706-
if (top != i)
1707-
continue;
1708-
1709-
WARN_ON_ONCE(top == 0);
1701+
if (err < 0)
1702+
goto out;
17101703

1711-
lvllist = &tmigr_level_list[top];
1704+
if (activate) {
1705+
struct tmigr_walk data;
17121706

17131707
/*
1714-
* Newly created root level should have accounted the upcoming
1715-
* CPU's child group and pre-accounted the old root.
1708+
* To prevent inconsistent states, active children need to be active in
1709+
* the new parent as well. Inactive children are already marked inactive
1710+
* in the parent group:
1711+
*
1712+
* * When new groups were created by tmigr_setup_groups() starting from
1713+
* the lowest level, then they are not active. They will be set active
1714+
* when the new online CPU comes active.
1715+
*
1716+
* * But if new groups above the current top level are required, it is
1717+
* mandatory to propagate the active state of the already existing
1718+
* child to the new parents. So tmigr_active_up() activates the
1719+
* new parents while walking up from the old root to the new.
1720+
*
1721+
* * It is ensured that @start is active, as this setup path is
1722+
* executed in hotplug prepare callback. This is executed by an
1723+
* already connected and !idle CPU. Even if all other CPUs go idle,
1724+
* the CPU executing the setup will be responsible up to current top
1725+
* level group. And the next time it goes inactive, it will release
1726+
* the new childmask and parent to subsequent walkers through this
1727+
* @child. Therefore propagate active state unconditionally.
17161728
*/
1717-
if (group->num_children == 2 && list_is_singular(lvllist)) {
1718-
/*
1719-
* The target CPU must never do the prepare work, except
1720-
* on early boot when the boot CPU is the target. Otherwise
1721-
* it may spuriously activate the old top level group inside
1722-
* the new one (nevertheless whether old top level group is
1723-
* active or not) and/or release an uninitialized childmask.
1724-
*/
1725-
WARN_ON_ONCE(cpu == raw_smp_processor_id());
1726-
1727-
lvllist = &tmigr_level_list[top - 1];
1728-
list_for_each_entry(child, lvllist, list) {
1729-
if (child->parent)
1730-
continue;
1729+
WARN_ON_ONCE(!start->parent);
1730+
data.childmask = start->groupmask;
1731+
__walk_groups_from(tmigr_active_up, &data, start, start->parent);
1732+
}
17311733

1732-
tmigr_connect_child_parent(child, group, true);
1733-
}
1734+
/* Root update */
1735+
if (list_is_singular(&tmigr_level_list[top])) {
1736+
group = list_first_entry(&tmigr_level_list[top],
1737+
typeof(*group), list);
1738+
WARN_ON_ONCE(group->parent);
1739+
if (tmigr_root) {
1740+
/* Old root should be the same or below */
1741+
WARN_ON_ONCE(tmigr_root->level > top);
17341742
}
1743+
tmigr_root = group;
17351744
}
1736-
1745+
out:
17371746
kfree(stack);
17381747

17391748
return err;
17401749
}
17411750

17421751
static int tmigr_add_cpu(unsigned int cpu)
17431752
{
1753+
struct tmigr_group *old_root = tmigr_root;
17441754
int node = cpu_to_node(cpu);
17451755
int ret;
17461756

1747-
mutex_lock(&tmigr_mutex);
1748-
ret = tmigr_setup_groups(cpu, node);
1749-
mutex_unlock(&tmigr_mutex);
1757+
guard(mutex)(&tmigr_mutex);
1758+
1759+
ret = tmigr_setup_groups(cpu, node, NULL, false);
1760+
1761+
/* Root has changed? Connect the old one to the new */
1762+
if (ret >= 0 && old_root && old_root != tmigr_root) {
1763+
/*
1764+
* The target CPU must never do the prepare work, except
1765+
* on early boot when the boot CPU is the target. Otherwise
1766+
* it may spuriously activate the old top level group inside
1767+
* the new one (nevertheless whether old top level group is
1768+
* active or not) and/or release an uninitialized childmask.
1769+
*/
1770+
WARN_ON_ONCE(cpu == raw_smp_processor_id());
1771+
ret = tmigr_setup_groups(-1, old_root->numa_node, old_root, true);
1772+
}
17501773

17511774
return ret;
17521775
}

0 commit comments

Comments
 (0)