Skip to content

Commit 72c6303

Browse files
Waiman-Longhtejun
authored andcommitted
cgroup/cpuset: Take isolated CPUs out of workqueue unbound cpumask
To make CPUs in isolated cpuset partition closer in isolation to the boot time isolated CPUs specified in the "isolcpus" boot command line option, we need to take those CPUs out of the workqueue unbound cpumask so that work functions from the unbound workqueues won't run on those CPUs. Otherwise, they will interfere the user tasks running on those isolated CPUs. With the introduction of the workqueue_unbound_exclude_cpumask() helper function in an earlier commit, those isolated CPUs can now be taken out from the workqueue unbound cpumask. This patch also updates cgroup-v2.rst to mention that isolated CPUs will be excluded from unbound workqueue cpumask as well as updating test_cpuset_prs.sh to verify the correctness of the new *cpuset.cpus.isolated file, if available via cgroup_debug option. Signed-off-by: Waiman Long <longman@redhat.com> Signed-off-by: Tejun Heo <tj@kernel.org>
1 parent 11e5f40 commit 72c6303

3 files changed

Lines changed: 166 additions & 34 deletions

File tree

Documentation/admin-guide/cgroup-v2.rst

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2358,11 +2358,11 @@ Cpuset Interface Files
23582358
partition or scheduling domain. The set of exclusive CPUs is
23592359
determined by the value of its "cpuset.cpus.exclusive.effective".
23602360

2361-
When set to "isolated", the CPUs in that partition will
2362-
be in an isolated state without any load balancing from the
2363-
scheduler. Tasks placed in such a partition with multiple
2364-
CPUs should be carefully distributed and bound to each of the
2365-
individual CPUs for optimal performance.
2361+
When set to "isolated", the CPUs in that partition will be in
2362+
an isolated state without any load balancing from the scheduler
2363+
and excluded from the unbound workqueues. Tasks placed in such
2364+
a partition with multiple CPUs should be carefully distributed
2365+
and bound to each of the individual CPUs for optimal performance.
23662366

23672367
A partition root ("root" or "isolated") can be in one of the
23682368
two possible states - valid or invalid. An invalid partition

kernel/cgroup/cpuset.c

Lines changed: 96 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
#include <linux/cpu.h>
2626
#include <linux/cpumask.h>
2727
#include <linux/cpuset.h>
28+
#include <linux/delay.h>
2829
#include <linux/init.h>
2930
#include <linux/interrupt.h>
3031
#include <linux/kernel.h>
@@ -43,6 +44,7 @@
4344
#include <linux/sched/isolation.h>
4445
#include <linux/cgroup.h>
4546
#include <linux/wait.h>
47+
#include <linux/workqueue.h>
4648

4749
DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
4850
DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
@@ -1444,38 +1446,47 @@ static void partition_xcpus_newstate(int old_prs, int new_prs, struct cpumask *x
14441446
* @new_prs: new partition_root_state
14451447
* @parent: parent cpuset
14461448
* @xcpus: exclusive CPUs to be added
1449+
* Return: true if isolated_cpus modified, false otherwise
14471450
*
14481451
* Remote partition if parent == NULL
14491452
*/
1450-
static void partition_xcpus_add(int new_prs, struct cpuset *parent,
1453+
static bool partition_xcpus_add(int new_prs, struct cpuset *parent,
14511454
struct cpumask *xcpus)
14521455
{
1456+
bool isolcpus_updated;
1457+
14531458
WARN_ON_ONCE(new_prs < 0);
14541459
lockdep_assert_held(&callback_lock);
14551460
if (!parent)
14561461
parent = &top_cpuset;
14571462

1463+
14581464
if (parent == &top_cpuset)
14591465
cpumask_or(subpartitions_cpus, subpartitions_cpus, xcpus);
14601466

1461-
if (new_prs != parent->partition_root_state)
1467+
isolcpus_updated = (new_prs != parent->partition_root_state);
1468+
if (isolcpus_updated)
14621469
partition_xcpus_newstate(parent->partition_root_state, new_prs,
14631470
xcpus);
14641471

14651472
cpumask_andnot(parent->effective_cpus, parent->effective_cpus, xcpus);
1473+
return isolcpus_updated;
14661474
}
14671475

14681476
/*
14691477
* partition_xcpus_del - Remove exclusive CPUs from partition
14701478
* @old_prs: old partition_root_state
14711479
* @parent: parent cpuset
14721480
* @xcpus: exclusive CPUs to be removed
1481+
* Return: true if isolated_cpus modified, false otherwise
14731482
*
14741483
* Remote partition if parent == NULL
14751484
*/
1476-
static void partition_xcpus_del(int old_prs, struct cpuset *parent,
1485+
static bool partition_xcpus_del(int old_prs, struct cpuset *parent,
14771486
struct cpumask *xcpus)
14781487
{
1488+
bool isolcpus_updated;
1489+
14791490
WARN_ON_ONCE(old_prs < 0);
14801491
lockdep_assert_held(&callback_lock);
14811492
if (!parent)
@@ -1484,12 +1495,27 @@ static void partition_xcpus_del(int old_prs, struct cpuset *parent,
14841495
if (parent == &top_cpuset)
14851496
cpumask_andnot(subpartitions_cpus, subpartitions_cpus, xcpus);
14861497

1487-
if (old_prs != parent->partition_root_state)
1498+
isolcpus_updated = (old_prs != parent->partition_root_state);
1499+
if (isolcpus_updated)
14881500
partition_xcpus_newstate(old_prs, parent->partition_root_state,
14891501
xcpus);
14901502

14911503
cpumask_and(xcpus, xcpus, cpu_active_mask);
14921504
cpumask_or(parent->effective_cpus, parent->effective_cpus, xcpus);
1505+
return isolcpus_updated;
1506+
}
1507+
1508+
static void update_unbound_workqueue_cpumask(bool isolcpus_updated)
1509+
{
1510+
int ret;
1511+
1512+
lockdep_assert_cpus_held();
1513+
1514+
if (!isolcpus_updated)
1515+
return;
1516+
1517+
ret = workqueue_unbound_exclude_cpumask(isolated_cpus);
1518+
WARN_ON_ONCE(ret < 0);
14931519
}
14941520

14951521
/*
@@ -1540,6 +1566,8 @@ static inline bool is_local_partition(struct cpuset *cs)
15401566
static int remote_partition_enable(struct cpuset *cs, int new_prs,
15411567
struct tmpmasks *tmp)
15421568
{
1569+
bool isolcpus_updated;
1570+
15431571
/*
15441572
* The user must have sysadmin privilege.
15451573
*/
@@ -1561,7 +1589,7 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs,
15611589
return 0;
15621590

15631591
spin_lock_irq(&callback_lock);
1564-
partition_xcpus_add(new_prs, NULL, tmp->new_cpus);
1592+
isolcpus_updated = partition_xcpus_add(new_prs, NULL, tmp->new_cpus);
15651593
list_add(&cs->remote_sibling, &remote_children);
15661594
if (cs->use_parent_ecpus) {
15671595
struct cpuset *parent = parent_cs(cs);
@@ -1570,13 +1598,13 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs,
15701598
parent->child_ecpus_count--;
15711599
}
15721600
spin_unlock_irq(&callback_lock);
1601+
update_unbound_workqueue_cpumask(isolcpus_updated);
15731602

15741603
/*
15751604
* Proprogate changes in top_cpuset's effective_cpus down the hierarchy.
15761605
*/
15771606
update_tasks_cpumask(&top_cpuset, tmp->new_cpus);
15781607
update_sibling_cpumasks(&top_cpuset, NULL, tmp);
1579-
15801608
return 1;
15811609
}
15821610

@@ -1591,18 +1619,22 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs,
15911619
*/
15921620
static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp)
15931621
{
1622+
bool isolcpus_updated;
1623+
15941624
compute_effective_exclusive_cpumask(cs, tmp->new_cpus);
15951625
WARN_ON_ONCE(!is_remote_partition(cs));
15961626
WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, subpartitions_cpus));
15971627

15981628
spin_lock_irq(&callback_lock);
15991629
list_del_init(&cs->remote_sibling);
1600-
partition_xcpus_del(cs->partition_root_state, NULL, tmp->new_cpus);
1630+
isolcpus_updated = partition_xcpus_del(cs->partition_root_state,
1631+
NULL, tmp->new_cpus);
16011632
cs->partition_root_state = -cs->partition_root_state;
16021633
if (!cs->prs_err)
16031634
cs->prs_err = PERR_INVCPUS;
16041635
reset_partition_data(cs);
16051636
spin_unlock_irq(&callback_lock);
1637+
update_unbound_workqueue_cpumask(isolcpus_updated);
16061638

16071639
/*
16081640
* Proprogate changes in top_cpuset's effective_cpus down the hierarchy.
@@ -1625,6 +1657,7 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask,
16251657
{
16261658
bool adding, deleting;
16271659
int prs = cs->partition_root_state;
1660+
int isolcpus_updated = 0;
16281661

16291662
if (WARN_ON_ONCE(!is_remote_partition(cs)))
16301663
return;
@@ -1649,10 +1682,11 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask,
16491682

16501683
spin_lock_irq(&callback_lock);
16511684
if (adding)
1652-
partition_xcpus_add(prs, NULL, tmp->addmask);
1685+
isolcpus_updated += partition_xcpus_add(prs, NULL, tmp->addmask);
16531686
if (deleting)
1654-
partition_xcpus_del(prs, NULL, tmp->delmask);
1687+
isolcpus_updated += partition_xcpus_del(prs, NULL, tmp->delmask);
16551688
spin_unlock_irq(&callback_lock);
1689+
update_unbound_workqueue_cpumask(isolcpus_updated);
16561690

16571691
/*
16581692
* Proprogate changes in top_cpuset's effective_cpus down the hierarchy.
@@ -1774,6 +1808,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
17741808
int part_error = PERR_NONE; /* Partition error? */
17751809
int subparts_delta = 0;
17761810
struct cpumask *xcpus; /* cs effective_xcpus */
1811+
int isolcpus_updated = 0;
17771812
bool nocpu;
17781813

17791814
lockdep_assert_held(&cpuset_mutex);
@@ -2010,15 +2045,18 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
20102045
* and vice versa.
20112046
*/
20122047
if (adding)
2013-
partition_xcpus_del(old_prs, parent, tmp->addmask);
2048+
isolcpus_updated += partition_xcpus_del(old_prs, parent,
2049+
tmp->addmask);
20142050
if (deleting)
2015-
partition_xcpus_add(new_prs, parent, tmp->delmask);
2051+
isolcpus_updated += partition_xcpus_add(new_prs, parent,
2052+
tmp->delmask);
20162053

20172054
if (is_partition_valid(parent)) {
20182055
parent->nr_subparts += subparts_delta;
20192056
WARN_ON_ONCE(parent->nr_subparts < 0);
20202057
}
20212058
spin_unlock_irq(&callback_lock);
2059+
update_unbound_workqueue_cpumask(isolcpus_updated);
20222060

20232061
if ((old_prs != new_prs) && (cmd == partcmd_update))
20242062
update_partition_exclusive(cs, new_prs);
@@ -3082,6 +3120,7 @@ static int update_prstate(struct cpuset *cs, int new_prs)
30823120
else if (new_xcpus_state)
30833121
partition_xcpus_newstate(old_prs, new_prs, cs->effective_xcpus);
30843122
spin_unlock_irq(&callback_lock);
3123+
update_unbound_workqueue_cpumask(new_xcpus_state);
30853124

30863125
/* Force update if switching back to member */
30873126
update_cpumasks_hier(cs, &tmpmask, !new_prs ? HIER_CHECKALL : 0);
@@ -4370,6 +4409,30 @@ void cpuset_force_rebuild(void)
43704409
force_rebuild = true;
43714410
}
43724411

4412+
/*
4413+
* Attempt to acquire a cpus_read_lock while a hotplug operation may be in
4414+
* progress.
4415+
* Return: true if successful, false otherwise
4416+
*
4417+
* To avoid circular lock dependency between cpuset_mutex and cpus_read_lock,
4418+
* cpus_read_trylock() is used here to acquire the lock.
4419+
*/
4420+
static bool cpuset_hotplug_cpus_read_trylock(void)
4421+
{
4422+
int retries = 0;
4423+
4424+
while (!cpus_read_trylock()) {
4425+
/*
4426+
* CPU hotplug still in progress. Retry 5 times
4427+
* with a 10ms wait before bailing out.
4428+
*/
4429+
if (++retries > 5)
4430+
return false;
4431+
msleep(10);
4432+
}
4433+
return true;
4434+
}
4435+
43734436
/**
43744437
* cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
43754438
* @cs: cpuset in interest
@@ -4386,6 +4449,7 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
43864449
bool cpus_updated;
43874450
bool mems_updated;
43884451
bool remote;
4452+
int partcmd = -1;
43894453
struct cpuset *parent;
43904454
retry:
43914455
wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
@@ -4417,11 +4481,13 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
44174481
compute_partition_effective_cpumask(cs, &new_cpus);
44184482

44194483
if (remote && cpumask_empty(&new_cpus) &&
4420-
partition_is_populated(cs, NULL)) {
4484+
partition_is_populated(cs, NULL) &&
4485+
cpuset_hotplug_cpus_read_trylock()) {
44214486
remote_partition_disable(cs, tmp);
44224487
compute_effective_cpumask(&new_cpus, cs, parent);
44234488
remote = false;
44244489
cpuset_force_rebuild();
4490+
cpus_read_unlock();
44254491
}
44264492

44274493
/*
@@ -4432,18 +4498,28 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
44324498
* partitions.
44334499
*/
44344500
if (is_local_partition(cs) && (!is_partition_valid(parent) ||
4435-
tasks_nocpu_error(parent, cs, &new_cpus))) {
4436-
update_parent_effective_cpumask(cs, partcmd_invalidate, NULL, tmp);
4437-
compute_effective_cpumask(&new_cpus, cs, parent);
4438-
cpuset_force_rebuild();
4439-
}
4501+
tasks_nocpu_error(parent, cs, &new_cpus)))
4502+
partcmd = partcmd_invalidate;
44404503
/*
44414504
* On the other hand, an invalid partition root may be transitioned
44424505
* back to a regular one.
44434506
*/
4444-
else if (is_partition_valid(parent) && is_partition_invalid(cs)) {
4445-
update_parent_effective_cpumask(cs, partcmd_update, NULL, tmp);
4446-
if (is_partition_valid(cs)) {
4507+
else if (is_partition_valid(parent) && is_partition_invalid(cs))
4508+
partcmd = partcmd_update;
4509+
4510+
/*
4511+
* cpus_read_lock needs to be held before calling
4512+
* update_parent_effective_cpumask(). To avoid circular lock
4513+
* dependency between cpuset_mutex and cpus_read_lock,
4514+
* cpus_read_trylock() is used here to acquire the lock.
4515+
*/
4516+
if (partcmd >= 0) {
4517+
if (!cpuset_hotplug_cpus_read_trylock())
4518+
goto update_tasks;
4519+
4520+
update_parent_effective_cpumask(cs, partcmd, NULL, tmp);
4521+
cpus_read_unlock();
4522+
if ((partcmd == partcmd_invalidate) || is_partition_valid(cs)) {
44474523
compute_partition_effective_cpumask(cs, &new_cpus);
44484524
cpuset_force_rebuild();
44494525
}

0 commit comments

Comments
 (0)