2525#include <linux/cpu.h>
2626#include <linux/cpumask.h>
2727#include <linux/cpuset.h>
28+ #include <linux/delay.h>
2829#include <linux/init.h>
2930#include <linux/interrupt.h>
3031#include <linux/kernel.h>
4344#include <linux/sched/isolation.h>
4445#include <linux/cgroup.h>
4546#include <linux/wait.h>
47+ #include <linux/workqueue.h>
4648
4749DEFINE_STATIC_KEY_FALSE (cpusets_pre_enable_key );
4850DEFINE_STATIC_KEY_FALSE (cpusets_enabled_key );
@@ -1444,38 +1446,47 @@ static void partition_xcpus_newstate(int old_prs, int new_prs, struct cpumask *x
14441446 * @new_prs: new partition_root_state
14451447 * @parent: parent cpuset
14461448 * @xcpus: exclusive CPUs to be added
1449+ * Return: true if isolated_cpus modified, false otherwise
14471450 *
14481451 * Remote partition if parent == NULL
14491452 */
1450- static void partition_xcpus_add (int new_prs , struct cpuset * parent ,
1453+ static bool partition_xcpus_add (int new_prs , struct cpuset * parent ,
14511454 struct cpumask * xcpus )
14521455{
1456+ bool isolcpus_updated ;
1457+
14531458 WARN_ON_ONCE (new_prs < 0 );
14541459 lockdep_assert_held (& callback_lock );
14551460 if (!parent )
14561461 parent = & top_cpuset ;
14571462
1463+
14581464 if (parent == & top_cpuset )
14591465 cpumask_or (subpartitions_cpus , subpartitions_cpus , xcpus );
14601466
1461- if (new_prs != parent -> partition_root_state )
1467+ isolcpus_updated = (new_prs != parent -> partition_root_state );
1468+ if (isolcpus_updated )
14621469 partition_xcpus_newstate (parent -> partition_root_state , new_prs ,
14631470 xcpus );
14641471
14651472 cpumask_andnot (parent -> effective_cpus , parent -> effective_cpus , xcpus );
1473+ return isolcpus_updated ;
14661474}
14671475
14681476/*
14691477 * partition_xcpus_del - Remove exclusive CPUs from partition
14701478 * @old_prs: old partition_root_state
14711479 * @parent: parent cpuset
14721480 * @xcpus: exclusive CPUs to be removed
1481+ * Return: true if isolated_cpus modified, false otherwise
14731482 *
14741483 * Remote partition if parent == NULL
14751484 */
1476- static void partition_xcpus_del (int old_prs , struct cpuset * parent ,
1485+ static bool partition_xcpus_del (int old_prs , struct cpuset * parent ,
14771486 struct cpumask * xcpus )
14781487{
1488+ bool isolcpus_updated ;
1489+
14791490 WARN_ON_ONCE (old_prs < 0 );
14801491 lockdep_assert_held (& callback_lock );
14811492 if (!parent )
@@ -1484,12 +1495,27 @@ static void partition_xcpus_del(int old_prs, struct cpuset *parent,
14841495 if (parent == & top_cpuset )
14851496 cpumask_andnot (subpartitions_cpus , subpartitions_cpus , xcpus );
14861497
1487- if (old_prs != parent -> partition_root_state )
1498+ isolcpus_updated = (old_prs != parent -> partition_root_state );
1499+ if (isolcpus_updated )
14881500 partition_xcpus_newstate (old_prs , parent -> partition_root_state ,
14891501 xcpus );
14901502
14911503 cpumask_and (xcpus , xcpus , cpu_active_mask );
14921504 cpumask_or (parent -> effective_cpus , parent -> effective_cpus , xcpus );
1505+ return isolcpus_updated ;
1506+ }
1507+
1508+ static void update_unbound_workqueue_cpumask (bool isolcpus_updated )
1509+ {
1510+ int ret ;
1511+
1512+ lockdep_assert_cpus_held ();
1513+
1514+ if (!isolcpus_updated )
1515+ return ;
1516+
1517+ ret = workqueue_unbound_exclude_cpumask (isolated_cpus );
1518+ WARN_ON_ONCE (ret < 0 );
14931519}
14941520
14951521/*
@@ -1540,6 +1566,8 @@ static inline bool is_local_partition(struct cpuset *cs)
15401566static int remote_partition_enable (struct cpuset * cs , int new_prs ,
15411567 struct tmpmasks * tmp )
15421568{
1569+ bool isolcpus_updated ;
1570+
15431571 /*
15441572 * The user must have sysadmin privilege.
15451573 */
@@ -1561,7 +1589,7 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs,
15611589 return 0 ;
15621590
15631591 spin_lock_irq (& callback_lock );
1564- partition_xcpus_add (new_prs , NULL , tmp -> new_cpus );
1592+ isolcpus_updated = partition_xcpus_add (new_prs , NULL , tmp -> new_cpus );
15651593 list_add (& cs -> remote_sibling , & remote_children );
15661594 if (cs -> use_parent_ecpus ) {
15671595 struct cpuset * parent = parent_cs (cs );
@@ -1570,13 +1598,13 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs,
15701598 parent -> child_ecpus_count -- ;
15711599 }
15721600 spin_unlock_irq (& callback_lock );
1601+ update_unbound_workqueue_cpumask (isolcpus_updated );
15731602
15741603 /*
15751604 * Proprogate changes in top_cpuset's effective_cpus down the hierarchy.
15761605 */
15771606 update_tasks_cpumask (& top_cpuset , tmp -> new_cpus );
15781607 update_sibling_cpumasks (& top_cpuset , NULL , tmp );
1579-
15801608 return 1 ;
15811609}
15821610
@@ -1591,18 +1619,22 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs,
15911619 */
15921620static void remote_partition_disable (struct cpuset * cs , struct tmpmasks * tmp )
15931621{
1622+ bool isolcpus_updated ;
1623+
15941624 compute_effective_exclusive_cpumask (cs , tmp -> new_cpus );
15951625 WARN_ON_ONCE (!is_remote_partition (cs ));
15961626 WARN_ON_ONCE (!cpumask_subset (tmp -> new_cpus , subpartitions_cpus ));
15971627
15981628 spin_lock_irq (& callback_lock );
15991629 list_del_init (& cs -> remote_sibling );
1600- partition_xcpus_del (cs -> partition_root_state , NULL , tmp -> new_cpus );
1630+ isolcpus_updated = partition_xcpus_del (cs -> partition_root_state ,
1631+ NULL , tmp -> new_cpus );
16011632 cs -> partition_root_state = - cs -> partition_root_state ;
16021633 if (!cs -> prs_err )
16031634 cs -> prs_err = PERR_INVCPUS ;
16041635 reset_partition_data (cs );
16051636 spin_unlock_irq (& callback_lock );
1637+ update_unbound_workqueue_cpumask (isolcpus_updated );
16061638
16071639 /*
16081640 * Proprogate changes in top_cpuset's effective_cpus down the hierarchy.
@@ -1625,6 +1657,7 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask,
16251657{
16261658 bool adding , deleting ;
16271659 int prs = cs -> partition_root_state ;
1660+ int isolcpus_updated = 0 ;
16281661
16291662 if (WARN_ON_ONCE (!is_remote_partition (cs )))
16301663 return ;
@@ -1649,10 +1682,11 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask,
16491682
16501683 spin_lock_irq (& callback_lock );
16511684 if (adding )
1652- partition_xcpus_add (prs , NULL , tmp -> addmask );
1685+ isolcpus_updated += partition_xcpus_add (prs , NULL , tmp -> addmask );
16531686 if (deleting )
1654- partition_xcpus_del (prs , NULL , tmp -> delmask );
1687+ isolcpus_updated += partition_xcpus_del (prs , NULL , tmp -> delmask );
16551688 spin_unlock_irq (& callback_lock );
1689+ update_unbound_workqueue_cpumask (isolcpus_updated );
16561690
16571691 /*
16581692 * Proprogate changes in top_cpuset's effective_cpus down the hierarchy.
@@ -1774,6 +1808,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
17741808 int part_error = PERR_NONE ; /* Partition error? */
17751809 int subparts_delta = 0 ;
17761810 struct cpumask * xcpus ; /* cs effective_xcpus */
1811+ int isolcpus_updated = 0 ;
17771812 bool nocpu ;
17781813
17791814 lockdep_assert_held (& cpuset_mutex );
@@ -2010,15 +2045,18 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
20102045 * and vice versa.
20112046 */
20122047 if (adding )
2013- partition_xcpus_del (old_prs , parent , tmp -> addmask );
2048+ isolcpus_updated += partition_xcpus_del (old_prs , parent ,
2049+ tmp -> addmask );
20142050 if (deleting )
2015- partition_xcpus_add (new_prs , parent , tmp -> delmask );
2051+ isolcpus_updated += partition_xcpus_add (new_prs , parent ,
2052+ tmp -> delmask );
20162053
20172054 if (is_partition_valid (parent )) {
20182055 parent -> nr_subparts += subparts_delta ;
20192056 WARN_ON_ONCE (parent -> nr_subparts < 0 );
20202057 }
20212058 spin_unlock_irq (& callback_lock );
2059+ update_unbound_workqueue_cpumask (isolcpus_updated );
20222060
20232061 if ((old_prs != new_prs ) && (cmd == partcmd_update ))
20242062 update_partition_exclusive (cs , new_prs );
@@ -3082,6 +3120,7 @@ static int update_prstate(struct cpuset *cs, int new_prs)
30823120 else if (new_xcpus_state )
30833121 partition_xcpus_newstate (old_prs , new_prs , cs -> effective_xcpus );
30843122 spin_unlock_irq (& callback_lock );
3123+ update_unbound_workqueue_cpumask (new_xcpus_state );
30853124
30863125 /* Force update if switching back to member */
30873126 update_cpumasks_hier (cs , & tmpmask , !new_prs ? HIER_CHECKALL : 0 );
@@ -4370,6 +4409,30 @@ void cpuset_force_rebuild(void)
43704409 force_rebuild = true;
43714410}
43724411
4412+ /*
4413+ * Attempt to acquire a cpus_read_lock while a hotplug operation may be in
4414+ * progress.
4415+ * Return: true if successful, false otherwise
4416+ *
4417+ * To avoid circular lock dependency between cpuset_mutex and cpus_read_lock,
4418+ * cpus_read_trylock() is used here to acquire the lock.
4419+ */
4420+ static bool cpuset_hotplug_cpus_read_trylock (void )
4421+ {
4422+ int retries = 0 ;
4423+
4424+ while (!cpus_read_trylock ()) {
4425+ /*
4426+ * CPU hotplug still in progress. Retry 5 times
4427+ * with a 10ms wait before bailing out.
4428+ */
4429+ if (++ retries > 5 )
4430+ return false;
4431+ msleep (10 );
4432+ }
4433+ return true;
4434+ }
4435+
43734436/**
43744437 * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
43754438 * @cs: cpuset in interest
@@ -4386,6 +4449,7 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
43864449 bool cpus_updated ;
43874450 bool mems_updated ;
43884451 bool remote ;
4452+ int partcmd = -1 ;
43894453 struct cpuset * parent ;
43904454retry :
43914455 wait_event (cpuset_attach_wq , cs -> attach_in_progress == 0 );
@@ -4417,11 +4481,13 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
44174481 compute_partition_effective_cpumask (cs , & new_cpus );
44184482
44194483 if (remote && cpumask_empty (& new_cpus ) &&
4420- partition_is_populated (cs , NULL )) {
4484+ partition_is_populated (cs , NULL ) &&
4485+ cpuset_hotplug_cpus_read_trylock ()) {
44214486 remote_partition_disable (cs , tmp );
44224487 compute_effective_cpumask (& new_cpus , cs , parent );
44234488 remote = false;
44244489 cpuset_force_rebuild ();
4490+ cpus_read_unlock ();
44254491 }
44264492
44274493 /*
@@ -4432,18 +4498,28 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
44324498 * partitions.
44334499 */
44344500 if (is_local_partition (cs ) && (!is_partition_valid (parent ) ||
4435- tasks_nocpu_error (parent , cs , & new_cpus ))) {
4436- update_parent_effective_cpumask (cs , partcmd_invalidate , NULL , tmp );
4437- compute_effective_cpumask (& new_cpus , cs , parent );
4438- cpuset_force_rebuild ();
4439- }
4501+ tasks_nocpu_error (parent , cs , & new_cpus )))
4502+ partcmd = partcmd_invalidate ;
44404503 /*
44414504 * On the other hand, an invalid partition root may be transitioned
44424505 * back to a regular one.
44434506 */
4444- else if (is_partition_valid (parent ) && is_partition_invalid (cs )) {
4445- update_parent_effective_cpumask (cs , partcmd_update , NULL , tmp );
4446- if (is_partition_valid (cs )) {
4507+ else if (is_partition_valid (parent ) && is_partition_invalid (cs ))
4508+ partcmd = partcmd_update ;
4509+
4510+ /*
4511+ * cpus_read_lock needs to be held before calling
4512+ * update_parent_effective_cpumask(). To avoid circular lock
4513+ * dependency between cpuset_mutex and cpus_read_lock,
4514+ * cpus_read_trylock() is used here to acquire the lock.
4515+ */
4516+ if (partcmd >= 0 ) {
4517+ if (!cpuset_hotplug_cpus_read_trylock ())
4518+ goto update_tasks ;
4519+
4520+ update_parent_effective_cpumask (cs , partcmd , NULL , tmp );
4521+ cpus_read_unlock ();
4522+ if ((partcmd == partcmd_invalidate ) || is_partition_valid (cs )) {
44474523 compute_partition_effective_cpumask (cs , & new_cpus );
44484524 cpuset_force_rebuild ();
44494525 }
0 commit comments