@@ -201,6 +201,14 @@ struct cpuset {
201201 struct list_head remote_sibling ;
202202};
203203
204+ /*
205+ * Legacy hierarchy call to cgroup_transfer_tasks() is handled asynchrously
206+ */
207+ struct cpuset_remove_tasks_struct {
208+ struct work_struct work ;
209+ struct cpuset * cs ;
210+ };
211+
204212/*
205213 * Exclusive CPUs distributed out to sub-partitions of top_cpuset
206214 */
@@ -449,12 +457,6 @@ static DEFINE_SPINLOCK(callback_lock);
449457
450458static struct workqueue_struct * cpuset_migrate_mm_wq ;
451459
452- /*
453- * CPU / memory hotplug is handled asynchronously.
454- */
455- static void cpuset_hotplug_workfn (struct work_struct * work );
456- static DECLARE_WORK (cpuset_hotplug_work , cpuset_hotplug_workfn ) ;
457-
458460static DECLARE_WAIT_QUEUE_HEAD (cpuset_attach_wq );
459461
460462static inline void check_insane_mems_config (nodemask_t * nodes )
@@ -540,22 +542,10 @@ static void guarantee_online_cpus(struct task_struct *tsk,
540542 rcu_read_lock ();
541543 cs = task_cs (tsk );
542544
543- while (!cpumask_intersects (cs -> effective_cpus , pmask )) {
545+ while (!cpumask_intersects (cs -> effective_cpus , pmask ))
544546 cs = parent_cs (cs );
545- if (unlikely (!cs )) {
546- /*
547- * The top cpuset doesn't have any online cpu as a
548- * consequence of a race between cpuset_hotplug_work
549- * and cpu hotplug notifier. But we know the top
550- * cpuset's effective_cpus is on its way to be
551- * identical to cpu_online_mask.
552- */
553- goto out_unlock ;
554- }
555- }
556- cpumask_and (pmask , pmask , cs -> effective_cpus );
557547
558- out_unlock :
548+ cpumask_and ( pmask , pmask , cs -> effective_cpus );
559549 rcu_read_unlock ();
560550}
561551
@@ -1217,7 +1207,7 @@ static void rebuild_sched_domains_locked(void)
12171207 /*
12181208 * If we have raced with CPU hotplug, return early to avoid
12191209 * passing doms with offlined cpu to partition_sched_domains().
1220- * Anyways, cpuset_hotplug_workfn () will rebuild sched domains.
1210+ * Anyways, cpuset_handle_hotplug () will rebuild sched domains.
12211211 *
12221212 * With no CPUs in any subpartitions, top_cpuset's effective CPUs
12231213 * should be the same as the active CPUs, so checking only top_cpuset
@@ -1260,12 +1250,17 @@ static void rebuild_sched_domains_locked(void)
12601250}
12611251#endif /* CONFIG_SMP */
12621252
1263- void rebuild_sched_domains (void )
1253+ static void rebuild_sched_domains_cpuslocked (void )
12641254{
1265- cpus_read_lock ();
12661255 mutex_lock (& cpuset_mutex );
12671256 rebuild_sched_domains_locked ();
12681257 mutex_unlock (& cpuset_mutex );
1258+ }
1259+
1260+ void rebuild_sched_domains (void )
1261+ {
1262+ cpus_read_lock ();
1263+ rebuild_sched_domains_cpuslocked ();
12691264 cpus_read_unlock ();
12701265}
12711266
@@ -2079,14 +2074,11 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
20792074
20802075 /*
20812076 * For partcmd_update without newmask, it is being called from
2082- * cpuset_hotplug_workfn() where cpus_read_lock() wasn't taken.
2083- * Update the load balance flag and scheduling domain if
2084- * cpus_read_trylock() is successful.
2077+ * cpuset_handle_hotplug(). Update the load balance flag and
2078+ * scheduling domain accordingly.
20852079 */
2086- if ((cmd == partcmd_update ) && !newmask && cpus_read_trylock ()) {
2080+ if ((cmd == partcmd_update ) && !newmask )
20872081 update_partition_sd_lb (cs , old_prs );
2088- cpus_read_unlock ();
2089- }
20902082
20912083 notify_partition_change (cs , old_prs );
20922084 return 0 ;
@@ -3599,8 +3591,8 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
35993591 * proceeding, so that we don't end up keep removing tasks added
36003592 * after execution capability is restored.
36013593 *
3602- * cpuset_hotplug_work calls back into cgroup core via
3603- * cgroup_transfer_tasks() and waiting for it from a cgroupfs
3594+ * cpuset_handle_hotplug may call back into cgroup core asynchronously
3595+ * via cgroup_transfer_tasks() and waiting for it from a cgroupfs
36043596 * operation like this one can lead to a deadlock through kernfs
36053597 * active_ref protection. Let's break the protection. Losing the
36063598 * protection is okay as we check whether @cs is online after
@@ -3609,7 +3601,6 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
36093601 */
36103602 css_get (& cs -> css );
36113603 kernfs_break_active_protection (of -> kn );
3612- flush_work (& cpuset_hotplug_work );
36133604
36143605 cpus_read_lock ();
36153606 mutex_lock (& cpuset_mutex );
@@ -4354,6 +4345,16 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
43544345 }
43554346}
43564347
4348+ static void cpuset_migrate_tasks_workfn (struct work_struct * work )
4349+ {
4350+ struct cpuset_remove_tasks_struct * s ;
4351+
4352+ s = container_of (work , struct cpuset_remove_tasks_struct , work );
4353+ remove_tasks_in_empty_cpuset (s -> cs );
4354+ css_put (& s -> cs -> css );
4355+ kfree (s );
4356+ }
4357+
43574358static void
43584359hotplug_update_tasks_legacy (struct cpuset * cs ,
43594360 struct cpumask * new_cpus , nodemask_t * new_mems ,
@@ -4383,12 +4384,21 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
43834384 /*
43844385 * Move tasks to the nearest ancestor with execution resources,
43854386 * This is full cgroup operation which will also call back into
4386- * cpuset. Should be done outside any lock .
4387+ * cpuset. Execute it asynchronously using workqueue .
43874388 */
4388- if (is_empty ) {
4389- mutex_unlock (& cpuset_mutex );
4390- remove_tasks_in_empty_cpuset (cs );
4391- mutex_lock (& cpuset_mutex );
4389+ if (is_empty && cs -> css .cgroup -> nr_populated_csets &&
4390+ css_tryget_online (& cs -> css )) {
4391+ struct cpuset_remove_tasks_struct * s ;
4392+
4393+ s = kzalloc (sizeof (* s ), GFP_KERNEL );
4394+ if (WARN_ON_ONCE (!s )) {
4395+ css_put (& cs -> css );
4396+ return ;
4397+ }
4398+
4399+ s -> cs = cs ;
4400+ INIT_WORK (& s -> work , cpuset_migrate_tasks_workfn );
4401+ schedule_work (& s -> work );
43924402 }
43934403}
43944404
@@ -4421,30 +4431,6 @@ void cpuset_force_rebuild(void)
44214431 force_rebuild = true;
44224432}
44234433
4424- /*
4425- * Attempt to acquire a cpus_read_lock while a hotplug operation may be in
4426- * progress.
4427- * Return: true if successful, false otherwise
4428- *
4429- * To avoid circular lock dependency between cpuset_mutex and cpus_read_lock,
4430- * cpus_read_trylock() is used here to acquire the lock.
4431- */
4432- static bool cpuset_hotplug_cpus_read_trylock (void )
4433- {
4434- int retries = 0 ;
4435-
4436- while (!cpus_read_trylock ()) {
4437- /*
4438- * CPU hotplug still in progress. Retry 5 times
4439- * with a 10ms wait before bailing out.
4440- */
4441- if (++ retries > 5 )
4442- return false;
4443- msleep (10 );
4444- }
4445- return true;
4446- }
4447-
44484434/**
44494435 * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
44504436 * @cs: cpuset in interest
@@ -4493,13 +4479,11 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
44934479 compute_partition_effective_cpumask (cs , & new_cpus );
44944480
44954481 if (remote && cpumask_empty (& new_cpus ) &&
4496- partition_is_populated (cs , NULL ) &&
4497- cpuset_hotplug_cpus_read_trylock ()) {
4482+ partition_is_populated (cs , NULL )) {
44984483 remote_partition_disable (cs , tmp );
44994484 compute_effective_cpumask (& new_cpus , cs , parent );
45004485 remote = false;
45014486 cpuset_force_rebuild ();
4502- cpus_read_unlock ();
45034487 }
45044488
45054489 /*
@@ -4519,18 +4503,8 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
45194503 else if (is_partition_valid (parent ) && is_partition_invalid (cs ))
45204504 partcmd = partcmd_update ;
45214505
4522- /*
4523- * cpus_read_lock needs to be held before calling
4524- * update_parent_effective_cpumask(). To avoid circular lock
4525- * dependency between cpuset_mutex and cpus_read_lock,
4526- * cpus_read_trylock() is used here to acquire the lock.
4527- */
45284506 if (partcmd >= 0 ) {
4529- if (!cpuset_hotplug_cpus_read_trylock ())
4530- goto update_tasks ;
4531-
45324507 update_parent_effective_cpumask (cs , partcmd , NULL , tmp );
4533- cpus_read_unlock ();
45344508 if ((partcmd == partcmd_invalidate ) || is_partition_valid (cs )) {
45354509 compute_partition_effective_cpumask (cs , & new_cpus );
45364510 cpuset_force_rebuild ();
@@ -4558,8 +4532,7 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
45584532}
45594533
45604534/**
4561- * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
4562- * @work: unused
4535+ * cpuset_handle_hotplug - handle CPU/memory hot{,un}plug for a cpuset
45634536 *
45644537 * This function is called after either CPU or memory configuration has
45654538 * changed and updates cpuset accordingly. The top_cpuset is always
@@ -4573,8 +4546,10 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
45734546 *
45744547 * Note that CPU offlining during suspend is ignored. We don't modify
45754548 * cpusets across suspend/resume cycles at all.
4549+ *
4550+ * CPU / memory hotplug is handled synchronously.
45764551 */
4577- static void cpuset_hotplug_workfn ( struct work_struct * work )
4552+ static void cpuset_handle_hotplug ( void )
45784553{
45794554 static cpumask_t new_cpus ;
45804555 static nodemask_t new_mems ;
@@ -4585,6 +4560,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
45854560 if (on_dfl && !alloc_cpumasks (NULL , & tmp ))
45864561 ptmp = & tmp ;
45874562
4563+ lockdep_assert_cpus_held ();
45884564 mutex_lock (& cpuset_mutex );
45894565
45904566 /* fetch the available cpus/mems and find out which changed how */
@@ -4666,7 +4642,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
46664642 /* rebuild sched domains if cpus_allowed has changed */
46674643 if (cpus_updated || force_rebuild ) {
46684644 force_rebuild = false;
4669- rebuild_sched_domains ();
4645+ rebuild_sched_domains_cpuslocked ();
46704646 }
46714647
46724648 free_cpumasks (NULL , ptmp );
@@ -4679,12 +4655,7 @@ void cpuset_update_active_cpus(void)
46794655 * inside cgroup synchronization. Bounce actual hotplug processing
46804656 * to a work item to avoid reverse locking order.
46814657 */
4682- schedule_work (& cpuset_hotplug_work );
4683- }
4684-
4685- void cpuset_wait_for_hotplug (void )
4686- {
4687- flush_work (& cpuset_hotplug_work );
4658+ cpuset_handle_hotplug ();
46884659}
46894660
46904661/*
@@ -4695,7 +4666,7 @@ void cpuset_wait_for_hotplug(void)
46954666static int cpuset_track_online_nodes (struct notifier_block * self ,
46964667 unsigned long action , void * arg )
46974668{
4698- schedule_work ( & cpuset_hotplug_work );
4669+ cpuset_handle_hotplug ( );
46994670 return NOTIFY_OK ;
47004671}
47014672
0 commit comments