@@ -179,6 +179,7 @@ struct worker_pool {
179179
180180 struct worker * manager ; /* L: purely informational */
181181 struct list_head workers ; /* A: attached workers */
182+ struct list_head dying_workers ; /* A: workers about to die */
182183 struct completion * detach_completion ; /* all workers detached */
183184
184185 struct ida worker_ida ; /* worker IDs for task name */
@@ -1906,7 +1907,7 @@ static void worker_detach_from_pool(struct worker *worker)
19061907 list_del (& worker -> node );
19071908 worker -> pool = NULL ;
19081909
1909- if (list_empty (& pool -> workers ))
1910+ if (list_empty (& pool -> workers ) && list_empty ( & pool -> dying_workers ) )
19101911 detach_completion = pool -> detach_completion ;
19111912 mutex_unlock (& wq_pool_attach_mutex );
19121913
@@ -1995,21 +1996,44 @@ static void rebind_worker(struct worker *worker, struct worker_pool *pool)
19951996 WARN_ON_ONCE (set_cpus_allowed_ptr (worker -> task , pool -> attrs -> cpumask ) < 0 );
19961997}
19971998
1999+ static void wake_dying_workers (struct list_head * cull_list )
2000+ {
2001+ struct worker * worker , * tmp ;
2002+
2003+ list_for_each_entry_safe (worker , tmp , cull_list , entry ) {
2004+ list_del_init (& worker -> entry );
2005+ unbind_worker (worker );
2006+ /*
2007+ * If the worker was somehow already running, then it had to be
2008+ * in pool->idle_list when set_worker_dying() happened or we
2009+ * wouldn't have gotten here.
2010+ *
2011+ * Thus, the worker must either have observed the WORKER_DIE
2012+ * flag, or have set its state to TASK_IDLE. Either way, the
2013+ * below will be observed by the worker and is safe to do
2014+ * outside of pool->lock.
2015+ */
2016+ wake_up_process (worker -> task );
2017+ }
2018+ }
2019+
19982020/**
1999- * destroy_worker - destroy a workqueue worker
2021+ * set_worker_dying - Tag a worker for destruction
20002022 * @worker: worker to be destroyed
2023+ * @list: transfer worker away from its pool->idle_list and into list
20012024 *
2002- * Destroy @worker and adjust @pool stats accordingly. The worker should
2003- * be idle.
2025+ * Tag @worker for destruction and adjust @pool stats accordingly. The worker
2026+ * should be idle.
20042027 *
20052028 * CONTEXT:
20062029 * raw_spin_lock_irq(pool->lock).
20072030 */
2008- static void destroy_worker (struct worker * worker )
2031+ static void set_worker_dying (struct worker * worker , struct list_head * list )
20092032{
20102033 struct worker_pool * pool = worker -> pool ;
20112034
20122035 lockdep_assert_held (& pool -> lock );
2036+ lockdep_assert_held (& wq_pool_attach_mutex );
20132037
20142038 /* sanity check frenzy */
20152039 if (WARN_ON (worker -> current_work ) ||
@@ -2020,9 +2044,10 @@ static void destroy_worker(struct worker *worker)
20202044 pool -> nr_workers -- ;
20212045 pool -> nr_idle -- ;
20222046
2023- list_del_init (& worker -> entry );
20242047 worker -> flags |= WORKER_DIE ;
2025- wake_up_process (worker -> task );
2048+
2049+ list_move (& worker -> entry , list );
2050+ list_move (& worker -> node , & pool -> dying_workers );
20262051}
20272052
20282053/**
@@ -2069,11 +2094,24 @@ static void idle_worker_timeout(struct timer_list *t)
20692094 *
20702095 * This goes through a pool's idle workers and gets rid of those that have been
20712096 * idle for at least IDLE_WORKER_TIMEOUT seconds.
2097+ *
2098+ * We don't want to disturb isolated CPUs because of a pcpu kworker being
2099+ * culled, so this also resets worker affinity. This requires a sleepable
2100+ * context, hence the split between timer callback and work item.
20722101 */
20732102static void idle_cull_fn (struct work_struct * work )
20742103{
20752104 struct worker_pool * pool = container_of (work , struct worker_pool , idle_cull_work );
2105+ struct list_head cull_list ;
20762106
2107+ INIT_LIST_HEAD (& cull_list );
2108+ /*
2109+ * Grabbing wq_pool_attach_mutex here ensures an already-running worker
2110+ * cannot proceed beyong worker_detach_from_pool() in its self-destruct
2111+ * path. This is required as a previously-preempted worker could run after
2112+ * set_worker_dying() has happened but before wake_dying_workers() did.
2113+ */
2114+ mutex_lock (& wq_pool_attach_mutex );
20772115 raw_spin_lock_irq (& pool -> lock );
20782116
20792117 while (too_many_workers (pool )) {
@@ -2088,10 +2126,12 @@ static void idle_cull_fn(struct work_struct *work)
20882126 break ;
20892127 }
20902128
2091- destroy_worker (worker );
2129+ set_worker_dying (worker , & cull_list );
20922130 }
20932131
20942132 raw_spin_unlock_irq (& pool -> lock );
2133+ wake_dying_workers (& cull_list );
2134+ mutex_unlock (& wq_pool_attach_mutex );
20952135}
20962136
20972137static void send_mayday (struct work_struct * work )
@@ -2455,12 +2495,12 @@ static int worker_thread(void *__worker)
24552495 /* am I supposed to die? */
24562496 if (unlikely (worker -> flags & WORKER_DIE )) {
24572497 raw_spin_unlock_irq (& pool -> lock );
2458- WARN_ON_ONCE (!list_empty (& worker -> entry ));
24592498 set_pf_worker (false);
24602499
24612500 set_task_comm (worker -> task , "kworker/dying" );
24622501 ida_free (& pool -> worker_ida , worker -> id );
24632502 worker_detach_from_pool (worker );
2503+ WARN_ON_ONCE (!list_empty (& worker -> entry ));
24642504 kfree (worker );
24652505 return 0 ;
24662506 }
@@ -3534,6 +3574,7 @@ static int init_worker_pool(struct worker_pool *pool)
35343574 timer_setup (& pool -> mayday_timer , pool_mayday_timeout , 0 );
35353575
35363576 INIT_LIST_HEAD (& pool -> workers );
3577+ INIT_LIST_HEAD (& pool -> dying_workers );
35373578
35383579 ida_init (& pool -> worker_ida );
35393580 INIT_HLIST_NODE (& pool -> hash_node );
@@ -3622,8 +3663,11 @@ static void rcu_free_pool(struct rcu_head *rcu)
36223663static void put_unbound_pool (struct worker_pool * pool )
36233664{
36243665 DECLARE_COMPLETION_ONSTACK (detach_completion );
3666+ struct list_head cull_list ;
36253667 struct worker * worker ;
36263668
3669+ INIT_LIST_HEAD (& cull_list );
3670+
36273671 lockdep_assert_held (& wq_pool_mutex );
36283672
36293673 if (-- pool -> refcnt )
@@ -3656,21 +3700,25 @@ static void put_unbound_pool(struct worker_pool *pool)
36563700 rcuwait_wait_event (& manager_wait ,
36573701 !(pool -> flags & POOL_MANAGER_ACTIVE ),
36583702 TASK_UNINTERRUPTIBLE );
3703+
3704+ mutex_lock (& wq_pool_attach_mutex );
36593705 raw_spin_lock_irq (& pool -> lock );
36603706 if (!(pool -> flags & POOL_MANAGER_ACTIVE )) {
36613707 pool -> flags |= POOL_MANAGER_ACTIVE ;
36623708 break ;
36633709 }
36643710 raw_spin_unlock_irq (& pool -> lock );
3711+ mutex_unlock (& wq_pool_attach_mutex );
36653712 }
36663713
36673714 while ((worker = first_idle_worker (pool )))
3668- destroy_worker (worker );
3715+ set_worker_dying (worker , & cull_list );
36693716 WARN_ON (pool -> nr_workers || pool -> nr_idle );
36703717 raw_spin_unlock_irq (& pool -> lock );
36713718
3672- mutex_lock (& wq_pool_attach_mutex );
3673- if (!list_empty (& pool -> workers ))
3719+ wake_dying_workers (& cull_list );
3720+
3721+ if (!list_empty (& pool -> workers ) || !list_empty (& pool -> dying_workers ))
36743722 pool -> detach_completion = & detach_completion ;
36753723 mutex_unlock (& wq_pool_attach_mutex );
36763724
0 commit comments