@@ -284,6 +284,16 @@ struct wq_flusher {
284284
285285struct wq_device ;
286286
287+ /*
288+ * Unlike in a per-cpu workqueue where max_active limits its concurrency level
289+ * on each CPU, in an unbound workqueue, max_active applies to the whole system.
290+ * As sharing a single nr_active across multiple sockets can be very expensive,
291+ * the counting and enforcement is per NUMA node.
292+ */
293+ struct wq_node_nr_active {
294+ atomic_t nr ; /* per-node nr_active count */
295+ };
296+
287297/*
288298 * The externally visible workqueue. It relays the issued work items to
289299 * the appropriate worker_pool through its pool_workqueues.
@@ -330,6 +340,7 @@ struct workqueue_struct {
330340 /* hot fields used during command issue, aligned to cacheline */
331341 unsigned int flags ____cacheline_aligned ; /* WQ: WQ_* flags */
332342 struct pool_workqueue __percpu __rcu * * cpu_pwq ; /* I: per-cpu pwqs */
343+ struct wq_node_nr_active * node_nr_active []; /* I: per-node nr_active */
333344};
334345
335346static struct kmem_cache * pwq_cache ;
@@ -1425,6 +1436,31 @@ work_func_t wq_worker_last_func(struct task_struct *task)
14251436 return worker -> last_func ;
14261437}
14271438
1439+ /**
1440+ * wq_node_nr_active - Determine wq_node_nr_active to use
1441+ * @wq: workqueue of interest
1442+ * @node: NUMA node, can be %NUMA_NO_NODE
1443+ *
1444+ * Determine wq_node_nr_active to use for @wq on @node. Returns:
1445+ *
1446+ * - %NULL for per-cpu workqueues as they don't need to use shared nr_active.
1447+ *
1448+ * - node_nr_active[nr_node_ids] if @node is %NUMA_NO_NODE.
1449+ *
1450+ * - Otherwise, node_nr_active[@node].
1451+ */
1452+ static struct wq_node_nr_active * wq_node_nr_active (struct workqueue_struct * wq ,
1453+ int node )
1454+ {
1455+ if (!(wq -> flags & WQ_UNBOUND ))
1456+ return NULL ;
1457+
1458+ if (node == NUMA_NO_NODE )
1459+ node = nr_node_ids ;
1460+
1461+ return wq -> node_nr_active [node ];
1462+ }
1463+
14281464/**
14291465 * get_pwq - get an extra reference on the specified pool_workqueue
14301466 * @pwq: pool_workqueue to get
@@ -1506,12 +1542,17 @@ static bool pwq_activate_work(struct pool_workqueue *pwq,
15061542 struct work_struct * work )
15071543{
15081544 struct worker_pool * pool = pwq -> pool ;
1545+ struct wq_node_nr_active * nna ;
15091546
15101547 lockdep_assert_held (& pool -> lock );
15111548
15121549 if (!(* work_data_bits (work ) & WORK_STRUCT_INACTIVE ))
15131550 return false;
15141551
1552+ nna = wq_node_nr_active (pwq -> wq , pool -> node );
1553+ if (nna )
1554+ atomic_inc (& nna -> nr );
1555+
15151556 pwq -> nr_active ++ ;
15161557 __pwq_activate_work (pwq , work );
15171558 return true;
@@ -1528,14 +1569,18 @@ static bool pwq_tryinc_nr_active(struct pool_workqueue *pwq)
15281569{
15291570 struct workqueue_struct * wq = pwq -> wq ;
15301571 struct worker_pool * pool = pwq -> pool ;
1572+ struct wq_node_nr_active * nna = wq_node_nr_active (wq , pool -> node );
15311573 bool obtained ;
15321574
15331575 lockdep_assert_held (& pool -> lock );
15341576
15351577 obtained = pwq -> nr_active < READ_ONCE (wq -> max_active );
15361578
1537- if (obtained )
1579+ if (obtained ) {
15381580 pwq -> nr_active ++ ;
1581+ if (nna )
1582+ atomic_inc (& nna -> nr );
1583+ }
15391584 return obtained ;
15401585}
15411586
@@ -1572,10 +1617,26 @@ static bool pwq_activate_first_inactive(struct pool_workqueue *pwq)
15721617static void pwq_dec_nr_active (struct pool_workqueue * pwq )
15731618{
15741619 struct worker_pool * pool = pwq -> pool ;
1620+ struct wq_node_nr_active * nna = wq_node_nr_active (pwq -> wq , pool -> node );
15751621
15761622 lockdep_assert_held (& pool -> lock );
15771623
1624+ /*
1625+ * @pwq->nr_active should be decremented for both percpu and unbound
1626+ * workqueues.
1627+ */
15781628 pwq -> nr_active -- ;
1629+
1630+ /*
1631+ * For a percpu workqueue, it's simple. Just need to kick the first
1632+ * inactive work item on @pwq itself.
1633+ */
1634+ if (!nna ) {
1635+ pwq_activate_first_inactive (pwq );
1636+ return ;
1637+ }
1638+
1639+ atomic_dec (& nna -> nr );
15791640 pwq_activate_first_inactive (pwq );
15801641}
15811642
@@ -4039,11 +4100,63 @@ static void wq_free_lockdep(struct workqueue_struct *wq)
40394100}
40404101#endif
40414102
4103+ static void free_node_nr_active (struct wq_node_nr_active * * nna_ar )
4104+ {
4105+ int node ;
4106+
4107+ for_each_node (node ) {
4108+ kfree (nna_ar [node ]);
4109+ nna_ar [node ] = NULL ;
4110+ }
4111+
4112+ kfree (nna_ar [nr_node_ids ]);
4113+ nna_ar [nr_node_ids ] = NULL ;
4114+ }
4115+
4116+ static void init_node_nr_active (struct wq_node_nr_active * nna )
4117+ {
4118+ atomic_set (& nna -> nr , 0 );
4119+ }
4120+
4121+ /*
4122+ * Each node's nr_active counter will be accessed mostly from its own node and
4123+ * should be allocated in the node.
4124+ */
4125+ static int alloc_node_nr_active (struct wq_node_nr_active * * nna_ar )
4126+ {
4127+ struct wq_node_nr_active * nna ;
4128+ int node ;
4129+
4130+ for_each_node (node ) {
4131+ nna = kzalloc_node (sizeof (* nna ), GFP_KERNEL , node );
4132+ if (!nna )
4133+ goto err_free ;
4134+ init_node_nr_active (nna );
4135+ nna_ar [node ] = nna ;
4136+ }
4137+
4138+ /* [nr_node_ids] is used as the fallback */
4139+ nna = kzalloc_node (sizeof (* nna ), GFP_KERNEL , NUMA_NO_NODE );
4140+ if (!nna )
4141+ goto err_free ;
4142+ init_node_nr_active (nna );
4143+ nna_ar [nr_node_ids ] = nna ;
4144+
4145+ return 0 ;
4146+
4147+ err_free :
4148+ free_node_nr_active (nna_ar );
4149+ return - ENOMEM ;
4150+ }
4151+
40424152static void rcu_free_wq (struct rcu_head * rcu )
40434153{
40444154 struct workqueue_struct * wq =
40454155 container_of (rcu , struct workqueue_struct , rcu );
40464156
4157+ if (wq -> flags & WQ_UNBOUND )
4158+ free_node_nr_active (wq -> node_nr_active );
4159+
40474160 wq_free_lockdep (wq );
40484161 free_percpu (wq -> cpu_pwq );
40494162 free_workqueue_attrs (wq -> unbound_attrs );
@@ -4785,7 +4898,8 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
47854898{
47864899 va_list args ;
47874900 struct workqueue_struct * wq ;
4788- int len ;
4901+ size_t wq_size ;
4902+ int name_len ;
47894903
47904904 /*
47914905 * Unbound && max_active == 1 used to imply ordered, which is no longer
@@ -4801,7 +4915,12 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
48014915 flags |= WQ_UNBOUND ;
48024916
48034917 /* allocate wq and format name */
4804- wq = kzalloc (sizeof (* wq ), GFP_KERNEL );
4918+ if (flags & WQ_UNBOUND )
4919+ wq_size = struct_size (wq , node_nr_active , nr_node_ids + 1 );
4920+ else
4921+ wq_size = sizeof (* wq );
4922+
4923+ wq = kzalloc (wq_size , GFP_KERNEL );
48054924 if (!wq )
48064925 return NULL ;
48074926
@@ -4812,11 +4931,12 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
48124931 }
48134932
48144933 va_start (args , max_active );
4815- len = vsnprintf (wq -> name , sizeof (wq -> name ), fmt , args );
4934+ name_len = vsnprintf (wq -> name , sizeof (wq -> name ), fmt , args );
48164935 va_end (args );
48174936
4818- if (len >= WQ_NAME_LEN )
4819- pr_warn_once ("workqueue: name exceeds WQ_NAME_LEN. Truncating to: %s\n" , wq -> name );
4937+ if (name_len >= WQ_NAME_LEN )
4938+ pr_warn_once ("workqueue: name exceeds WQ_NAME_LEN. Truncating to: %s\n" ,
4939+ wq -> name );
48204940
48214941 max_active = max_active ?: WQ_DFL_ACTIVE ;
48224942 max_active = wq_clamp_max_active (max_active , flags , wq -> name );
@@ -4835,8 +4955,13 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
48354955 wq_init_lockdep (wq );
48364956 INIT_LIST_HEAD (& wq -> list );
48374957
4958+ if (flags & WQ_UNBOUND ) {
4959+ if (alloc_node_nr_active (wq -> node_nr_active ) < 0 )
4960+ goto err_unreg_lockdep ;
4961+ }
4962+
48384963 if (alloc_and_link_pwqs (wq ) < 0 )
4839- goto err_unreg_lockdep ;
4964+ goto err_free_node_nr_active ;
48404965
48414966 if (wq_online && init_rescuer (wq ) < 0 )
48424967 goto err_destroy ;
@@ -4861,6 +4986,9 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
48614986
48624987 return wq ;
48634988
4989+ err_free_node_nr_active :
4990+ if (wq -> flags & WQ_UNBOUND )
4991+ free_node_nr_active (wq -> node_nr_active );
48644992err_unreg_lockdep :
48654993 wq_unregister_lockdep (wq );
48664994 wq_free_lockdep (wq );
0 commit comments