Skip to content

Commit cd546fa

Browse files
committed
Merge tag 'wq-for-6.4' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq
Pull workqueue updates from Tejun Heo: "Mostly changes from Petr to improve warning and error reporting. Workqueue now reports more of the relevant failures with better context which should help debugging" * tag 'wq-for-6.4' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq: workqueue: Introduce show_freezable_workqueues workqueue: Print backtraces from CPUs with hung CPU bound workqueues workqueue: Warn when a rescuer could not be created workqueue: Interrupted create_worker() is not a repeated event workqueue: Warn when a new worker could not be created workqueue: Fix hung time report of worker pools workqueue: Simplify a pr_warn() call in wq_select_unbound_cpu() MAINTAINERS: Add workqueue_internal.h to the WORKQUEUE entry
2 parents 89d77f7 + 704bc66 commit cd546fa

4 files changed

Lines changed: 124 additions & 14 deletions

File tree

MAINTAINERS

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22743,6 +22743,7 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq.git
2274322743
F: Documentation/core-api/workqueue.rst
2274422744
F: include/linux/workqueue.h
2274522745
F: kernel/workqueue.c
22746+
F: kernel/workqueue_internal.h
2274622747

2274722748
WWAN DRIVERS
2274822749
M: Loic Poulain <loic.poulain@linaro.org>

include/linux/workqueue.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -472,6 +472,7 @@ extern unsigned int work_busy(struct work_struct *work);
472472
extern __printf(1, 2) void set_worker_desc(const char *fmt, ...);
473473
extern void print_worker_info(const char *log_lvl, struct task_struct *task);
474474
extern void show_all_workqueues(void);
475+
extern void show_freezable_workqueues(void);
475476
extern void show_one_workqueue(struct workqueue_struct *wq);
476477
extern void wq_worker_comm(char *buf, size_t size, struct task_struct *task);
477478

kernel/power/process.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ static int try_to_freeze_tasks(bool user_only)
9393
todo - wq_busy, wq_busy);
9494

9595
if (wq_busy)
96-
show_all_workqueues();
96+
show_freezable_workqueues();
9797

9898
if (!wakeup || pm_debug_messages_on) {
9999
read_lock(&tasklist_lock);

kernel/workqueue.c

Lines changed: 121 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949
#include <linux/moduleparam.h>
5050
#include <linux/uaccess.h>
5151
#include <linux/sched/isolation.h>
52+
#include <linux/sched/debug.h>
5253
#include <linux/nmi.h>
5354
#include <linux/kvm_para.h>
5455

@@ -141,6 +142,8 @@ enum {
141142
* WR: wq->mutex protected for writes. RCU protected for reads.
142143
*
143144
* MD: wq_mayday_lock protected.
145+
*
146+
* WD: Used internally by the watchdog.
144147
*/
145148

146149
/* struct worker is defined in workqueue_internal.h */
@@ -153,6 +156,7 @@ struct worker_pool {
153156
unsigned int flags; /* X: flags */
154157

155158
unsigned long watchdog_ts; /* L: watchdog timestamp */
159+
bool cpu_stall; /* WD: stalled cpu bound pool */
156160

157161
/*
158162
* The counter is incremented in a process context on the associated CPU
@@ -1392,15 +1396,13 @@ static bool is_chained_work(struct workqueue_struct *wq)
13921396
*/
13931397
static int wq_select_unbound_cpu(int cpu)
13941398
{
1395-
static bool printed_dbg_warning;
13961399
int new_cpu;
13971400

13981401
if (likely(!wq_debug_force_rr_cpu)) {
13991402
if (cpumask_test_cpu(cpu, wq_unbound_cpumask))
14001403
return cpu;
1401-
} else if (!printed_dbg_warning) {
1402-
pr_warn("workqueue: round-robin CPU selection forced, expect performance impact\n");
1403-
printed_dbg_warning = true;
1404+
} else {
1405+
pr_warn_once("workqueue: round-robin CPU selection forced, expect performance impact\n");
14041406
}
14051407

14061408
if (cpumask_empty(wq_unbound_cpumask))
@@ -1938,12 +1940,17 @@ static struct worker *create_worker(struct worker_pool *pool)
19381940

19391941
/* ID is needed to determine kthread name */
19401942
id = ida_alloc(&pool->worker_ida, GFP_KERNEL);
1941-
if (id < 0)
1943+
if (id < 0) {
1944+
pr_err_once("workqueue: Failed to allocate a worker ID: %pe\n",
1945+
ERR_PTR(id));
19421946
return NULL;
1947+
}
19431948

19441949
worker = alloc_worker(pool->node);
1945-
if (!worker)
1950+
if (!worker) {
1951+
pr_err_once("workqueue: Failed to allocate a worker\n");
19461952
goto fail;
1953+
}
19471954

19481955
worker->id = id;
19491956

@@ -1955,8 +1962,16 @@ static struct worker *create_worker(struct worker_pool *pool)
19551962

19561963
worker->task = kthread_create_on_node(worker_thread, worker, pool->node,
19571964
"kworker/%s", id_buf);
1958-
if (IS_ERR(worker->task))
1965+
if (IS_ERR(worker->task)) {
1966+
if (PTR_ERR(worker->task) == -EINTR) {
1967+
pr_err("workqueue: Interrupted when creating a worker thread \"kworker/%s\"\n",
1968+
id_buf);
1969+
} else {
1970+
pr_err_once("workqueue: Failed to create a worker thread: %pe",
1971+
worker->task);
1972+
}
19591973
goto fail;
1974+
}
19601975

19611976
set_user_nice(worker->task, pool->attrs->nice);
19621977
kthread_bind_mask(worker->task, pool->attrs->cpumask);
@@ -4380,13 +4395,18 @@ static int init_rescuer(struct workqueue_struct *wq)
43804395
return 0;
43814396

43824397
rescuer = alloc_worker(NUMA_NO_NODE);
4383-
if (!rescuer)
4398+
if (!rescuer) {
4399+
pr_err("workqueue: Failed to allocate a rescuer for wq \"%s\"\n",
4400+
wq->name);
43844401
return -ENOMEM;
4402+
}
43854403

43864404
rescuer->rescue_wq = wq;
43874405
rescuer->task = kthread_create(rescuer_thread, rescuer, "%s", wq->name);
43884406
if (IS_ERR(rescuer->task)) {
43894407
ret = PTR_ERR(rescuer->task);
4408+
pr_err("workqueue: Failed to create a rescuer kthread for wq \"%s\": %pe",
4409+
wq->name, ERR_PTR(ret));
43904410
kfree(rescuer);
43914411
return ret;
43924412
}
@@ -5002,10 +5022,16 @@ static void show_one_worker_pool(struct worker_pool *pool)
50025022
struct worker *worker;
50035023
bool first = true;
50045024
unsigned long flags;
5025+
unsigned long hung = 0;
50055026

50065027
raw_spin_lock_irqsave(&pool->lock, flags);
50075028
if (pool->nr_workers == pool->nr_idle)
50085029
goto next_pool;
5030+
5031+
/* How long the first pending work is waiting for a worker. */
5032+
if (!list_empty(&pool->worklist))
5033+
hung = jiffies_to_msecs(jiffies - pool->watchdog_ts) / 1000;
5034+
50095035
/*
50105036
* Defer printing to avoid deadlocks in console drivers that
50115037
* queue work while holding locks also taken in their write
@@ -5014,9 +5040,7 @@ static void show_one_worker_pool(struct worker_pool *pool)
50145040
printk_deferred_enter();
50155041
pr_info("pool %d:", pool->id);
50165042
pr_cont_pool_info(pool);
5017-
pr_cont(" hung=%us workers=%d",
5018-
jiffies_to_msecs(jiffies - pool->watchdog_ts) / 1000,
5019-
pool->nr_workers);
5043+
pr_cont(" hung=%lus workers=%d", hung, pool->nr_workers);
50205044
if (pool->manager)
50215045
pr_cont(" manager: %d",
50225046
task_pid_nr(pool->manager->task));
@@ -5041,8 +5065,7 @@ static void show_one_worker_pool(struct worker_pool *pool)
50415065
/**
50425066
* show_all_workqueues - dump workqueue state
50435067
*
5044-
* Called from a sysrq handler or try_to_freeze_tasks() and prints out
5045-
* all busy workqueues and pools.
5068+
* Called from a sysrq handler and prints out all busy workqueues and pools.
50465069
*/
50475070
void show_all_workqueues(void)
50485071
{
@@ -5063,6 +5086,29 @@ void show_all_workqueues(void)
50635086
rcu_read_unlock();
50645087
}
50655088

5089+
/**
5090+
* show_freezable_workqueues - dump freezable workqueue state
5091+
*
5092+
* Called from try_to_freeze_tasks() and prints out all freezable workqueues
5093+
* still busy.
5094+
*/
5095+
void show_freezable_workqueues(void)
5096+
{
5097+
struct workqueue_struct *wq;
5098+
5099+
rcu_read_lock();
5100+
5101+
pr_info("Showing freezable workqueues that are still busy:\n");
5102+
5103+
list_for_each_entry_rcu(wq, &workqueues, list) {
5104+
if (!(wq->flags & WQ_FREEZABLE))
5105+
continue;
5106+
show_one_workqueue(wq);
5107+
}
5108+
5109+
rcu_read_unlock();
5110+
}
5111+
50665112
/* used to show worker information through /proc/PID/{comm,stat,status} */
50675113
void wq_worker_comm(char *buf, size_t size, struct task_struct *task)
50685114
{
@@ -5962,6 +6008,57 @@ static struct timer_list wq_watchdog_timer;
59626008
static unsigned long wq_watchdog_touched = INITIAL_JIFFIES;
59636009
static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES;
59646010

6011+
/*
6012+
* Show workers that might prevent the processing of pending work items.
6013+
* The only candidates are CPU-bound workers in the running state.
6014+
* Pending work items should be handled by another idle worker
6015+
* in all other situations.
6016+
*/
6017+
static void show_cpu_pool_hog(struct worker_pool *pool)
6018+
{
6019+
struct worker *worker;
6020+
unsigned long flags;
6021+
int bkt;
6022+
6023+
raw_spin_lock_irqsave(&pool->lock, flags);
6024+
6025+
hash_for_each(pool->busy_hash, bkt, worker, hentry) {
6026+
if (task_is_running(worker->task)) {
6027+
/*
6028+
* Defer printing to avoid deadlocks in console
6029+
* drivers that queue work while holding locks
6030+
* also taken in their write paths.
6031+
*/
6032+
printk_deferred_enter();
6033+
6034+
pr_info("pool %d:\n", pool->id);
6035+
sched_show_task(worker->task);
6036+
6037+
printk_deferred_exit();
6038+
}
6039+
}
6040+
6041+
raw_spin_unlock_irqrestore(&pool->lock, flags);
6042+
}
6043+
6044+
static void show_cpu_pools_hogs(void)
6045+
{
6046+
struct worker_pool *pool;
6047+
int pi;
6048+
6049+
pr_info("Showing backtraces of running workers in stalled CPU-bound worker pools:\n");
6050+
6051+
rcu_read_lock();
6052+
6053+
for_each_pool(pool, pi) {
6054+
if (pool->cpu_stall)
6055+
show_cpu_pool_hog(pool);
6056+
6057+
}
6058+
6059+
rcu_read_unlock();
6060+
}
6061+
59656062
static void wq_watchdog_reset_touched(void)
59666063
{
59676064
int cpu;
@@ -5975,6 +6072,7 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)
59756072
{
59766073
unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ;
59776074
bool lockup_detected = false;
6075+
bool cpu_pool_stall = false;
59786076
unsigned long now = jiffies;
59796077
struct worker_pool *pool;
59806078
int pi;
@@ -5987,6 +6085,7 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)
59876085
for_each_pool(pool, pi) {
59886086
unsigned long pool_ts, touched, ts;
59896087

6088+
pool->cpu_stall = false;
59906089
if (list_empty(&pool->worklist))
59916090
continue;
59926091

@@ -6011,18 +6110,27 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)
60116110
/* did we stall? */
60126111
if (time_after(now, ts + thresh)) {
60136112
lockup_detected = true;
6113+
if (pool->cpu >= 0) {
6114+
pool->cpu_stall = true;
6115+
cpu_pool_stall = true;
6116+
}
60146117
pr_emerg("BUG: workqueue lockup - pool");
60156118
pr_cont_pool_info(pool);
60166119
pr_cont(" stuck for %us!\n",
60176120
jiffies_to_msecs(now - pool_ts) / 1000);
60186121
}
6122+
6123+
60196124
}
60206125

60216126
rcu_read_unlock();
60226127

60236128
if (lockup_detected)
60246129
show_all_workqueues();
60256130

6131+
if (cpu_pool_stall)
6132+
show_cpu_pools_hogs();
6133+
60266134
wq_watchdog_reset_touched();
60276135
mod_timer(&wq_watchdog_timer, jiffies + thresh);
60286136
}

0 commit comments

Comments
 (0)