Skip to content

Commit cd2440d

Browse files
pmladekhtejun
authored andcommitted
workqueue: Print backtraces from CPUs with hung CPU bound workqueues
The workqueue watchdog reports a lockup when there was not any progress in the worker pool for a long time. The progress means that a pending work item starts being proceed. Worker pools for unbound workqueues always wake up an idle worker and try to process the work immediately. The last idle worker has to create new worker first. The stall might happen only when a new worker could not be created in which case an error should get printed. Another problem might be too high load. In this case, workers are victims of a global system problem. Worker pools for CPU bound workqueues are designed for lightweight work items that do not need much CPU time. They are proceed one by one on a single worker. New worker is used only when a work is sleeping. It creates one additional scenario. The stall might happen when the CPU-bound workqueue is used for CPU-intensive work. More precisely, the stall is detected when a CPU-bound worker is in the TASK_RUNNING state for too long. In this case, it might be useful to see the backtrace from the problematic worker. The information how long a worker is in the running state is not available. But the CPU-bound worker pools do not have many workers in the running state by definition. And only few pools are typically blocked. It should be acceptable to print backtraces from all workers in TASK_RUNNING state in the stalled worker pools. The number of false positives should be very low. Signed-off-by: Petr Mladek <pmladek@suse.com> Signed-off-by: Tejun Heo <tj@kernel.org>
1 parent 4c0736a commit cd2440d

1 file changed

Lines changed: 66 additions & 0 deletions

File tree

kernel/workqueue.c

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949
#include <linux/moduleparam.h>
5050
#include <linux/uaccess.h>
5151
#include <linux/sched/isolation.h>
52+
#include <linux/sched/debug.h>
5253
#include <linux/nmi.h>
5354
#include <linux/kvm_para.h>
5455

@@ -141,6 +142,8 @@ enum {
141142
* WR: wq->mutex protected for writes. RCU protected for reads.
142143
*
143144
* MD: wq_mayday_lock protected.
145+
*
146+
* WD: Used internally by the watchdog.
144147
*/
145148

146149
/* struct worker is defined in workqueue_internal.h */
@@ -153,6 +156,7 @@ struct worker_pool {
153156
unsigned int flags; /* X: flags */
154157

155158
unsigned long watchdog_ts; /* L: watchdog timestamp */
159+
bool cpu_stall; /* WD: stalled cpu bound pool */
156160

157161
/*
158162
* The counter is incremented in a process context on the associated CPU
@@ -5976,6 +5980,57 @@ static struct timer_list wq_watchdog_timer;
59765980
static unsigned long wq_watchdog_touched = INITIAL_JIFFIES;
59775981
static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES;
59785982

5983+
/*
5984+
* Show workers that might prevent the processing of pending work items.
5985+
* The only candidates are CPU-bound workers in the running state.
5986+
* Pending work items should be handled by another idle worker
5987+
* in all other situations.
5988+
*/
5989+
static void show_cpu_pool_hog(struct worker_pool *pool)
5990+
{
5991+
struct worker *worker;
5992+
unsigned long flags;
5993+
int bkt;
5994+
5995+
raw_spin_lock_irqsave(&pool->lock, flags);
5996+
5997+
hash_for_each(pool->busy_hash, bkt, worker, hentry) {
5998+
if (task_is_running(worker->task)) {
5999+
/*
6000+
* Defer printing to avoid deadlocks in console
6001+
* drivers that queue work while holding locks
6002+
* also taken in their write paths.
6003+
*/
6004+
printk_deferred_enter();
6005+
6006+
pr_info("pool %d:\n", pool->id);
6007+
sched_show_task(worker->task);
6008+
6009+
printk_deferred_exit();
6010+
}
6011+
}
6012+
6013+
raw_spin_unlock_irqrestore(&pool->lock, flags);
6014+
}
6015+
6016+
static void show_cpu_pools_hogs(void)
6017+
{
6018+
struct worker_pool *pool;
6019+
int pi;
6020+
6021+
pr_info("Showing backtraces of running workers in stalled CPU-bound worker pools:\n");
6022+
6023+
rcu_read_lock();
6024+
6025+
for_each_pool(pool, pi) {
6026+
if (pool->cpu_stall)
6027+
show_cpu_pool_hog(pool);
6028+
6029+
}
6030+
6031+
rcu_read_unlock();
6032+
}
6033+
59796034
static void wq_watchdog_reset_touched(void)
59806035
{
59816036
int cpu;
@@ -5989,6 +6044,7 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)
59896044
{
59906045
unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ;
59916046
bool lockup_detected = false;
6047+
bool cpu_pool_stall = false;
59926048
unsigned long now = jiffies;
59936049
struct worker_pool *pool;
59946050
int pi;
@@ -6001,6 +6057,7 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)
60016057
for_each_pool(pool, pi) {
60026058
unsigned long pool_ts, touched, ts;
60036059

6060+
pool->cpu_stall = false;
60046061
if (list_empty(&pool->worklist))
60056062
continue;
60066063

@@ -6025,18 +6082,27 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)
60256082
/* did we stall? */
60266083
if (time_after(now, ts + thresh)) {
60276084
lockup_detected = true;
6085+
if (pool->cpu >= 0) {
6086+
pool->cpu_stall = true;
6087+
cpu_pool_stall = true;
6088+
}
60286089
pr_emerg("BUG: workqueue lockup - pool");
60296090
pr_cont_pool_info(pool);
60306091
pr_cont(" stuck for %us!\n",
60316092
jiffies_to_msecs(now - pool_ts) / 1000);
60326093
}
6094+
6095+
60336096
}
60346097

60356098
rcu_read_unlock();
60366099

60376100
if (lockup_detected)
60386101
show_all_workqueues();
60396102

6103+
if (cpu_pool_stall)
6104+
show_cpu_pools_hogs();
6105+
60406106
wq_watchdog_reset_touched();
60416107
mod_timer(&wq_watchdog_timer, jiffies + thresh);
60426108
}

0 commit comments

Comments
 (0)