Skip to content

Commit 9311e6c

Browse files
committed
cgroup: Fix sleeping from invalid context warning on PREEMPT_RT
cgroup_task_dead() is called from finish_task_switch() which runs with preemption disabled and doesn't allow scheduling even on PREEMPT_RT. The function needs to acquire css_set_lock which is a regular spinlock that can sleep on RT kernels, leading to "sleeping function called from invalid context" warnings. css_set_lock is too large in scope to convert to a raw_spinlock. However, the unlinking operations don't need to run synchronously - they just need to complete after the task is done running. On PREEMPT_RT, defer the work through irq_work. While the work doesn't need to happen immediately, it can't be delayed indefinitely either as the dead task pins the cgroup and task_struct can be pinned indefinitely. Use the lazy version of irq_work to allow batching and lower impact while ensuring timely completion. v2: Use IRQ_WORK_INIT_LAZY instead of immediate irq_work and add explanation for why the work can't be delayed indefinitely (Sebastian Andrzej Siewior). Fixes: d245698 ("cgroup: Defer task cgroup unlink until after the task is done switching out") Reported-by: Calvin Owens <calvin@wbinvd.org> Link: https://lore.kernel.org/r/20251104181114.489391-1-calvin@wbinvd.org Signed-off-by: Tejun Heo <tj@kernel.org>
1 parent be04e96 commit 9311e6c

2 files changed

Lines changed: 58 additions & 2 deletions

File tree

include/linux/sched.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1324,7 +1324,10 @@ struct task_struct {
13241324
struct css_set __rcu *cgroups;
13251325
/* cg_list protected by css_set_lock and tsk->alloc_lock: */
13261326
struct list_head cg_list;
1327-
#endif
1327+
#ifdef CONFIG_PREEMPT_RT
1328+
struct llist_node cg_dead_lnode;
1329+
#endif /* CONFIG_PREEMPT_RT */
1330+
#endif /* CONFIG_CGROUPS */
13281331
#ifdef CONFIG_X86_CPU_RESCTRL
13291332
u32 closid;
13301333
u32 rmid;

kernel/cgroup/cgroup.c

Lines changed: 54 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -290,6 +290,7 @@ static void kill_css(struct cgroup_subsys_state *css);
290290
static int cgroup_addrm_files(struct cgroup_subsys_state *css,
291291
struct cgroup *cgrp, struct cftype cfts[],
292292
bool is_add);
293+
static void cgroup_rt_init(void);
293294

294295
#ifdef CONFIG_DEBUG_CGROUP_REF
295296
#define CGROUP_REF_FN_ATTRS noinline
@@ -6360,6 +6361,7 @@ int __init cgroup_init(void)
63606361
BUG_ON(ss_rstat_init(NULL));
63616362

63626363
get_user_ns(init_cgroup_ns.user_ns);
6364+
cgroup_rt_init();
63636365

63646366
cgroup_lock();
63656367

@@ -6990,7 +6992,7 @@ void cgroup_task_exit(struct task_struct *tsk)
69906992
} while_each_subsys_mask();
69916993
}
69926994

6993-
void cgroup_task_dead(struct task_struct *tsk)
6995+
static void do_cgroup_task_dead(struct task_struct *tsk)
69946996
{
69956997
struct css_set *cset;
69966998
unsigned long flags;
@@ -7016,6 +7018,57 @@ void cgroup_task_dead(struct task_struct *tsk)
70167018
spin_unlock_irqrestore(&css_set_lock, flags);
70177019
}
70187020

7021+
#ifdef CONFIG_PREEMPT_RT
7022+
/*
7023+
* cgroup_task_dead() is called from finish_task_switch() which doesn't allow
7024+
* scheduling even in RT. As the task_dead path requires grabbing css_set_lock,
7025+
* this lead to sleeping in the invalid context warning bug. css_set_lock is too
7026+
* big to become a raw_spinlock. The task_dead path doesn't need to run
7027+
* synchronously but can't be delayed indefinitely either as the dead task pins
7028+
* the cgroup and task_struct can be pinned indefinitely. Bounce through lazy
7029+
* irq_work to allow batching while ensuring timely completion.
7030+
*/
7031+
static DEFINE_PER_CPU(struct llist_head, cgrp_dead_tasks);
7032+
static DEFINE_PER_CPU(struct irq_work, cgrp_dead_tasks_iwork);
7033+
7034+
static void cgrp_dead_tasks_iwork_fn(struct irq_work *iwork)
7035+
{
7036+
struct llist_node *lnode;
7037+
struct task_struct *task, *next;
7038+
7039+
lnode = llist_del_all(this_cpu_ptr(&cgrp_dead_tasks));
7040+
llist_for_each_entry_safe(task, next, lnode, cg_dead_lnode) {
7041+
do_cgroup_task_dead(task);
7042+
put_task_struct(task);
7043+
}
7044+
}
7045+
7046+
static void __init cgroup_rt_init(void)
7047+
{
7048+
int cpu;
7049+
7050+
for_each_possible_cpu(cpu) {
7051+
init_llist_head(per_cpu_ptr(&cgrp_dead_tasks, cpu));
7052+
per_cpu(cgrp_dead_tasks_iwork, cpu) =
7053+
IRQ_WORK_INIT_LAZY(cgrp_dead_tasks_iwork_fn);
7054+
}
7055+
}
7056+
7057+
void cgroup_task_dead(struct task_struct *task)
7058+
{
7059+
get_task_struct(task);
7060+
llist_add(&task->cg_dead_lnode, this_cpu_ptr(&cgrp_dead_tasks));
7061+
irq_work_queue(this_cpu_ptr(&cgrp_dead_tasks_iwork));
7062+
}
7063+
#else /* CONFIG_PREEMPT_RT */
7064+
static void __init cgroup_rt_init(void) {}
7065+
7066+
void cgroup_task_dead(struct task_struct *task)
7067+
{
7068+
do_cgroup_task_dead(task);
7069+
}
7070+
#endif /* CONFIG_PREEMPT_RT */
7071+
70197072
void cgroup_task_release(struct task_struct *task)
70207073
{
70217074
struct cgroup_subsys *ss;

0 commit comments

Comments
 (0)