Skip to content

Commit d245698

Browse files
committed
cgroup: Defer task cgroup unlink until after the task is done switching out
When a task exits, css_set_move_task(tsk, cset, NULL, false) unlinks the task from its cgroup. From the cgroup's perspective, the task is now gone. If this makes the cgroup empty, it can be removed, triggering ->css_offline() callbacks that notify controllers the cgroup is going offline resource-wise. However, the exiting task can still run, perform memory operations, and schedule until the final context switch in finish_task_switch(). This creates a confusing situation where controllers are told a cgroup is offline while resource activities are still happening in it. While this hasn't broken existing controllers, it has caused direct confusion for sched_ext schedulers. Split cgroup_task_exit() into two functions. cgroup_task_exit() now only calls the subsystem exit callbacks and continues to be called from do_exit(). The css_set cleanup is moved to the new cgroup_task_dead() which is called from finish_task_switch() after the final context switch, so that the cgroup only appears empty after the task is truly done running. This also reorders operations so that subsys->exit() is now called before unlinking from the cgroup, which shouldn't break anything. Cc: Dan Schatzberg <dschatzberg@meta.com> Cc: Peter Zijlstra <peterz@infradead.org> Signed-off-by: Tejun Heo <tj@kernel.org>
1 parent 260fbcb commit d245698

3 files changed

Lines changed: 18 additions & 9 deletions

File tree

include/linux/cgroup.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,7 @@ extern void cgroup_cancel_fork(struct task_struct *p,
138138
extern void cgroup_post_fork(struct task_struct *p,
139139
struct kernel_clone_args *kargs);
140140
void cgroup_task_exit(struct task_struct *p);
141+
void cgroup_task_dead(struct task_struct *p);
141142
void cgroup_task_release(struct task_struct *p);
142143
void cgroup_task_free(struct task_struct *p);
143144

@@ -681,6 +682,7 @@ static inline void cgroup_cancel_fork(struct task_struct *p,
681682
static inline void cgroup_post_fork(struct task_struct *p,
682683
struct kernel_clone_args *kargs) {}
683684
static inline void cgroup_task_exit(struct task_struct *p) {}
685+
static inline void cgroup_task_dead(struct task_struct *p) {}
684686
static inline void cgroup_task_release(struct task_struct *p) {}
685687
static inline void cgroup_task_free(struct task_struct *p) {}
686688

kernel/cgroup/cgroup.c

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -944,7 +944,7 @@ static void css_set_move_task(struct task_struct *task,
944944
/*
945945
* We are synchronized through cgroup_threadgroup_rwsem
946946
* against PF_EXITING setting such that we can't race
947-
* against cgroup_task_exit()/cgroup_task_free() dropping
947+
* against cgroup_task_dead()/cgroup_task_free() dropping
948948
* the css_set.
949949
*/
950950
WARN_ON_ONCE(task->flags & PF_EXITING);
@@ -6982,10 +6982,20 @@ void cgroup_post_fork(struct task_struct *child,
69826982
void cgroup_task_exit(struct task_struct *tsk)
69836983
{
69846984
struct cgroup_subsys *ss;
6985-
struct css_set *cset;
69866985
int i;
69876986

6988-
spin_lock_irq(&css_set_lock);
6987+
/* see cgroup_post_fork() for details */
6988+
do_each_subsys_mask(ss, i, have_exit_callback) {
6989+
ss->exit(tsk);
6990+
} while_each_subsys_mask();
6991+
}
6992+
6993+
void cgroup_task_dead(struct task_struct *tsk)
6994+
{
6995+
struct css_set *cset;
6996+
unsigned long flags;
6997+
6998+
spin_lock_irqsave(&css_set_lock, flags);
69896999

69907000
WARN_ON_ONCE(list_empty(&tsk->cg_list));
69917001
cset = task_css_set(tsk);
@@ -7003,12 +7013,7 @@ void cgroup_task_exit(struct task_struct *tsk)
70037013
test_bit(CGRP_FREEZE, &task_dfl_cgroup(tsk)->flags)))
70047014
cgroup_update_frozen(task_dfl_cgroup(tsk));
70057015

7006-
spin_unlock_irq(&css_set_lock);
7007-
7008-
/* see cgroup_post_fork() for details */
7009-
do_each_subsys_mask(ss, i, have_exit_callback) {
7010-
ss->exit(tsk);
7011-
} while_each_subsys_mask();
7016+
spin_unlock_irqrestore(&css_set_lock, flags);
70127017
}
70137018

70147019
void cgroup_task_release(struct task_struct *task)

kernel/sched/core.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5222,6 +5222,8 @@ static struct rq *finish_task_switch(struct task_struct *prev)
52225222
if (prev->sched_class->task_dead)
52235223
prev->sched_class->task_dead(prev);
52245224

5225+
cgroup_task_dead(prev);
5226+
52255227
/* Task is done with its stack. */
52265228
put_task_stack(prev);
52275229

0 commit comments

Comments
 (0)