Skip to content

Commit d40dc30

Browse files
committed
Merge patch series "pidfs: handle multi-threaded exec and premature thread-group leader exit"
Christian Brauner <brauner@kernel.org> says: This is another attempt at trying to make pidfd polling for multi-threaded exec and premature thread-group leader exit consistent. A quick recap of these two cases: (1) During a multi-threaded exec by a subthread, i.e., non-thread-group leader thread, all other threads in the thread-group including the thread-group leader are killed and the struct pid of the thread-group leader will be taken over by the subthread that called exec. IOW, two tasks change their TIDs. (2) A premature thread-group leader exit means that the thread-group leader exited before all of the other subthreads in the thread-group have exited. Both cases lead to inconsistencies for pidfd polling with PIDFD_THREAD. Any caller that holds a PIDFD_THREAD pidfd to the current thread-group leader may or may not see an exit notification on the file descriptor depending on when poll is performed. If the poll is performed before the exec of the subthread has concluded an exit notification is generated for the old thread-group leader. If the poll is performed after the exec of the subthread has concluded no exit notification is generated for the old thread-group leader. The correct behavior would be to simply not generate an exit notification on the struct pid of a subhthread exec because the struct pid is taken over by the subthread and thus remains alive. But this is difficult to handle because a thread-group may exit premature as mentioned in (2). In that case an exit notification is reliably generated but the subthreads may continue to run for an indeterminate amount of time and thus also may exec at some point. This tiny series tries to address this problem. If that works correctly then no exit notifications are generated for a PIDFD_THREAD pidfd for a thread-group leader until all subthreads have been reaped. If a subthread should exec before no exit notification will be generated until that task exits or it creates subthreads and repeates the cycle. * patches from https://lore.kernel.org/r/20250320-work-pidfs-thread_group-v4-0-da678ce805bf@kernel.org: selftests/pidfd: third test for multi-threaded exec polling selftests/pidfd: second test for multi-threaded exec polling selftests/pidfd: first test for multi-threaded exec polling pidfs: improve multi-threaded exec and premature thread-group leader exit polling Link: https://lore.kernel.org/r/20250320-work-pidfs-thread_group-v4-0-da678ce805bf@kernel.org Signed-off-by: Christian Brauner <brauner@kernel.org>
2 parents 68db272 + 4d5483a commit d40dc30

4 files changed

Lines changed: 225 additions & 30 deletions

File tree

fs/pidfs.c

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -210,20 +210,21 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
210210
static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
211211
{
212212
struct pid *pid = pidfd_pid(file);
213-
bool thread = file->f_flags & PIDFD_THREAD;
214213
struct task_struct *task;
215214
__poll_t poll_flags = 0;
216215

217216
poll_wait(file, &pid->wait_pidfd, pts);
218217
/*
219-
* Depending on PIDFD_THREAD, inform pollers when the thread
220-
* or the whole thread-group exits.
218+
* Don't wake waiters if the thread-group leader exited
219+
* prematurely. They either get notified when the last subthread
220+
* exits or not at all if one of the remaining subthreads execs
221+
* and assumes the struct pid of the old thread-group leader.
221222
*/
222223
guard(rcu)();
223224
task = pid_task(pid, PIDTYPE_PID);
224225
if (!task)
225226
poll_flags = EPOLLIN | EPOLLRDNORM | EPOLLHUP;
226-
else if (task->exit_state && (thread || thread_group_empty(task)))
227+
else if (task->exit_state && !delay_group_leader(task))
227228
poll_flags = EPOLLIN | EPOLLRDNORM;
228229

229230
return poll_flags;

kernel/exit.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -743,10 +743,10 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
743743

744744
tsk->exit_state = EXIT_ZOMBIE;
745745
/*
746-
* sub-thread or delay_group_leader(), wake up the
747-
* PIDFD_THREAD waiters.
746+
* Ignore thread-group leaders that exited before all
747+
* subthreads did.
748748
*/
749-
if (!thread_group_empty(tsk))
749+
if (!delay_group_leader(tsk))
750750
do_notify_pidfd(tsk);
751751

752752
if (unlikely(tsk->ptrace)) {

kernel/signal.c

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2180,8 +2180,7 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
21802180
WARN_ON_ONCE(!tsk->ptrace &&
21812181
(tsk->group_leader != tsk || !thread_group_empty(tsk)));
21822182
/*
2183-
* tsk is a group leader and has no threads, wake up the
2184-
* non-PIDFD_THREAD waiters.
2183+
* Notify for thread-group leaders without subthreads.
21852184
*/
21862185
if (thread_group_empty(tsk))
21872186
do_notify_pidfd(tsk);

tools/testing/selftests/pidfd/pidfd_info_test.c

Lines changed: 216 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -236,7 +236,7 @@ static void *pidfd_info_pause_thread(void *arg)
236236

237237
TEST_F(pidfd_info, thread_group)
238238
{
239-
pid_t pid_leader, pid_thread;
239+
pid_t pid_leader, pid_poller, pid_thread;
240240
pthread_t thread;
241241
int nevents, pidfd_leader, pidfd_thread, pidfd_leader_thread, ret;
242242
int ipc_sockets[2];
@@ -262,6 +262,35 @@ TEST_F(pidfd_info, thread_group)
262262
syscall(__NR_exit, EXIT_SUCCESS);
263263
}
264264

265+
/*
266+
* Opening a PIDFD_THREAD aka thread-specific pidfd based on a
267+
* thread-group leader must succeed.
268+
*/
269+
pidfd_leader_thread = sys_pidfd_open(pid_leader, PIDFD_THREAD);
270+
ASSERT_GE(pidfd_leader_thread, 0);
271+
272+
pid_poller = fork();
273+
ASSERT_GE(pid_poller, 0);
274+
if (pid_poller == 0) {
275+
/*
276+
* We can't poll and wait for the old thread-group
277+
* leader to exit using a thread-specific pidfd. The
278+
* thread-group leader exited prematurely and
279+
* notification is delayed until all subthreads have
280+
* exited.
281+
*/
282+
fds.events = POLLIN;
283+
fds.fd = pidfd_leader_thread;
284+
nevents = poll(&fds, 1, 10000 /* wait 5 seconds */);
285+
if (nevents != 0)
286+
_exit(EXIT_FAILURE);
287+
if (fds.revents & POLLIN)
288+
_exit(EXIT_FAILURE);
289+
if (fds.revents & POLLHUP)
290+
_exit(EXIT_FAILURE);
291+
_exit(EXIT_SUCCESS);
292+
}
293+
265294
/* Retrieve the tid of the thread. */
266295
EXPECT_EQ(close(ipc_sockets[1]), 0);
267296
ASSERT_EQ(read_nointr(ipc_sockets[0], &pid_thread, sizeof(pid_thread)), sizeof(pid_thread));
@@ -275,12 +304,7 @@ TEST_F(pidfd_info, thread_group)
275304
pidfd_thread = sys_pidfd_open(pid_thread, PIDFD_THREAD);
276305
ASSERT_GE(pidfd_thread, 0);
277306

278-
/*
279-
* Opening a PIDFD_THREAD aka thread-specific pidfd based on a
280-
* thread-group leader must succeed.
281-
*/
282-
pidfd_leader_thread = sys_pidfd_open(pid_leader, PIDFD_THREAD);
283-
ASSERT_GE(pidfd_leader_thread, 0);
307+
ASSERT_EQ(wait_for_pid(pid_poller), 0);
284308

285309
/*
286310
* Note that pidfd_leader is a thread-group pidfd, so polling on it
@@ -389,7 +413,7 @@ static void *pidfd_info_thread_exec(void *arg)
389413

390414
TEST_F(pidfd_info, thread_group_exec)
391415
{
392-
pid_t pid_leader, pid_thread;
416+
pid_t pid_leader, pid_poller, pid_thread;
393417
pthread_t thread;
394418
int nevents, pidfd_leader, pidfd_leader_thread, pidfd_thread, ret;
395419
int ipc_sockets[2];
@@ -415,6 +439,37 @@ TEST_F(pidfd_info, thread_group_exec)
415439
syscall(__NR_exit, EXIT_SUCCESS);
416440
}
417441

442+
/* Open a thread-specific pidfd for the thread-group leader. */
443+
pidfd_leader_thread = sys_pidfd_open(pid_leader, PIDFD_THREAD);
444+
ASSERT_GE(pidfd_leader_thread, 0);
445+
446+
pid_poller = fork();
447+
ASSERT_GE(pid_poller, 0);
448+
if (pid_poller == 0) {
449+
/*
450+
* We can't poll and wait for the old thread-group
451+
* leader to exit using a thread-specific pidfd. The
452+
* thread-group leader exited prematurely and
453+
* notification is delayed until all subthreads have
454+
* exited.
455+
*
456+
* When the thread has execed it will taken over the old
457+
* thread-group leaders struct pid. Calling poll after
458+
* the thread execed will thus block again because a new
459+
* thread-group has started.
460+
*/
461+
fds.events = POLLIN;
462+
fds.fd = pidfd_leader_thread;
463+
nevents = poll(&fds, 1, 10000 /* wait 5 seconds */);
464+
if (nevents != 0)
465+
_exit(EXIT_FAILURE);
466+
if (fds.revents & POLLIN)
467+
_exit(EXIT_FAILURE);
468+
if (fds.revents & POLLHUP)
469+
_exit(EXIT_FAILURE);
470+
_exit(EXIT_SUCCESS);
471+
}
472+
418473
/* Retrieve the tid of the thread. */
419474
EXPECT_EQ(close(ipc_sockets[1]), 0);
420475
ASSERT_EQ(read_nointr(ipc_sockets[0], &pid_thread, sizeof(pid_thread)), sizeof(pid_thread));
@@ -423,32 +478,158 @@ TEST_F(pidfd_info, thread_group_exec)
423478
pidfd_thread = sys_pidfd_open(pid_thread, PIDFD_THREAD);
424479
ASSERT_GE(pidfd_thread, 0);
425480

426-
/* Open a thread-specific pidfd for the thread-group leader. */
427-
pidfd_leader_thread = sys_pidfd_open(pid_leader, PIDFD_THREAD);
428-
ASSERT_GE(pidfd_leader_thread, 0);
481+
/* Now that we've opened a thread-specific pidfd the thread can exec. */
482+
ASSERT_EQ(write_nointr(ipc_sockets[0], &pid_thread, sizeof(pid_thread)), sizeof(pid_thread));
483+
EXPECT_EQ(close(ipc_sockets[0]), 0);
484+
485+
ASSERT_EQ(wait_for_pid(pid_poller), 0);
486+
487+
/* Wait until the kernel has SIGKILLed the thread. */
488+
fds.events = POLLHUP;
489+
fds.fd = pidfd_thread;
490+
nevents = poll(&fds, 1, -1);
491+
ASSERT_EQ(nevents, 1);
492+
/* The thread has been reaped. */
493+
ASSERT_TRUE(!!(fds.revents & POLLHUP));
494+
495+
/* Retrieve thread-specific exit info from pidfd. */
496+
ASSERT_EQ(ioctl(pidfd_thread, PIDFD_GET_INFO, &info), 0);
497+
ASSERT_FALSE(!!(info.mask & PIDFD_INFO_CREDS));
498+
ASSERT_TRUE(!!(info.mask & PIDFD_INFO_EXIT));
499+
/*
500+
* While the kernel will have SIGKILLed the whole thread-group
501+
* during exec it will cause the individual threads to exit
502+
* cleanly.
503+
*/
504+
ASSERT_TRUE(WIFEXITED(info.exit_code));
505+
ASSERT_EQ(WEXITSTATUS(info.exit_code), 0);
506+
507+
/*
508+
* The thread-group leader is still alive, the thread has taken
509+
* over its struct pid and thus its pid number.
510+
*/
511+
info.mask = PIDFD_INFO_CGROUPID | PIDFD_INFO_EXIT;
512+
ASSERT_EQ(ioctl(pidfd_leader, PIDFD_GET_INFO, &info), 0);
513+
ASSERT_TRUE(!!(info.mask & PIDFD_INFO_CREDS));
514+
ASSERT_FALSE(!!(info.mask & PIDFD_INFO_EXIT));
515+
ASSERT_EQ(info.pid, pid_leader);
516+
517+
/* Take down the thread-group leader. */
518+
EXPECT_EQ(sys_pidfd_send_signal(pidfd_leader, SIGKILL, NULL, 0), 0);
429519

430520
/*
431-
* We can poll and wait for the old thread-group leader to exit
432-
* using a thread-specific pidfd.
433-
*
434-
* This only works until the thread has execed. When the thread
435-
* has execed it will have taken over the old thread-group
436-
* leaders struct pid. Calling poll after the thread execed will
437-
* thus block again because a new thread-group has started (Yes,
438-
* it's fscked.).
521+
* Afte the exec we're dealing with an empty thread-group so now
522+
* we must see an exit notification on the thread-specific pidfd
523+
* for the thread-group leader as there's no subthread that can
524+
* revive the struct pid.
439525
*/
440526
fds.events = POLLIN;
441527
fds.fd = pidfd_leader_thread;
442528
nevents = poll(&fds, 1, -1);
443529
ASSERT_EQ(nevents, 1);
444-
/* The thread-group leader has exited. */
445530
ASSERT_TRUE(!!(fds.revents & POLLIN));
446-
/* The thread-group leader hasn't been reaped. */
447531
ASSERT_FALSE(!!(fds.revents & POLLHUP));
448532

533+
EXPECT_EQ(sys_waitid(P_PIDFD, pidfd_leader, NULL, WEXITED), 0);
534+
535+
/* Retrieve exit information for the thread-group leader. */
536+
info.mask = PIDFD_INFO_CGROUPID | PIDFD_INFO_EXIT;
537+
ASSERT_EQ(ioctl(pidfd_leader, PIDFD_GET_INFO, &info), 0);
538+
ASSERT_FALSE(!!(info.mask & PIDFD_INFO_CREDS));
539+
ASSERT_TRUE(!!(info.mask & PIDFD_INFO_EXIT));
540+
541+
EXPECT_EQ(close(pidfd_leader), 0);
542+
EXPECT_EQ(close(pidfd_thread), 0);
543+
}
544+
545+
static void *pidfd_info_thread_exec_sane(void *arg)
546+
{
547+
pid_t pid_thread = gettid();
548+
int ipc_socket = *(int *)arg;
549+
550+
/* Inform the grand-parent what the tid of this thread is. */
551+
if (write_nointr(ipc_socket, &pid_thread, sizeof(pid_thread)) != sizeof(pid_thread))
552+
return NULL;
553+
554+
if (read_nointr(ipc_socket, &pid_thread, sizeof(pid_thread)) != sizeof(pid_thread))
555+
return NULL;
556+
557+
close(ipc_socket);
558+
559+
sys_execveat(AT_FDCWD, "pidfd_exec_helper", NULL, NULL, 0);
560+
return NULL;
561+
}
562+
563+
TEST_F(pidfd_info, thread_group_exec_thread)
564+
{
565+
pid_t pid_leader, pid_poller, pid_thread;
566+
pthread_t thread;
567+
int nevents, pidfd_leader, pidfd_leader_thread, pidfd_thread, ret;
568+
int ipc_sockets[2];
569+
struct pollfd fds = {};
570+
struct pidfd_info info = {
571+
.mask = PIDFD_INFO_CGROUPID | PIDFD_INFO_EXIT,
572+
};
573+
574+
ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
575+
EXPECT_EQ(ret, 0);
576+
577+
pid_leader = create_child(&pidfd_leader, 0);
578+
EXPECT_GE(pid_leader, 0);
579+
580+
if (pid_leader == 0) {
581+
close(ipc_sockets[0]);
582+
583+
/* The thread will outlive the thread-group leader. */
584+
if (pthread_create(&thread, NULL, pidfd_info_thread_exec_sane, &ipc_sockets[1]))
585+
syscall(__NR_exit, EXIT_FAILURE);
586+
587+
/*
588+
* Pause the thread-group leader. It will be killed once
589+
* the subthread execs.
590+
*/
591+
pause();
592+
syscall(__NR_exit, EXIT_SUCCESS);
593+
}
594+
595+
/* Retrieve the tid of the thread. */
596+
EXPECT_EQ(close(ipc_sockets[1]), 0);
597+
ASSERT_EQ(read_nointr(ipc_sockets[0], &pid_thread, sizeof(pid_thread)), sizeof(pid_thread));
598+
599+
/* Opening a thread as a PIDFD_THREAD must succeed. */
600+
pidfd_thread = sys_pidfd_open(pid_thread, PIDFD_THREAD);
601+
ASSERT_GE(pidfd_thread, 0);
602+
603+
/* Open a thread-specific pidfd for the thread-group leader. */
604+
pidfd_leader_thread = sys_pidfd_open(pid_leader, PIDFD_THREAD);
605+
ASSERT_GE(pidfd_leader_thread, 0);
606+
607+
pid_poller = fork();
608+
ASSERT_GE(pid_poller, 0);
609+
if (pid_poller == 0) {
610+
/*
611+
* The subthread will now exec. The struct pid of the old
612+
* thread-group leader will be assumed by the subthread which
613+
* becomes the new thread-group leader. So no exit notification
614+
* must be generated. Wait for 5 seconds and call it a success
615+
* if no notification has been received.
616+
*/
617+
fds.events = POLLIN;
618+
fds.fd = pidfd_leader_thread;
619+
nevents = poll(&fds, 1, 10000 /* wait 5 seconds */);
620+
if (nevents != 0)
621+
_exit(EXIT_FAILURE);
622+
if (fds.revents & POLLIN)
623+
_exit(EXIT_FAILURE);
624+
if (fds.revents & POLLHUP)
625+
_exit(EXIT_FAILURE);
626+
_exit(EXIT_SUCCESS);
627+
}
628+
449629
/* Now that we've opened a thread-specific pidfd the thread can exec. */
450630
ASSERT_EQ(write_nointr(ipc_sockets[0], &pid_thread, sizeof(pid_thread)), sizeof(pid_thread));
451631
EXPECT_EQ(close(ipc_sockets[0]), 0);
632+
ASSERT_EQ(wait_for_pid(pid_poller), 0);
452633

453634
/* Wait until the kernel has SIGKILLed the thread. */
454635
fds.events = POLLHUP;
@@ -482,6 +663,20 @@ TEST_F(pidfd_info, thread_group_exec)
482663

483664
/* Take down the thread-group leader. */
484665
EXPECT_EQ(sys_pidfd_send_signal(pidfd_leader, SIGKILL, NULL, 0), 0);
666+
667+
/*
668+
* Afte the exec we're dealing with an empty thread-group so now
669+
* we must see an exit notification on the thread-specific pidfd
670+
* for the thread-group leader as there's no subthread that can
671+
* revive the struct pid.
672+
*/
673+
fds.events = POLLIN;
674+
fds.fd = pidfd_leader_thread;
675+
nevents = poll(&fds, 1, -1);
676+
ASSERT_EQ(nevents, 1);
677+
ASSERT_TRUE(!!(fds.revents & POLLIN));
678+
ASSERT_FALSE(!!(fds.revents & POLLHUP));
679+
485680
EXPECT_EQ(sys_waitid(P_PIDFD, pidfd_leader, NULL, WEXITED), 0);
486681

487682
/* Retrieve exit information for the thread-group leader. */

0 commit comments

Comments
 (0)