Skip to content

Commit 64bef69

Browse files
oleg-nesterovbrauner
authored andcommitted
pidfd: implement PIDFD_THREAD flag for pidfd_open()
With this flag: - pidfd_open() doesn't require that the target task must be a thread-group leader - pidfd_poll() succeeds when the task exits and becomes a zombie (iow, passes exit_notify()), even if it is a leader and thread-group is not empty. This means that the behaviour of pidfd_poll(PIDFD_THREAD, pid-of-group-leader) is not well defined if it races with exec() from its sub-thread; pidfd_poll() can succeed or not depending on whether pidfd_task_exited() is called before or after exchange_tids(). Perhaps we can improve this behaviour later, pidfd_poll() can probably take sig->group_exec_task into account. But this doesn't really differ from the case when the leader exits before other threads (so pidfd_poll() succeeds) and then another thread execs and pidfd_poll() will block again. thread_group_exited() is no longer used, perhaps it can die. Co-developed-by: Tycho Andersen <tycho@tycho.pizza> Signed-off-by: Oleg Nesterov <oleg@redhat.com> Link: https://lore.kernel.org/r/20240131132602.GA23641@redhat.com Tested-by: Tycho Andersen <tandersen@netflix.com> Reviewed-by: Tycho Andersen <tandersen@netflix.com> Signed-off-by: Christian Brauner <brauner@kernel.org>
1 parent 21e2520 commit 64bef69

7 files changed

Lines changed: 54 additions & 23 deletions

File tree

fs/exec.c

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1143,7 +1143,11 @@ static int de_thread(struct task_struct *tsk)
11431143

11441144
BUG_ON(leader->exit_state != EXIT_ZOMBIE);
11451145
leader->exit_state = EXIT_DEAD;
1146-
1146+
/*
1147+
* leader and tsk exhanged their pids, the old pid dies,
1148+
* wake up the PIDFD_THREAD waiters.
1149+
*/
1150+
do_notify_pidfd(leader);
11471151
/*
11481152
* We are going to release_task()->ptrace_unlink() silently,
11491153
* the tracer can sleep in do_wait(). EXIT_DEAD guarantees

include/linux/pid.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,10 +70,11 @@ extern const struct file_operations pidfd_fops;
7070

7171
struct file;
7272

73-
extern struct pid *pidfd_pid(const struct file *file);
73+
struct pid *pidfd_pid(const struct file *file);
7474
struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags);
7575
struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags);
7676
int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret);
77+
void do_notify_pidfd(struct task_struct *task);
7778

7879
static inline struct pid *get_pid(struct pid *pid)
7980
{

include/uapi/linux/pidfd.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#include <linux/fcntl.h>
88

99
/* Flags for pidfd_open(). */
10-
#define PIDFD_NONBLOCK O_NONBLOCK
10+
#define PIDFD_NONBLOCK O_NONBLOCK
11+
#define PIDFD_THREAD O_EXCL
1112

1213
#endif /* _UAPI_LINUX_PIDFD_H */

kernel/exit.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -739,6 +739,13 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
739739
kill_orphaned_pgrp(tsk->group_leader, NULL);
740740

741741
tsk->exit_state = EXIT_ZOMBIE;
742+
/*
743+
* sub-thread or delay_group_leader(), wake up the
744+
* PIDFD_THREAD waiters.
745+
*/
746+
if (!thread_group_empty(tsk))
747+
do_notify_pidfd(tsk);
748+
742749
if (unlikely(tsk->ptrace)) {
743750
int sig = thread_group_leader(tsk) &&
744751
thread_group_empty(tsk) &&

kernel/fork.c

Lines changed: 31 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@
101101
#include <linux/user_events.h>
102102
#include <linux/iommu.h>
103103
#include <linux/rseq.h>
104+
#include <uapi/linux/pidfd.h>
104105

105106
#include <asm/pgalloc.h>
106107
#include <linux/uaccess.h>
@@ -2050,6 +2051,8 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
20502051

20512052
seq_put_decimal_ll(m, "Pid:\t", nr);
20522053

2054+
/* TODO: report PIDFD_THREAD */
2055+
20532056
#ifdef CONFIG_PID_NS
20542057
seq_put_decimal_ll(m, "\nNSpid:\t", nr);
20552058
if (nr > 0) {
@@ -2068,22 +2071,35 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
20682071
}
20692072
#endif
20702073

2074+
static bool pidfd_task_exited(struct pid *pid, bool thread)
2075+
{
2076+
struct task_struct *task;
2077+
bool exited;
2078+
2079+
rcu_read_lock();
2080+
task = pid_task(pid, PIDTYPE_PID);
2081+
exited = !task ||
2082+
(READ_ONCE(task->exit_state) && (thread || thread_group_empty(task)));
2083+
rcu_read_unlock();
2084+
2085+
return exited;
2086+
}
2087+
20712088
/*
20722089
* Poll support for process exit notification.
20732090
*/
20742091
static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
20752092
{
20762093
struct pid *pid = file->private_data;
2094+
bool thread = file->f_flags & PIDFD_THREAD;
20772095
__poll_t poll_flags = 0;
20782096

20792097
poll_wait(file, &pid->wait_pidfd, pts);
2080-
20812098
/*
2082-
* Inform pollers only when the whole thread group exits.
2083-
* If the thread group leader exits before all other threads in the
2084-
* group, then poll(2) should block, similar to the wait(2) family.
2099+
* Depending on PIDFD_THREAD, inform pollers when the thread
2100+
* or the whole thread-group exits.
20852101
*/
2086-
if (thread_group_exited(pid))
2102+
if (pidfd_task_exited(pid, thread))
20872103
poll_flags = EPOLLIN | EPOLLRDNORM;
20882104

20892105
return poll_flags;
@@ -2141,6 +2157,11 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re
21412157
return PTR_ERR(pidfd_file);
21422158
}
21432159
get_pid(pid); /* held by pidfd_file now */
2160+
/*
2161+
* anon_inode_getfile() ignores everything outside of the
2162+
* O_ACCMODE | O_NONBLOCK mask, set PIDFD_THREAD manually.
2163+
*/
2164+
pidfd_file->f_flags |= (flags & PIDFD_THREAD);
21442165
*ret = pidfd_file;
21452166
return pidfd;
21462167
}
@@ -2154,7 +2175,8 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re
21542175
* Allocate a new file that stashes @pid and reserve a new pidfd number in the
21552176
* caller's file descriptor table. The pidfd is reserved but not installed yet.
21562177
*
2157-
* The helper verifies that @pid is used as a thread group leader.
2178+
* The helper verifies that @pid is still in use, without PIDFD_THREAD the
2179+
* task identified by @pid must be a thread-group leader.
21582180
*
21592181
* If this function returns successfully the caller is responsible to either
21602182
* call fd_install() passing the returned pidfd and pidfd file as arguments in
@@ -2173,7 +2195,9 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re
21732195
*/
21742196
int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
21752197
{
2176-
if (!pid || !pid_has_task(pid, PIDTYPE_TGID))
2198+
bool thread = flags & PIDFD_THREAD;
2199+
2200+
if (!pid || !pid_has_task(pid, thread ? PIDTYPE_PID : PIDTYPE_TGID))
21772201
return -EINVAL;
21782202

21792203
return __pidfd_prepare(pid, flags, ret);

kernel/pid.c

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -552,11 +552,6 @@ struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags)
552552
* Return the task associated with @pidfd. The function takes a reference on
553553
* the returned task. The caller is responsible for releasing that reference.
554554
*
555-
* Currently, the process identified by @pidfd is always a thread-group leader.
556-
* This restriction currently exists for all aspects of pidfds including pidfd
557-
* creation (CLONE_PIDFD cannot be used with CLONE_THREAD) and pidfd polling
558-
* (only supports thread group leaders).
559-
*
560555
* Return: On success, the task_struct associated with the pidfd.
561556
* On error, a negative errno number will be returned.
562557
*/
@@ -615,11 +610,8 @@ static int pidfd_create(struct pid *pid, unsigned int flags)
615610
* @flags: flags to pass
616611
*
617612
* This creates a new pid file descriptor with the O_CLOEXEC flag set for
618-
* the process identified by @pid. Currently, the process identified by
619-
* @pid must be a thread-group leader. This restriction currently exists
620-
* for all aspects of pidfds including pidfd creation (CLONE_PIDFD cannot
621-
* be used with CLONE_THREAD) and pidfd polling (only supports thread group
622-
* leaders).
613+
* the task identified by @pid. Without PIDFD_THREAD flag the target task
614+
* must be a thread-group leader.
623615
*
624616
* Return: On success, a cloexec pidfd is returned.
625617
* On error, a negative errno number will be returned.
@@ -629,7 +621,7 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
629621
int fd;
630622
struct pid *p;
631623

632-
if (flags & ~PIDFD_NONBLOCK)
624+
if (flags & ~(PIDFD_NONBLOCK | PIDFD_THREAD))
633625
return -EINVAL;
634626

635627
if (pid <= 0)

kernel/signal.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2019,7 +2019,7 @@ int send_sigqueue(struct sigqueue *q, struct pid *pid, enum pid_type type)
20192019
return ret;
20202020
}
20212021

2022-
static void do_notify_pidfd(struct task_struct *task)
2022+
void do_notify_pidfd(struct task_struct *task)
20232023
{
20242024
struct pid *pid;
20252025

@@ -2051,7 +2051,8 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
20512051
WARN_ON_ONCE(!tsk->ptrace &&
20522052
(tsk->group_leader != tsk || !thread_group_empty(tsk)));
20532053
/*
2054-
* tsk is a group leader and has no threads, wake up the pidfd waiters.
2054+
* tsk is a group leader and has no threads, wake up the
2055+
* non-PIDFD_THREAD waiters.
20552056
*/
20562057
if (thread_group_empty(tsk))
20572058
do_notify_pidfd(tsk);
@@ -3926,6 +3927,7 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
39263927
prepare_kill_siginfo(sig, &kinfo);
39273928
}
39283929

3930+
/* TODO: respect PIDFD_THREAD */
39293931
ret = kill_pid_info(sig, &kinfo, pid);
39303932

39313933
err:

0 commit comments

Comments
 (0)