Skip to content

Commit 6092c50

Browse files
committed
Merge patch series "pidfs: provide information after task has been reaped"
Christian Brauner <brauner@kernel.org> says: Various tools need access to information about a process/task even after it has already been reaped. For example, systemd's journal logs and uses such information as the cgroup id and exit status to deal with processes that have been sent via SCM_PIDFD or SCM_PEERPIDFD. By the time the pidfd is received the process might have already been reaped. This series aims to provide information by extending the PIDFD_GET_INFO ioctl to retrieve the exit code and cgroup id. There might be other stuff that we would want in the future. Pidfd polling allows waiting on either task exit or for a task to have been reaped. The contract for PIDFD_INFO_EXIT is simply that EPOLLHUP must be observed before exit information can be retrieved, i.e., exit information is only provided once the task has been reaped. Note, that if a thread-group leader exits before other threads in the thread-group then exit information will only be available once the thread-group is empty. This aligns with wait() as well, where reaping of a thread-group leader that exited before the thread-group was empty is delayed until the thread-group is empty. With PIDFD_INFO_EXIT autoreaping might actually become usable because it means a parent can ignore SIGCHLD or set SA_NOCLDWAIT and simply use pidfd polling and PIDFD_INFO_EXIT to get get status information for its children. The kernel will autocleanup right away instead of delaying. This includes expansive selftests including for thread-group behior and multi-threaded exec by a non-thread-group leader thread. * patches from https://lore.kernel.org/r/20250305-work-pidfs-kill_on_last_close-v3-0-c8c3d8361705@kernel.org: selftests/pidfd: add seventh PIDFD_INFO_EXIT selftest selftests/pidfd: add sixth PIDFD_INFO_EXIT selftest selftests/pidfd: add fifth PIDFD_INFO_EXIT selftest selftests/pidfd: add fourth PIDFD_INFO_EXIT selftest selftests/pidfd: add third PIDFD_INFO_EXIT selftest selftests/pidfd: add second PIDFD_INFO_EXIT selftest selftests/pidfd: add first PIDFD_INFO_EXIT selftest selftests/pidfd: expand common pidfd header pidfs/selftests: ensure correct headers for ioctl handling selftests/pidfd: fix header inclusion pidfs: allow to retrieve exit information pidfs: record exit code and cgroupid at exit pidfs: use private inode slab cache pidfs: move setting flags into pidfs_alloc_file() pidfd: rely on automatic cleanup in __pidfd_prepare() pidfs: switch to copy_struct_to_user() Link: https://lore.kernel.org/r/20250305-work-pidfs-kill_on_last_close-v3-0-c8c3d8361705@kernel.org Signed-off-by: Christian Brauner <brauner@kernel.org>
2 parents b1e809e + 56f235d commit 6092c50

15 files changed

Lines changed: 783 additions & 105 deletions

File tree

fs/internal.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -325,6 +325,7 @@ struct stashed_operations {
325325
int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data,
326326
struct path *path);
327327
void stashed_dentry_prune(struct dentry *dentry);
328+
struct dentry *stashed_dentry_get(struct dentry **stashed);
328329
/**
329330
* path_mounted - check whether path is mounted
330331
* @path: path to check

fs/libfs.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2113,7 +2113,7 @@ struct timespec64 simple_inode_init_ts(struct inode *inode)
21132113
}
21142114
EXPORT_SYMBOL(simple_inode_init_ts);
21152115

2116-
static inline struct dentry *get_stashed_dentry(struct dentry **stashed)
2116+
struct dentry *stashed_dentry_get(struct dentry **stashed)
21172117
{
21182118
struct dentry *dentry;
21192119

@@ -2215,7 +2215,7 @@ int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data,
22152215
const struct stashed_operations *sops = mnt->mnt_sb->s_fs_info;
22162216

22172217
/* See if dentry can be reused. */
2218-
path->dentry = get_stashed_dentry(stashed);
2218+
path->dentry = stashed_dentry_get(stashed);
22192219
if (path->dentry) {
22202220
sops->put_data(data);
22212221
goto out_path;

fs/pidfs.c

Lines changed: 163 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,28 @@
2424
#include "internal.h"
2525
#include "mount.h"
2626

27+
static struct kmem_cache *pidfs_cachep __ro_after_init;
28+
29+
/*
30+
* Stashes information that userspace needs to access even after the
31+
* process has been reaped.
32+
*/
33+
struct pidfs_exit_info {
34+
__u64 cgroupid;
35+
__s32 exit_code;
36+
};
37+
38+
struct pidfs_inode {
39+
struct pidfs_exit_info __pei;
40+
struct pidfs_exit_info *exit_info;
41+
struct inode vfs_inode;
42+
};
43+
44+
static inline struct pidfs_inode *pidfs_i(struct inode *inode)
45+
{
46+
return container_of(inode, struct pidfs_inode, vfs_inode);
47+
}
48+
2749
static struct rb_root pidfs_ino_tree = RB_ROOT;
2850

2951
#if BITS_PER_LONG == 32
@@ -207,17 +229,28 @@ static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
207229
return poll_flags;
208230
}
209231

210-
static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long arg)
232+
static inline bool pid_in_current_pidns(const struct pid *pid)
233+
{
234+
const struct pid_namespace *ns = task_active_pid_ns(current);
235+
236+
if (ns->level <= pid->level)
237+
return pid->numbers[ns->level].ns == ns;
238+
239+
return false;
240+
}
241+
242+
static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
211243
{
212244
struct pidfd_info __user *uinfo = (struct pidfd_info __user *)arg;
245+
struct inode *inode = file_inode(file);
246+
struct pid *pid = pidfd_pid(file);
213247
size_t usize = _IOC_SIZE(cmd);
214248
struct pidfd_info kinfo = {};
249+
struct pidfs_exit_info *exit_info;
215250
struct user_namespace *user_ns;
251+
struct task_struct *task;
216252
const struct cred *c;
217253
__u64 mask;
218-
#ifdef CONFIG_CGROUPS
219-
struct cgroup *cgrp;
220-
#endif
221254

222255
if (!uinfo)
223256
return -EINVAL;
@@ -227,6 +260,37 @@ static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long
227260
if (copy_from_user(&mask, &uinfo->mask, sizeof(mask)))
228261
return -EFAULT;
229262

263+
/*
264+
* Restrict information retrieval to tasks within the caller's pid
265+
* namespace hierarchy.
266+
*/
267+
if (!pid_in_current_pidns(pid))
268+
return -ESRCH;
269+
270+
if (mask & PIDFD_INFO_EXIT) {
271+
exit_info = READ_ONCE(pidfs_i(inode)->exit_info);
272+
if (exit_info) {
273+
kinfo.mask |= PIDFD_INFO_EXIT;
274+
#ifdef CONFIG_CGROUPS
275+
kinfo.cgroupid = exit_info->cgroupid;
276+
kinfo.mask |= PIDFD_INFO_CGROUPID;
277+
#endif
278+
kinfo.exit_code = exit_info->exit_code;
279+
}
280+
}
281+
282+
task = get_pid_task(pid, PIDTYPE_PID);
283+
if (!task) {
284+
/*
285+
* If the task has already been reaped, only exit
286+
* information is available
287+
*/
288+
if (!(mask & PIDFD_INFO_EXIT))
289+
return -ESRCH;
290+
291+
goto copy_out;
292+
}
293+
230294
c = get_task_cred(task);
231295
if (!c)
232296
return -ESRCH;
@@ -246,11 +310,15 @@ static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long
246310
put_cred(c);
247311

248312
#ifdef CONFIG_CGROUPS
249-
rcu_read_lock();
250-
cgrp = task_dfl_cgroup(task);
251-
kinfo.cgroupid = cgroup_id(cgrp);
252-
kinfo.mask |= PIDFD_INFO_CGROUPID;
253-
rcu_read_unlock();
313+
if (!kinfo.cgroupid) {
314+
struct cgroup *cgrp;
315+
316+
rcu_read_lock();
317+
cgrp = task_dfl_cgroup(task);
318+
kinfo.cgroupid = cgroup_id(cgrp);
319+
kinfo.mask |= PIDFD_INFO_CGROUPID;
320+
rcu_read_unlock();
321+
}
254322
#endif
255323

256324
/*
@@ -270,16 +338,14 @@ static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long
270338
if (kinfo.pid == 0 || kinfo.tgid == 0 || (kinfo.ppid == 0 && kinfo.pid != 1))
271339
return -ESRCH;
272340

341+
copy_out:
273342
/*
274343
* If userspace and the kernel have the same struct size it can just
275344
* be copied. If userspace provides an older struct, only the bits that
276345
* userspace knows about will be copied. If userspace provides a new
277346
* struct, only the bits that the kernel knows about will be copied.
278347
*/
279-
if (copy_to_user(uinfo, &kinfo, min(usize, sizeof(kinfo))))
280-
return -EFAULT;
281-
282-
return 0;
348+
return copy_struct_to_user(uinfo, usize, &kinfo, sizeof(kinfo), NULL);
283349
}
284350

285351
static bool pidfs_ioctl_valid(unsigned int cmd)
@@ -307,7 +373,6 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
307373
{
308374
struct task_struct *task __free(put_task) = NULL;
309375
struct nsproxy *nsp __free(put_nsproxy) = NULL;
310-
struct pid *pid = pidfd_pid(file);
311376
struct ns_common *ns_common = NULL;
312377
struct pid_namespace *pid_ns;
313378

@@ -322,13 +387,13 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
322387
return put_user(file_inode(file)->i_generation, argp);
323388
}
324389

325-
task = get_pid_task(pid, PIDTYPE_PID);
326-
if (!task)
327-
return -ESRCH;
328-
329390
/* Extensible IOCTL that does not open namespace FDs, take a shortcut */
330391
if (_IOC_NR(cmd) == _IOC_NR(PIDFD_GET_INFO))
331-
return pidfd_info(task, cmd, arg);
392+
return pidfd_info(file, cmd, arg);
393+
394+
task = get_pid_task(pidfd_pid(file), PIDTYPE_PID);
395+
if (!task)
396+
return -ESRCH;
332397

333398
if (arg)
334399
return -EINVAL;
@@ -440,6 +505,49 @@ struct pid *pidfd_pid(const struct file *file)
440505
return file_inode(file)->i_private;
441506
}
442507

508+
/*
509+
* We're called from release_task(). We know there's at least one
510+
* reference to struct pid being held that won't be released until the
511+
* task has been reaped which cannot happen until we're out of
512+
* release_task().
513+
*
514+
* If this struct pid is referred to by a pidfd then
515+
* stashed_dentry_get() will return the dentry and inode for that struct
516+
* pid. Since we've taken a reference on it there's now an additional
517+
* reference from the exit path on it. Which is fine. We're going to put
518+
* it again in a second and we know that the pid is kept alive anyway.
519+
*
520+
* Worst case is that we've filled in the info and immediately free the
521+
* dentry and inode afterwards since the pidfd has been closed. Since
522+
* pidfs_exit() currently is placed after exit_task_work() we know that
523+
* it cannot be us aka the exiting task holding a pidfd to ourselves.
524+
*/
525+
void pidfs_exit(struct task_struct *tsk)
526+
{
527+
struct dentry *dentry;
528+
529+
might_sleep();
530+
531+
dentry = stashed_dentry_get(&task_pid(tsk)->stashed);
532+
if (dentry) {
533+
struct inode *inode = d_inode(dentry);
534+
struct pidfs_exit_info *exit_info = &pidfs_i(inode)->__pei;
535+
#ifdef CONFIG_CGROUPS
536+
struct cgroup *cgrp;
537+
538+
rcu_read_lock();
539+
cgrp = task_dfl_cgroup(tsk);
540+
exit_info->cgroupid = cgroup_id(cgrp);
541+
rcu_read_unlock();
542+
#endif
543+
exit_info->exit_code = tsk->exit_code;
544+
545+
/* Ensure that PIDFD_GET_INFO sees either all or nothing. */
546+
smp_store_release(&pidfs_i(inode)->exit_info, &pidfs_i(inode)->__pei);
547+
dput(dentry);
548+
}
549+
}
550+
443551
static struct vfsmount *pidfs_mnt __ro_after_init;
444552

445553
/*
@@ -495,9 +603,30 @@ static void pidfs_evict_inode(struct inode *inode)
495603
put_pid(pid);
496604
}
497605

606+
static struct inode *pidfs_alloc_inode(struct super_block *sb)
607+
{
608+
struct pidfs_inode *pi;
609+
610+
pi = alloc_inode_sb(sb, pidfs_cachep, GFP_KERNEL);
611+
if (!pi)
612+
return NULL;
613+
614+
memset(&pi->__pei, 0, sizeof(pi->__pei));
615+
pi->exit_info = NULL;
616+
617+
return &pi->vfs_inode;
618+
}
619+
620+
static void pidfs_free_inode(struct inode *inode)
621+
{
622+
kmem_cache_free(pidfs_cachep, pidfs_i(inode));
623+
}
624+
498625
static const struct super_operations pidfs_sops = {
626+
.alloc_inode = pidfs_alloc_inode,
499627
.drop_inode = generic_delete_inode,
500628
.evict_inode = pidfs_evict_inode,
629+
.free_inode = pidfs_free_inode,
501630
.statfs = simple_statfs,
502631
};
503632

@@ -699,12 +828,27 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
699828
return ERR_PTR(ret);
700829

701830
pidfd_file = dentry_open(&path, flags, current_cred());
831+
/* Raise PIDFD_THREAD explicitly as do_dentry_open() strips it. */
832+
if (!IS_ERR(pidfd_file))
833+
pidfd_file->f_flags |= (flags & PIDFD_THREAD);
834+
702835
path_put(&path);
703836
return pidfd_file;
704837
}
705838

839+
static void pidfs_inode_init_once(void *data)
840+
{
841+
struct pidfs_inode *pi = data;
842+
843+
inode_init_once(&pi->vfs_inode);
844+
}
845+
706846
void __init pidfs_init(void)
707847
{
848+
pidfs_cachep = kmem_cache_create("pidfs_cache", sizeof(struct pidfs_inode), 0,
849+
(SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT |
850+
SLAB_ACCOUNT | SLAB_PANIC),
851+
pidfs_inode_init_once);
708852
pidfs_mnt = kern_mount(&pidfs_type);
709853
if (IS_ERR(pidfs_mnt))
710854
panic("Failed to mount pidfs pseudo filesystem");

include/linux/pidfs.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags);
66
void __init pidfs_init(void);
77
void pidfs_add_pid(struct pid *pid);
88
void pidfs_remove_pid(struct pid *pid);
9+
void pidfs_exit(struct task_struct *tsk);
910
extern const struct dentry_operations pidfs_dentry_operations;
1011

1112
#endif /* _LINUX_PID_FS_H */

include/uapi/linux/pidfd.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#define PIDFD_INFO_PID (1UL << 0) /* Always returned, even if not requested */
2121
#define PIDFD_INFO_CREDS (1UL << 1) /* Always returned, even if not requested */
2222
#define PIDFD_INFO_CGROUPID (1UL << 2) /* Always returned if available, even if not requested */
23+
#define PIDFD_INFO_EXIT (1UL << 3) /* Only returned if requested. */
2324

2425
#define PIDFD_INFO_SIZE_VER0 64 /* sizeof first published struct */
2526

@@ -86,7 +87,7 @@ struct pidfd_info {
8687
__u32 sgid;
8788
__u32 fsuid;
8889
__u32 fsgid;
89-
__u32 spare0[1];
90+
__s32 exit_code;
9091
};
9192

9293
#define PIDFS_IOCTL_MAGIC 0xFF

kernel/exit.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@
6969
#include <linux/sysfs.h>
7070
#include <linux/user_events.h>
7171
#include <linux/uaccess.h>
72+
#include <linux/pidfs.h>
7273

7374
#include <uapi/linux/wait.h>
7475

@@ -249,6 +250,7 @@ void release_task(struct task_struct *p)
249250
dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
250251
rcu_read_unlock();
251252

253+
pidfs_exit(p);
252254
cgroup_release(p);
253255

254256
write_lock_irq(&tasklist_lock);

kernel/fork.c

Lines changed: 4 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2032,25 +2032,18 @@ static inline void rcu_copy_process(struct task_struct *p)
20322032
*/
20332033
static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
20342034
{
2035-
int pidfd;
20362035
struct file *pidfd_file;
20372036

2038-
pidfd = get_unused_fd_flags(O_CLOEXEC);
2037+
CLASS(get_unused_fd, pidfd)(O_CLOEXEC);
20392038
if (pidfd < 0)
20402039
return pidfd;
20412040

20422041
pidfd_file = pidfs_alloc_file(pid, flags | O_RDWR);
2043-
if (IS_ERR(pidfd_file)) {
2044-
put_unused_fd(pidfd);
2042+
if (IS_ERR(pidfd_file))
20452043
return PTR_ERR(pidfd_file);
2046-
}
2047-
/*
2048-
* anon_inode_getfile() ignores everything outside of the
2049-
* O_ACCMODE | O_NONBLOCK mask, set PIDFD_THREAD manually.
2050-
*/
2051-
pidfd_file->f_flags |= (flags & PIDFD_THREAD);
2044+
20522045
*ret = pidfd_file;
2053-
return pidfd;
2046+
return take_fd(pidfd);
20542047
}
20552048

20562049
/**

tools/testing/selftests/pidfd/.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,5 @@ pidfd_getfd_test
88
pidfd_setns_test
99
pidfd_file_handle_test
1010
pidfd_bind_mount
11+
pidfd_info_test
12+
pidfd_exec_helper

tools/testing/selftests/pidfd/Makefile

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,9 @@ CFLAGS += -g $(KHDR_INCLUDES) -pthread -Wall
33

44
TEST_GEN_PROGS := pidfd_test pidfd_fdinfo_test pidfd_open_test \
55
pidfd_poll_test pidfd_wait pidfd_getfd_test pidfd_setns_test \
6-
pidfd_file_handle_test pidfd_bind_mount
6+
pidfd_file_handle_test pidfd_bind_mount pidfd_info_test
7+
8+
TEST_GEN_PROGS_EXTENDED := pidfd_exec_helper
79

810
include ../lib.mk
911

0 commit comments

Comments
 (0)