Skip to content

Commit df00ded

Browse files
committed
Merge tag 'vfs-6.15-rc1.pidfs' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
Pull vfs pidfs updates from Christian Brauner: - Allow retrieving exit information after a process has been reaped through pidfds via the new PIDFD_INTO_EXIT extension for the PIDFD_GET_INFO ioctl. Various tools need access to information about a process/task even after it has already been reaped. Pidfd polling allows waiting on either task exit or for a task to have been reaped. The contract for PIDFD_INFO_EXIT is simply that EPOLLHUP must be observed before exit information can be retrieved, i.e., exit information is only provided once the task has been reaped and then can be retrieved as long as the pidfd is open. - Add PIDFD_SELF_{THREAD,THREAD_GROUP} sentinels allowing userspace to forgo allocating a file descriptor for their own process. This is useful in scenarios where users want to act on their own process through pidfds and is akin to AT_FDCWD. - Improve premature thread-group leader and subthread exec behavior when polling on pidfds: (1) During a multi-threaded exec by a subthread, i.e., non-thread-group leader thread, all other threads in the thread-group including the thread-group leader are killed and the struct pid of the thread-group leader will be taken over by the subthread that called exec. IOW, two tasks change their TIDs. (2) A premature thread-group leader exit means that the thread-group leader exited before all of the other subthreads in the thread-group have exited. Both cases lead to inconsistencies for pidfd polling with PIDFD_THREAD. Any caller that holds a PIDFD_THREAD pidfd to the current thread-group leader may or may not see an exit notification on the file descriptor depending on when poll is performed. If the poll is performed before the exec of the subthread has concluded an exit notification is generated for the old thread-group leader. If the poll is performed after the exec of the subthread has concluded no exit notification is generated for the old thread-group leader. The correct behavior is to simply not generate an exit notification on the struct pid of a subhthread exec because the struct pid is taken over by the subthread and thus remains alive. But this is difficult to handle because a thread-group may exit premature as mentioned in (2). In that case an exit notification is reliably generated but the subthreads may continue to run for an indeterminate amount of time and thus also may exec at some point. After this pull no exit notifications will be generated for a PIDFD_THREAD pidfd for a thread-group leader until all subthreads have been reaped. If a subthread should exec before no exit notification will be generated until that task exits or it creates subthreads and repeates the cycle. This means an exit notification indicates the ability for the father to reap the child. * tag 'vfs-6.15-rc1.pidfs' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: (25 commits) selftests/pidfd: third test for multi-threaded exec polling selftests/pidfd: second test for multi-threaded exec polling selftests/pidfd: first test for multi-threaded exec polling pidfs: improve multi-threaded exec and premature thread-group leader exit polling pidfs: ensure that PIDFS_INFO_EXIT is available selftests/pidfd: add seventh PIDFD_INFO_EXIT selftest selftests/pidfd: add sixth PIDFD_INFO_EXIT selftest selftests/pidfd: add fifth PIDFD_INFO_EXIT selftest selftests/pidfd: add fourth PIDFD_INFO_EXIT selftest selftests/pidfd: add third PIDFD_INFO_EXIT selftest selftests/pidfd: add second PIDFD_INFO_EXIT selftest selftests/pidfd: add first PIDFD_INFO_EXIT selftest selftests/pidfd: expand common pidfd header pidfs/selftests: ensure correct headers for ioctl handling selftests/pidfd: fix header inclusion pidfs: allow to retrieve exit information pidfs: record exit code and cgroupid at exit pidfs: use private inode slab cache pidfs: move setting flags into pidfs_alloc_file() pidfd: rely on automatic cleanup in __pidfd_prepare() ...
2 parents 71ee2fd + d40dc30 commit df00ded

19 files changed

Lines changed: 1241 additions & 192 deletions

File tree

fs/internal.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -324,6 +324,7 @@ struct stashed_operations {
324324
int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data,
325325
struct path *path);
326326
void stashed_dentry_prune(struct dentry *dentry);
327+
struct dentry *stashed_dentry_get(struct dentry **stashed);
327328
/**
328329
* path_mounted - check whether path is mounted
329330
* @path: path to check

fs/libfs.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2113,7 +2113,7 @@ struct timespec64 simple_inode_init_ts(struct inode *inode)
21132113
}
21142114
EXPORT_SYMBOL(simple_inode_init_ts);
21152115

2116-
static inline struct dentry *get_stashed_dentry(struct dentry **stashed)
2116+
struct dentry *stashed_dentry_get(struct dentry **stashed)
21172117
{
21182118
struct dentry *dentry;
21192119

@@ -2215,7 +2215,7 @@ int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data,
22152215
const struct stashed_operations *sops = mnt->mnt_sb->s_fs_info;
22162216

22172217
/* See if dentry can be reused. */
2218-
path->dentry = get_stashed_dentry(stashed);
2218+
path->dentry = stashed_dentry_get(stashed);
22192219
if (path->dentry) {
22202220
sops->put_data(data);
22212221
goto out_path;

fs/pidfs.c

Lines changed: 221 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,28 @@
2424
#include "internal.h"
2525
#include "mount.h"
2626

27+
static struct kmem_cache *pidfs_cachep __ro_after_init;
28+
29+
/*
30+
* Stashes information that userspace needs to access even after the
31+
* process has been reaped.
32+
*/
33+
struct pidfs_exit_info {
34+
__u64 cgroupid;
35+
__s32 exit_code;
36+
};
37+
38+
struct pidfs_inode {
39+
struct pidfs_exit_info __pei;
40+
struct pidfs_exit_info *exit_info;
41+
struct inode vfs_inode;
42+
};
43+
44+
static inline struct pidfs_inode *pidfs_i(struct inode *inode)
45+
{
46+
return container_of(inode, struct pidfs_inode, vfs_inode);
47+
}
48+
2749
static struct rb_root pidfs_ino_tree = RB_ROOT;
2850

2951
#if BITS_PER_LONG == 32
@@ -188,36 +210,48 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
188210
static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
189211
{
190212
struct pid *pid = pidfd_pid(file);
191-
bool thread = file->f_flags & PIDFD_THREAD;
192213
struct task_struct *task;
193214
__poll_t poll_flags = 0;
194215

195216
poll_wait(file, &pid->wait_pidfd, pts);
196217
/*
197-
* Depending on PIDFD_THREAD, inform pollers when the thread
198-
* or the whole thread-group exits.
218+
* Don't wake waiters if the thread-group leader exited
219+
* prematurely. They either get notified when the last subthread
220+
* exits or not at all if one of the remaining subthreads execs
221+
* and assumes the struct pid of the old thread-group leader.
199222
*/
200223
guard(rcu)();
201224
task = pid_task(pid, PIDTYPE_PID);
202225
if (!task)
203226
poll_flags = EPOLLIN | EPOLLRDNORM | EPOLLHUP;
204-
else if (task->exit_state && (thread || thread_group_empty(task)))
227+
else if (task->exit_state && !delay_group_leader(task))
205228
poll_flags = EPOLLIN | EPOLLRDNORM;
206229

207230
return poll_flags;
208231
}
209232

210-
static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long arg)
233+
static inline bool pid_in_current_pidns(const struct pid *pid)
234+
{
235+
const struct pid_namespace *ns = task_active_pid_ns(current);
236+
237+
if (ns->level <= pid->level)
238+
return pid->numbers[ns->level].ns == ns;
239+
240+
return false;
241+
}
242+
243+
static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
211244
{
212245
struct pidfd_info __user *uinfo = (struct pidfd_info __user *)arg;
246+
struct inode *inode = file_inode(file);
247+
struct pid *pid = pidfd_pid(file);
213248
size_t usize = _IOC_SIZE(cmd);
214249
struct pidfd_info kinfo = {};
250+
struct pidfs_exit_info *exit_info;
215251
struct user_namespace *user_ns;
252+
struct task_struct *task;
216253
const struct cred *c;
217254
__u64 mask;
218-
#ifdef CONFIG_CGROUPS
219-
struct cgroup *cgrp;
220-
#endif
221255

222256
if (!uinfo)
223257
return -EINVAL;
@@ -227,6 +261,37 @@ static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long
227261
if (copy_from_user(&mask, &uinfo->mask, sizeof(mask)))
228262
return -EFAULT;
229263

264+
/*
265+
* Restrict information retrieval to tasks within the caller's pid
266+
* namespace hierarchy.
267+
*/
268+
if (!pid_in_current_pidns(pid))
269+
return -ESRCH;
270+
271+
if (mask & PIDFD_INFO_EXIT) {
272+
exit_info = READ_ONCE(pidfs_i(inode)->exit_info);
273+
if (exit_info) {
274+
kinfo.mask |= PIDFD_INFO_EXIT;
275+
#ifdef CONFIG_CGROUPS
276+
kinfo.cgroupid = exit_info->cgroupid;
277+
kinfo.mask |= PIDFD_INFO_CGROUPID;
278+
#endif
279+
kinfo.exit_code = exit_info->exit_code;
280+
}
281+
}
282+
283+
task = get_pid_task(pid, PIDTYPE_PID);
284+
if (!task) {
285+
/*
286+
* If the task has already been reaped, only exit
287+
* information is available
288+
*/
289+
if (!(mask & PIDFD_INFO_EXIT))
290+
return -ESRCH;
291+
292+
goto copy_out;
293+
}
294+
230295
c = get_task_cred(task);
231296
if (!c)
232297
return -ESRCH;
@@ -246,11 +311,15 @@ static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long
246311
put_cred(c);
247312

248313
#ifdef CONFIG_CGROUPS
249-
rcu_read_lock();
250-
cgrp = task_dfl_cgroup(task);
251-
kinfo.cgroupid = cgroup_id(cgrp);
252-
kinfo.mask |= PIDFD_INFO_CGROUPID;
253-
rcu_read_unlock();
314+
if (!kinfo.cgroupid) {
315+
struct cgroup *cgrp;
316+
317+
rcu_read_lock();
318+
cgrp = task_dfl_cgroup(task);
319+
kinfo.cgroupid = cgroup_id(cgrp);
320+
kinfo.mask |= PIDFD_INFO_CGROUPID;
321+
rcu_read_unlock();
322+
}
254323
#endif
255324

256325
/*
@@ -270,16 +339,14 @@ static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long
270339
if (kinfo.pid == 0 || kinfo.tgid == 0 || (kinfo.ppid == 0 && kinfo.pid != 1))
271340
return -ESRCH;
272341

342+
copy_out:
273343
/*
274344
* If userspace and the kernel have the same struct size it can just
275345
* be copied. If userspace provides an older struct, only the bits that
276346
* userspace knows about will be copied. If userspace provides a new
277347
* struct, only the bits that the kernel knows about will be copied.
278348
*/
279-
if (copy_to_user(uinfo, &kinfo, min(usize, sizeof(kinfo))))
280-
return -EFAULT;
281-
282-
return 0;
349+
return copy_struct_to_user(uinfo, usize, &kinfo, sizeof(kinfo), NULL);
283350
}
284351

285352
static bool pidfs_ioctl_valid(unsigned int cmd)
@@ -317,7 +384,6 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
317384
{
318385
struct task_struct *task __free(put_task) = NULL;
319386
struct nsproxy *nsp __free(put_nsproxy) = NULL;
320-
struct pid *pid = pidfd_pid(file);
321387
struct ns_common *ns_common = NULL;
322388
struct pid_namespace *pid_ns;
323389

@@ -332,13 +398,13 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
332398
return put_user(file_inode(file)->i_generation, argp);
333399
}
334400

335-
task = get_pid_task(pid, PIDTYPE_PID);
336-
if (!task)
337-
return -ESRCH;
338-
339401
/* Extensible IOCTL that does not open namespace FDs, take a shortcut */
340402
if (_IOC_NR(cmd) == _IOC_NR(PIDFD_GET_INFO))
341-
return pidfd_info(task, cmd, arg);
403+
return pidfd_info(file, cmd, arg);
404+
405+
task = get_pid_task(pidfd_pid(file), PIDTYPE_PID);
406+
if (!task)
407+
return -ESRCH;
342408

343409
if (arg)
344410
return -EINVAL;
@@ -450,6 +516,49 @@ struct pid *pidfd_pid(const struct file *file)
450516
return file_inode(file)->i_private;
451517
}
452518

519+
/*
520+
* We're called from release_task(). We know there's at least one
521+
* reference to struct pid being held that won't be released until the
522+
* task has been reaped which cannot happen until we're out of
523+
* release_task().
524+
*
525+
* If this struct pid is referred to by a pidfd then
526+
* stashed_dentry_get() will return the dentry and inode for that struct
527+
* pid. Since we've taken a reference on it there's now an additional
528+
* reference from the exit path on it. Which is fine. We're going to put
529+
* it again in a second and we know that the pid is kept alive anyway.
530+
*
531+
* Worst case is that we've filled in the info and immediately free the
532+
* dentry and inode afterwards since the pidfd has been closed. Since
533+
* pidfs_exit() currently is placed after exit_task_work() we know that
534+
* it cannot be us aka the exiting task holding a pidfd to ourselves.
535+
*/
536+
void pidfs_exit(struct task_struct *tsk)
537+
{
538+
struct dentry *dentry;
539+
540+
might_sleep();
541+
542+
dentry = stashed_dentry_get(&task_pid(tsk)->stashed);
543+
if (dentry) {
544+
struct inode *inode = d_inode(dentry);
545+
struct pidfs_exit_info *exit_info = &pidfs_i(inode)->__pei;
546+
#ifdef CONFIG_CGROUPS
547+
struct cgroup *cgrp;
548+
549+
rcu_read_lock();
550+
cgrp = task_dfl_cgroup(tsk);
551+
exit_info->cgroupid = cgroup_id(cgrp);
552+
rcu_read_unlock();
553+
#endif
554+
exit_info->exit_code = tsk->exit_code;
555+
556+
/* Ensure that PIDFD_GET_INFO sees either all or nothing. */
557+
smp_store_release(&pidfs_i(inode)->exit_info, &pidfs_i(inode)->__pei);
558+
dput(dentry);
559+
}
560+
}
561+
453562
static struct vfsmount *pidfs_mnt __ro_after_init;
454563

455564
/*
@@ -505,9 +614,30 @@ static void pidfs_evict_inode(struct inode *inode)
505614
put_pid(pid);
506615
}
507616

617+
static struct inode *pidfs_alloc_inode(struct super_block *sb)
618+
{
619+
struct pidfs_inode *pi;
620+
621+
pi = alloc_inode_sb(sb, pidfs_cachep, GFP_KERNEL);
622+
if (!pi)
623+
return NULL;
624+
625+
memset(&pi->__pei, 0, sizeof(pi->__pei));
626+
pi->exit_info = NULL;
627+
628+
return &pi->vfs_inode;
629+
}
630+
631+
static void pidfs_free_inode(struct inode *inode)
632+
{
633+
kmem_cache_free(pidfs_cachep, pidfs_i(inode));
634+
}
635+
508636
static const struct super_operations pidfs_sops = {
637+
.alloc_inode = pidfs_alloc_inode,
509638
.drop_inode = generic_delete_inode,
510639
.evict_inode = pidfs_evict_inode,
640+
.free_inode = pidfs_free_inode,
511641
.statfs = simple_statfs,
512642
};
513643

@@ -633,8 +763,49 @@ static int pidfs_export_permission(struct handle_to_path_ctx *ctx,
633763
return 0;
634764
}
635765

766+
static inline bool pidfs_pid_valid(struct pid *pid, const struct path *path,
767+
unsigned int flags)
768+
{
769+
enum pid_type type;
770+
771+
if (flags & PIDFD_CLONE)
772+
return true;
773+
774+
/*
775+
* Make sure that if a pidfd is created PIDFD_INFO_EXIT
776+
* information will be available. So after an inode for the
777+
* pidfd has been allocated perform another check that the pid
778+
* is still alive. If it is exit information is available even
779+
* if the task gets reaped before the pidfd is returned to
780+
* userspace. The only exception is PIDFD_CLONE where no task
781+
* linkage has been established for @pid yet and the kernel is
782+
* in the middle of process creation so there's nothing for
783+
* pidfs to miss.
784+
*/
785+
if (flags & PIDFD_THREAD)
786+
type = PIDTYPE_PID;
787+
else
788+
type = PIDTYPE_TGID;
789+
790+
/*
791+
* Since pidfs_exit() is called before struct pid's task linkage
792+
* is removed the case where the task got reaped but a dentry
793+
* was already attached to struct pid and exit information was
794+
* recorded and published can be handled correctly.
795+
*/
796+
if (unlikely(!pid_has_task(pid, type))) {
797+
struct inode *inode = d_inode(path->dentry);
798+
return !!READ_ONCE(pidfs_i(inode)->exit_info);
799+
}
800+
801+
return true;
802+
}
803+
636804
static struct file *pidfs_export_open(struct path *path, unsigned int oflags)
637805
{
806+
if (!pidfs_pid_valid(d_inode(path->dentry)->i_private, path, oflags))
807+
return ERR_PTR(-ESRCH);
808+
638809
/*
639810
* Clear O_LARGEFILE as open_by_handle_at() forces it and raise
640811
* O_RDWR as pidfds always are.
@@ -698,22 +869,46 @@ static struct file_system_type pidfs_type = {
698869

699870
struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
700871
{
701-
702872
struct file *pidfd_file;
703-
struct path path;
873+
struct path path __free(path_put) = {};
704874
int ret;
705875

876+
/*
877+
* Ensure that PIDFD_CLONE can be passed as a flag without
878+
* overloading other uapi pidfd flags.
879+
*/
880+
BUILD_BUG_ON(PIDFD_CLONE == PIDFD_THREAD);
881+
BUILD_BUG_ON(PIDFD_CLONE == PIDFD_NONBLOCK);
882+
706883
ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path);
707884
if (ret < 0)
708885
return ERR_PTR(ret);
709886

887+
if (!pidfs_pid_valid(pid, &path, flags))
888+
return ERR_PTR(-ESRCH);
889+
890+
flags &= ~PIDFD_CLONE;
710891
pidfd_file = dentry_open(&path, flags, current_cred());
711-
path_put(&path);
892+
/* Raise PIDFD_THREAD explicitly as do_dentry_open() strips it. */
893+
if (!IS_ERR(pidfd_file))
894+
pidfd_file->f_flags |= (flags & PIDFD_THREAD);
895+
712896
return pidfd_file;
713897
}
714898

899+
static void pidfs_inode_init_once(void *data)
900+
{
901+
struct pidfs_inode *pi = data;
902+
903+
inode_init_once(&pi->vfs_inode);
904+
}
905+
715906
void __init pidfs_init(void)
716907
{
908+
pidfs_cachep = kmem_cache_create("pidfs_cache", sizeof(struct pidfs_inode), 0,
909+
(SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT |
910+
SLAB_ACCOUNT | SLAB_PANIC),
911+
pidfs_inode_init_once);
717912
pidfs_mnt = kern_mount(&pidfs_type);
718913
if (IS_ERR(pidfs_mnt))
719914
panic("Failed to mount pidfs pseudo filesystem");

0 commit comments

Comments
 (0)