Skip to content

Commit 8ec7c82

Browse files
committed
pidfs: persist information
Persist exit and coredump information independent of whether anyone currently holds a pidfd for the struct pid. The current scheme allocated pidfs dentries on-demand repeatedly. This scheme is reaching it's limits as it makes it impossible to pin information that needs to be available after the task has exited or coredumped and that should not be lost simply because the pidfd got closed temporarily. The next opener should still see the stashed information. This is also a prerequisite for supporting extended attributes on pidfds to allow attaching meta information to them. If someone opens a pidfd for a struct pid a pidfs dentry is allocated and stashed in pid->stashed. Once the last pidfd for the struct pid is closed the pidfs dentry is released and removed from pid->stashed. So if 10 callers create a pidfs dentry for the same struct pid sequentially, i.e., each closing the pidfd before the other creates a new one then a new pidfs dentry is allocated every time. Because multiple tasks acquiring and releasing a pidfd for the same struct pid can race with each another a task may still find a valid pidfs entry from the previous task in pid->stashed and reuse it. Or it might find a dead dentry in there and fail to reuse it and so stashes a new pidfs dentry. Multiple tasks may race to stash a new pidfs dentry but only one will succeed, the other ones will put their dentry. The current scheme aims to ensure that a pidfs dentry for a struct pid can only be created if the task is still alive or if a pidfs dentry already existed before the task was reaped and so exit information has been was stashed in the pidfs inode. That's great except that it's buggy. If a pidfs dentry is stashed in pid->stashed after pidfs_exit() but before __unhash_process() is called we will return a pidfd for a reaped task without exit information being available. The pidfds_pid_valid() check does not guard against this race as it doens't sync at all with pidfs_exit(). The pid_has_task() check might be successful simply because we're before __unhash_process() but after pidfs_exit(). Introduce a new scheme where the lifetime of information associated with a pidfs entry (coredump and exit information) isn't bound to the lifetime of the pidfs inode but the struct pid itself. The first time a pidfs dentry is allocated for a struct pid a struct pidfs_attr will be allocated which will be used to store exit and coredump information. If all pidfs for the pidfs dentry are closed the dentry and inode can be cleaned up but the struct pidfs_attr will stick until the struct pid itself is freed. This will ensure minimal memory usage while persisting relevant information. The new scheme has various advantages. First, it allows to close the race where we end up handing out a pidfd for a reaped task for which no exit information is available. Second, it minimizes memory usage. Third, it allows to remove complex lifetime tracking via dentries when registering a struct pid with pidfs. There's no need to get or put a reference. Instead, the lifetime of exit and coredump information associated with a struct pid is bound to the lifetime of struct pid itself. Link: https://lore.kernel.org/20250618-work-pidfs-persistent-v2-5-98f3456fd552@kernel.org Reviewed-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com> Signed-off-by: Christian Brauner <brauner@kernel.org>
1 parent 75215c9 commit 8ec7c82

4 files changed

Lines changed: 151 additions & 67 deletions

File tree

fs/pidfs.c

Lines changed: 146 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,10 @@
2525
#include "internal.h"
2626
#include "mount.h"
2727

28+
#define PIDFS_PID_DEAD ERR_PTR(-ESRCH)
29+
2830
static struct kmem_cache *pidfs_cachep __ro_after_init;
31+
static struct kmem_cache *pidfs_attr_cachep __ro_after_init;
2932

3033
/*
3134
* Stashes information that userspace needs to access even after the
@@ -37,6 +40,11 @@ struct pidfs_exit_info {
3740
__u32 coredump_mask;
3841
};
3942

43+
struct pidfs_attr {
44+
struct pidfs_exit_info __pei;
45+
struct pidfs_exit_info *exit_info;
46+
};
47+
4048
struct pidfs_inode {
4149
struct pidfs_exit_info __pei;
4250
struct pidfs_exit_info *exit_info;
@@ -125,6 +133,7 @@ void pidfs_add_pid(struct pid *pid)
125133

126134
pid->ino = pidfs_ino_nr;
127135
pid->stashed = NULL;
136+
pid->attr = NULL;
128137
pidfs_ino_nr++;
129138

130139
write_seqcount_begin(&pidmap_lock_seq);
@@ -139,6 +148,18 @@ void pidfs_remove_pid(struct pid *pid)
139148
write_seqcount_end(&pidmap_lock_seq);
140149
}
141150

151+
void pidfs_free_pid(struct pid *pid)
152+
{
153+
/*
154+
* Any dentry must've been wiped from the pid by now.
155+
* Otherwise there's a reference count bug.
156+
*/
157+
VFS_WARN_ON_ONCE(pid->stashed);
158+
159+
if (!IS_ERR(pid->attr))
160+
kfree(pid->attr);
161+
}
162+
142163
#ifdef CONFIG_PROC_FS
143164
/**
144165
* pidfd_show_fdinfo - print information about a pidfd
@@ -261,13 +282,13 @@ static __u32 pidfs_coredump_mask(unsigned long mm_flags)
261282
static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
262283
{
263284
struct pidfd_info __user *uinfo = (struct pidfd_info __user *)arg;
264-
struct inode *inode = file_inode(file);
265285
struct pid *pid = pidfd_pid(file);
266286
size_t usize = _IOC_SIZE(cmd);
267287
struct pidfd_info kinfo = {};
268288
struct pidfs_exit_info *exit_info;
269289
struct user_namespace *user_ns;
270290
struct task_struct *task;
291+
struct pidfs_attr *attr;
271292
const struct cred *c;
272293
__u64 mask;
273294

@@ -286,8 +307,9 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
286307
if (!pid_in_current_pidns(pid))
287308
return -ESRCH;
288309

310+
attr = READ_ONCE(pid->attr);
289311
if (mask & PIDFD_INFO_EXIT) {
290-
exit_info = READ_ONCE(pidfs_i(inode)->exit_info);
312+
exit_info = READ_ONCE(attr->exit_info);
291313
if (exit_info) {
292314
kinfo.mask |= PIDFD_INFO_EXIT;
293315
#ifdef CONFIG_CGROUPS
@@ -300,7 +322,7 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
300322

301323
if (mask & PIDFD_INFO_COREDUMP) {
302324
kinfo.mask |= PIDFD_INFO_COREDUMP;
303-
kinfo.coredump_mask = READ_ONCE(pidfs_i(inode)->__pei.coredump_mask);
325+
kinfo.coredump_mask = READ_ONCE(attr->__pei.coredump_mask);
304326
}
305327

306328
task = get_pid_task(pid, PIDTYPE_PID);
@@ -552,58 +574,77 @@ struct pid *pidfd_pid(const struct file *file)
552574
* task has been reaped which cannot happen until we're out of
553575
* release_task().
554576
*
555-
* If this struct pid is referred to by a pidfd then
556-
* stashed_dentry_get() will return the dentry and inode for that struct
557-
* pid. Since we've taken a reference on it there's now an additional
558-
* reference from the exit path on it. Which is fine. We're going to put
559-
* it again in a second and we know that the pid is kept alive anyway.
577+
* If this struct pid has at least once been referred to by a pidfd then
578+
* pid->attr will be allocated. If not we mark the struct pid as dead so
579+
* anyone who is trying to register it with pidfs will fail to do so.
580+
* Otherwise we would hand out pidfs for reaped tasks without having
581+
* exit information available.
560582
*
561-
* Worst case is that we've filled in the info and immediately free the
562-
* dentry and inode afterwards since the pidfd has been closed. Since
583+
* Worst case is that we've filled in the info and the pid gets freed
584+
* right away in free_pid() when no one holds a pidfd anymore. Since
563585
* pidfs_exit() currently is placed after exit_task_work() we know that
564-
* it cannot be us aka the exiting task holding a pidfd to ourselves.
586+
* it cannot be us aka the exiting task holding a pidfd to itself.
565587
*/
566588
void pidfs_exit(struct task_struct *tsk)
567589
{
568-
struct dentry *dentry;
590+
struct pid *pid = task_pid(tsk);
591+
struct pidfs_attr *attr;
592+
struct pidfs_exit_info *exit_info;
593+
#ifdef CONFIG_CGROUPS
594+
struct cgroup *cgrp;
595+
#endif
569596

570597
might_sleep();
571598

572-
dentry = stashed_dentry_get(&task_pid(tsk)->stashed);
573-
if (dentry) {
574-
struct inode *inode = d_inode(dentry);
575-
struct pidfs_exit_info *exit_info = &pidfs_i(inode)->__pei;
576-
#ifdef CONFIG_CGROUPS
577-
struct cgroup *cgrp;
599+
guard(spinlock_irq)(&pid->wait_pidfd.lock);
600+
attr = pid->attr;
601+
if (!attr) {
602+
/*
603+
* No one ever held a pidfd for this struct pid.
604+
* Mark it as dead so no one can add a pidfs
605+
* entry anymore. We're about to be reaped and
606+
* so no exit information would be available.
607+
*/
608+
pid->attr = PIDFS_PID_DEAD;
609+
return;
610+
}
578611

579-
rcu_read_lock();
580-
cgrp = task_dfl_cgroup(tsk);
581-
exit_info->cgroupid = cgroup_id(cgrp);
582-
rcu_read_unlock();
612+
/*
613+
* If @pid->attr is set someone might still legitimately hold a
614+
* pidfd to @pid or someone might concurrently still be getting
615+
* a reference to an already stashed dentry from @pid->stashed.
616+
* So defer cleaning @pid->attr until the last reference to @pid
617+
* is put
618+
*/
619+
620+
exit_info = &attr->__pei;
621+
622+
#ifdef CONFIG_CGROUPS
623+
rcu_read_lock();
624+
cgrp = task_dfl_cgroup(tsk);
625+
exit_info->cgroupid = cgroup_id(cgrp);
626+
rcu_read_unlock();
583627
#endif
584-
exit_info->exit_code = tsk->exit_code;
628+
exit_info->exit_code = tsk->exit_code;
585629

586-
/* Ensure that PIDFD_GET_INFO sees either all or nothing. */
587-
smp_store_release(&pidfs_i(inode)->exit_info, &pidfs_i(inode)->__pei);
588-
dput(dentry);
589-
}
630+
/* Ensure that PIDFD_GET_INFO sees either all or nothing. */
631+
smp_store_release(&attr->exit_info, &attr->__pei);
590632
}
591633

592634
#ifdef CONFIG_COREDUMP
593635
void pidfs_coredump(const struct coredump_params *cprm)
594636
{
595637
struct pid *pid = cprm->pid;
596638
struct pidfs_exit_info *exit_info;
597-
struct dentry *dentry;
598-
struct inode *inode;
639+
struct pidfs_attr *attr;
599640
__u32 coredump_mask = 0;
600641

601-
dentry = pid->stashed;
602-
if (WARN_ON_ONCE(!dentry))
603-
return;
642+
attr = READ_ONCE(pid->attr);
604643

605-
inode = d_inode(dentry);
606-
exit_info = &pidfs_i(inode)->__pei;
644+
VFS_WARN_ON_ONCE(!attr);
645+
VFS_WARN_ON_ONCE(attr == PIDFS_PID_DEAD);
646+
647+
exit_info = &attr->__pei;
607648
/* Note how we were coredumped. */
608649
coredump_mask = pidfs_coredump_mask(cprm->mm_flags);
609650
/* Note that we actually did coredump. */
@@ -663,7 +704,7 @@ static struct inode *pidfs_alloc_inode(struct super_block *sb)
663704

664705
static void pidfs_free_inode(struct inode *inode)
665706
{
666-
kmem_cache_free(pidfs_cachep, pidfs_i(inode));
707+
kfree(pidfs_i(inode));
667708
}
668709

669710
static const struct super_operations pidfs_sops = {
@@ -831,8 +872,13 @@ static inline bool pidfs_pid_valid(struct pid *pid, const struct path *path,
831872
* recorded and published can be handled correctly.
832873
*/
833874
if (unlikely(!pid_has_task(pid, type))) {
834-
struct inode *inode = d_inode(path->dentry);
835-
return !!READ_ONCE(pidfs_i(inode)->exit_info);
875+
struct pidfs_attr *attr;
876+
877+
attr = READ_ONCE(pid->attr);
878+
if (!attr)
879+
return false;
880+
if (!READ_ONCE(attr->exit_info))
881+
return false;
836882
}
837883

838884
return true;
@@ -878,9 +924,67 @@ static void pidfs_put_data(void *data)
878924
put_pid(pid);
879925
}
880926

927+
/**
928+
* pidfs_register_pid - register a struct pid in pidfs
929+
* @pid: pid to pin
930+
*
931+
* Register a struct pid in pidfs. Needs to be paired with
932+
* pidfs_put_pid() to not risk leaking the pidfs dentry and inode.
933+
*
934+
* Return: On success zero, on error a negative error code is returned.
935+
*/
936+
int pidfs_register_pid(struct pid *pid)
937+
{
938+
struct pidfs_attr *new_attr __free(kfree) = NULL;
939+
struct pidfs_attr *attr;
940+
941+
might_sleep();
942+
943+
if (!pid)
944+
return 0;
945+
946+
attr = READ_ONCE(pid->attr);
947+
if (unlikely(attr == PIDFS_PID_DEAD))
948+
return PTR_ERR(PIDFS_PID_DEAD);
949+
if (attr)
950+
return 0;
951+
952+
new_attr = kmem_cache_zalloc(pidfs_attr_cachep, GFP_KERNEL);
953+
if (!new_attr)
954+
return -ENOMEM;
955+
956+
/* Synchronize with pidfs_exit(). */
957+
guard(spinlock_irq)(&pid->wait_pidfd.lock);
958+
959+
attr = pid->attr;
960+
if (unlikely(attr == PIDFS_PID_DEAD))
961+
return PTR_ERR(PIDFS_PID_DEAD);
962+
if (unlikely(attr))
963+
return 0;
964+
965+
pid->attr = no_free_ptr(new_attr);
966+
return 0;
967+
}
968+
969+
static struct dentry *pidfs_stash_dentry(struct dentry **stashed,
970+
struct dentry *dentry)
971+
{
972+
int ret;
973+
struct pid *pid = d_inode(dentry)->i_private;
974+
975+
VFS_WARN_ON_ONCE(stashed != &pid->stashed);
976+
977+
ret = pidfs_register_pid(pid);
978+
if (ret)
979+
return ERR_PTR(ret);
980+
981+
return stash_dentry(stashed, dentry);
982+
}
983+
881984
static const struct stashed_operations pidfs_stashed_ops = {
882-
.init_inode = pidfs_init_inode,
883-
.put_data = pidfs_put_data,
985+
.stash_dentry = pidfs_stash_dentry,
986+
.init_inode = pidfs_init_inode,
987+
.put_data = pidfs_put_data,
884988
};
885989

886990
static int pidfs_init_fs_context(struct fs_context *fc)
@@ -936,33 +1040,6 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
9361040
return pidfd_file;
9371041
}
9381042

939-
/**
940-
* pidfs_register_pid - register a struct pid in pidfs
941-
* @pid: pid to pin
942-
*
943-
* Register a struct pid in pidfs. Needs to be paired with
944-
* pidfs_put_pid() to not risk leaking the pidfs dentry and inode.
945-
*
946-
* Return: On success zero, on error a negative error code is returned.
947-
*/
948-
int pidfs_register_pid(struct pid *pid)
949-
{
950-
struct path path __free(path_put) = {};
951-
int ret;
952-
953-
might_sleep();
954-
955-
if (!pid)
956-
return 0;
957-
958-
ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path);
959-
if (unlikely(ret))
960-
return ret;
961-
/* Keep the dentry and only put the reference to the mount. */
962-
path.dentry = NULL;
963-
return 0;
964-
}
965-
9661043
/**
9671044
* pidfs_get_pid - pin a struct pid through pidfs
9681045
* @pid: pid to pin
@@ -1008,6 +1085,9 @@ void __init pidfs_init(void)
10081085
(SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT |
10091086
SLAB_ACCOUNT | SLAB_PANIC),
10101087
pidfs_inode_init_once);
1088+
pidfs_attr_cachep = kmem_cache_create("pidfs_attr_cache", sizeof(struct pidfs_attr), 0,
1089+
(SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT |
1090+
SLAB_ACCOUNT | SLAB_PANIC), NULL);
10111091
pidfs_mnt = kern_mount(&pidfs_type);
10121092
if (IS_ERR(pidfs_mnt))
10131093
panic("Failed to mount pidfs pseudo filesystem");

include/linux/pid.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,8 @@
4747

4848
#define RESERVED_PIDS 300
4949

50+
struct pidfs_attr;
51+
5052
struct upid {
5153
int nr;
5254
struct pid_namespace *ns;
@@ -60,6 +62,7 @@ struct pid {
6062
u64 ino;
6163
struct rb_node pidfs_node;
6264
struct dentry *stashed;
65+
struct pidfs_attr *attr;
6366
};
6467
/* lists of tasks that use this pid */
6568
struct hlist_head tasks[PIDTYPE_MAX];

include/linux/pidfs.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,5 +16,6 @@ extern const struct dentry_operations pidfs_dentry_operations;
1616
int pidfs_register_pid(struct pid *pid);
1717
void pidfs_get_pid(struct pid *pid);
1818
void pidfs_put_pid(struct pid *pid);
19+
void pidfs_free_pid(struct pid *pid);
1920

2021
#endif /* _LINUX_PID_FS_H */

kernel/pid.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ void put_pid(struct pid *pid)
100100

101101
ns = pid->numbers[pid->level].ns;
102102
if (refcount_dec_and_test(&pid->count)) {
103-
WARN_ON_ONCE(pid->stashed);
103+
pidfs_free_pid(pid);
104104
kmem_cache_free(ns->pid_cachep, pid);
105105
put_pid_ns(ns);
106106
}

0 commit comments

Comments
 (0)