Skip to content

Commit 4e3d1e6

Browse files
committed
Merge patch series "pidfs: persistent info & xattrs"
Christian Brauner <brauner@kernel.org> says: Persist exit and coredump information independent of whether anyone currently holds a pidfd for the struct pid. The current scheme allocated pidfs dentries on-demand repeatedly. This scheme is reaching it's limits as it makes it impossible to pin information that needs to be available after the task has exited or coredumped and that should not be lost simply because the pidfd got closed temporarily. The next opener should still see the stashed information. This is also a prerequisite for supporting extended attributes on pidfds to allow attaching meta information to them. If someone opens a pidfd for a struct pid a pidfs dentry is allocated and stashed in pid->stashed. Once the last pidfd for the struct pid is closed the pidfs dentry is released and removed from pid->stashed. So if 10 callers create a pidfs dentry for the same struct pid sequentially, i.e., each closing the pidfd before the other creates a new one then a new pidfs dentry is allocated every time. Because multiple tasks acquiring and releasing a pidfd for the same struct pid can race with each another a task may still find a valid pidfs entry from the previous task in pid->stashed and reuse it. Or it might find a dead dentry in there and fail to reuse it and so stashes a new pidfs dentry. Multiple tasks may race to stash a new pidfs dentry but only one will succeed, the other ones will put their dentry. The current scheme aims to ensure that a pidfs dentry for a struct pid can only be created if the task is still alive or if a pidfs dentry already existed before the task was reaped and so exit information has been was stashed in the pidfs inode. That's great except that it's buggy. If a pidfs dentry is stashed in pid->stashed after pidfs_exit() but before __unhash_process() is called we will return a pidfd for a reaped task without exit information being available. The pidfds_pid_valid() check does not guard against this race as it doens't sync at all with pidfs_exit(). The pid_has_task() check might be successful simply because we're before __unhash_process() but after pidfs_exit(). Introduce a new scheme where the lifetime of information associated with a pidfs entry (coredump and exit information) isn't bound to the lifetime of the pidfs inode but the struct pid itself. The first time a pidfs dentry is allocated for a struct pid a struct pidfs_attr will be allocated which will be used to store exit and coredump information. If all pidfs for the pidfs dentry are closed the dentry and inode can be cleaned up but the struct pidfs_attr will stick until the struct pid itself is freed. This will ensure minimal memory usage while persisting relevant information. The new scheme has various advantages. First, it allows to close the race where we end up handing out a pidfd for a reaped task for which no exit information is available. Second, it minimizes memory usage. Third, it allows to remove complex lifetime tracking via dentries when registering a struct pid with pidfs. There's no need to get or put a reference. Instead, the lifetime of exit and coredump information associated with a struct pid is bound to the lifetime of struct pid itself. Now that we have a way to persist information for pidfs dentries we can start supporting extended attributes on pidfds. This will allow userspace to attach meta information to tasks. One natural extension would be to introduce a custom pidfs.* extended attribute space and allow for the inheritance of extended attributes across fork() and exec(). The first simple scheme will allow privileged userspace to set trusted extended attributes on pidfs inodes. * patches from https://lore.kernel.org/20250618-work-pidfs-persistent-v2-0-98f3456fd552@kernel.org: pidfs: add some CONFIG_DEBUG_VFS asserts selftests/pidfd: test setattr support selftests/pidfd: test extended attribute support selftests/pidfd: test extended attribute support pidfs: support xattrs on pidfds pidfs: make inodes mutable libfs: prepare to allow for non-immutable pidfd inodes pidfs: remove pidfs_pid_valid() pidfs: remove pidfs_{get,put}_pid() pidfs: remove custom inode allocation pidfs: remove unused members from struct pidfs_inode pidfs: persist information pidfs: move to anonymous struct libfs: massage path_from_stashed() libfs: massage path_from_stashed() to allow custom stashing behavior pidfs: raise SB_I_NODEV and SB_I_NOEXEC Link: https://lore.kernel.org/20250618-work-pidfs-persistent-v2-0-98f3456fd552@kernel.org Signed-off-by: Christian Brauner <brauner@kernel.org>
2 parents 19272b3 + f9fac1f commit 4e3d1e6

12 files changed

Lines changed: 481 additions & 215 deletions

File tree

fs/coredump.c

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -898,12 +898,6 @@ void do_coredump(const kernel_siginfo_t *siginfo)
898898
retval = kernel_connect(socket, (struct sockaddr *)(&addr),
899899
addr_len, O_NONBLOCK | SOCK_COREDUMP);
900900

901-
/*
902-
* ... Make sure to only put our reference after connect() took
903-
* its own reference keeping the pidfs entry alive ...
904-
*/
905-
pidfs_put_pid(cprm.pid);
906-
907901
if (retval) {
908902
if (retval == -EAGAIN)
909903
coredump_report_failure("Coredump socket %s receive queue full", addr.sun_path);

fs/internal.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -322,12 +322,15 @@ struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns);
322322
struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap);
323323
void mnt_idmap_put(struct mnt_idmap *idmap);
324324
struct stashed_operations {
325+
struct dentry *(*stash_dentry)(struct dentry **stashed,
326+
struct dentry *dentry);
325327
void (*put_data)(void *data);
326328
int (*init_inode)(struct inode *inode, void *data);
327329
};
328330
int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data,
329331
struct path *path);
330332
void stashed_dentry_prune(struct dentry *dentry);
333+
struct dentry *stash_dentry(struct dentry **stashed, struct dentry *dentry);
331334
struct dentry *stashed_dentry_get(struct dentry **stashed);
332335
/**
333336
* path_mounted - check whether path is mounted

fs/libfs.c

Lines changed: 22 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2128,6 +2128,8 @@ struct dentry *stashed_dentry_get(struct dentry **stashed)
21282128
dentry = rcu_dereference(*stashed);
21292129
if (!dentry)
21302130
return NULL;
2131+
if (IS_ERR(dentry))
2132+
return dentry;
21312133
if (!lockref_get_not_dead(&dentry->d_lockref))
21322134
return NULL;
21332135
return dentry;
@@ -2160,7 +2162,6 @@ static struct dentry *prepare_anon_dentry(struct dentry **stashed,
21602162

21612163
/* Notice when this is changed. */
21622164
WARN_ON_ONCE(!S_ISREG(inode->i_mode));
2163-
WARN_ON_ONCE(!IS_IMMUTABLE(inode));
21642165

21652166
dentry = d_alloc_anon(sb);
21662167
if (!dentry) {
@@ -2176,8 +2177,7 @@ static struct dentry *prepare_anon_dentry(struct dentry **stashed,
21762177
return dentry;
21772178
}
21782179

2179-
static struct dentry *stash_dentry(struct dentry **stashed,
2180-
struct dentry *dentry)
2180+
struct dentry *stash_dentry(struct dentry **stashed, struct dentry *dentry)
21812181
{
21822182
guard(rcu)();
21832183
for (;;) {
@@ -2218,14 +2218,16 @@ static struct dentry *stash_dentry(struct dentry **stashed,
22182218
int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data,
22192219
struct path *path)
22202220
{
2221-
struct dentry *dentry;
2221+
struct dentry *dentry, *res;
22222222
const struct stashed_operations *sops = mnt->mnt_sb->s_fs_info;
22232223

22242224
/* See if dentry can be reused. */
2225-
path->dentry = stashed_dentry_get(stashed);
2226-
if (path->dentry) {
2225+
res = stashed_dentry_get(stashed);
2226+
if (IS_ERR(res))
2227+
return PTR_ERR(res);
2228+
if (res) {
22272229
sops->put_data(data);
2228-
goto out_path;
2230+
goto make_path;
22292231
}
22302232

22312233
/* Allocate a new dentry. */
@@ -2234,14 +2236,22 @@ int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data,
22342236
return PTR_ERR(dentry);
22352237

22362238
/* Added a new dentry. @data is now owned by the filesystem. */
2237-
path->dentry = stash_dentry(stashed, dentry);
2238-
if (path->dentry != dentry)
2239+
if (sops->stash_dentry)
2240+
res = sops->stash_dentry(stashed, dentry);
2241+
else
2242+
res = stash_dentry(stashed, dentry);
2243+
if (IS_ERR(res)) {
2244+
dput(dentry);
2245+
return PTR_ERR(res);
2246+
}
2247+
if (res != dentry)
22392248
dput(dentry);
22402249

2241-
out_path:
2242-
WARN_ON_ONCE(path->dentry->d_fsdata != stashed);
2243-
WARN_ON_ONCE(d_inode(path->dentry)->i_private != data);
2250+
make_path:
2251+
path->dentry = res;
22442252
path->mnt = mntget(mnt);
2253+
VFS_WARN_ON_ONCE(path->dentry->d_fsdata != stashed);
2254+
VFS_WARN_ON_ONCE(d_inode(path->dentry)->i_private != data);
22452255
return 0;
22462256
}
22472257

0 commit comments

Comments
 (0)