Skip to content

Commit 543b9b6

Browse files
committed
Merge tag 'kernel-7.0-rc1.misc' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
Pull pidfs updates from Christian Brauner: - pid: introduce task_ppid_vnr() helper - pidfs: convert rb-tree to rhashtable Mateusz reported performance penalties during task creation because pidfs uses pidmap_lock to add elements into the rbtree. Switch to an rhashtable to have separate fine-grained locking and to decouple from pidmap_lock moving all heavy manipulations outside of it Also move inode allocation outside of pidmap_lock. With this there's nothing happening for pidfs under pidmap_lock - pid: reorder fields in pid_namespace to reduce false sharing - Revert "pid: make __task_pid_nr_ns(ns => NULL) safe for zombie callers" - ipc: Add SPDX license id to mqueue.c * tag 'kernel-7.0-rc1.misc' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: pid: introduce task_ppid_vnr() helper pidfs: implement ino allocation without the pidmap lock Revert "pid: make __task_pid_nr_ns(ns => NULL) safe for zombie callers" pid: reorder fields in pid_namespace to reduce false sharing pidfs: convert rb-tree to rhashtable ipc: Add SPDX license id to mqueue.c
2 parents 57d76ce + 3673dd3 commit 543b9b6

6 files changed

Lines changed: 121 additions & 97 deletions

File tree

fs/pidfs.c

Lines changed: 95 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,9 @@
2121
#include <linux/utsname.h>
2222
#include <net/net_namespace.h>
2323
#include <linux/coredump.h>
24+
#include <linux/rhashtable.h>
2425
#include <linux/xattr.h>
26+
#include <linux/cookie.h>
2527

2628
#include "internal.h"
2729
#include "mount.h"
@@ -55,9 +57,48 @@ struct pidfs_attr {
5557
__u32 coredump_signal;
5658
};
5759

58-
static struct rb_root pidfs_ino_tree = RB_ROOT;
60+
static struct rhashtable pidfs_ino_ht;
61+
62+
static const struct rhashtable_params pidfs_ino_ht_params = {
63+
.key_offset = offsetof(struct pid, ino),
64+
.key_len = sizeof(u64),
65+
.head_offset = offsetof(struct pid, pidfs_hash),
66+
.automatic_shrinking = true,
67+
};
68+
69+
/*
70+
* inode number handling
71+
*
72+
* On 64 bit nothing special happens. The 64bit number assigned
73+
* to struct pid is the inode number.
74+
*
75+
* On 32 bit the 64 bit number assigned to struct pid is split
76+
* into two 32 bit numbers. The lower 32 bits are used as the
77+
* inode number and the upper 32 bits are used as the inode
78+
* generation number.
79+
*
80+
* On 32 bit pidfs_ino() will return the lower 32 bit. When
81+
* pidfs_ino() returns zero a wrap around happened. When a
82+
* wraparound happens the 64 bit number will be incremented by 1
83+
* so inode numbering starts at 1 again.
84+
*
85+
* On 64 bit comparing two pidfds is as simple as comparing
86+
* inode numbers.
87+
*
88+
* When a wraparound happens on 32 bit multiple pidfds with the
89+
* same inode number are likely to exist (This isn't a problem
90+
* since before pidfs pidfds used the anonymous inode meaning
91+
* all pidfds had the same inode number.). Userspace can
92+
* reconstruct the 64 bit identifier by retrieving both the
93+
* inode number and the inode generation number to compare or
94+
* use file handles.
95+
*/
5996

6097
#if BITS_PER_LONG == 32
98+
99+
DEFINE_SPINLOCK(pidfs_ino_lock);
100+
static u64 pidfs_ino_nr = 1;
101+
61102
static inline unsigned long pidfs_ino(u64 ino)
62103
{
63104
return lower_32_bits(ino);
@@ -69,6 +110,18 @@ static inline u32 pidfs_gen(u64 ino)
69110
return upper_32_bits(ino);
70111
}
71112

113+
static inline u64 pidfs_alloc_ino(void)
114+
{
115+
u64 ino;
116+
117+
spin_lock(&pidfs_ino_lock);
118+
if (pidfs_ino(pidfs_ino_nr) == 0)
119+
pidfs_ino_nr++;
120+
ino = pidfs_ino_nr++;
121+
spin_unlock(&pidfs_ino_lock);
122+
return ino;
123+
}
124+
72125
#else
73126

74127
/* On 64 bit simply return ino. */
@@ -82,69 +135,47 @@ static inline u32 pidfs_gen(u64 ino)
82135
{
83136
return 0;
84137
}
85-
#endif
86138

87-
static int pidfs_ino_cmp(struct rb_node *a, const struct rb_node *b)
88-
{
89-
struct pid *pid_a = rb_entry(a, struct pid, pidfs_node);
90-
struct pid *pid_b = rb_entry(b, struct pid, pidfs_node);
91-
u64 pid_ino_a = pid_a->ino;
92-
u64 pid_ino_b = pid_b->ino;
93-
94-
if (pid_ino_a < pid_ino_b)
95-
return -1;
96-
if (pid_ino_a > pid_ino_b)
97-
return 1;
98-
return 0;
99-
}
139+
DEFINE_COOKIE(pidfs_ino_cookie);
100140

101-
void pidfs_add_pid(struct pid *pid)
141+
static u64 pidfs_alloc_ino(void)
102142
{
103-
static u64 pidfs_ino_nr = 2;
143+
u64 ino;
104144

105-
/*
106-
* On 64 bit nothing special happens. The 64bit number assigned
107-
* to struct pid is the inode number.
108-
*
109-
* On 32 bit the 64 bit number assigned to struct pid is split
110-
* into two 32 bit numbers. The lower 32 bits are used as the
111-
* inode number and the upper 32 bits are used as the inode
112-
* generation number.
113-
*
114-
* On 32 bit pidfs_ino() will return the lower 32 bit. When
115-
* pidfs_ino() returns zero a wrap around happened. When a
116-
* wraparound happens the 64 bit number will be incremented by 2
117-
* so inode numbering starts at 2 again.
118-
*
119-
* On 64 bit comparing two pidfds is as simple as comparing
120-
* inode numbers.
121-
*
122-
* When a wraparound happens on 32 bit multiple pidfds with the
123-
* same inode number are likely to exist (This isn't a problem
124-
* since before pidfs pidfds used the anonymous inode meaning
125-
* all pidfds had the same inode number.). Userspace can
126-
* reconstruct the 64 bit identifier by retrieving both the
127-
* inode number and the inode generation number to compare or
128-
* use file handles.
129-
*/
130-
if (pidfs_ino(pidfs_ino_nr) == 0)
131-
pidfs_ino_nr += 2;
145+
preempt_disable();
146+
ino = gen_cookie_next(&pidfs_ino_cookie);
147+
preempt_enable();
132148

133-
pid->ino = pidfs_ino_nr;
149+
VFS_WARN_ON_ONCE(ino < 1);
150+
return ino;
151+
}
152+
153+
#endif
154+
155+
void pidfs_prepare_pid(struct pid *pid)
156+
{
134157
pid->stashed = NULL;
135158
pid->attr = NULL;
136-
pidfs_ino_nr++;
159+
pid->ino = 0;
160+
}
137161

138-
write_seqcount_begin(&pidmap_lock_seq);
139-
rb_find_add_rcu(&pid->pidfs_node, &pidfs_ino_tree, pidfs_ino_cmp);
140-
write_seqcount_end(&pidmap_lock_seq);
162+
int pidfs_add_pid(struct pid *pid)
163+
{
164+
int ret;
165+
166+
pid->ino = pidfs_alloc_ino();
167+
ret = rhashtable_insert_fast(&pidfs_ino_ht, &pid->pidfs_hash,
168+
pidfs_ino_ht_params);
169+
if (unlikely(ret))
170+
pid->ino = 0;
171+
return ret;
141172
}
142173

143174
void pidfs_remove_pid(struct pid *pid)
144175
{
145-
write_seqcount_begin(&pidmap_lock_seq);
146-
rb_erase(&pid->pidfs_node, &pidfs_ino_tree);
147-
write_seqcount_end(&pidmap_lock_seq);
176+
if (likely(pid->ino))
177+
rhashtable_remove_fast(&pidfs_ino_ht, &pid->pidfs_hash,
178+
pidfs_ino_ht_params);
148179
}
149180

150181
void pidfs_free_pid(struct pid *pid)
@@ -415,7 +446,7 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
415446
* the fields are set correctly, or return ESRCH to avoid providing
416447
* incomplete information. */
417448

418-
kinfo.ppid = task_ppid_nr_ns(task, NULL);
449+
kinfo.ppid = task_ppid_vnr(task);
419450
kinfo.tgid = task_tgid_vnr(task);
420451
kinfo.pid = task_pid_vnr(task);
421452
kinfo.mask |= PIDFD_INFO_PID;
@@ -791,42 +822,24 @@ static int pidfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
791822
return FILEID_KERNFS;
792823
}
793824

794-
static int pidfs_ino_find(const void *key, const struct rb_node *node)
795-
{
796-
const u64 pid_ino = *(u64 *)key;
797-
const struct pid *pid = rb_entry(node, struct pid, pidfs_node);
798-
799-
if (pid_ino < pid->ino)
800-
return -1;
801-
if (pid_ino > pid->ino)
802-
return 1;
803-
return 0;
804-
}
805-
806825
/* Find a struct pid based on the inode number. */
807826
static struct pid *pidfs_ino_get_pid(u64 ino)
808827
{
809828
struct pid *pid;
810-
struct rb_node *node;
811-
unsigned int seq;
829+
struct pidfs_attr *attr;
812830

813831
guard(rcu)();
814-
do {
815-
seq = read_seqcount_begin(&pidmap_lock_seq);
816-
node = rb_find_rcu(&ino, &pidfs_ino_tree, pidfs_ino_find);
817-
if (node)
818-
break;
819-
} while (read_seqcount_retry(&pidmap_lock_seq, seq));
820-
821-
if (!node)
832+
pid = rhashtable_lookup(&pidfs_ino_ht, &ino, pidfs_ino_ht_params);
833+
if (!pid)
834+
return NULL;
835+
attr = READ_ONCE(pid->attr);
836+
if (IS_ERR_OR_NULL(attr))
837+
return NULL;
838+
if (test_bit(PIDFS_ATTR_BIT_EXIT, &attr->attr_mask))
822839
return NULL;
823-
824-
pid = rb_entry(node, struct pid, pidfs_node);
825-
826840
/* Within our pid namespace hierarchy? */
827841
if (pid_vnr(pid) == 0)
828842
return NULL;
829-
830843
return get_pid(pid);
831844
}
832845

@@ -1104,6 +1117,9 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
11041117

11051118
void __init pidfs_init(void)
11061119
{
1120+
if (rhashtable_init(&pidfs_ino_ht, &pidfs_ino_ht_params))
1121+
panic("Failed to initialize pidfs hashtable");
1122+
11071123
pidfs_attr_cachep = kmem_cache_create("pidfs_attr_cache", sizeof(struct pidfs_attr), 0,
11081124
(SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT |
11091125
SLAB_ACCOUNT | SLAB_PANIC), NULL);

include/linux/pid.h

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
#include <linux/rculist.h>
77
#include <linux/rcupdate.h>
88
#include <linux/refcount.h>
9+
#include <linux/rhashtable-types.h>
910
#include <linux/sched.h>
1011
#include <linux/wait.h>
1112

@@ -60,7 +61,7 @@ struct pid {
6061
spinlock_t lock;
6162
struct {
6263
u64 ino;
63-
struct rb_node pidfs_node;
64+
struct rhash_head pidfs_hash;
6465
struct dentry *stashed;
6566
struct pidfs_attr *attr;
6667
};
@@ -73,7 +74,6 @@ struct pid {
7374
struct upid numbers[];
7475
};
7576

76-
extern seqcount_spinlock_t pidmap_lock_seq;
7777
extern struct pid init_struct_pid;
7878

7979
struct file;
@@ -310,6 +310,11 @@ static inline pid_t task_ppid_nr_ns(const struct task_struct *tsk, struct pid_na
310310
return pid;
311311
}
312312

313+
static inline pid_t task_ppid_vnr(const struct task_struct *tsk)
314+
{
315+
return task_ppid_nr_ns(tsk, NULL);
316+
}
317+
313318
static inline pid_t task_ppid_nr(const struct task_struct *tsk)
314319
{
315320
return task_ppid_nr_ns(tsk, &init_pid_ns);

include/linux/pid_namespace.h

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,13 @@ struct pid_namespace {
2727
struct idr idr;
2828
struct rcu_head rcu;
2929
unsigned int pid_allocated;
30+
#ifdef CONFIG_SYSCTL
31+
#if defined(CONFIG_MEMFD_CREATE)
32+
int memfd_noexec_scope;
33+
#endif
34+
struct ctl_table_set set;
35+
struct ctl_table_header *sysctls;
36+
#endif
3037
struct task_struct *child_reaper;
3138
struct kmem_cache *pid_cachep;
3239
unsigned int level;
@@ -40,13 +47,6 @@ struct pid_namespace {
4047
int reboot; /* group exit code if this pidns was rebooted */
4148
struct ns_common ns;
4249
struct work_struct work;
43-
#ifdef CONFIG_SYSCTL
44-
struct ctl_table_set set;
45-
struct ctl_table_header *sysctls;
46-
#if defined(CONFIG_MEMFD_CREATE)
47-
int memfd_noexec_scope;
48-
#endif
49-
#endif
5050
} __randomize_layout;
5151

5252
extern struct pid_namespace init_pid_ns;

include/linux/pidfs.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@ struct coredump_params;
66

77
struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags);
88
void __init pidfs_init(void);
9-
void pidfs_add_pid(struct pid *pid);
9+
void pidfs_prepare_pid(struct pid *pid);
10+
int pidfs_add_pid(struct pid *pid);
1011
void pidfs_remove_pid(struct pid *pid);
1112
void pidfs_exit(struct task_struct *tsk);
1213
#ifdef CONFIG_COREDUMP

ipc/mqueue.c

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
// SPDX-License-Identifier: GPL-2.0
12
/*
23
* POSIX message queues filesystem for Linux.
34
*
@@ -9,8 +10,6 @@
910
* Manfred Spraul (manfred@colorfullife.com)
1011
*
1112
* Audit: George Wilson (ltcgcw@us.ibm.com)
12-
*
13-
* This file is released under the GPL.
1413
*/
1514

1615
#include <linux/capability.h>

0 commit comments

Comments
 (0)