2121#include <linux/utsname.h>
2222#include <net/net_namespace.h>
2323#include <linux/coredump.h>
24+ #include <linux/rhashtable.h>
2425#include <linux/xattr.h>
26+ #include <linux/cookie.h>
2527
2628#include "internal.h"
2729#include "mount.h"
@@ -55,9 +57,48 @@ struct pidfs_attr {
5557 __u32 coredump_signal ;
5658};
5759
58- static struct rb_root pidfs_ino_tree = RB_ROOT ;
60+ static struct rhashtable pidfs_ino_ht ;
61+
62+ static const struct rhashtable_params pidfs_ino_ht_params = {
63+ .key_offset = offsetof(struct pid , ino ),
64+ .key_len = sizeof (u64 ),
65+ .head_offset = offsetof(struct pid , pidfs_hash ),
66+ .automatic_shrinking = true,
67+ };
68+
69+ /*
70+ * inode number handling
71+ *
72+ * On 64 bit nothing special happens. The 64bit number assigned
73+ * to struct pid is the inode number.
74+ *
75+ * On 32 bit the 64 bit number assigned to struct pid is split
76+ * into two 32 bit numbers. The lower 32 bits are used as the
77+ * inode number and the upper 32 bits are used as the inode
78+ * generation number.
79+ *
80+ * On 32 bit pidfs_ino() will return the lower 32 bit. When
81+ * pidfs_ino() returns zero a wrap around happened. When a
82+ * wraparound happens the 64 bit number will be incremented by 1
83+ * so inode numbering starts at 1 again.
84+ *
85+ * On 64 bit comparing two pidfds is as simple as comparing
86+ * inode numbers.
87+ *
88+ * When a wraparound happens on 32 bit multiple pidfds with the
89+ * same inode number are likely to exist (This isn't a problem
90+ * since before pidfs pidfds used the anonymous inode meaning
91+ * all pidfds had the same inode number.). Userspace can
92+ * reconstruct the 64 bit identifier by retrieving both the
93+ * inode number and the inode generation number to compare or
94+ * use file handles.
95+ */
5996
6097#if BITS_PER_LONG == 32
98+
99+ DEFINE_SPINLOCK (pidfs_ino_lock );
100+ static u64 pidfs_ino_nr = 1 ;
101+
61102static inline unsigned long pidfs_ino (u64 ino )
62103{
63104 return lower_32_bits (ino );
@@ -69,6 +110,18 @@ static inline u32 pidfs_gen(u64 ino)
69110 return upper_32_bits (ino );
70111}
71112
113+ static inline u64 pidfs_alloc_ino (void )
114+ {
115+ u64 ino ;
116+
117+ spin_lock (& pidfs_ino_lock );
118+ if (pidfs_ino (pidfs_ino_nr ) == 0 )
119+ pidfs_ino_nr ++ ;
120+ ino = pidfs_ino_nr ++ ;
121+ spin_unlock (& pidfs_ino_lock );
122+ return ino ;
123+ }
124+
72125#else
73126
74127/* On 64 bit simply return ino. */
@@ -82,69 +135,47 @@ static inline u32 pidfs_gen(u64 ino)
82135{
83136 return 0 ;
84137}
85- #endif
86138
87- static int pidfs_ino_cmp (struct rb_node * a , const struct rb_node * b )
88- {
89- struct pid * pid_a = rb_entry (a , struct pid , pidfs_node );
90- struct pid * pid_b = rb_entry (b , struct pid , pidfs_node );
91- u64 pid_ino_a = pid_a -> ino ;
92- u64 pid_ino_b = pid_b -> ino ;
93-
94- if (pid_ino_a < pid_ino_b )
95- return -1 ;
96- if (pid_ino_a > pid_ino_b )
97- return 1 ;
98- return 0 ;
99- }
139+ DEFINE_COOKIE (pidfs_ino_cookie );
100140
101- void pidfs_add_pid ( struct pid * pid )
141+ static u64 pidfs_alloc_ino ( void )
102142{
103- static u64 pidfs_ino_nr = 2 ;
143+ u64 ino ;
104144
105- /*
106- * On 64 bit nothing special happens. The 64bit number assigned
107- * to struct pid is the inode number.
108- *
109- * On 32 bit the 64 bit number assigned to struct pid is split
110- * into two 32 bit numbers. The lower 32 bits are used as the
111- * inode number and the upper 32 bits are used as the inode
112- * generation number.
113- *
114- * On 32 bit pidfs_ino() will return the lower 32 bit. When
115- * pidfs_ino() returns zero a wrap around happened. When a
116- * wraparound happens the 64 bit number will be incremented by 2
117- * so inode numbering starts at 2 again.
118- *
119- * On 64 bit comparing two pidfds is as simple as comparing
120- * inode numbers.
121- *
122- * When a wraparound happens on 32 bit multiple pidfds with the
123- * same inode number are likely to exist (This isn't a problem
124- * since before pidfs pidfds used the anonymous inode meaning
125- * all pidfds had the same inode number.). Userspace can
126- * reconstruct the 64 bit identifier by retrieving both the
127- * inode number and the inode generation number to compare or
128- * use file handles.
129- */
130- if (pidfs_ino (pidfs_ino_nr ) == 0 )
131- pidfs_ino_nr += 2 ;
145+ preempt_disable ();
146+ ino = gen_cookie_next (& pidfs_ino_cookie );
147+ preempt_enable ();
132148
133- pid -> ino = pidfs_ino_nr ;
149+ VFS_WARN_ON_ONCE (ino < 1 );
150+ return ino ;
151+ }
152+
153+ #endif
154+
155+ void pidfs_prepare_pid (struct pid * pid )
156+ {
134157 pid -> stashed = NULL ;
135158 pid -> attr = NULL ;
136- pidfs_ino_nr ++ ;
159+ pid -> ino = 0 ;
160+ }
137161
138- write_seqcount_begin (& pidmap_lock_seq );
139- rb_find_add_rcu (& pid -> pidfs_node , & pidfs_ino_tree , pidfs_ino_cmp );
140- write_seqcount_end (& pidmap_lock_seq );
162+ int pidfs_add_pid (struct pid * pid )
163+ {
164+ int ret ;
165+
166+ pid -> ino = pidfs_alloc_ino ();
167+ ret = rhashtable_insert_fast (& pidfs_ino_ht , & pid -> pidfs_hash ,
168+ pidfs_ino_ht_params );
169+ if (unlikely (ret ))
170+ pid -> ino = 0 ;
171+ return ret ;
141172}
142173
143174void pidfs_remove_pid (struct pid * pid )
144175{
145- write_seqcount_begin ( & pidmap_lock_seq );
146- rb_erase ( & pid -> pidfs_node , & pidfs_ino_tree );
147- write_seqcount_end ( & pidmap_lock_seq );
176+ if ( likely ( pid -> ino ))
177+ rhashtable_remove_fast ( & pidfs_ino_ht , & pid -> pidfs_hash ,
178+ pidfs_ino_ht_params );
148179}
149180
150181void pidfs_free_pid (struct pid * pid )
@@ -415,7 +446,7 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
415446 * the fields are set correctly, or return ESRCH to avoid providing
416447 * incomplete information. */
417448
418- kinfo .ppid = task_ppid_nr_ns (task , NULL );
449+ kinfo .ppid = task_ppid_vnr (task );
419450 kinfo .tgid = task_tgid_vnr (task );
420451 kinfo .pid = task_pid_vnr (task );
421452 kinfo .mask |= PIDFD_INFO_PID ;
@@ -791,42 +822,24 @@ static int pidfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
791822 return FILEID_KERNFS ;
792823}
793824
794- static int pidfs_ino_find (const void * key , const struct rb_node * node )
795- {
796- const u64 pid_ino = * (u64 * )key ;
797- const struct pid * pid = rb_entry (node , struct pid , pidfs_node );
798-
799- if (pid_ino < pid -> ino )
800- return -1 ;
801- if (pid_ino > pid -> ino )
802- return 1 ;
803- return 0 ;
804- }
805-
806825/* Find a struct pid based on the inode number. */
807826static struct pid * pidfs_ino_get_pid (u64 ino )
808827{
809828 struct pid * pid ;
810- struct rb_node * node ;
811- unsigned int seq ;
829+ struct pidfs_attr * attr ;
812830
813831 guard (rcu )();
814- do {
815- seq = read_seqcount_begin (& pidmap_lock_seq );
816- node = rb_find_rcu (& ino , & pidfs_ino_tree , pidfs_ino_find );
817- if (node )
818- break ;
819- } while (read_seqcount_retry (& pidmap_lock_seq , seq ));
820-
821- if (!node )
832+ pid = rhashtable_lookup (& pidfs_ino_ht , & ino , pidfs_ino_ht_params );
833+ if (!pid )
834+ return NULL ;
835+ attr = READ_ONCE (pid -> attr );
836+ if (IS_ERR_OR_NULL (attr ))
837+ return NULL ;
838+ if (test_bit (PIDFS_ATTR_BIT_EXIT , & attr -> attr_mask ))
822839 return NULL ;
823-
824- pid = rb_entry (node , struct pid , pidfs_node );
825-
826840 /* Within our pid namespace hierarchy? */
827841 if (pid_vnr (pid ) == 0 )
828842 return NULL ;
829-
830843 return get_pid (pid );
831844}
832845
@@ -1104,6 +1117,9 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
11041117
11051118void __init pidfs_init (void )
11061119{
1120+ if (rhashtable_init (& pidfs_ino_ht , & pidfs_ino_ht_params ))
1121+ panic ("Failed to initialize pidfs hashtable" );
1122+
11071123 pidfs_attr_cachep = kmem_cache_create ("pidfs_attr_cache" , sizeof (struct pidfs_attr ), 0 ,
11081124 (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT |
11091125 SLAB_ACCOUNT | SLAB_PANIC ), NULL );
0 commit comments