3232#include <linux/fs_context.h>
3333#include <linux/shmem_fs.h>
3434#include <linux/mnt_idmapping.h>
35- #include <linux/nospec.h>
3635
3736#include "pnode.h"
3837#include "internal.h"
@@ -79,8 +78,10 @@ static struct kmem_cache *mnt_cache __ro_after_init;
7978static DECLARE_RWSEM (namespace_sem );
8079static HLIST_HEAD (unmounted ); /* protected by namespace_sem */
8180static LIST_HEAD (ex_mountpoints ); /* protected by namespace_sem */
82- static DEFINE_RWLOCK (mnt_ns_tree_lock );
81+ static DEFINE_SEQLOCK (mnt_ns_tree_lock );
82+
8383static struct rb_root mnt_ns_tree = RB_ROOT ; /* protected by mnt_ns_tree_lock */
84+ static LIST_HEAD (mnt_ns_list ); /* protected by mnt_ns_tree_lock */
8485
8586struct mount_kattr {
8687 unsigned int attr_set ;
@@ -106,42 +107,60 @@ EXPORT_SYMBOL_GPL(fs_kobj);
106107 */
107108__cacheline_aligned_in_smp DEFINE_SEQLOCK (mount_lock );
108109
109- static int mnt_ns_cmp (u64 seq , const struct mnt_namespace * ns )
110- {
111- u64 seq_b = ns -> seq ;
112-
113- if (seq < seq_b )
114- return -1 ;
115- if (seq > seq_b )
116- return 1 ;
117- return 0 ;
118- }
119-
120110static inline struct mnt_namespace * node_to_mnt_ns (const struct rb_node * node )
121111{
122112 if (!node )
123113 return NULL ;
124114 return rb_entry (node , struct mnt_namespace , mnt_ns_tree_node );
125115}
126116
127- static bool mnt_ns_less (struct rb_node * a , const struct rb_node * b )
117+ static int mnt_ns_cmp (struct rb_node * a , const struct rb_node * b )
128118{
129119 struct mnt_namespace * ns_a = node_to_mnt_ns (a );
130120 struct mnt_namespace * ns_b = node_to_mnt_ns (b );
131121 u64 seq_a = ns_a -> seq ;
122+ u64 seq_b = ns_b -> seq ;
123+
124+ if (seq_a < seq_b )
125+ return -1 ;
126+ if (seq_a > seq_b )
127+ return 1 ;
128+ return 0 ;
129+ }
130+
131+ static inline void mnt_ns_tree_write_lock (void )
132+ {
133+ write_seqlock (& mnt_ns_tree_lock );
134+ }
132135
133- return mnt_ns_cmp (seq_a , ns_b ) < 0 ;
136+ static inline void mnt_ns_tree_write_unlock (void )
137+ {
138+ write_sequnlock (& mnt_ns_tree_lock );
134139}
135140
136141static void mnt_ns_tree_add (struct mnt_namespace * ns )
137142{
138- guard (write_lock )(& mnt_ns_tree_lock );
139- rb_add (& ns -> mnt_ns_tree_node , & mnt_ns_tree , mnt_ns_less );
143+ struct rb_node * node , * prev ;
144+
145+ mnt_ns_tree_write_lock ();
146+ node = rb_find_add_rcu (& ns -> mnt_ns_tree_node , & mnt_ns_tree , mnt_ns_cmp );
147+ /*
148+ * If there's no previous entry simply add it after the
149+ * head and if there is add it after the previous entry.
150+ */
151+ prev = rb_prev (& ns -> mnt_ns_tree_node );
152+ if (!prev )
153+ list_add_rcu (& ns -> mnt_ns_list , & mnt_ns_list );
154+ else
155+ list_add_rcu (& ns -> mnt_ns_list , & node_to_mnt_ns (prev )-> mnt_ns_list );
156+ mnt_ns_tree_write_unlock ();
157+
158+ WARN_ON_ONCE (node );
140159}
141160
142161static void mnt_ns_release (struct mnt_namespace * ns )
143162{
144- lockdep_assert_not_held (& mnt_ns_tree_lock );
163+ lockdep_assert_not_held (& mnt_ns_tree_lock . lock );
145164
146165 /* keep alive for {list,stat}mount() */
147166 if (refcount_dec_and_test (& ns -> passive )) {
@@ -151,41 +170,34 @@ static void mnt_ns_release(struct mnt_namespace *ns)
151170}
152171DEFINE_FREE (mnt_ns_release , struct mnt_namespace * , if (_T ) mnt_ns_release (_T ))
153172
173+ static void mnt_ns_release_rcu (struct rcu_head * rcu )
174+ {
175+ mnt_ns_release (container_of (rcu , struct mnt_namespace , mnt_ns_rcu ));
176+ }
177+
154178static void mnt_ns_tree_remove (struct mnt_namespace * ns )
155179{
156180 /* remove from global mount namespace list */
157181 if (!is_anon_ns (ns )) {
158- guard ( write_lock )( & mnt_ns_tree_lock );
182+ mnt_ns_tree_write_lock ( );
159183 rb_erase (& ns -> mnt_ns_tree_node , & mnt_ns_tree );
184+ list_bidir_del_rcu (& ns -> mnt_ns_list );
185+ mnt_ns_tree_write_unlock ();
160186 }
161187
162- mnt_ns_release ( ns );
188+ call_rcu ( & ns -> mnt_ns_rcu , mnt_ns_release_rcu );
163189}
164190
165- /*
166- * Returns the mount namespace which either has the specified id, or has the
167- * next smallest id afer the specified one.
168- */
169- static struct mnt_namespace * mnt_ns_find_id_at (u64 mnt_ns_id )
191+ static int mnt_ns_find (const void * key , const struct rb_node * node )
170192{
171- struct rb_node * node = mnt_ns_tree .rb_node ;
172- struct mnt_namespace * ret = NULL ;
173-
174- lockdep_assert_held (& mnt_ns_tree_lock );
193+ const u64 mnt_ns_id = * (u64 * )key ;
194+ const struct mnt_namespace * ns = node_to_mnt_ns (node );
175195
176- while (node ) {
177- struct mnt_namespace * n = node_to_mnt_ns (node );
178-
179- if (mnt_ns_id <= n -> seq ) {
180- ret = node_to_mnt_ns (node );
181- if (mnt_ns_id == n -> seq )
182- break ;
183- node = node -> rb_left ;
184- } else {
185- node = node -> rb_right ;
186- }
187- }
188- return ret ;
196+ if (mnt_ns_id < ns -> seq )
197+ return -1 ;
198+ if (mnt_ns_id > ns -> seq )
199+ return 1 ;
200+ return 0 ;
189201}
190202
191203/*
@@ -195,18 +207,37 @@ static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id)
195207 * namespace the @namespace_sem must first be acquired. If the namespace has
196208 * already shut down before acquiring @namespace_sem, {list,stat}mount() will
197209 * see that the mount rbtree of the namespace is empty.
210+ *
211+ * Note the lookup is lockless protected by a sequence counter. We only
212+ * need to guard against false negatives as false positives aren't
213+ * possible. So if we didn't find a mount namespace and the sequence
214+ * counter has changed we need to retry. If the sequence counter is
215+ * still the same we know the search actually failed.
198216 */
199217static struct mnt_namespace * lookup_mnt_ns (u64 mnt_ns_id )
200218{
201- struct mnt_namespace * ns ;
219+ struct mnt_namespace * ns ;
220+ struct rb_node * node ;
221+ unsigned int seq ;
202222
203- guard (read_lock )(& mnt_ns_tree_lock );
204- ns = mnt_ns_find_id_at (mnt_ns_id );
205- if (!ns || ns -> seq != mnt_ns_id )
206- return NULL ;
223+ guard (rcu )();
224+ do {
225+ seq = read_seqbegin (& mnt_ns_tree_lock );
226+ node = rb_find_rcu (& mnt_ns_id , & mnt_ns_tree , mnt_ns_find );
227+ if (node )
228+ break ;
229+ } while (read_seqretry (& mnt_ns_tree_lock , seq ));
207230
208- refcount_inc (& ns -> passive );
209- return ns ;
231+ if (!node )
232+ return NULL ;
233+
234+ /*
235+ * The last reference count is put with RCU delay so we can
236+ * unconditonally acquire a reference here.
237+ */
238+ ns = node_to_mnt_ns (node );
239+ refcount_inc (& ns -> passive );
240+ return ns ;
210241}
211242
212243static inline void lock_mount_hash (void )
@@ -2063,30 +2094,34 @@ struct ns_common *from_mnt_ns(struct mnt_namespace *mnt)
20632094 return & mnt -> ns ;
20642095}
20652096
2066- struct mnt_namespace * __lookup_next_mnt_ns (struct mnt_namespace * mntns , bool previous )
2097+ struct mnt_namespace * get_sequential_mnt_ns (struct mnt_namespace * mntns , bool previous )
20672098{
2068- guard (read_lock )(& mnt_ns_tree_lock );
2099+ guard (rcu )();
2100+
20692101 for (;;) {
2070- struct rb_node * node ;
2102+ struct list_head * list ;
20712103
20722104 if (previous )
2073- node = rb_prev ( & mntns -> mnt_ns_tree_node );
2105+ list = rcu_dereference ( list_bidir_prev_rcu ( & mntns -> mnt_ns_list ) );
20742106 else
2075- node = rb_next ( & mntns -> mnt_ns_tree_node );
2076- if (! node )
2107+ list = rcu_dereference ( list_next_rcu ( & mntns -> mnt_ns_list ) );
2108+ if (list_is_head ( list , & mnt_ns_list ) )
20772109 return ERR_PTR (- ENOENT );
20782110
2079- mntns = node_to_mnt_ns (node );
2080- node = & mntns -> mnt_ns_tree_node ;
2111+ mntns = list_entry_rcu (list , struct mnt_namespace , mnt_ns_list );
20812112
2113+ /*
2114+ * The last passive reference count is put with RCU
2115+ * delay so accessing the mount namespace is not just
2116+ * safe but all relevant members are still valid.
2117+ */
20822118 if (!ns_capable_noaudit (mntns -> user_ns , CAP_SYS_ADMIN ))
20832119 continue ;
20842120
20852121 /*
2086- * Holding mnt_ns_tree_lock prevents the mount namespace from
2087- * being freed but it may well be on it's deathbed. We want an
2088- * active reference, not just a passive one here as we're
2089- * persisting the mount namespace.
2122+ * We need an active reference count as we're persisting
2123+ * the mount namespace and it might already be on its
2124+ * deathbed.
20902125 */
20912126 if (!refcount_inc_not_zero (& mntns -> ns .count ))
20922127 continue ;
@@ -3903,6 +3938,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a
39033938 refcount_set (& new_ns -> ns .count , 1 );
39043939 refcount_set (& new_ns -> passive , 1 );
39053940 new_ns -> mounts = RB_ROOT ;
3941+ INIT_LIST_HEAD (& new_ns -> mnt_ns_list );
39063942 RB_CLEAR_NODE (& new_ns -> mnt_ns_tree_node );
39073943 init_waitqueue_head (& new_ns -> poll );
39083944 new_ns -> user_ns = get_user_ns (user_ns );
@@ -3982,14 +4018,14 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
39824018 while (p -> mnt .mnt_root != q -> mnt .mnt_root )
39834019 p = next_mnt (skip_mnt_tree (p ), old );
39844020 }
3985- mnt_ns_tree_add (new_ns );
39864021 namespace_unlock ();
39874022
39884023 if (rootmnt )
39894024 mntput (rootmnt );
39904025 if (pwdmnt )
39914026 mntput (pwdmnt );
39924027
4028+ mnt_ns_tree_add (new_ns );
39934029 return new_ns ;
39944030}
39954031
0 commit comments