1717
1818#define IO_BUFFER_LIST_BUF_PER_PAGE (PAGE_SIZE / sizeof(struct io_uring_buf))
1919
20- #define BGID_ARRAY 64
21-
2220/* BIDs are addressed by a 16-bit field in a CQE */
2321#define MAX_BIDS_PER_BGID (1 << 16)
2422
@@ -40,13 +38,9 @@ struct io_buf_free {
4038 int inuse ;
4139};
4240
43- static struct io_buffer_list * __io_buffer_get_list (struct io_ring_ctx * ctx ,
44- struct io_buffer_list * bl ,
45- unsigned int bgid )
41+ static inline struct io_buffer_list * __io_buffer_get_list (struct io_ring_ctx * ctx ,
42+ unsigned int bgid )
4643{
47- if (bl && bgid < BGID_ARRAY )
48- return & bl [bgid ];
49-
5044 return xa_load (& ctx -> io_bl_xa , bgid );
5145}
5246
@@ -55,7 +49,7 @@ static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
5549{
5650 lockdep_assert_held (& ctx -> uring_lock );
5751
58- return __io_buffer_get_list (ctx , ctx -> io_bl , bgid );
52+ return __io_buffer_get_list (ctx , bgid );
5953}
6054
6155static int io_buffer_add_list (struct io_ring_ctx * ctx ,
@@ -67,11 +61,7 @@ static int io_buffer_add_list(struct io_ring_ctx *ctx,
6761 * always under the ->uring_lock, but the RCU lookup from mmap does.
6862 */
6963 bl -> bgid = bgid ;
70- smp_store_release (& bl -> is_ready , 1 );
71-
72- if (bgid < BGID_ARRAY )
73- return 0 ;
74-
64+ atomic_set (& bl -> refs , 1 );
7565 return xa_err (xa_store (& ctx -> io_bl_xa , bgid , bl , GFP_KERNEL ));
7666}
7767
@@ -208,24 +198,6 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
208198 return ret ;
209199}
210200
211- static __cold int io_init_bl_list (struct io_ring_ctx * ctx )
212- {
213- struct io_buffer_list * bl ;
214- int i ;
215-
216- bl = kcalloc (BGID_ARRAY , sizeof (struct io_buffer_list ), GFP_KERNEL );
217- if (!bl )
218- return - ENOMEM ;
219-
220- for (i = 0 ; i < BGID_ARRAY ; i ++ ) {
221- INIT_LIST_HEAD (& bl [i ].buf_list );
222- bl [i ].bgid = i ;
223- }
224-
225- smp_store_release (& ctx -> io_bl , bl );
226- return 0 ;
227- }
228-
229201/*
230202 * Mark the given mapped range as free for reuse
231203 */
@@ -294,24 +266,24 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
294266 return i ;
295267}
296268
269+ void io_put_bl (struct io_ring_ctx * ctx , struct io_buffer_list * bl )
270+ {
271+ if (atomic_dec_and_test (& bl -> refs )) {
272+ __io_remove_buffers (ctx , bl , -1U );
273+ kfree_rcu (bl , rcu );
274+ }
275+ }
276+
297277void io_destroy_buffers (struct io_ring_ctx * ctx )
298278{
299279 struct io_buffer_list * bl ;
300280 struct list_head * item , * tmp ;
301281 struct io_buffer * buf ;
302282 unsigned long index ;
303- int i ;
304-
305- for (i = 0 ; i < BGID_ARRAY ; i ++ ) {
306- if (!ctx -> io_bl )
307- break ;
308- __io_remove_buffers (ctx , & ctx -> io_bl [i ], -1U );
309- }
310283
311284 xa_for_each (& ctx -> io_bl_xa , index , bl ) {
312285 xa_erase (& ctx -> io_bl_xa , bl -> bgid );
313- __io_remove_buffers (ctx , bl , -1U );
314- kfree_rcu (bl , rcu );
286+ io_put_bl (ctx , bl );
315287 }
316288
317289 /*
@@ -489,12 +461,6 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
489461
490462 io_ring_submit_lock (ctx , issue_flags );
491463
492- if (unlikely (p -> bgid < BGID_ARRAY && !ctx -> io_bl )) {
493- ret = io_init_bl_list (ctx );
494- if (ret )
495- goto err ;
496- }
497-
498464 bl = io_buffer_get_list (ctx , p -> bgid );
499465 if (unlikely (!bl )) {
500466 bl = kzalloc (sizeof (* bl ), GFP_KERNEL_ACCOUNT );
@@ -507,14 +473,9 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
507473 if (ret ) {
508474 /*
509475 * Doesn't need rcu free as it was never visible, but
510- * let's keep it consistent throughout. Also can't
511- * be a lower indexed array group, as adding one
512- * where lookup failed cannot happen.
476+ * let's keep it consistent throughout.
513477 */
514- if (p -> bgid >= BGID_ARRAY )
515- kfree_rcu (bl , rcu );
516- else
517- WARN_ON_ONCE (1 );
478+ kfree_rcu (bl , rcu );
518479 goto err ;
519480 }
520481 }
@@ -679,12 +640,6 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
679640 if (reg .ring_entries >= 65536 )
680641 return - EINVAL ;
681642
682- if (unlikely (reg .bgid < BGID_ARRAY && !ctx -> io_bl )) {
683- int ret = io_init_bl_list (ctx );
684- if (ret )
685- return ret ;
686- }
687-
688643 bl = io_buffer_get_list (ctx , reg .bgid );
689644 if (bl ) {
690645 /* if mapped buffer ring OR classic exists, don't allow */
@@ -733,11 +688,8 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
733688 if (!bl -> is_buf_ring )
734689 return - EINVAL ;
735690
736- __io_remove_buffers (ctx , bl , -1U );
737- if (bl -> bgid >= BGID_ARRAY ) {
738- xa_erase (& ctx -> io_bl_xa , bl -> bgid );
739- kfree_rcu (bl , rcu );
740- }
691+ xa_erase (& ctx -> io_bl_xa , bl -> bgid );
692+ io_put_bl (ctx , bl );
741693 return 0 ;
742694}
743695
@@ -767,23 +719,35 @@ int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg)
767719 return 0 ;
768720}
769721
770- void * io_pbuf_get_address (struct io_ring_ctx * ctx , unsigned long bgid )
722+ struct io_buffer_list * io_pbuf_get_bl (struct io_ring_ctx * ctx ,
723+ unsigned long bgid )
771724{
772725 struct io_buffer_list * bl ;
726+ bool ret ;
773727
774- bl = __io_buffer_get_list (ctx , smp_load_acquire (& ctx -> io_bl ), bgid );
775-
776- if (!bl || !bl -> is_mmap )
777- return NULL ;
778728 /*
779- * Ensure the list is fully setup. Only strictly needed for RCU lookup
780- * via mmap, and in that case only for the array indexed groups. For
781- * the xarray lookups, it's either visible and ready, or not at all.
729+ * We have to be a bit careful here - we're inside mmap and cannot grab
730+ * the uring_lock. This means the buffer_list could be simultaneously
731+ * going away, if someone is trying to be sneaky. Look it up under rcu
732+ * so we know it's not going away, and attempt to grab a reference to
733+ * it. If the ref is already zero, then fail the mapping. If successful,
734+ * the caller will call io_put_bl() to drop the the reference at at the
735+ * end. This may then safely free the buffer_list (and drop the pages)
736+ * at that point, vm_insert_pages() would've already grabbed the
737+ * necessary vma references.
782738 */
783- if (!smp_load_acquire (& bl -> is_ready ))
784- return NULL ;
785-
786- return bl -> buf_ring ;
739+ rcu_read_lock ();
740+ bl = xa_load (& ctx -> io_bl_xa , bgid );
741+ /* must be a mmap'able buffer ring and have pages */
742+ ret = false;
743+ if (bl && bl -> is_mmap )
744+ ret = atomic_inc_not_zero (& bl -> refs );
745+ rcu_read_unlock ();
746+
747+ if (ret )
748+ return bl ;
749+
750+ return ERR_PTR (- EINVAL );
787751}
788752
789753/*
0 commit comments