@@ -325,6 +325,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
325325 INIT_LIST_HEAD (& ctx -> sqd_list );
326326 INIT_LIST_HEAD (& ctx -> cq_overflow_list );
327327 INIT_LIST_HEAD (& ctx -> io_buffers_cache );
328+ INIT_HLIST_HEAD (& ctx -> io_buf_list );
328329 io_alloc_cache_init (& ctx -> rsrc_node_cache , IO_NODE_ALLOC_CACHE_MAX ,
329330 sizeof (struct io_rsrc_node ));
330331 io_alloc_cache_init (& ctx -> apoll_cache , IO_ALLOC_CACHE_MAX ,
@@ -2666,7 +2667,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
26662667 return READ_ONCE (rings -> cq .head ) == READ_ONCE (rings -> cq .tail ) ? ret : 0 ;
26672668}
26682669
2669- static void io_mem_free (void * ptr )
2670+ void io_mem_free (void * ptr )
26702671{
26712672 if (!ptr )
26722673 return ;
@@ -2697,6 +2698,7 @@ static void *__io_uaddr_map(struct page ***pages, unsigned short *npages,
26972698{
26982699 struct page * * page_array ;
26992700 unsigned int nr_pages ;
2701+ void * page_addr ;
27002702 int ret , i ;
27012703
27022704 * npages = 0 ;
@@ -2718,27 +2720,29 @@ static void *__io_uaddr_map(struct page ***pages, unsigned short *npages,
27182720 io_pages_free (& page_array , ret > 0 ? ret : 0 );
27192721 return ret < 0 ? ERR_PTR (ret ) : ERR_PTR (- EFAULT );
27202722 }
2721- /*
2722- * Should be a single page. If the ring is small enough that we can
2723- * use a normal page, that is fine. If we need multiple pages, then
2724- * userspace should use a huge page. That's the only way to guarantee
2725- * that we get contigious memory, outside of just being lucky or
2726- * (currently) having low memory fragmentation.
2727- */
2728- if (page_array [0 ] != page_array [ret - 1 ])
2729- goto err ;
27302723
2731- /*
2732- * Can't support mapping user allocated ring memory on 32-bit archs
2733- * where it could potentially reside in highmem. Just fail those with
2734- * -EINVAL, just like we did on kernels that didn't support this
2735- * feature.
2736- */
2724+ page_addr = page_address (page_array [0 ]);
27372725 for (i = 0 ; i < nr_pages ; i ++ ) {
2738- if (PageHighMem (page_array [i ])) {
2739- ret = - EINVAL ;
2726+ ret = - EINVAL ;
2727+
2728+ /*
2729+ * Can't support mapping user allocated ring memory on 32-bit
2730+ * archs where it could potentially reside in highmem. Just
2731+ * fail those with -EINVAL, just like we did on kernels that
2732+ * didn't support this feature.
2733+ */
2734+ if (PageHighMem (page_array [i ]))
27402735 goto err ;
2741- }
2736+
2737+ /*
2738+ * No support for discontig pages for now, should either be a
2739+ * single normal page, or a huge page. Later on we can add
2740+ * support for remapping discontig pages, for now we will
2741+ * just fail them with EINVAL.
2742+ */
2743+ if (page_address (page_array [i ]) != page_addr )
2744+ goto err ;
2745+ page_addr += PAGE_SIZE ;
27422746 }
27432747
27442748 * pages = page_array ;
@@ -2775,7 +2779,7 @@ static void io_rings_free(struct io_ring_ctx *ctx)
27752779 }
27762780}
27772781
2778- static void * io_mem_alloc (size_t size )
2782+ void * io_mem_alloc (size_t size )
27792783{
27802784 gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP ;
27812785 void * ret ;
@@ -2947,6 +2951,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
29472951 ctx -> mm_account = NULL ;
29482952 }
29492953 io_rings_free (ctx );
2954+ io_kbuf_mmap_list_free (ctx );
29502955
29512956 percpu_ref_exit (& ctx -> refs );
29522957 free_uid (ctx -> user );
@@ -3475,25 +3480,27 @@ static void *io_uring_validate_mmap_request(struct file *file,
34753480 struct page * page ;
34763481 void * ptr ;
34773482
3478- /* Don't allow mmap if the ring was setup without it */
3479- if (ctx -> flags & IORING_SETUP_NO_MMAP )
3480- return ERR_PTR (- EINVAL );
3481-
34823483 switch (offset & IORING_OFF_MMAP_MASK ) {
34833484 case IORING_OFF_SQ_RING :
34843485 case IORING_OFF_CQ_RING :
3486+ /* Don't allow mmap if the ring was setup without it */
3487+ if (ctx -> flags & IORING_SETUP_NO_MMAP )
3488+ return ERR_PTR (- EINVAL );
34853489 ptr = ctx -> rings ;
34863490 break ;
34873491 case IORING_OFF_SQES :
3492+ /* Don't allow mmap if the ring was setup without it */
3493+ if (ctx -> flags & IORING_SETUP_NO_MMAP )
3494+ return ERR_PTR (- EINVAL );
34883495 ptr = ctx -> sq_sqes ;
34893496 break ;
34903497 case IORING_OFF_PBUF_RING : {
34913498 unsigned int bgid ;
34923499
34933500 bgid = (offset & ~IORING_OFF_MMAP_MASK ) >> IORING_OFF_PBUF_SHIFT ;
3494- mutex_lock ( & ctx -> uring_lock );
3501+ rcu_read_lock ( );
34953502 ptr = io_pbuf_get_address (ctx , bgid );
3496- mutex_unlock ( & ctx -> uring_lock );
3503+ rcu_read_unlock ( );
34973504 if (!ptr )
34983505 return ERR_PTR (- EINVAL );
34993506 break ;
@@ -3645,7 +3652,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
36453652 size_t , argsz )
36463653{
36473654 struct io_ring_ctx * ctx ;
3648- struct fd f ;
3655+ struct file * file ;
36493656 long ret ;
36503657
36513658 if (unlikely (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
@@ -3663,20 +3670,19 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
36633670 if (unlikely (!tctx || fd >= IO_RINGFD_REG_MAX ))
36643671 return - EINVAL ;
36653672 fd = array_index_nospec (fd , IO_RINGFD_REG_MAX );
3666- f .file = tctx -> registered_rings [fd ];
3667- f .flags = 0 ;
3668- if (unlikely (!f .file ))
3673+ file = tctx -> registered_rings [fd ];
3674+ if (unlikely (!file ))
36693675 return - EBADF ;
36703676 } else {
3671- f = fdget (fd );
3672- if (unlikely (!f . file ))
3677+ file = fget (fd );
3678+ if (unlikely (!file ))
36733679 return - EBADF ;
36743680 ret = - EOPNOTSUPP ;
3675- if (unlikely (!io_is_uring_fops (f . file )))
3681+ if (unlikely (!io_is_uring_fops (file )))
36763682 goto out ;
36773683 }
36783684
3679- ctx = f . file -> private_data ;
3685+ ctx = file -> private_data ;
36803686 ret = - EBADFD ;
36813687 if (unlikely (ctx -> flags & IORING_SETUP_R_DISABLED ))
36823688 goto out ;
@@ -3770,7 +3776,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
37703776 }
37713777 }
37723778out :
3773- fdput (f );
3779+ if (!(flags & IORING_ENTER_REGISTERED_RING ))
3780+ fput (file );
37743781 return ret ;
37753782}
37763783
@@ -4611,7 +4618,7 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
46114618{
46124619 struct io_ring_ctx * ctx ;
46134620 long ret = - EBADF ;
4614- struct fd f ;
4621+ struct file * file ;
46154622 bool use_registered_ring ;
46164623
46174624 use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING );
@@ -4630,27 +4637,27 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
46304637 if (unlikely (!tctx || fd >= IO_RINGFD_REG_MAX ))
46314638 return - EINVAL ;
46324639 fd = array_index_nospec (fd , IO_RINGFD_REG_MAX );
4633- f .file = tctx -> registered_rings [fd ];
4634- f .flags = 0 ;
4635- if (unlikely (!f .file ))
4640+ file = tctx -> registered_rings [fd ];
4641+ if (unlikely (!file ))
46364642 return - EBADF ;
46374643 } else {
4638- f = fdget (fd );
4639- if (unlikely (!f . file ))
4644+ file = fget (fd );
4645+ if (unlikely (!file ))
46404646 return - EBADF ;
46414647 ret = - EOPNOTSUPP ;
4642- if (!io_is_uring_fops (f . file ))
4648+ if (!io_is_uring_fops (file ))
46434649 goto out_fput ;
46444650 }
46454651
4646- ctx = f . file -> private_data ;
4652+ ctx = file -> private_data ;
46474653
46484654 mutex_lock (& ctx -> uring_lock );
46494655 ret = __io_uring_register (ctx , opcode , arg , nr_args );
46504656 mutex_unlock (& ctx -> uring_lock );
46514657 trace_io_uring_register (ctx , opcode , ctx -> nr_user_files , ctx -> nr_user_bufs , ret );
46524658out_fput :
4653- fdput (f );
4659+ if (!use_registered_ring )
4660+ fput (file );
46544661 return ret ;
46554662}
46564663
0 commit comments