Skip to content

Commit ffa059b

Browse files
committed
Merge tag 'for-6.7/io_uring-2023-10-30' of git://git.kernel.dk/linux
Pull io_uring updates from Jens Axboe: "This contains the core io_uring updates, of which there are not many, and adds support for using WAITID through io_uring and hence not needing to block on these kinds of events. Outside of that, tweaks to the legacy provided buffer handling and some cleanups related to cancelations for uring_cmd support" * tag 'for-6.7/io_uring-2023-10-30' of git://git.kernel.dk/linux: io_uring/poll: use IOU_F_TWQ_LAZY_WAKE for wakeups io_uring/kbuf: Use slab for struct io_buffer objects io_uring/kbuf: Allow the full buffer id space for provided buffers io_uring/kbuf: Fix check of BID wrapping in provided buffers io_uring/rsrc: cleanup io_pin_pages() io_uring: cancelable uring_cmd io_uring: retain top 8bits of uring_cmd flags for kernel internal use io_uring: add IORING_OP_WAITID support exit: add internal include file with helpers exit: add kernel_waitid_prepare() helper exit: move core of do_wait() into helper exit: abstract out should_wake helper for child_wait_callback() io_uring/rw: add support for IORING_OP_READ_MULTISHOT io_uring/rw: mark readv/writev as vectored in the opcode definition io_uring/rw: split io_read() into a helper
2 parents ca995ce + 6ce4a93 commit ffa059b

19 files changed

Lines changed: 782 additions & 120 deletions

File tree

include/linux/io_uring.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,15 @@ enum io_uring_cmd_flags {
2020
IO_URING_F_SQE128 = (1 << 8),
2121
IO_URING_F_CQE32 = (1 << 9),
2222
IO_URING_F_IOPOLL = (1 << 10),
23+
24+
/* set when uring wants to cancel a previously issued command */
25+
IO_URING_F_CANCEL = (1 << 11),
2326
};
2427

28+
/* only top 8 bits of sqe->uring_cmd_flags for kernel internal use */
29+
#define IORING_URING_CMD_CANCELABLE (1U << 30)
30+
#define IORING_URING_CMD_POLLED (1U << 31)
31+
2532
struct io_uring_cmd {
2633
struct file *file;
2734
const struct io_uring_sqe *sqe;
@@ -82,6 +89,9 @@ static inline void io_uring_free(struct task_struct *tsk)
8289
__io_uring_free(tsk);
8390
}
8491
int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags);
92+
void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
93+
unsigned int issue_flags);
94+
struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd);
8595
#else
8696
static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
8797
struct iov_iter *iter, void *ioucmd)
@@ -122,6 +132,14 @@ static inline int io_uring_cmd_sock(struct io_uring_cmd *cmd,
122132
{
123133
return -EOPNOTSUPP;
124134
}
135+
static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
136+
unsigned int issue_flags)
137+
{
138+
}
139+
static inline struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd)
140+
{
141+
return NULL;
142+
}
125143
#endif
126144

127145
#endif

include/linux/io_uring_types.h

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,12 @@ struct io_ring_ctx {
265265
*/
266266
struct io_wq_work_list iopoll_list;
267267
bool poll_multi_queue;
268+
269+
/*
270+
* Any cancelable uring_cmd is added to this list in
271+
* ->uring_cmd() by io_uring_cmd_insert_cancelable()
272+
*/
273+
struct hlist_head cancelable_uring_cmd;
268274
} ____cacheline_aligned_in_smp;
269275

270276
struct {
@@ -313,6 +319,8 @@ struct io_ring_ctx {
313319
struct list_head cq_overflow_list;
314320
struct io_hash_table cancel_table;
315321

322+
struct hlist_head waitid_list;
323+
316324
const struct cred *sq_creds; /* cred used for __io_sq_thread() */
317325
struct io_sq_data *sq_data; /* if using sq thread polling */
318326

@@ -342,8 +350,6 @@ struct io_ring_ctx {
342350
struct wait_queue_head rsrc_quiesce_wq;
343351
unsigned rsrc_quiesce;
344352

345-
struct list_head io_buffers_pages;
346-
347353
#if defined(CONFIG_UNIX)
348354
struct socket *ring_sock;
349355
#endif

include/uapi/linux/io_uring.h

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ struct io_uring_sqe {
6565
__u32 xattr_flags;
6666
__u32 msg_ring_flags;
6767
__u32 uring_cmd_flags;
68+
__u32 waitid_flags;
6869
};
6970
__u64 user_data; /* data to be passed back at completion time */
7071
/* pack this to avoid bogus arm OABI complaints */
@@ -240,19 +241,20 @@ enum io_uring_op {
240241
IORING_OP_URING_CMD,
241242
IORING_OP_SEND_ZC,
242243
IORING_OP_SENDMSG_ZC,
244+
IORING_OP_READ_MULTISHOT,
245+
IORING_OP_WAITID,
243246

244247
/* this goes last, obviously */
245248
IORING_OP_LAST,
246249
};
247250

248251
/*
249-
* sqe->uring_cmd_flags
252+
* sqe->uring_cmd_flags top 8bits aren't available for userspace
250253
* IORING_URING_CMD_FIXED use registered buffer; pass this flag
251254
* along with setting sqe->buf_index.
252-
* IORING_URING_CMD_POLLED driver use only
253255
*/
254256
#define IORING_URING_CMD_FIXED (1U << 0)
255-
#define IORING_URING_CMD_POLLED (1U << 31)
257+
#define IORING_URING_CMD_MASK IORING_URING_CMD_FIXED
256258

257259

258260
/*

io_uring/Makefile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,5 +7,6 @@ obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \
77
openclose.o uring_cmd.o epoll.o \
88
statx.o net.o msg_ring.o timeout.o \
99
sqpoll.o fdinfo.o tctx.o poll.o \
10-
cancel.o kbuf.o rsrc.o rw.o opdef.o notif.o
10+
cancel.o kbuf.o rsrc.o rw.o opdef.o \
11+
notif.o waitid.o
1112
obj-$(CONFIG_IO_WQ) += io-wq.o

io_uring/cancel.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#include "tctx.h"
1616
#include "poll.h"
1717
#include "timeout.h"
18+
#include "waitid.h"
1819
#include "cancel.h"
1920

2021
struct io_cancel {
@@ -119,6 +120,10 @@ int io_try_cancel(struct io_uring_task *tctx, struct io_cancel_data *cd,
119120
if (ret != -ENOENT)
120121
return ret;
121122

123+
ret = io_waitid_cancel(ctx, cd, issue_flags);
124+
if (ret != -ENOENT)
125+
return ret;
126+
122127
spin_lock(&ctx->completion_lock);
123128
if (!(cd->flags & IORING_ASYNC_CANCEL_FD))
124129
ret = io_timeout_cancel(ctx, cd);

io_uring/io_uring.c

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@
9292
#include "cancel.h"
9393
#include "net.h"
9494
#include "notif.h"
95+
#include "waitid.h"
9596

9697
#include "timeout.h"
9798
#include "poll.h"
@@ -338,7 +339,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
338339
spin_lock_init(&ctx->completion_lock);
339340
spin_lock_init(&ctx->timeout_lock);
340341
INIT_WQ_LIST(&ctx->iopoll_list);
341-
INIT_LIST_HEAD(&ctx->io_buffers_pages);
342342
INIT_LIST_HEAD(&ctx->io_buffers_comp);
343343
INIT_LIST_HEAD(&ctx->defer_list);
344344
INIT_LIST_HEAD(&ctx->timeout_list);
@@ -348,8 +348,10 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
348348
INIT_LIST_HEAD(&ctx->tctx_list);
349349
ctx->submit_state.free_list.next = NULL;
350350
INIT_WQ_LIST(&ctx->locked_free_list);
351+
INIT_HLIST_HEAD(&ctx->waitid_list);
351352
INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
352353
INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
354+
INIT_HLIST_HEAD(&ctx->cancelable_uring_cmd);
353355
return ctx;
354356
err:
355357
kfree(ctx->cancel_table.hbs);
@@ -3276,6 +3278,37 @@ static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
32763278
return ret;
32773279
}
32783280

3281+
static bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx,
3282+
struct task_struct *task, bool cancel_all)
3283+
{
3284+
struct hlist_node *tmp;
3285+
struct io_kiocb *req;
3286+
bool ret = false;
3287+
3288+
lockdep_assert_held(&ctx->uring_lock);
3289+
3290+
hlist_for_each_entry_safe(req, tmp, &ctx->cancelable_uring_cmd,
3291+
hash_node) {
3292+
struct io_uring_cmd *cmd = io_kiocb_to_cmd(req,
3293+
struct io_uring_cmd);
3294+
struct file *file = req->file;
3295+
3296+
if (!cancel_all && req->task != task)
3297+
continue;
3298+
3299+
if (cmd->flags & IORING_URING_CMD_CANCELABLE) {
3300+
/* ->sqe isn't available if no async data */
3301+
if (!req_has_async_data(req))
3302+
cmd->sqe = NULL;
3303+
file->f_op->uring_cmd(cmd, IO_URING_F_CANCEL);
3304+
ret = true;
3305+
}
3306+
}
3307+
io_submit_flush_completions(ctx);
3308+
3309+
return ret;
3310+
}
3311+
32793312
static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
32803313
struct task_struct *task,
32813314
bool cancel_all)
@@ -3323,6 +3356,8 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
33233356
ret |= io_cancel_defer_files(ctx, task, cancel_all);
33243357
mutex_lock(&ctx->uring_lock);
33253358
ret |= io_poll_remove_all(ctx, task, cancel_all);
3359+
ret |= io_waitid_remove_all(ctx, task, cancel_all);
3360+
ret |= io_uring_try_cancel_uring_cmd(ctx, task, cancel_all);
33263361
mutex_unlock(&ctx->uring_lock);
33273362
ret |= io_kill_timeouts(ctx, task, cancel_all);
33283363
if (task)
@@ -4686,6 +4721,9 @@ static int __init io_uring_init(void)
46864721

46874722
BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32));
46884723

4724+
/* top 8bits are for internal use */
4725+
BUILD_BUG_ON((IORING_URING_CMD_MASK & 0xff000000) != 0);
4726+
46894727
io_uring_optable_init();
46904728

46914729
/*
@@ -4701,6 +4739,9 @@ static int __init io_uring_init(void)
47014739
SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU,
47024740
offsetof(struct io_kiocb, cmd.data),
47034741
sizeof_field(struct io_kiocb, cmd.data), NULL);
4742+
io_buf_cachep = kmem_cache_create("io_buffer", sizeof(struct io_buffer), 0,
4743+
SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT,
4744+
NULL);
47044745

47054746
#ifdef CONFIG_SYSCTL
47064747
register_sysctl_init("kernel", kernel_io_uring_disabled_table);

io_uring/io_uring.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -343,6 +343,7 @@ static inline bool io_req_cache_empty(struct io_ring_ctx *ctx)
343343
}
344344

345345
extern struct kmem_cache *req_cachep;
346+
extern struct kmem_cache *io_buf_cachep;
346347

347348
static inline struct io_kiocb *io_extract_req(struct io_ring_ctx *ctx)
348349
{

io_uring/kbuf.c

Lines changed: 33 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,17 @@
1919

2020
#define BGID_ARRAY 64
2121

22+
/* BIDs are addressed by a 16-bit field in a CQE */
23+
#define MAX_BIDS_PER_BGID (1 << 16)
24+
25+
struct kmem_cache *io_buf_cachep;
26+
2227
struct io_provide_buf {
2328
struct file *file;
2429
__u64 addr;
2530
__u32 len;
2631
__u32 bgid;
27-
__u16 nbufs;
32+
__u32 nbufs;
2833
__u16 bid;
2934
};
3035

@@ -255,6 +260,8 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
255260
void io_destroy_buffers(struct io_ring_ctx *ctx)
256261
{
257262
struct io_buffer_list *bl;
263+
struct list_head *item, *tmp;
264+
struct io_buffer *buf;
258265
unsigned long index;
259266
int i;
260267

@@ -270,12 +277,9 @@ void io_destroy_buffers(struct io_ring_ctx *ctx)
270277
kfree(bl);
271278
}
272279

273-
while (!list_empty(&ctx->io_buffers_pages)) {
274-
struct page *page;
275-
276-
page = list_first_entry(&ctx->io_buffers_pages, struct page, lru);
277-
list_del_init(&page->lru);
278-
__free_page(page);
280+
list_for_each_safe(item, tmp, &ctx->io_buffers_cache) {
281+
buf = list_entry(item, struct io_buffer, list);
282+
kmem_cache_free(io_buf_cachep, buf);
279283
}
280284
}
281285

@@ -289,7 +293,7 @@ int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
289293
return -EINVAL;
290294

291295
tmp = READ_ONCE(sqe->fd);
292-
if (!tmp || tmp > USHRT_MAX)
296+
if (!tmp || tmp > MAX_BIDS_PER_BGID)
293297
return -EINVAL;
294298

295299
memset(p, 0, sizeof(*p));
@@ -332,7 +336,7 @@ int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
332336
return -EINVAL;
333337

334338
tmp = READ_ONCE(sqe->fd);
335-
if (!tmp || tmp > USHRT_MAX)
339+
if (!tmp || tmp > MAX_BIDS_PER_BGID)
336340
return -E2BIG;
337341
p->nbufs = tmp;
338342
p->addr = READ_ONCE(sqe->addr);
@@ -352,17 +356,18 @@ int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
352356
tmp = READ_ONCE(sqe->off);
353357
if (tmp > USHRT_MAX)
354358
return -E2BIG;
355-
if (tmp + p->nbufs >= USHRT_MAX)
359+
if (tmp + p->nbufs > MAX_BIDS_PER_BGID)
356360
return -EINVAL;
357361
p->bid = tmp;
358362
return 0;
359363
}
360364

365+
#define IO_BUFFER_ALLOC_BATCH 64
366+
361367
static int io_refill_buffer_cache(struct io_ring_ctx *ctx)
362368
{
363-
struct io_buffer *buf;
364-
struct page *page;
365-
int bufs_in_page;
369+
struct io_buffer *bufs[IO_BUFFER_ALLOC_BATCH];
370+
int allocated;
366371

367372
/*
368373
* Completions that don't happen inline (eg not under uring_lock) will
@@ -382,22 +387,25 @@ static int io_refill_buffer_cache(struct io_ring_ctx *ctx)
382387

383388
/*
384389
* No free buffers and no completion entries either. Allocate a new
385-
* page worth of buffer entries and add those to our freelist.
390+
* batch of buffer entries and add those to our freelist.
386391
*/
387-
page = alloc_page(GFP_KERNEL_ACCOUNT);
388-
if (!page)
389-
return -ENOMEM;
390-
391-
list_add(&page->lru, &ctx->io_buffers_pages);
392392

393-
buf = page_address(page);
394-
bufs_in_page = PAGE_SIZE / sizeof(*buf);
395-
while (bufs_in_page) {
396-
list_add_tail(&buf->list, &ctx->io_buffers_cache);
397-
buf++;
398-
bufs_in_page--;
393+
allocated = kmem_cache_alloc_bulk(io_buf_cachep, GFP_KERNEL_ACCOUNT,
394+
ARRAY_SIZE(bufs), (void **) bufs);
395+
if (unlikely(!allocated)) {
396+
/*
397+
* Bulk alloc is all-or-nothing. If we fail to get a batch,
398+
* retry single alloc to be on the safe side.
399+
*/
400+
bufs[0] = kmem_cache_alloc(io_buf_cachep, GFP_KERNEL);
401+
if (!bufs[0])
402+
return -ENOMEM;
403+
allocated = 1;
399404
}
400405

406+
while (allocated)
407+
list_add_tail(&bufs[--allocated]->list, &ctx->io_buffers_cache);
408+
401409
return 0;
402410
}
403411

0 commit comments

Comments
 (0)