Skip to content

Commit c3018a2

Browse files
committed
Merge tag 'for-6.17/io_uring-20250728' of git://git.kernel.dk/linux
Pull io_uring updates from Jens Axboe: - Optimization to avoid reference counts on non-cloned registered buffers. This is how these buffers were handled prior to having cloning support, and we can still use that approach as long as the buffers haven't been cloned to another ring. - Cleanup and improvement for uring_cmd, where btrfs was the only user of storing allocated data for the lifetime of the uring_cmd. Clean that up so we can get rid of the need to do that. - Avoid unnecessary memory copies in uring_cmd usage. This is particularly important as a lot of uring_cmd usage necessitates the use of 128b SQEs. - A few updates for recv multishot, where it's now possible to add fairness limits for limiting how much is transferred for each retry loop. Additionally, recv multishot now supports an overall cap as well, where once reached the multishot recv will terminate. The latter is useful for buffer management and juggling many recv streams at the same time. - Add support for returning the TX timestamps via a new socket command. This feature can work in either singleshot or multishot mode, where the latter triggers a completion whenever new timestamps are available. This is an alternative to using the existing error queue. - Add support for an io_uring "mock" file, which is the start of being able to do 100% targeted testing in terms of exercising io_uring request handling. The idea is to have a file type that can be anything the tester would like, and behave exactly how you want it to behave in terms of hitting the code paths you want. - Improve zcrx by using sgtables to de-duplicate and improve dma address handling. - Prep work for supporting larger pages for zcrx. - Various little improvements and fixes. * tag 'for-6.17/io_uring-20250728' of git://git.kernel.dk/linux: (42 commits) io_uring/zcrx: fix leaking pages on sg init fail io_uring/zcrx: don't leak pages on account failure io_uring/zcrx: fix null ifq on area destruction io_uring: fix breakage in EXPERT menu io_uring/cmd: remove struct io_uring_cmd_data btrfs/ioctl: store btrfs_uring_encoded_data in io_btrfs_cmd io_uring/cmd: introduce IORING_URING_CMD_REISSUE flag io_uring/zcrx: account area memory io_uring: export io_[un]account_mem io_uring/net: Support multishot receive len cap io_uring: deduplicate wakeup handling io_uring/net: cast min_not_zero() type io_uring/poll: cleanup apoll freeing io_uring/net: allow multishot receive per-invocation cap io_uring/net: move io_sr_msg->retry_flags to io_sr_msg->flags io_uring/net: use passed in 'len' in io_recv_buf_select() io_uring/zcrx: prepare fallback for larger pages io_uring/zcrx: assert area type in io_zcrx_iov_page io_uring/zcrx: allocate sgtable for umem areas io_uring/zcrx: introduce io_populate_area_dma ...
2 parents e5cf61f + d9f595b commit c3018a2

27 files changed

Lines changed: 1029 additions & 238 deletions

File tree

MAINTAINERS

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12687,6 +12687,7 @@ F: include/linux/io_uring.h
1268712687
F: include/linux/io_uring_types.h
1268812688
F: include/trace/events/io_uring.h
1268912689
F: include/uapi/linux/io_uring.h
12690+
F: include/uapi/linux/io_uring/
1269012691
F: io_uring/
1269112692

1269212693
IPMI SUBSYSTEM

fs/btrfs/ioctl.c

Lines changed: 27 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4607,6 +4607,13 @@ static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool
46074607
return ret;
46084608
}
46094609

4610+
struct btrfs_uring_encoded_data {
4611+
struct btrfs_ioctl_encoded_io_args args;
4612+
struct iovec iovstack[UIO_FASTIOV];
4613+
struct iovec *iov;
4614+
struct iov_iter iter;
4615+
};
4616+
46104617
/*
46114618
* Context that's attached to an encoded read io_uring command, in cmd->pdu. It
46124619
* contains the fields in btrfs_uring_read_extent that are necessary to finish
@@ -4628,6 +4635,7 @@ struct btrfs_uring_priv {
46284635
};
46294636

46304637
struct io_btrfs_cmd {
4638+
struct btrfs_uring_encoded_data *data;
46314639
struct btrfs_uring_priv *priv;
46324640
};
46334641

@@ -4686,6 +4694,7 @@ static void btrfs_uring_read_finished(struct io_uring_cmd *cmd, unsigned int iss
46864694
kfree(priv->pages);
46874695
kfree(priv->iov);
46884696
kfree(priv);
4697+
kfree(bc->data);
46894698
}
46904699

46914700
void btrfs_uring_read_extent_endio(void *ctx, int err)
@@ -4769,13 +4778,6 @@ static int btrfs_uring_read_extent(struct kiocb *iocb, struct iov_iter *iter,
47694778
return ret;
47704779
}
47714780

4772-
struct btrfs_uring_encoded_data {
4773-
struct btrfs_ioctl_encoded_io_args args;
4774-
struct iovec iovstack[UIO_FASTIOV];
4775-
struct iovec *iov;
4776-
struct iov_iter iter;
4777-
};
4778-
47794781
static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue_flags)
47804782
{
47814783
size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args, flags);
@@ -4791,7 +4793,11 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue
47914793
struct extent_state *cached_state = NULL;
47924794
u64 start, lockend;
47934795
void __user *sqe_addr;
4794-
struct btrfs_uring_encoded_data *data = io_uring_cmd_get_async_data(cmd)->op_data;
4796+
struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(cmd, struct io_btrfs_cmd);
4797+
struct btrfs_uring_encoded_data *data = NULL;
4798+
4799+
if (cmd->flags & IORING_URING_CMD_REISSUE)
4800+
data = bc->data;
47954801

47964802
if (!capable(CAP_SYS_ADMIN)) {
47974803
ret = -EPERM;
@@ -4821,7 +4827,7 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue
48214827
goto out_acct;
48224828
}
48234829

4824-
io_uring_cmd_get_async_data(cmd)->op_data = data;
4830+
bc->data = data;
48254831

48264832
if (issue_flags & IO_URING_F_COMPAT) {
48274833
#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
@@ -4919,6 +4925,9 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue
49194925
add_rchar(current, ret);
49204926
inc_syscr(current);
49214927

4928+
if (ret != -EIOCBQUEUED && ret != -EAGAIN)
4929+
kfree(data);
4930+
49224931
return ret;
49234932
}
49244933

@@ -4929,7 +4938,11 @@ static int btrfs_uring_encoded_write(struct io_uring_cmd *cmd, unsigned int issu
49294938
struct file *file;
49304939
ssize_t ret;
49314940
void __user *sqe_addr;
4932-
struct btrfs_uring_encoded_data *data = io_uring_cmd_get_async_data(cmd)->op_data;
4941+
struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(cmd, struct io_btrfs_cmd);
4942+
struct btrfs_uring_encoded_data *data = NULL;
4943+
4944+
if (cmd->flags & IORING_URING_CMD_REISSUE)
4945+
data = bc->data;
49334946

49344947
if (!capable(CAP_SYS_ADMIN)) {
49354948
ret = -EPERM;
@@ -4951,7 +4964,7 @@ static int btrfs_uring_encoded_write(struct io_uring_cmd *cmd, unsigned int issu
49514964
goto out_acct;
49524965
}
49534966

4954-
io_uring_cmd_get_async_data(cmd)->op_data = data;
4967+
bc->data = data;
49554968

49564969
if (issue_flags & IO_URING_F_COMPAT) {
49574970
#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
@@ -5041,6 +5054,9 @@ static int btrfs_uring_encoded_write(struct io_uring_cmd *cmd, unsigned int issu
50415054
if (ret > 0)
50425055
add_wchar(current, ret);
50435056
inc_syscw(current);
5057+
5058+
if (ret != -EAGAIN)
5059+
kfree(data);
50445060
return ret;
50455061
}
50465062

include/linux/io_uring/cmd.h

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88

99
/* only top 8 bits of sqe->uring_cmd_flags for kernel internal use */
1010
#define IORING_URING_CMD_CANCELABLE (1U << 30)
11+
/* io_uring_cmd is being issued again */
12+
#define IORING_URING_CMD_REISSUE (1U << 31)
1113

1214
struct io_uring_cmd {
1315
struct file *file;
@@ -19,10 +21,6 @@ struct io_uring_cmd {
1921
u8 pdu[32]; /* available inline for free use */
2022
};
2123

22-
struct io_uring_cmd_data {
23-
void *op_data;
24-
};
25-
2624
static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe)
2725
{
2826
return sqe->cmd;
@@ -135,11 +133,6 @@ static inline struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd
135133
return cmd_to_io_kiocb(cmd)->tctx->task;
136134
}
137135

138-
static inline struct io_uring_cmd_data *io_uring_cmd_get_async_data(struct io_uring_cmd *cmd)
139-
{
140-
return cmd_to_io_kiocb(cmd)->async_data;
141-
}
142-
143136
/*
144137
* Return uring_cmd's context reference as its context handle for driver to
145138
* track per-context resource, such as registered kernel IO buffer

include/linux/io_uring_types.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ enum io_uring_cmd_flags {
2626
IO_URING_F_MULTISHOT = 4,
2727
/* executed by io-wq */
2828
IO_URING_F_IOWQ = 8,
29+
/* executed inline from syscall */
30+
IO_URING_F_INLINE = 16,
2931
/* int's last bit, sign checks are usually faster than a bit test */
3032
IO_URING_F_NONBLOCK = INT_MIN,
3133

@@ -502,6 +504,7 @@ enum {
502504
REQ_F_BUF_NODE_BIT,
503505
REQ_F_HAS_METADATA_BIT,
504506
REQ_F_IMPORT_BUFFER_BIT,
507+
REQ_F_SQE_COPIED_BIT,
505508

506509
/* not a real bit, just to check we're not overflowing the space */
507510
__REQ_F_LAST_BIT,
@@ -591,6 +594,8 @@ enum {
591594
* For SEND_ZC, whether to import buffers (i.e. the first issue).
592595
*/
593596
REQ_F_IMPORT_BUFFER = IO_REQ_FLAG(REQ_F_IMPORT_BUFFER_BIT),
597+
/* ->sqe_copy() has been called, if necessary */
598+
REQ_F_SQE_COPIED = IO_REQ_FLAG(REQ_F_SQE_COPIED_BIT),
594599
};
595600

596601
typedef void (*io_req_tw_func_t)(struct io_kiocb *req, io_tw_token_t tw);

include/net/sock.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2677,6 +2677,10 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
26772677
void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk,
26782678
struct sk_buff *skb);
26792679

2680+
bool skb_has_tx_timestamp(struct sk_buff *skb, const struct sock *sk);
2681+
int skb_get_tx_timestamp(struct sk_buff *skb, struct sock *sk,
2682+
struct timespec64 *ts);
2683+
26802684
static inline void
26812685
sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb)
26822686
{

include/uapi/linux/io_uring.h

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ struct io_uring_sqe {
5050
};
5151
__u32 len; /* buffer size or number of iovecs */
5252
union {
53-
__kernel_rwf_t rw_flags;
53+
__u32 rw_flags;
5454
__u32 fsync_flags;
5555
__u16 poll_events; /* compatibility */
5656
__u32 poll32_events; /* word-reversed for BE */
@@ -449,6 +449,7 @@ enum io_uring_msg_ring_flags {
449449
#define IORING_NOP_FILE (1U << 1)
450450
#define IORING_NOP_FIXED_FILE (1U << 2)
451451
#define IORING_NOP_FIXED_BUFFER (1U << 3)
452+
#define IORING_NOP_TW (1U << 4)
452453

453454
/*
454455
* IO completion data structure (Completion Queue Entry)
@@ -968,6 +969,22 @@ enum io_uring_socket_op {
968969
SOCKET_URING_OP_SIOCOUTQ,
969970
SOCKET_URING_OP_GETSOCKOPT,
970971
SOCKET_URING_OP_SETSOCKOPT,
972+
SOCKET_URING_OP_TX_TIMESTAMP,
973+
};
974+
975+
/*
976+
* SOCKET_URING_OP_TX_TIMESTAMP definitions
977+
*/
978+
979+
#define IORING_TIMESTAMP_HW_SHIFT 16
980+
/* The cqe->flags bit from which the timestamp type is stored */
981+
#define IORING_TIMESTAMP_TYPE_SHIFT (IORING_TIMESTAMP_HW_SHIFT + 1)
982+
/* The cqe->flags flag signifying whether it's a hardware timestamp */
983+
#define IORING_CQE_F_TSTAMP_HW ((__u32)1 << IORING_TIMESTAMP_HW_SHIFT)
984+
985+
struct io_timespec {
986+
__u64 tv_sec;
987+
__u64 tv_nsec;
971988
};
972989

973990
/* Zero copy receive refill queue entry */
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
#ifndef LINUX_IO_URING_MOCK_FILE_H
2+
#define LINUX_IO_URING_MOCK_FILE_H
3+
4+
#include <linux/types.h>
5+
6+
enum {
7+
IORING_MOCK_FEAT_CMD_COPY,
8+
IORING_MOCK_FEAT_RW_ZERO,
9+
IORING_MOCK_FEAT_RW_NOWAIT,
10+
IORING_MOCK_FEAT_RW_ASYNC,
11+
IORING_MOCK_FEAT_POLL,
12+
13+
IORING_MOCK_FEAT_END,
14+
};
15+
16+
struct io_uring_mock_probe {
17+
__u64 features;
18+
__u64 __resv[9];
19+
};
20+
21+
enum {
22+
IORING_MOCK_CREATE_F_SUPPORT_NOWAIT = 1,
23+
IORING_MOCK_CREATE_F_POLL = 2,
24+
};
25+
26+
struct io_uring_mock_create {
27+
__u32 out_fd;
28+
__u32 flags;
29+
__u64 file_size;
30+
__u64 rw_delay_ns;
31+
__u64 __resv[13];
32+
};
33+
34+
enum {
35+
IORING_MOCK_MGR_CMD_PROBE,
36+
IORING_MOCK_MGR_CMD_CREATE,
37+
};
38+
39+
enum {
40+
IORING_MOCK_CMD_COPY_REGBUF,
41+
};
42+
43+
enum {
44+
IORING_MOCK_COPY_FROM = 1,
45+
};
46+
47+
#endif

init/Kconfig

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1794,7 +1794,7 @@ config IO_URING
17941794

17951795
config GCOV_PROFILE_URING
17961796
bool "Enable GCOV profiling on the io_uring subsystem"
1797-
depends on GCOV_KERNEL
1797+
depends on IO_URING && GCOV_KERNEL
17981798
help
17991799
Enable GCOV profiling on the io_uring subsystem, to facilitate
18001800
code coverage testing.
@@ -1805,6 +1805,17 @@ config GCOV_PROFILE_URING
18051805
the io_uring subsystem, hence this should only be enabled for
18061806
specific test purposes.
18071807

1808+
config IO_URING_MOCK_FILE
1809+
tristate "Enable io_uring mock files (Experimental)" if EXPERT
1810+
default n
1811+
depends on IO_URING
1812+
help
1813+
Enable mock files for io_uring subststem testing. The ABI might
1814+
still change, so it's still experimental and should only be enabled
1815+
for specific test purposes.
1816+
1817+
If unsure, say N.
1818+
18081819
config ADVISE_SYSCALLS
18091820
bool "Enable madvise/fadvise syscalls" if EXPERT
18101821
default y

io_uring/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,3 +21,4 @@ obj-$(CONFIG_EPOLL) += epoll.o
2121
obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o
2222
obj-$(CONFIG_NET) += net.o cmd_net.o
2323
obj-$(CONFIG_PROC_FS) += fdinfo.o
24+
obj-$(CONFIG_IO_URING_MOCK_FILE) += mock_file.o

0 commit comments

Comments
 (0)