Skip to content

Commit 74f7c52

Browse files
committed
Merge branch 'mptcp-receive-path-improvement'
Matthieu Baerts says: ==================== mptcp: receive path improvement This series includes several changes to the MPTCP RX path. The main goals are improving the RX performances, and increase the long term maintainability. Some changes reflects recent(ish) improvements introduced in the TCP stack: patch 1, 2 and 3 are the MPTCP counter part of SKB deferral free and auto-tuning improvements. Note that patch 3 could possibly fix additional issues, and overall such patch should protect from similar issues to arise in the future. Patches 4-7 are aimed at introducing the socket backlog usage which will be done in a later series to process the packets received by the different subflows while the msk socket is owned. Patch 8 is not related to the RX path, but it contains additional tests for new features recently introduced in net-next. ==================== Link: https://patch.msgid.link/20250927-net-next-mptcp-rcv-path-imp-v1-0-5da266aa9c1a@kernel.org Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2 parents f017c1f + c912f93 commit 74f7c52

6 files changed

Lines changed: 177 additions & 95 deletions

File tree

include/net/tcp.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -370,6 +370,7 @@ void tcp_delack_timer_handler(struct sock *sk);
370370
int tcp_ioctl(struct sock *sk, int cmd, int *karg);
371371
enum skb_drop_reason tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb);
372372
void tcp_rcv_established(struct sock *sk, struct sk_buff *skb);
373+
void tcp_rcvbuf_grow(struct sock *sk);
373374
void tcp_rcv_space_adjust(struct sock *sk);
374375
int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp);
375376
void tcp_twsk_destructor(struct sock *sk);

net/ipv4/tcp_input.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -891,7 +891,7 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
891891
}
892892
}
893893

894-
static void tcp_rcvbuf_grow(struct sock *sk)
894+
void tcp_rcvbuf_grow(struct sock *sk)
895895
{
896896
const struct net *net = sock_net(sk);
897897
struct tcp_sock *tp = tcp_sk(sk);

net/mptcp/protocol.c

Lines changed: 95 additions & 92 deletions
Original file line numberDiff line numberDiff line change
@@ -142,22 +142,33 @@ static void mptcp_drop(struct sock *sk, struct sk_buff *skb)
142142
__kfree_skb(skb);
143143
}
144144

145-
static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to,
146-
struct sk_buff *from)
145+
static bool __mptcp_try_coalesce(struct sock *sk, struct sk_buff *to,
146+
struct sk_buff *from, bool *fragstolen,
147+
int *delta)
147148
{
148-
bool fragstolen;
149-
int delta;
149+
int limit = READ_ONCE(sk->sk_rcvbuf);
150150

151151
if (unlikely(MPTCP_SKB_CB(to)->cant_coalesce) ||
152152
MPTCP_SKB_CB(from)->offset ||
153-
((to->len + from->len) > (sk->sk_rcvbuf >> 3)) ||
154-
!skb_try_coalesce(to, from, &fragstolen, &delta))
153+
((to->len + from->len) > (limit >> 3)) ||
154+
!skb_try_coalesce(to, from, fragstolen, delta))
155155
return false;
156156

157157
pr_debug("colesced seq %llx into %llx new len %d new end seq %llx\n",
158158
MPTCP_SKB_CB(from)->map_seq, MPTCP_SKB_CB(to)->map_seq,
159159
to->len, MPTCP_SKB_CB(from)->end_seq);
160160
MPTCP_SKB_CB(to)->end_seq = MPTCP_SKB_CB(from)->end_seq;
161+
return true;
162+
}
163+
164+
static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to,
165+
struct sk_buff *from)
166+
{
167+
bool fragstolen;
168+
int delta;
169+
170+
if (!__mptcp_try_coalesce(sk, to, from, &fragstolen, &delta))
171+
return false;
161172

162173
/* note the fwd memory can reach a negative value after accounting
163174
* for the delta, but the later skb free will restore a non
@@ -179,6 +190,35 @@ static bool mptcp_ooo_try_coalesce(struct mptcp_sock *msk, struct sk_buff *to,
179190
return mptcp_try_coalesce((struct sock *)msk, to, from);
180191
}
181192

193+
/* "inspired" by tcp_rcvbuf_grow(), main difference:
194+
* - mptcp does not maintain a msk-level window clamp
195+
* - returns true when the receive buffer is actually updated
196+
*/
197+
static bool mptcp_rcvbuf_grow(struct sock *sk)
198+
{
199+
struct mptcp_sock *msk = mptcp_sk(sk);
200+
const struct net *net = sock_net(sk);
201+
int rcvwin, rcvbuf, cap;
202+
203+
if (!READ_ONCE(net->ipv4.sysctl_tcp_moderate_rcvbuf) ||
204+
(sk->sk_userlocks & SOCK_RCVBUF_LOCK))
205+
return false;
206+
207+
rcvwin = msk->rcvq_space.space << 1;
208+
209+
if (!RB_EMPTY_ROOT(&msk->out_of_order_queue))
210+
rcvwin += MPTCP_SKB_CB(msk->ooo_last_skb)->end_seq - msk->ack_seq;
211+
212+
cap = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]);
213+
214+
rcvbuf = min_t(u32, mptcp_space_from_win(sk, rcvwin), cap);
215+
if (rcvbuf > sk->sk_rcvbuf) {
216+
WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
217+
return true;
218+
}
219+
return false;
220+
}
221+
182222
/* "inspired" by tcp_data_queue_ofo(), main differences:
183223
* - use mptcp seqs
184224
* - don't cope with sacks
@@ -292,29 +332,16 @@ static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb)
292332
end:
293333
skb_condense(skb);
294334
skb_set_owner_r(skb, sk);
335+
/* do not grow rcvbuf for not-yet-accepted or orphaned sockets. */
336+
if (sk->sk_socket)
337+
mptcp_rcvbuf_grow(sk);
295338
}
296339

297-
static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
298-
struct sk_buff *skb, unsigned int offset,
299-
size_t copy_len)
340+
static void mptcp_init_skb(struct sock *ssk, struct sk_buff *skb, int offset,
341+
int copy_len)
300342
{
301-
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
302-
struct sock *sk = (struct sock *)msk;
303-
struct sk_buff *tail;
304-
bool has_rxtstamp;
305-
306-
__skb_unlink(skb, &ssk->sk_receive_queue);
307-
308-
skb_ext_reset(skb);
309-
skb_orphan(skb);
310-
311-
/* try to fetch required memory from subflow */
312-
if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
313-
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RCVPRUNED);
314-
goto drop;
315-
}
316-
317-
has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp;
343+
const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
344+
bool has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp;
318345

319346
/* the skb map_seq accounts for the skb offset:
320347
* mptcp_subflow_get_mapped_dsn() is based on the current tp->copied_seq
@@ -326,6 +353,24 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
326353
MPTCP_SKB_CB(skb)->has_rxtstamp = has_rxtstamp;
327354
MPTCP_SKB_CB(skb)->cant_coalesce = 0;
328355

356+
__skb_unlink(skb, &ssk->sk_receive_queue);
357+
358+
skb_ext_reset(skb);
359+
skb_dst_drop(skb);
360+
}
361+
362+
static bool __mptcp_move_skb(struct sock *sk, struct sk_buff *skb)
363+
{
364+
u64 copy_len = MPTCP_SKB_CB(skb)->end_seq - MPTCP_SKB_CB(skb)->map_seq;
365+
struct mptcp_sock *msk = mptcp_sk(sk);
366+
struct sk_buff *tail;
367+
368+
/* try to fetch required memory from subflow */
369+
if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
370+
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RCVPRUNED);
371+
goto drop;
372+
}
373+
329374
if (MPTCP_SKB_CB(skb)->map_seq == msk->ack_seq) {
330375
/* in sequence */
331376
msk->bytes_received += copy_len;
@@ -646,7 +691,9 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
646691
if (offset < skb->len) {
647692
size_t len = skb->len - offset;
648693

649-
ret = __mptcp_move_skb(msk, ssk, skb, offset, len) || ret;
694+
mptcp_init_skb(ssk, skb, offset, len);
695+
skb_orphan(skb);
696+
ret = __mptcp_move_skb(sk, skb) || ret;
650697
seq += len;
651698

652699
if (unlikely(map_remaining < len)) {
@@ -767,12 +814,8 @@ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk)
767814

768815
moved = __mptcp_move_skbs_from_subflow(msk, ssk);
769816
__mptcp_ofo_queue(msk);
770-
if (unlikely(ssk->sk_err)) {
771-
if (!sock_owned_by_user(sk))
772-
__mptcp_error_report(sk);
773-
else
774-
__set_bit(MPTCP_ERROR_REPORT, &msk->cb_flags);
775-
}
817+
if (unlikely(ssk->sk_err))
818+
__mptcp_subflow_error_report(sk, ssk);
776819

777820
/* If the moves have caught up with the DATA_FIN sequence number
778821
* it's time to ack the DATA_FIN and change socket state, but
@@ -784,18 +827,10 @@ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk)
784827
return moved;
785828
}
786829

787-
static void __mptcp_rcvbuf_update(struct sock *sk, struct sock *ssk)
788-
{
789-
if (unlikely(ssk->sk_rcvbuf > sk->sk_rcvbuf))
790-
WRITE_ONCE(sk->sk_rcvbuf, ssk->sk_rcvbuf);
791-
}
792-
793830
static void __mptcp_data_ready(struct sock *sk, struct sock *ssk)
794831
{
795832
struct mptcp_sock *msk = mptcp_sk(sk);
796833

797-
__mptcp_rcvbuf_update(sk, ssk);
798-
799834
/* Wake-up the reader only for in-sequence data */
800835
if (move_skbs_to_msk(msk, ssk) && mptcp_epollin_ready(sk))
801836
sk->sk_data_ready(sk);
@@ -1943,12 +1978,13 @@ static int __mptcp_recvmsg_mskq(struct sock *sk,
19431978
}
19441979

19451980
if (!(flags & MSG_PEEK)) {
1946-
/* avoid the indirect call, we know the destructor is sock_wfree */
1981+
/* avoid the indirect call, we know the destructor is sock_rfree */
19471982
skb->destructor = NULL;
1983+
skb->sk = NULL;
19481984
atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
19491985
sk_mem_uncharge(sk, skb->truesize);
19501986
__skb_unlink(skb, &sk->sk_receive_queue);
1951-
__kfree_skb(skb);
1987+
skb_attempt_defer_free(skb);
19521988
msk->bytes_consumed += count;
19531989
}
19541990

@@ -2013,48 +2049,26 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
20132049
if (msk->rcvq_space.copied <= msk->rcvq_space.space)
20142050
goto new_measure;
20152051

2016-
if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) &&
2017-
!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
2018-
u64 rcvwin, grow;
2019-
int rcvbuf;
2020-
2021-
rcvwin = ((u64)msk->rcvq_space.copied << 1) + 16 * advmss;
2022-
2023-
grow = rcvwin * (msk->rcvq_space.copied - msk->rcvq_space.space);
2024-
2025-
do_div(grow, msk->rcvq_space.space);
2026-
rcvwin += (grow << 1);
2027-
2028-
rcvbuf = min_t(u64, mptcp_space_from_win(sk, rcvwin),
2029-
READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
2030-
2031-
if (rcvbuf > sk->sk_rcvbuf) {
2032-
u32 window_clamp;
2033-
2034-
window_clamp = mptcp_win_from_space(sk, rcvbuf);
2035-
WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
2052+
msk->rcvq_space.space = msk->rcvq_space.copied;
2053+
if (mptcp_rcvbuf_grow(sk)) {
20362054

2037-
/* Make subflows follow along. If we do not do this, we
2038-
* get drops at subflow level if skbs can't be moved to
2039-
* the mptcp rx queue fast enough (announced rcv_win can
2040-
* exceed ssk->sk_rcvbuf).
2041-
*/
2042-
mptcp_for_each_subflow(msk, subflow) {
2043-
struct sock *ssk;
2044-
bool slow;
2055+
/* Make subflows follow along. If we do not do this, we
2056+
* get drops at subflow level if skbs can't be moved to
2057+
* the mptcp rx queue fast enough (announced rcv_win can
2058+
* exceed ssk->sk_rcvbuf).
2059+
*/
2060+
mptcp_for_each_subflow(msk, subflow) {
2061+
struct sock *ssk;
2062+
bool slow;
20452063

2046-
ssk = mptcp_subflow_tcp_sock(subflow);
2047-
slow = lock_sock_fast(ssk);
2048-
WRITE_ONCE(ssk->sk_rcvbuf, rcvbuf);
2049-
WRITE_ONCE(tcp_sk(ssk)->window_clamp, window_clamp);
2050-
if (tcp_can_send_ack(ssk))
2051-
tcp_cleanup_rbuf(ssk, 1);
2052-
unlock_sock_fast(ssk, slow);
2053-
}
2064+
ssk = mptcp_subflow_tcp_sock(subflow);
2065+
slow = lock_sock_fast(ssk);
2066+
tcp_sk(ssk)->rcvq_space.space = msk->rcvq_space.copied;
2067+
tcp_rcvbuf_grow(ssk);
2068+
unlock_sock_fast(ssk, slow);
20542069
}
20552070
}
20562071

2057-
msk->rcvq_space.space = msk->rcvq_space.copied;
20582072
new_measure:
20592073
msk->rcvq_space.copied = 0;
20602074
msk->rcvq_space.time = mstamp;
@@ -2083,11 +2097,6 @@ static bool __mptcp_move_skbs(struct sock *sk)
20832097
if (list_empty(&msk->conn_list))
20842098
return false;
20852099

2086-
/* verify we can move any data from the subflow, eventually updating */
2087-
if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK))
2088-
mptcp_for_each_subflow(msk, subflow)
2089-
__mptcp_rcvbuf_update(sk, subflow->tcp_sock);
2090-
20912100
subflow = list_first_entry(&msk->conn_list,
20922101
struct mptcp_subflow_context, node);
20932102
for (;;) {
@@ -2205,14 +2214,8 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
22052214
break;
22062215
}
22072216

2208-
if (sk->sk_shutdown & RCV_SHUTDOWN) {
2209-
/* race breaker: the shutdown could be after the
2210-
* previous receive queue check
2211-
*/
2212-
if (__mptcp_move_skbs(sk))
2213-
continue;
2217+
if (sk->sk_shutdown & RCV_SHUTDOWN)
22142218
break;
2215-
}
22162219

22172220
if (sk->sk_state == TCP_CLOSE) {
22182221
copied = -ENOTCONN;

net/mptcp/protocol.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -341,8 +341,8 @@ struct mptcp_sock {
341341
struct mptcp_pm_data pm;
342342
struct mptcp_sched_ops *sched;
343343
struct {
344-
u32 space; /* bytes copied in last measurement window */
345-
u32 copied; /* bytes copied in this measurement window */
344+
int space; /* bytes copied in last measurement window */
345+
int copied; /* bytes copied in this measurement window */
346346
u64 time; /* start time of measurement window */
347347
u64 rtt_us; /* last maximum rtt of subflows */
348348
} rcvq_space;

0 commit comments

Comments
 (0)