@@ -142,22 +142,33 @@ static void mptcp_drop(struct sock *sk, struct sk_buff *skb)
142142 __kfree_skb (skb );
143143}
144144
145- static bool mptcp_try_coalesce (struct sock * sk , struct sk_buff * to ,
146- struct sk_buff * from )
145+ static bool __mptcp_try_coalesce (struct sock * sk , struct sk_buff * to ,
146+ struct sk_buff * from , bool * fragstolen ,
147+ int * delta )
147148{
148- bool fragstolen ;
149- int delta ;
149+ int limit = READ_ONCE (sk -> sk_rcvbuf );
150150
151151 if (unlikely (MPTCP_SKB_CB (to )-> cant_coalesce ) ||
152152 MPTCP_SKB_CB (from )-> offset ||
153- ((to -> len + from -> len ) > (sk -> sk_rcvbuf >> 3 )) ||
154- !skb_try_coalesce (to , from , & fragstolen , & delta ))
153+ ((to -> len + from -> len ) > (limit >> 3 )) ||
154+ !skb_try_coalesce (to , from , fragstolen , delta ))
155155 return false;
156156
157157 pr_debug ("colesced seq %llx into %llx new len %d new end seq %llx\n" ,
158158 MPTCP_SKB_CB (from )-> map_seq , MPTCP_SKB_CB (to )-> map_seq ,
159159 to -> len , MPTCP_SKB_CB (from )-> end_seq );
160160 MPTCP_SKB_CB (to )-> end_seq = MPTCP_SKB_CB (from )-> end_seq ;
161+ return true;
162+ }
163+
164+ static bool mptcp_try_coalesce (struct sock * sk , struct sk_buff * to ,
165+ struct sk_buff * from )
166+ {
167+ bool fragstolen ;
168+ int delta ;
169+
170+ if (!__mptcp_try_coalesce (sk , to , from , & fragstolen , & delta ))
171+ return false;
161172
162173 /* note the fwd memory can reach a negative value after accounting
163174 * for the delta, but the later skb free will restore a non
@@ -179,6 +190,35 @@ static bool mptcp_ooo_try_coalesce(struct mptcp_sock *msk, struct sk_buff *to,
179190 return mptcp_try_coalesce ((struct sock * )msk , to , from );
180191}
181192
193+ /* "inspired" by tcp_rcvbuf_grow(), main difference:
194+ * - mptcp does not maintain a msk-level window clamp
195+ * - returns true when the receive buffer is actually updated
196+ */
197+ static bool mptcp_rcvbuf_grow (struct sock * sk )
198+ {
199+ struct mptcp_sock * msk = mptcp_sk (sk );
200+ const struct net * net = sock_net (sk );
201+ int rcvwin , rcvbuf , cap ;
202+
203+ if (!READ_ONCE (net -> ipv4 .sysctl_tcp_moderate_rcvbuf ) ||
204+ (sk -> sk_userlocks & SOCK_RCVBUF_LOCK ))
205+ return false;
206+
207+ rcvwin = msk -> rcvq_space .space << 1 ;
208+
209+ if (!RB_EMPTY_ROOT (& msk -> out_of_order_queue ))
210+ rcvwin += MPTCP_SKB_CB (msk -> ooo_last_skb )-> end_seq - msk -> ack_seq ;
211+
212+ cap = READ_ONCE (net -> ipv4 .sysctl_tcp_rmem [2 ]);
213+
214+ rcvbuf = min_t (u32 , mptcp_space_from_win (sk , rcvwin ), cap );
215+ if (rcvbuf > sk -> sk_rcvbuf ) {
216+ WRITE_ONCE (sk -> sk_rcvbuf , rcvbuf );
217+ return true;
218+ }
219+ return false;
220+ }
221+
182222/* "inspired" by tcp_data_queue_ofo(), main differences:
183223 * - use mptcp seqs
184224 * - don't cope with sacks
@@ -292,29 +332,16 @@ static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb)
292332end :
293333 skb_condense (skb );
294334 skb_set_owner_r (skb , sk );
335+ /* do not grow rcvbuf for not-yet-accepted or orphaned sockets. */
336+ if (sk -> sk_socket )
337+ mptcp_rcvbuf_grow (sk );
295338}
296339
297- static bool __mptcp_move_skb (struct mptcp_sock * msk , struct sock * ssk ,
298- struct sk_buff * skb , unsigned int offset ,
299- size_t copy_len )
340+ static void mptcp_init_skb (struct sock * ssk , struct sk_buff * skb , int offset ,
341+ int copy_len )
300342{
301- struct mptcp_subflow_context * subflow = mptcp_subflow_ctx (ssk );
302- struct sock * sk = (struct sock * )msk ;
303- struct sk_buff * tail ;
304- bool has_rxtstamp ;
305-
306- __skb_unlink (skb , & ssk -> sk_receive_queue );
307-
308- skb_ext_reset (skb );
309- skb_orphan (skb );
310-
311- /* try to fetch required memory from subflow */
312- if (!sk_rmem_schedule (sk , skb , skb -> truesize )) {
313- MPTCP_INC_STATS (sock_net (sk ), MPTCP_MIB_RCVPRUNED );
314- goto drop ;
315- }
316-
317- has_rxtstamp = TCP_SKB_CB (skb )-> has_rxtstamp ;
343+ const struct mptcp_subflow_context * subflow = mptcp_subflow_ctx (ssk );
344+ bool has_rxtstamp = TCP_SKB_CB (skb )-> has_rxtstamp ;
318345
319346 /* the skb map_seq accounts for the skb offset:
320347 * mptcp_subflow_get_mapped_dsn() is based on the current tp->copied_seq
@@ -326,6 +353,24 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
326353 MPTCP_SKB_CB (skb )-> has_rxtstamp = has_rxtstamp ;
327354 MPTCP_SKB_CB (skb )-> cant_coalesce = 0 ;
328355
356+ __skb_unlink (skb , & ssk -> sk_receive_queue );
357+
358+ skb_ext_reset (skb );
359+ skb_dst_drop (skb );
360+ }
361+
362+ static bool __mptcp_move_skb (struct sock * sk , struct sk_buff * skb )
363+ {
364+ u64 copy_len = MPTCP_SKB_CB (skb )-> end_seq - MPTCP_SKB_CB (skb )-> map_seq ;
365+ struct mptcp_sock * msk = mptcp_sk (sk );
366+ struct sk_buff * tail ;
367+
368+ /* try to fetch required memory from subflow */
369+ if (!sk_rmem_schedule (sk , skb , skb -> truesize )) {
370+ MPTCP_INC_STATS (sock_net (sk ), MPTCP_MIB_RCVPRUNED );
371+ goto drop ;
372+ }
373+
329374 if (MPTCP_SKB_CB (skb )-> map_seq == msk -> ack_seq ) {
330375 /* in sequence */
331376 msk -> bytes_received += copy_len ;
@@ -646,7 +691,9 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
646691 if (offset < skb -> len ) {
647692 size_t len = skb -> len - offset ;
648693
649- ret = __mptcp_move_skb (msk , ssk , skb , offset , len ) || ret ;
694+ mptcp_init_skb (ssk , skb , offset , len );
695+ skb_orphan (skb );
696+ ret = __mptcp_move_skb (sk , skb ) || ret ;
650697 seq += len ;
651698
652699 if (unlikely (map_remaining < len )) {
@@ -767,12 +814,8 @@ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk)
767814
768815 moved = __mptcp_move_skbs_from_subflow (msk , ssk );
769816 __mptcp_ofo_queue (msk );
770- if (unlikely (ssk -> sk_err )) {
771- if (!sock_owned_by_user (sk ))
772- __mptcp_error_report (sk );
773- else
774- __set_bit (MPTCP_ERROR_REPORT , & msk -> cb_flags );
775- }
817+ if (unlikely (ssk -> sk_err ))
818+ __mptcp_subflow_error_report (sk , ssk );
776819
777820 /* If the moves have caught up with the DATA_FIN sequence number
778821 * it's time to ack the DATA_FIN and change socket state, but
@@ -784,18 +827,10 @@ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk)
784827 return moved ;
785828}
786829
787- static void __mptcp_rcvbuf_update (struct sock * sk , struct sock * ssk )
788- {
789- if (unlikely (ssk -> sk_rcvbuf > sk -> sk_rcvbuf ))
790- WRITE_ONCE (sk -> sk_rcvbuf , ssk -> sk_rcvbuf );
791- }
792-
793830static void __mptcp_data_ready (struct sock * sk , struct sock * ssk )
794831{
795832 struct mptcp_sock * msk = mptcp_sk (sk );
796833
797- __mptcp_rcvbuf_update (sk , ssk );
798-
799834 /* Wake-up the reader only for in-sequence data */
800835 if (move_skbs_to_msk (msk , ssk ) && mptcp_epollin_ready (sk ))
801836 sk -> sk_data_ready (sk );
@@ -1943,12 +1978,13 @@ static int __mptcp_recvmsg_mskq(struct sock *sk,
19431978 }
19441979
19451980 if (!(flags & MSG_PEEK )) {
1946- /* avoid the indirect call, we know the destructor is sock_wfree */
1981+ /* avoid the indirect call, we know the destructor is sock_rfree */
19471982 skb -> destructor = NULL ;
1983+ skb -> sk = NULL ;
19481984 atomic_sub (skb -> truesize , & sk -> sk_rmem_alloc );
19491985 sk_mem_uncharge (sk , skb -> truesize );
19501986 __skb_unlink (skb , & sk -> sk_receive_queue );
1951- __kfree_skb (skb );
1987+ skb_attempt_defer_free (skb );
19521988 msk -> bytes_consumed += count ;
19531989 }
19541990
@@ -2013,48 +2049,26 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
20132049 if (msk -> rcvq_space .copied <= msk -> rcvq_space .space )
20142050 goto new_measure ;
20152051
2016- if (READ_ONCE (sock_net (sk )-> ipv4 .sysctl_tcp_moderate_rcvbuf ) &&
2017- !(sk -> sk_userlocks & SOCK_RCVBUF_LOCK )) {
2018- u64 rcvwin , grow ;
2019- int rcvbuf ;
2020-
2021- rcvwin = ((u64 )msk -> rcvq_space .copied << 1 ) + 16 * advmss ;
2022-
2023- grow = rcvwin * (msk -> rcvq_space .copied - msk -> rcvq_space .space );
2024-
2025- do_div (grow , msk -> rcvq_space .space );
2026- rcvwin += (grow << 1 );
2027-
2028- rcvbuf = min_t (u64 , mptcp_space_from_win (sk , rcvwin ),
2029- READ_ONCE (sock_net (sk )-> ipv4 .sysctl_tcp_rmem [2 ]));
2030-
2031- if (rcvbuf > sk -> sk_rcvbuf ) {
2032- u32 window_clamp ;
2033-
2034- window_clamp = mptcp_win_from_space (sk , rcvbuf );
2035- WRITE_ONCE (sk -> sk_rcvbuf , rcvbuf );
2052+ msk -> rcvq_space .space = msk -> rcvq_space .copied ;
2053+ if (mptcp_rcvbuf_grow (sk )) {
20362054
2037- /* Make subflows follow along. If we do not do this, we
2038- * get drops at subflow level if skbs can't be moved to
2039- * the mptcp rx queue fast enough (announced rcv_win can
2040- * exceed ssk->sk_rcvbuf).
2041- */
2042- mptcp_for_each_subflow (msk , subflow ) {
2043- struct sock * ssk ;
2044- bool slow ;
2055+ /* Make subflows follow along. If we do not do this, we
2056+ * get drops at subflow level if skbs can't be moved to
2057+ * the mptcp rx queue fast enough (announced rcv_win can
2058+ * exceed ssk->sk_rcvbuf).
2059+ */
2060+ mptcp_for_each_subflow (msk , subflow ) {
2061+ struct sock * ssk ;
2062+ bool slow ;
20452063
2046- ssk = mptcp_subflow_tcp_sock (subflow );
2047- slow = lock_sock_fast (ssk );
2048- WRITE_ONCE (ssk -> sk_rcvbuf , rcvbuf );
2049- WRITE_ONCE (tcp_sk (ssk )-> window_clamp , window_clamp );
2050- if (tcp_can_send_ack (ssk ))
2051- tcp_cleanup_rbuf (ssk , 1 );
2052- unlock_sock_fast (ssk , slow );
2053- }
2064+ ssk = mptcp_subflow_tcp_sock (subflow );
2065+ slow = lock_sock_fast (ssk );
2066+ tcp_sk (ssk )-> rcvq_space .space = msk -> rcvq_space .copied ;
2067+ tcp_rcvbuf_grow (ssk );
2068+ unlock_sock_fast (ssk , slow );
20542069 }
20552070 }
20562071
2057- msk -> rcvq_space .space = msk -> rcvq_space .copied ;
20582072new_measure :
20592073 msk -> rcvq_space .copied = 0 ;
20602074 msk -> rcvq_space .time = mstamp ;
@@ -2083,11 +2097,6 @@ static bool __mptcp_move_skbs(struct sock *sk)
20832097 if (list_empty (& msk -> conn_list ))
20842098 return false;
20852099
2086- /* verify we can move any data from the subflow, eventually updating */
2087- if (!(sk -> sk_userlocks & SOCK_RCVBUF_LOCK ))
2088- mptcp_for_each_subflow (msk , subflow )
2089- __mptcp_rcvbuf_update (sk , subflow -> tcp_sock );
2090-
20912100 subflow = list_first_entry (& msk -> conn_list ,
20922101 struct mptcp_subflow_context , node );
20932102 for (;;) {
@@ -2205,14 +2214,8 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
22052214 break ;
22062215 }
22072216
2208- if (sk -> sk_shutdown & RCV_SHUTDOWN ) {
2209- /* race breaker: the shutdown could be after the
2210- * previous receive queue check
2211- */
2212- if (__mptcp_move_skbs (sk ))
2213- continue ;
2217+ if (sk -> sk_shutdown & RCV_SHUTDOWN )
22142218 break ;
2215- }
22162219
22172220 if (sk -> sk_state == TCP_CLOSE ) {
22182221 copied = - ENOTCONN ;
0 commit comments