1 // SPDX-License-Identifier: GPL-2.0 2 /* Multipath TCP 3 * 4 * Copyright (c) 2017 - 2019, Intel Corporation. 5 */ 6 7 #define pr_fmt(fmt) "MPTCP: " fmt 8 9 #include <linux/kernel.h> 10 #include <linux/module.h> 11 #include <linux/netdevice.h> 12 #include <linux/sched/signal.h> 13 #include <linux/atomic.h> 14 #include <net/sock.h> 15 #include <net/inet_common.h> 16 #include <net/inet_hashtables.h> 17 #include <net/protocol.h> 18 #include <net/tcp.h> 19 #include <net/tcp_states.h> 20 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 21 #include <net/transp_v6.h> 22 #endif 23 #include <net/mptcp.h> 24 #include <net/xfrm.h> 25 #include "protocol.h" 26 #include "mib.h" 27 28 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 29 struct mptcp6_sock { 30 struct mptcp_sock msk; 31 struct ipv6_pinfo np; 32 }; 33 #endif 34 35 struct mptcp_skb_cb { 36 u64 map_seq; 37 u64 end_seq; 38 u32 offset; 39 }; 40 41 #define MPTCP_SKB_CB(__skb) ((struct mptcp_skb_cb *)&((__skb)->cb[0])) 42 43 static struct percpu_counter mptcp_sockets_allocated; 44 45 static void __mptcp_destroy_sock(struct sock *sk); 46 static void __mptcp_check_send_data_fin(struct sock *sk); 47 48 /* If msk has an initial subflow socket, and the MP_CAPABLE handshake has not 49 * completed yet or has failed, return the subflow socket. 50 * Otherwise return NULL. 51 */ 52 static struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk) 53 { 54 if (!msk->subflow || READ_ONCE(msk->can_ack)) 55 return NULL; 56 57 return msk->subflow; 58 } 59 60 /* Returns end sequence number of the receiver's advertised window */ 61 static u64 mptcp_wnd_end(const struct mptcp_sock *msk) 62 { 63 return atomic64_read(&msk->wnd_end); 64 } 65 66 static bool mptcp_is_tcpsk(struct sock *sk) 67 { 68 struct socket *sock = sk->sk_socket; 69 70 if (unlikely(sk->sk_prot == &tcp_prot)) { 71 /* we are being invoked after mptcp_accept() has 72 * accepted a non-mp-capable flow: sk is a tcp_sk, 73 * not an mptcp one. 74 * 75 * Hand the socket over to tcp so all further socket ops 76 * bypass mptcp. 77 */ 78 sock->ops = &inet_stream_ops; 79 return true; 80 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 81 } else if (unlikely(sk->sk_prot == &tcpv6_prot)) { 82 sock->ops = &inet6_stream_ops; 83 return true; 84 #endif 85 } 86 87 return false; 88 } 89 90 static struct sock *__mptcp_tcp_fallback(struct mptcp_sock *msk) 91 { 92 sock_owned_by_me((const struct sock *)msk); 93 94 if (likely(!__mptcp_check_fallback(msk))) 95 return NULL; 96 97 return msk->first; 98 } 99 100 static int __mptcp_socket_create(struct mptcp_sock *msk) 101 { 102 struct mptcp_subflow_context *subflow; 103 struct sock *sk = (struct sock *)msk; 104 struct socket *ssock; 105 int err; 106 107 err = mptcp_subflow_create_socket(sk, &ssock); 108 if (err) 109 return err; 110 111 msk->first = ssock->sk; 112 msk->subflow = ssock; 113 subflow = mptcp_subflow_ctx(ssock->sk); 114 list_add(&subflow->node, &msk->conn_list); 115 sock_hold(ssock->sk); 116 subflow->request_mptcp = 1; 117 118 /* accept() will wait on first subflow sk_wq, and we always wakes up 119 * via msk->sk_socket 120 */ 121 RCU_INIT_POINTER(msk->first->sk_wq, &sk->sk_socket->wq); 122 123 return 0; 124 } 125 126 static void mptcp_drop(struct sock *sk, struct sk_buff *skb) 127 { 128 sk_drops_add(sk, skb); 129 __kfree_skb(skb); 130 } 131 132 static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to, 133 struct sk_buff *from) 134 { 135 bool fragstolen; 136 int delta; 137 138 if (MPTCP_SKB_CB(from)->offset || 139 !skb_try_coalesce(to, from, &fragstolen, &delta)) 140 return false; 141 142 pr_debug("colesced seq %llx into %llx new len %d new end seq %llx", 143 MPTCP_SKB_CB(from)->map_seq, MPTCP_SKB_CB(to)->map_seq, 144 to->len, MPTCP_SKB_CB(from)->end_seq); 145 MPTCP_SKB_CB(to)->end_seq = MPTCP_SKB_CB(from)->end_seq; 146 kfree_skb_partial(from, fragstolen); 147 atomic_add(delta, &sk->sk_rmem_alloc); 148 sk_mem_charge(sk, delta); 149 return true; 150 } 151 152 static bool mptcp_ooo_try_coalesce(struct mptcp_sock *msk, struct sk_buff *to, 153 struct sk_buff *from) 154 { 155 if (MPTCP_SKB_CB(from)->map_seq != MPTCP_SKB_CB(to)->end_seq) 156 return false; 157 158 return mptcp_try_coalesce((struct sock *)msk, to, from); 159 } 160 161 /* "inspired" by tcp_data_queue_ofo(), main differences: 162 * - use mptcp seqs 163 * - don't cope with sacks 164 */ 165 static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb) 166 { 167 struct sock *sk = (struct sock *)msk; 168 struct rb_node **p, *parent; 169 u64 seq, end_seq, max_seq; 170 struct sk_buff *skb1; 171 int space; 172 173 seq = MPTCP_SKB_CB(skb)->map_seq; 174 end_seq = MPTCP_SKB_CB(skb)->end_seq; 175 space = tcp_space(sk); 176 max_seq = space > 0 ? space + msk->ack_seq : msk->ack_seq; 177 178 pr_debug("msk=%p seq=%llx limit=%llx empty=%d", msk, seq, max_seq, 179 RB_EMPTY_ROOT(&msk->out_of_order_queue)); 180 if (after64(seq, max_seq)) { 181 /* out of window */ 182 mptcp_drop(sk, skb); 183 pr_debug("oow by %ld", (unsigned long)seq - (unsigned long)max_seq); 184 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_NODSSWINDOW); 185 return; 186 } 187 188 p = &msk->out_of_order_queue.rb_node; 189 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUE); 190 if (RB_EMPTY_ROOT(&msk->out_of_order_queue)) { 191 rb_link_node(&skb->rbnode, NULL, p); 192 rb_insert_color(&skb->rbnode, &msk->out_of_order_queue); 193 msk->ooo_last_skb = skb; 194 goto end; 195 } 196 197 /* with 2 subflows, adding at end of ooo queue is quite likely 198 * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup. 199 */ 200 if (mptcp_ooo_try_coalesce(msk, msk->ooo_last_skb, skb)) { 201 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOMERGE); 202 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUETAIL); 203 return; 204 } 205 206 /* Can avoid an rbtree lookup if we are adding skb after ooo_last_skb */ 207 if (!before64(seq, MPTCP_SKB_CB(msk->ooo_last_skb)->end_seq)) { 208 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUETAIL); 209 parent = &msk->ooo_last_skb->rbnode; 210 p = &parent->rb_right; 211 goto insert; 212 } 213 214 /* Find place to insert this segment. Handle overlaps on the way. */ 215 parent = NULL; 216 while (*p) { 217 parent = *p; 218 skb1 = rb_to_skb(parent); 219 if (before64(seq, MPTCP_SKB_CB(skb1)->map_seq)) { 220 p = &parent->rb_left; 221 continue; 222 } 223 if (before64(seq, MPTCP_SKB_CB(skb1)->end_seq)) { 224 if (!after64(end_seq, MPTCP_SKB_CB(skb1)->end_seq)) { 225 /* All the bits are present. Drop. */ 226 mptcp_drop(sk, skb); 227 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); 228 return; 229 } 230 if (after64(seq, MPTCP_SKB_CB(skb1)->map_seq)) { 231 /* partial overlap: 232 * | skb | 233 * | skb1 | 234 * continue traversing 235 */ 236 } else { 237 /* skb's seq == skb1's seq and skb covers skb1. 238 * Replace skb1 with skb. 239 */ 240 rb_replace_node(&skb1->rbnode, &skb->rbnode, 241 &msk->out_of_order_queue); 242 mptcp_drop(sk, skb1); 243 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); 244 goto merge_right; 245 } 246 } else if (mptcp_ooo_try_coalesce(msk, skb1, skb)) { 247 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOMERGE); 248 return; 249 } 250 p = &parent->rb_right; 251 } 252 253 insert: 254 /* Insert segment into RB tree. */ 255 rb_link_node(&skb->rbnode, parent, p); 256 rb_insert_color(&skb->rbnode, &msk->out_of_order_queue); 257 258 merge_right: 259 /* Remove other segments covered by skb. */ 260 while ((skb1 = skb_rb_next(skb)) != NULL) { 261 if (before64(end_seq, MPTCP_SKB_CB(skb1)->end_seq)) 262 break; 263 rb_erase(&skb1->rbnode, &msk->out_of_order_queue); 264 mptcp_drop(sk, skb1); 265 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); 266 } 267 /* If there is no skb after us, we are the last_skb ! */ 268 if (!skb1) 269 msk->ooo_last_skb = skb; 270 271 end: 272 skb_condense(skb); 273 skb_set_owner_r(skb, sk); 274 } 275 276 static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, 277 struct sk_buff *skb, unsigned int offset, 278 size_t copy_len) 279 { 280 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 281 struct sock *sk = (struct sock *)msk; 282 struct sk_buff *tail; 283 284 __skb_unlink(skb, &ssk->sk_receive_queue); 285 286 skb_ext_reset(skb); 287 skb_orphan(skb); 288 289 /* try to fetch required memory from subflow */ 290 if (!sk_rmem_schedule(sk, skb, skb->truesize)) { 291 if (ssk->sk_forward_alloc < skb->truesize) 292 goto drop; 293 __sk_mem_reclaim(ssk, skb->truesize); 294 if (!sk_rmem_schedule(sk, skb, skb->truesize)) 295 goto drop; 296 } 297 298 /* the skb map_seq accounts for the skb offset: 299 * mptcp_subflow_get_mapped_dsn() is based on the current tp->copied_seq 300 * value 301 */ 302 MPTCP_SKB_CB(skb)->map_seq = mptcp_subflow_get_mapped_dsn(subflow); 303 MPTCP_SKB_CB(skb)->end_seq = MPTCP_SKB_CB(skb)->map_seq + copy_len; 304 MPTCP_SKB_CB(skb)->offset = offset; 305 306 if (MPTCP_SKB_CB(skb)->map_seq == msk->ack_seq) { 307 /* in sequence */ 308 WRITE_ONCE(msk->ack_seq, msk->ack_seq + copy_len); 309 tail = skb_peek_tail(&sk->sk_receive_queue); 310 if (tail && mptcp_try_coalesce(sk, tail, skb)) 311 return true; 312 313 skb_set_owner_r(skb, sk); 314 __skb_queue_tail(&sk->sk_receive_queue, skb); 315 return true; 316 } else if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq)) { 317 mptcp_data_queue_ofo(msk, skb); 318 return false; 319 } 320 321 /* old data, keep it simple and drop the whole pkt, sender 322 * will retransmit as needed, if needed. 323 */ 324 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); 325 drop: 326 mptcp_drop(sk, skb); 327 return false; 328 } 329 330 static void mptcp_stop_timer(struct sock *sk) 331 { 332 struct inet_connection_sock *icsk = inet_csk(sk); 333 334 sk_stop_timer(sk, &icsk->icsk_retransmit_timer); 335 mptcp_sk(sk)->timer_ival = 0; 336 } 337 338 static void mptcp_close_wake_up(struct sock *sk) 339 { 340 if (sock_flag(sk, SOCK_DEAD)) 341 return; 342 343 sk->sk_state_change(sk); 344 if (sk->sk_shutdown == SHUTDOWN_MASK || 345 sk->sk_state == TCP_CLOSE) 346 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); 347 else 348 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); 349 } 350 351 static void mptcp_check_data_fin_ack(struct sock *sk) 352 { 353 struct mptcp_sock *msk = mptcp_sk(sk); 354 355 if (__mptcp_check_fallback(msk)) 356 return; 357 358 /* Look for an acknowledged DATA_FIN */ 359 if (((1 << sk->sk_state) & 360 (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK)) && 361 msk->write_seq == atomic64_read(&msk->snd_una)) { 362 mptcp_stop_timer(sk); 363 364 WRITE_ONCE(msk->snd_data_fin_enable, 0); 365 366 switch (sk->sk_state) { 367 case TCP_FIN_WAIT1: 368 inet_sk_state_store(sk, TCP_FIN_WAIT2); 369 break; 370 case TCP_CLOSING: 371 case TCP_LAST_ACK: 372 inet_sk_state_store(sk, TCP_CLOSE); 373 break; 374 } 375 376 mptcp_close_wake_up(sk); 377 } 378 } 379 380 static bool mptcp_pending_data_fin(struct sock *sk, u64 *seq) 381 { 382 struct mptcp_sock *msk = mptcp_sk(sk); 383 384 if (READ_ONCE(msk->rcv_data_fin) && 385 ((1 << sk->sk_state) & 386 (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2))) { 387 u64 rcv_data_fin_seq = READ_ONCE(msk->rcv_data_fin_seq); 388 389 if (msk->ack_seq == rcv_data_fin_seq) { 390 if (seq) 391 *seq = rcv_data_fin_seq; 392 393 return true; 394 } 395 } 396 397 return false; 398 } 399 400 static void mptcp_set_timeout(const struct sock *sk, const struct sock *ssk) 401 { 402 long tout = ssk && inet_csk(ssk)->icsk_pending ? 403 inet_csk(ssk)->icsk_timeout - jiffies : 0; 404 405 if (tout <= 0) 406 tout = mptcp_sk(sk)->timer_ival; 407 mptcp_sk(sk)->timer_ival = tout > 0 ? tout : TCP_RTO_MIN; 408 } 409 410 static void mptcp_send_ack(struct mptcp_sock *msk) 411 { 412 struct mptcp_subflow_context *subflow; 413 414 mptcp_for_each_subflow(msk, subflow) { 415 struct sock *ssk = mptcp_subflow_tcp_sock(subflow); 416 417 lock_sock(ssk); 418 tcp_send_ack(ssk); 419 release_sock(ssk); 420 } 421 } 422 423 static bool mptcp_check_data_fin(struct sock *sk) 424 { 425 struct mptcp_sock *msk = mptcp_sk(sk); 426 u64 rcv_data_fin_seq; 427 bool ret = false; 428 429 if (__mptcp_check_fallback(msk) || !msk->first) 430 return ret; 431 432 /* Need to ack a DATA_FIN received from a peer while this side 433 * of the connection is in ESTABLISHED, FIN_WAIT1, or FIN_WAIT2. 434 * msk->rcv_data_fin was set when parsing the incoming options 435 * at the subflow level and the msk lock was not held, so this 436 * is the first opportunity to act on the DATA_FIN and change 437 * the msk state. 438 * 439 * If we are caught up to the sequence number of the incoming 440 * DATA_FIN, send the DATA_ACK now and do state transition. If 441 * not caught up, do nothing and let the recv code send DATA_ACK 442 * when catching up. 443 */ 444 445 if (mptcp_pending_data_fin(sk, &rcv_data_fin_seq)) { 446 WRITE_ONCE(msk->ack_seq, msk->ack_seq + 1); 447 WRITE_ONCE(msk->rcv_data_fin, 0); 448 449 sk->sk_shutdown |= RCV_SHUTDOWN; 450 smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ 451 set_bit(MPTCP_DATA_READY, &msk->flags); 452 453 switch (sk->sk_state) { 454 case TCP_ESTABLISHED: 455 inet_sk_state_store(sk, TCP_CLOSE_WAIT); 456 break; 457 case TCP_FIN_WAIT1: 458 inet_sk_state_store(sk, TCP_CLOSING); 459 break; 460 case TCP_FIN_WAIT2: 461 inet_sk_state_store(sk, TCP_CLOSE); 462 break; 463 default: 464 /* Other states not expected */ 465 WARN_ON_ONCE(1); 466 break; 467 } 468 469 ret = true; 470 mptcp_set_timeout(sk, NULL); 471 mptcp_send_ack(msk); 472 mptcp_close_wake_up(sk); 473 } 474 return ret; 475 } 476 477 static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, 478 struct sock *ssk, 479 unsigned int *bytes) 480 { 481 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 482 struct sock *sk = (struct sock *)msk; 483 unsigned int moved = 0; 484 bool more_data_avail; 485 struct tcp_sock *tp; 486 u32 old_copied_seq; 487 bool done = false; 488 int sk_rbuf; 489 490 sk_rbuf = READ_ONCE(sk->sk_rcvbuf); 491 492 if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { 493 int ssk_rbuf = READ_ONCE(ssk->sk_rcvbuf); 494 495 if (unlikely(ssk_rbuf > sk_rbuf)) { 496 WRITE_ONCE(sk->sk_rcvbuf, ssk_rbuf); 497 sk_rbuf = ssk_rbuf; 498 } 499 } 500 501 pr_debug("msk=%p ssk=%p", msk, ssk); 502 tp = tcp_sk(ssk); 503 old_copied_seq = tp->copied_seq; 504 do { 505 u32 map_remaining, offset; 506 u32 seq = tp->copied_seq; 507 struct sk_buff *skb; 508 bool fin; 509 510 /* try to move as much data as available */ 511 map_remaining = subflow->map_data_len - 512 mptcp_subflow_get_map_offset(subflow); 513 514 skb = skb_peek(&ssk->sk_receive_queue); 515 if (!skb) { 516 /* if no data is found, a racing workqueue/recvmsg 517 * already processed the new data, stop here or we 518 * can enter an infinite loop 519 */ 520 if (!moved) 521 done = true; 522 break; 523 } 524 525 if (__mptcp_check_fallback(msk)) { 526 /* if we are running under the workqueue, TCP could have 527 * collapsed skbs between dummy map creation and now 528 * be sure to adjust the size 529 */ 530 map_remaining = skb->len; 531 subflow->map_data_len = skb->len; 532 } 533 534 offset = seq - TCP_SKB_CB(skb)->seq; 535 fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; 536 if (fin) { 537 done = true; 538 seq++; 539 } 540 541 if (offset < skb->len) { 542 size_t len = skb->len - offset; 543 544 if (tp->urg_data) 545 done = true; 546 547 if (__mptcp_move_skb(msk, ssk, skb, offset, len)) 548 moved += len; 549 seq += len; 550 551 if (WARN_ON_ONCE(map_remaining < len)) 552 break; 553 } else { 554 WARN_ON_ONCE(!fin); 555 sk_eat_skb(ssk, skb); 556 done = true; 557 } 558 559 WRITE_ONCE(tp->copied_seq, seq); 560 more_data_avail = mptcp_subflow_data_available(ssk); 561 562 if (atomic_read(&sk->sk_rmem_alloc) > sk_rbuf) { 563 done = true; 564 break; 565 } 566 } while (more_data_avail); 567 568 *bytes += moved; 569 if (tp->copied_seq != old_copied_seq) 570 tcp_cleanup_rbuf(ssk, 1); 571 572 return done; 573 } 574 575 static bool mptcp_ofo_queue(struct mptcp_sock *msk) 576 { 577 struct sock *sk = (struct sock *)msk; 578 struct sk_buff *skb, *tail; 579 bool moved = false; 580 struct rb_node *p; 581 u64 end_seq; 582 583 p = rb_first(&msk->out_of_order_queue); 584 pr_debug("msk=%p empty=%d", msk, RB_EMPTY_ROOT(&msk->out_of_order_queue)); 585 while (p) { 586 skb = rb_to_skb(p); 587 if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq)) 588 break; 589 590 p = rb_next(p); 591 rb_erase(&skb->rbnode, &msk->out_of_order_queue); 592 593 if (unlikely(!after64(MPTCP_SKB_CB(skb)->end_seq, 594 msk->ack_seq))) { 595 mptcp_drop(sk, skb); 596 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); 597 continue; 598 } 599 600 end_seq = MPTCP_SKB_CB(skb)->end_seq; 601 tail = skb_peek_tail(&sk->sk_receive_queue); 602 if (!tail || !mptcp_ooo_try_coalesce(msk, tail, skb)) { 603 int delta = msk->ack_seq - MPTCP_SKB_CB(skb)->map_seq; 604 605 /* skip overlapping data, if any */ 606 pr_debug("uncoalesced seq=%llx ack seq=%llx delta=%d", 607 MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq, 608 delta); 609 MPTCP_SKB_CB(skb)->offset += delta; 610 __skb_queue_tail(&sk->sk_receive_queue, skb); 611 } 612 msk->ack_seq = end_seq; 613 moved = true; 614 } 615 return moved; 616 } 617 618 /* In most cases we will be able to lock the mptcp socket. If its already 619 * owned, we need to defer to the work queue to avoid ABBA deadlock. 620 */ 621 static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) 622 { 623 struct sock *sk = (struct sock *)msk; 624 unsigned int moved = 0; 625 626 if (READ_ONCE(sk->sk_lock.owned)) 627 return false; 628 629 if (unlikely(!spin_trylock_bh(&sk->sk_lock.slock))) 630 return false; 631 632 /* must re-check after taking the lock */ 633 if (!READ_ONCE(sk->sk_lock.owned)) { 634 __mptcp_move_skbs_from_subflow(msk, ssk, &moved); 635 mptcp_ofo_queue(msk); 636 637 /* If the moves have caught up with the DATA_FIN sequence number 638 * it's time to ack the DATA_FIN and change socket state, but 639 * this is not a good place to change state. Let the workqueue 640 * do it. 641 */ 642 if (mptcp_pending_data_fin(sk, NULL)) 643 mptcp_schedule_work(sk); 644 } 645 646 spin_unlock_bh(&sk->sk_lock.slock); 647 648 return moved > 0; 649 } 650 651 void mptcp_data_ready(struct sock *sk, struct sock *ssk) 652 { 653 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 654 struct mptcp_sock *msk = mptcp_sk(sk); 655 int sk_rbuf, ssk_rbuf; 656 bool wake; 657 658 /* move_skbs_to_msk below can legitly clear the data_avail flag, 659 * but we will need later to properly woke the reader, cache its 660 * value 661 */ 662 wake = subflow->data_avail == MPTCP_SUBFLOW_DATA_AVAIL; 663 if (wake) 664 set_bit(MPTCP_DATA_READY, &msk->flags); 665 666 ssk_rbuf = READ_ONCE(ssk->sk_rcvbuf); 667 sk_rbuf = READ_ONCE(sk->sk_rcvbuf); 668 if (unlikely(ssk_rbuf > sk_rbuf)) 669 sk_rbuf = ssk_rbuf; 670 671 /* over limit? can't append more skbs to msk */ 672 if (atomic_read(&sk->sk_rmem_alloc) > sk_rbuf) 673 goto wake; 674 675 if (move_skbs_to_msk(msk, ssk)) 676 goto wake; 677 678 /* mptcp socket is owned, release_cb should retry */ 679 if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, 680 &sk->sk_tsq_flags)) { 681 sock_hold(sk); 682 683 /* need to try again, its possible release_cb() has already 684 * been called after the test_and_set_bit() above. 685 */ 686 move_skbs_to_msk(msk, ssk); 687 } 688 wake: 689 if (wake) 690 sk->sk_data_ready(sk); 691 } 692 693 static void __mptcp_flush_join_list(struct mptcp_sock *msk) 694 { 695 if (likely(list_empty(&msk->join_list))) 696 return; 697 698 spin_lock_bh(&msk->join_list_lock); 699 list_splice_tail_init(&msk->join_list, &msk->conn_list); 700 spin_unlock_bh(&msk->join_list_lock); 701 } 702 703 static bool mptcp_timer_pending(struct sock *sk) 704 { 705 return timer_pending(&inet_csk(sk)->icsk_retransmit_timer); 706 } 707 708 static void mptcp_reset_timer(struct sock *sk) 709 { 710 struct inet_connection_sock *icsk = inet_csk(sk); 711 unsigned long tout; 712 713 /* prevent rescheduling on close */ 714 if (unlikely(inet_sk_state_load(sk) == TCP_CLOSE)) 715 return; 716 717 /* should never be called with mptcp level timer cleared */ 718 tout = READ_ONCE(mptcp_sk(sk)->timer_ival); 719 if (WARN_ON_ONCE(!tout)) 720 tout = TCP_RTO_MIN; 721 sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + tout); 722 } 723 724 bool mptcp_schedule_work(struct sock *sk) 725 { 726 if (inet_sk_state_load(sk) != TCP_CLOSE && 727 schedule_work(&mptcp_sk(sk)->work)) { 728 /* each subflow already holds a reference to the sk, and the 729 * workqueue is invoked by a subflow, so sk can't go away here. 730 */ 731 sock_hold(sk); 732 return true; 733 } 734 return false; 735 } 736 737 void mptcp_data_acked(struct sock *sk) 738 { 739 mptcp_reset_timer(sk); 740 741 if ((test_bit(MPTCP_NOSPACE, &mptcp_sk(sk)->flags) || 742 mptcp_send_head(sk) || 743 (inet_sk_state_load(sk) != TCP_ESTABLISHED))) 744 mptcp_schedule_work(sk); 745 } 746 747 void mptcp_subflow_eof(struct sock *sk) 748 { 749 if (!test_and_set_bit(MPTCP_WORK_EOF, &mptcp_sk(sk)->flags)) 750 mptcp_schedule_work(sk); 751 } 752 753 static void mptcp_check_for_eof(struct mptcp_sock *msk) 754 { 755 struct mptcp_subflow_context *subflow; 756 struct sock *sk = (struct sock *)msk; 757 int receivers = 0; 758 759 mptcp_for_each_subflow(msk, subflow) 760 receivers += !subflow->rx_eof; 761 if (receivers) 762 return; 763 764 if (!(sk->sk_shutdown & RCV_SHUTDOWN)) { 765 /* hopefully temporary hack: propagate shutdown status 766 * to msk, when all subflows agree on it 767 */ 768 sk->sk_shutdown |= RCV_SHUTDOWN; 769 770 smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ 771 set_bit(MPTCP_DATA_READY, &msk->flags); 772 sk->sk_data_ready(sk); 773 } 774 775 switch (sk->sk_state) { 776 case TCP_ESTABLISHED: 777 inet_sk_state_store(sk, TCP_CLOSE_WAIT); 778 break; 779 case TCP_FIN_WAIT1: 780 /* fallback sockets skip TCP_CLOSING - TCP will take care */ 781 inet_sk_state_store(sk, TCP_CLOSE); 782 break; 783 default: 784 return; 785 } 786 mptcp_close_wake_up(sk); 787 } 788 789 static bool mptcp_ext_cache_refill(struct mptcp_sock *msk) 790 { 791 const struct sock *sk = (const struct sock *)msk; 792 793 if (!msk->cached_ext) 794 msk->cached_ext = __skb_ext_alloc(sk->sk_allocation); 795 796 return !!msk->cached_ext; 797 } 798 799 static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk) 800 { 801 struct mptcp_subflow_context *subflow; 802 struct sock *sk = (struct sock *)msk; 803 804 sock_owned_by_me(sk); 805 806 mptcp_for_each_subflow(msk, subflow) { 807 if (subflow->data_avail) 808 return mptcp_subflow_tcp_sock(subflow); 809 } 810 811 return NULL; 812 } 813 814 static bool mptcp_skb_can_collapse_to(u64 write_seq, 815 const struct sk_buff *skb, 816 const struct mptcp_ext *mpext) 817 { 818 if (!tcp_skb_can_collapse_to(skb)) 819 return false; 820 821 /* can collapse only if MPTCP level sequence is in order and this 822 * mapping has not been xmitted yet 823 */ 824 return mpext && mpext->data_seq + mpext->data_len == write_seq && 825 !mpext->frozen; 826 } 827 828 static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk, 829 const struct page_frag *pfrag, 830 const struct mptcp_data_frag *df) 831 { 832 return df && pfrag->page == df->page && 833 pfrag->size - pfrag->offset > 0 && 834 df->data_seq + df->data_len == msk->write_seq; 835 } 836 837 static void dfrag_uncharge(struct sock *sk, int len) 838 { 839 sk_mem_uncharge(sk, len); 840 sk_wmem_queued_add(sk, -len); 841 } 842 843 static void dfrag_clear(struct sock *sk, struct mptcp_data_frag *dfrag) 844 { 845 int len = dfrag->data_len + dfrag->overhead; 846 847 list_del(&dfrag->list); 848 dfrag_uncharge(sk, len); 849 put_page(dfrag->page); 850 } 851 852 static void mptcp_clean_una(struct sock *sk) 853 { 854 struct mptcp_sock *msk = mptcp_sk(sk); 855 struct mptcp_data_frag *dtmp, *dfrag; 856 bool cleaned = false; 857 u64 snd_una; 858 859 /* on fallback we just need to ignore snd_una, as this is really 860 * plain TCP 861 */ 862 if (__mptcp_check_fallback(msk)) 863 atomic64_set(&msk->snd_una, msk->snd_nxt); 864 865 snd_una = atomic64_read(&msk->snd_una); 866 867 list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) { 868 if (after64(dfrag->data_seq + dfrag->data_len, snd_una)) 869 break; 870 871 if (WARN_ON_ONCE(dfrag == msk->first_pending)) 872 break; 873 dfrag_clear(sk, dfrag); 874 cleaned = true; 875 } 876 877 dfrag = mptcp_rtx_head(sk); 878 if (dfrag && after64(snd_una, dfrag->data_seq)) { 879 u64 delta = snd_una - dfrag->data_seq; 880 881 if (WARN_ON_ONCE(delta > dfrag->already_sent)) 882 goto out; 883 884 dfrag->data_seq += delta; 885 dfrag->offset += delta; 886 dfrag->data_len -= delta; 887 dfrag->already_sent -= delta; 888 889 dfrag_uncharge(sk, delta); 890 cleaned = true; 891 } 892 893 out: 894 if (cleaned) 895 sk_mem_reclaim_partial(sk); 896 } 897 898 static void mptcp_clean_una_wakeup(struct sock *sk) 899 { 900 struct mptcp_sock *msk = mptcp_sk(sk); 901 902 mptcp_clean_una(sk); 903 904 /* Only wake up writers if a subflow is ready */ 905 if (sk_stream_is_writeable(sk)) { 906 clear_bit(MPTCP_NOSPACE, &msk->flags); 907 sk_stream_write_space(sk); 908 } 909 } 910 911 /* ensure we get enough memory for the frag hdr, beyond some minimal amount of 912 * data 913 */ 914 static bool mptcp_page_frag_refill(struct sock *sk, struct page_frag *pfrag) 915 { 916 struct mptcp_subflow_context *subflow; 917 struct mptcp_sock *msk = mptcp_sk(sk); 918 bool first = true; 919 920 if (likely(skb_page_frag_refill(32U + sizeof(struct mptcp_data_frag), 921 pfrag, sk->sk_allocation))) 922 return true; 923 924 sk_stream_moderate_sndbuf(sk); 925 mptcp_for_each_subflow(msk, subflow) { 926 struct sock *ssk = mptcp_subflow_tcp_sock(subflow); 927 928 if (first) 929 tcp_enter_memory_pressure(ssk); 930 sk_stream_moderate_sndbuf(ssk); 931 first = false; 932 } 933 return false; 934 } 935 936 static struct mptcp_data_frag * 937 mptcp_carve_data_frag(const struct mptcp_sock *msk, struct page_frag *pfrag, 938 int orig_offset) 939 { 940 int offset = ALIGN(orig_offset, sizeof(long)); 941 struct mptcp_data_frag *dfrag; 942 943 dfrag = (struct mptcp_data_frag *)(page_to_virt(pfrag->page) + offset); 944 dfrag->data_len = 0; 945 dfrag->data_seq = msk->write_seq; 946 dfrag->overhead = offset - orig_offset + sizeof(struct mptcp_data_frag); 947 dfrag->offset = offset + sizeof(struct mptcp_data_frag); 948 dfrag->already_sent = 0; 949 dfrag->page = pfrag->page; 950 951 return dfrag; 952 } 953 954 struct mptcp_sendmsg_info { 955 int mss_now; 956 int size_goal; 957 u16 limit; 958 u16 sent; 959 unsigned int flags; 960 }; 961 962 static int mptcp_check_allowed_size(struct mptcp_sock *msk, u64 data_seq, 963 int avail_size) 964 { 965 u64 window_end = mptcp_wnd_end(msk); 966 967 if (__mptcp_check_fallback(msk)) 968 return avail_size; 969 970 if (!before64(data_seq + avail_size, window_end)) { 971 u64 allowed_size = window_end - data_seq; 972 973 return min_t(unsigned int, allowed_size, avail_size); 974 } 975 976 return avail_size; 977 } 978 979 static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, 980 struct mptcp_data_frag *dfrag, 981 struct mptcp_sendmsg_info *info) 982 { 983 u64 data_seq = dfrag->data_seq + info->sent; 984 struct mptcp_sock *msk = mptcp_sk(sk); 985 bool zero_window_probe = false; 986 struct mptcp_ext *mpext = NULL; 987 struct sk_buff *skb, *tail; 988 bool can_collapse = false; 989 int avail_size; 990 size_t ret; 991 992 pr_debug("msk=%p ssk=%p sending dfrag at seq=%lld len=%d already sent=%d", 993 msk, ssk, dfrag->data_seq, dfrag->data_len, info->sent); 994 995 /* compute send limit */ 996 info->mss_now = tcp_send_mss(ssk, &info->size_goal, info->flags); 997 avail_size = info->size_goal; 998 skb = tcp_write_queue_tail(ssk); 999 if (skb) { 1000 /* Limit the write to the size available in the 1001 * current skb, if any, so that we create at most a new skb. 1002 * Explicitly tells TCP internals to avoid collapsing on later 1003 * queue management operation, to avoid breaking the ext <-> 1004 * SSN association set here 1005 */ 1006 mpext = skb_ext_find(skb, SKB_EXT_MPTCP); 1007 can_collapse = (info->size_goal - skb->len > 0) && 1008 mptcp_skb_can_collapse_to(data_seq, skb, mpext); 1009 if (!can_collapse) 1010 TCP_SKB_CB(skb)->eor = 1; 1011 else 1012 avail_size = info->size_goal - skb->len; 1013 } 1014 1015 /* Zero window and all data acked? Probe. */ 1016 avail_size = mptcp_check_allowed_size(msk, data_seq, avail_size); 1017 if (avail_size == 0) { 1018 if (skb || atomic64_read(&msk->snd_una) != msk->snd_nxt) 1019 return 0; 1020 zero_window_probe = true; 1021 data_seq = atomic64_read(&msk->snd_una) - 1; 1022 avail_size = 1; 1023 } 1024 1025 if (WARN_ON_ONCE(info->sent > info->limit || 1026 info->limit > dfrag->data_len)) 1027 return 0; 1028 1029 ret = info->limit - info->sent; 1030 tail = tcp_build_frag(ssk, avail_size, info->flags, dfrag->page, 1031 dfrag->offset + info->sent, &ret); 1032 if (!tail) { 1033 tcp_remove_empty_skb(sk, tcp_write_queue_tail(ssk)); 1034 return -ENOMEM; 1035 } 1036 1037 /* if the tail skb is still the cached one, collapsing really happened. 1038 */ 1039 if (skb == tail) { 1040 WARN_ON_ONCE(!can_collapse); 1041 mpext->data_len += ret; 1042 WARN_ON_ONCE(zero_window_probe); 1043 goto out; 1044 } 1045 1046 mpext = __skb_ext_set(tail, SKB_EXT_MPTCP, msk->cached_ext); 1047 msk->cached_ext = NULL; 1048 1049 memset(mpext, 0, sizeof(*mpext)); 1050 mpext->data_seq = data_seq; 1051 mpext->subflow_seq = mptcp_subflow_ctx(ssk)->rel_write_seq; 1052 mpext->data_len = ret; 1053 mpext->use_map = 1; 1054 mpext->dsn64 = 1; 1055 1056 pr_debug("data_seq=%llu subflow_seq=%u data_len=%u dsn64=%d", 1057 mpext->data_seq, mpext->subflow_seq, mpext->data_len, 1058 mpext->dsn64); 1059 1060 if (zero_window_probe) { 1061 mptcp_subflow_ctx(ssk)->rel_write_seq += ret; 1062 mpext->frozen = 1; 1063 ret = 0; 1064 tcp_push_pending_frames(ssk); 1065 } 1066 out: 1067 mptcp_subflow_ctx(ssk)->rel_write_seq += ret; 1068 return ret; 1069 } 1070 1071 static void mptcp_nospace(struct mptcp_sock *msk) 1072 { 1073 struct mptcp_subflow_context *subflow; 1074 1075 set_bit(MPTCP_NOSPACE, &msk->flags); 1076 smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */ 1077 1078 mptcp_for_each_subflow(msk, subflow) { 1079 struct sock *ssk = mptcp_subflow_tcp_sock(subflow); 1080 bool ssk_writeable = sk_stream_is_writeable(ssk); 1081 struct socket *sock = READ_ONCE(ssk->sk_socket); 1082 1083 if (ssk_writeable || !sock) 1084 continue; 1085 1086 /* enables ssk->write_space() callbacks */ 1087 set_bit(SOCK_NOSPACE, &sock->flags); 1088 } 1089 1090 /* mptcp_data_acked() could run just before we set the NOSPACE bit, 1091 * so explicitly check for snd_una value 1092 */ 1093 mptcp_clean_una((struct sock *)msk); 1094 } 1095 1096 static bool mptcp_subflow_active(struct mptcp_subflow_context *subflow) 1097 { 1098 struct sock *ssk = mptcp_subflow_tcp_sock(subflow); 1099 1100 /* can't send if JOIN hasn't completed yet (i.e. is usable for mptcp) */ 1101 if (subflow->request_join && !subflow->fully_established) 1102 return false; 1103 1104 /* only send if our side has not closed yet */ 1105 return ((1 << ssk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)); 1106 } 1107 1108 #define MPTCP_SEND_BURST_SIZE ((1 << 16) - \ 1109 sizeof(struct tcphdr) - \ 1110 MAX_TCP_OPTION_SPACE - \ 1111 sizeof(struct ipv6hdr) - \ 1112 sizeof(struct frag_hdr)) 1113 1114 struct subflow_send_info { 1115 struct sock *ssk; 1116 u64 ratio; 1117 }; 1118 1119 static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk, 1120 u32 *sndbuf) 1121 { 1122 struct subflow_send_info send_info[2]; 1123 struct mptcp_subflow_context *subflow; 1124 int i, nr_active = 0; 1125 struct sock *ssk; 1126 u64 ratio; 1127 u32 pace; 1128 1129 sock_owned_by_me((struct sock *)msk); 1130 1131 *sndbuf = 0; 1132 if (!mptcp_ext_cache_refill(msk)) 1133 return NULL; 1134 1135 if (__mptcp_check_fallback(msk)) { 1136 if (!msk->first) 1137 return NULL; 1138 *sndbuf = msk->first->sk_sndbuf; 1139 return sk_stream_memory_free(msk->first) ? msk->first : NULL; 1140 } 1141 1142 /* re-use last subflow, if the burst allow that */ 1143 if (msk->last_snd && msk->snd_burst > 0 && 1144 sk_stream_memory_free(msk->last_snd) && 1145 mptcp_subflow_active(mptcp_subflow_ctx(msk->last_snd))) { 1146 mptcp_for_each_subflow(msk, subflow) { 1147 ssk = mptcp_subflow_tcp_sock(subflow); 1148 *sndbuf = max(tcp_sk(ssk)->snd_wnd, *sndbuf); 1149 } 1150 return msk->last_snd; 1151 } 1152 1153 /* pick the subflow with the lower wmem/wspace ratio */ 1154 for (i = 0; i < 2; ++i) { 1155 send_info[i].ssk = NULL; 1156 send_info[i].ratio = -1; 1157 } 1158 mptcp_for_each_subflow(msk, subflow) { 1159 ssk = mptcp_subflow_tcp_sock(subflow); 1160 if (!mptcp_subflow_active(subflow)) 1161 continue; 1162 1163 nr_active += !subflow->backup; 1164 *sndbuf = max(tcp_sk(ssk)->snd_wnd, *sndbuf); 1165 if (!sk_stream_memory_free(subflow->tcp_sock)) 1166 continue; 1167 1168 pace = READ_ONCE(ssk->sk_pacing_rate); 1169 if (!pace) 1170 continue; 1171 1172 ratio = div_u64((u64)READ_ONCE(ssk->sk_wmem_queued) << 32, 1173 pace); 1174 if (ratio < send_info[subflow->backup].ratio) { 1175 send_info[subflow->backup].ssk = ssk; 1176 send_info[subflow->backup].ratio = ratio; 1177 } 1178 } 1179 1180 pr_debug("msk=%p nr_active=%d ssk=%p:%lld backup=%p:%lld", 1181 msk, nr_active, send_info[0].ssk, send_info[0].ratio, 1182 send_info[1].ssk, send_info[1].ratio); 1183 1184 /* pick the best backup if no other subflow is active */ 1185 if (!nr_active) 1186 send_info[0].ssk = send_info[1].ssk; 1187 1188 if (send_info[0].ssk) { 1189 msk->last_snd = send_info[0].ssk; 1190 msk->snd_burst = min_t(int, MPTCP_SEND_BURST_SIZE, 1191 sk_stream_wspace(msk->last_snd)); 1192 return msk->last_snd; 1193 } 1194 return NULL; 1195 } 1196 1197 static void mptcp_push_release(struct sock *sk, struct sock *ssk, 1198 struct mptcp_sendmsg_info *info) 1199 { 1200 mptcp_set_timeout(sk, ssk); 1201 tcp_push(ssk, 0, info->mss_now, tcp_sk(ssk)->nonagle, info->size_goal); 1202 release_sock(ssk); 1203 } 1204 1205 static void mptcp_push_pending(struct sock *sk, unsigned int flags) 1206 { 1207 struct sock *prev_ssk = NULL, *ssk = NULL; 1208 struct mptcp_sock *msk = mptcp_sk(sk); 1209 struct mptcp_sendmsg_info info = { 1210 .flags = flags, 1211 }; 1212 struct mptcp_data_frag *dfrag; 1213 int len, copied = 0; 1214 u32 sndbuf; 1215 1216 while ((dfrag = mptcp_send_head(sk))) { 1217 info.sent = dfrag->already_sent; 1218 info.limit = dfrag->data_len; 1219 len = dfrag->data_len - dfrag->already_sent; 1220 while (len > 0) { 1221 int ret = 0; 1222 1223 prev_ssk = ssk; 1224 __mptcp_flush_join_list(msk); 1225 ssk = mptcp_subflow_get_send(msk, &sndbuf); 1226 1227 /* do auto tuning */ 1228 if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK) && 1229 sndbuf > READ_ONCE(sk->sk_sndbuf)) 1230 WRITE_ONCE(sk->sk_sndbuf, sndbuf); 1231 1232 /* try to keep the subflow socket lock across 1233 * consecutive xmit on the same socket 1234 */ 1235 if (ssk != prev_ssk && prev_ssk) 1236 mptcp_push_release(sk, prev_ssk, &info); 1237 if (!ssk) 1238 goto out; 1239 1240 if (ssk != prev_ssk || !prev_ssk) 1241 lock_sock(ssk); 1242 1243 ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); 1244 if (ret <= 0) { 1245 mptcp_push_release(sk, ssk, &info); 1246 goto out; 1247 } 1248 1249 info.sent += ret; 1250 dfrag->already_sent += ret; 1251 msk->snd_nxt += ret; 1252 msk->snd_burst -= ret; 1253 copied += ret; 1254 len -= ret; 1255 } 1256 WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); 1257 } 1258 1259 /* at this point we held the socket lock for the last subflow we used */ 1260 if (ssk) 1261 mptcp_push_release(sk, ssk, &info); 1262 1263 out: 1264 /* start the timer, if it's not pending */ 1265 if (!mptcp_timer_pending(sk)) 1266 mptcp_reset_timer(sk); 1267 if (copied) 1268 __mptcp_check_send_data_fin(sk); 1269 } 1270 1271 static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) 1272 { 1273 struct mptcp_sock *msk = mptcp_sk(sk); 1274 struct page_frag *pfrag; 1275 size_t copied = 0; 1276 int ret = 0; 1277 long timeo; 1278 1279 if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL)) 1280 return -EOPNOTSUPP; 1281 1282 lock_sock(sk); 1283 1284 timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); 1285 1286 if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) { 1287 ret = sk_stream_wait_connect(sk, &timeo); 1288 if (ret) 1289 goto out; 1290 } 1291 1292 pfrag = sk_page_frag(sk); 1293 mptcp_clean_una(sk); 1294 1295 while (msg_data_left(msg)) { 1296 struct mptcp_data_frag *dfrag; 1297 int frag_truesize = 0; 1298 bool dfrag_collapsed; 1299 size_t psize, offset; 1300 1301 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) { 1302 ret = -EPIPE; 1303 goto out; 1304 } 1305 1306 /* reuse tail pfrag, if possible, or carve a new one from the 1307 * page allocator 1308 */ 1309 dfrag = mptcp_pending_tail(sk); 1310 dfrag_collapsed = mptcp_frag_can_collapse_to(msk, pfrag, dfrag); 1311 if (!dfrag_collapsed) { 1312 if (!sk_stream_memory_free(sk)) { 1313 mptcp_push_pending(sk, msg->msg_flags); 1314 if (!sk_stream_memory_free(sk)) 1315 goto wait_for_memory; 1316 } 1317 if (!mptcp_page_frag_refill(sk, pfrag)) 1318 goto wait_for_memory; 1319 1320 dfrag = mptcp_carve_data_frag(msk, pfrag, pfrag->offset); 1321 frag_truesize = dfrag->overhead; 1322 } 1323 1324 /* we do not bound vs wspace, to allow a single packet. 1325 * memory accounting will prevent execessive memory usage 1326 * anyway 1327 */ 1328 offset = dfrag->offset + dfrag->data_len; 1329 psize = pfrag->size - offset; 1330 psize = min_t(size_t, psize, msg_data_left(msg)); 1331 if (!sk_wmem_schedule(sk, psize + frag_truesize)) 1332 goto wait_for_memory; 1333 1334 if (copy_page_from_iter(dfrag->page, offset, psize, 1335 &msg->msg_iter) != psize) { 1336 ret = -EFAULT; 1337 goto out; 1338 } 1339 1340 /* data successfully copied into the write queue */ 1341 copied += psize; 1342 dfrag->data_len += psize; 1343 frag_truesize += psize; 1344 pfrag->offset += frag_truesize; 1345 WRITE_ONCE(msk->write_seq, msk->write_seq + psize); 1346 1347 /* charge data on mptcp pending queue to the msk socket 1348 * Note: we charge such data both to sk and ssk 1349 */ 1350 sk_wmem_queued_add(sk, frag_truesize); 1351 sk->sk_forward_alloc -= frag_truesize; 1352 if (!dfrag_collapsed) { 1353 get_page(dfrag->page); 1354 list_add_tail(&dfrag->list, &msk->rtx_queue); 1355 if (!msk->first_pending) 1356 WRITE_ONCE(msk->first_pending, dfrag); 1357 } 1358 pr_debug("msk=%p dfrag at seq=%lld len=%d sent=%d new=%d", msk, 1359 dfrag->data_seq, dfrag->data_len, dfrag->already_sent, 1360 !dfrag_collapsed); 1361 1362 if (!mptcp_ext_cache_refill(msk)) 1363 goto wait_for_memory; 1364 continue; 1365 1366 wait_for_memory: 1367 mptcp_nospace(msk); 1368 if (mptcp_timer_pending(sk)) 1369 mptcp_reset_timer(sk); 1370 ret = sk_stream_wait_memory(sk, &timeo); 1371 if (ret) 1372 goto out; 1373 } 1374 1375 if (copied) 1376 mptcp_push_pending(sk, msg->msg_flags); 1377 1378 out: 1379 release_sock(sk); 1380 return copied ? : ret; 1381 } 1382 1383 static void mptcp_wait_data(struct sock *sk, long *timeo) 1384 { 1385 DEFINE_WAIT_FUNC(wait, woken_wake_function); 1386 struct mptcp_sock *msk = mptcp_sk(sk); 1387 1388 add_wait_queue(sk_sleep(sk), &wait); 1389 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); 1390 1391 sk_wait_event(sk, timeo, 1392 test_and_clear_bit(MPTCP_DATA_READY, &msk->flags), &wait); 1393 1394 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); 1395 remove_wait_queue(sk_sleep(sk), &wait); 1396 } 1397 1398 static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, 1399 struct msghdr *msg, 1400 size_t len) 1401 { 1402 struct sock *sk = (struct sock *)msk; 1403 struct sk_buff *skb; 1404 int copied = 0; 1405 1406 while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) { 1407 u32 offset = MPTCP_SKB_CB(skb)->offset; 1408 u32 data_len = skb->len - offset; 1409 u32 count = min_t(size_t, len - copied, data_len); 1410 int err; 1411 1412 err = skb_copy_datagram_msg(skb, offset, msg, count); 1413 if (unlikely(err < 0)) { 1414 if (!copied) 1415 return err; 1416 break; 1417 } 1418 1419 copied += count; 1420 1421 if (count < data_len) { 1422 MPTCP_SKB_CB(skb)->offset += count; 1423 break; 1424 } 1425 1426 __skb_unlink(skb, &sk->sk_receive_queue); 1427 __kfree_skb(skb); 1428 1429 if (copied >= len) 1430 break; 1431 } 1432 1433 return copied; 1434 } 1435 1436 /* receive buffer autotuning. See tcp_rcv_space_adjust for more information. 1437 * 1438 * Only difference: Use highest rtt estimate of the subflows in use. 1439 */ 1440 static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) 1441 { 1442 struct mptcp_subflow_context *subflow; 1443 struct sock *sk = (struct sock *)msk; 1444 u32 time, advmss = 1; 1445 u64 rtt_us, mstamp; 1446 1447 sock_owned_by_me(sk); 1448 1449 if (copied <= 0) 1450 return; 1451 1452 msk->rcvq_space.copied += copied; 1453 1454 mstamp = div_u64(tcp_clock_ns(), NSEC_PER_USEC); 1455 time = tcp_stamp_us_delta(mstamp, msk->rcvq_space.time); 1456 1457 rtt_us = msk->rcvq_space.rtt_us; 1458 if (rtt_us && time < (rtt_us >> 3)) 1459 return; 1460 1461 rtt_us = 0; 1462 mptcp_for_each_subflow(msk, subflow) { 1463 const struct tcp_sock *tp; 1464 u64 sf_rtt_us; 1465 u32 sf_advmss; 1466 1467 tp = tcp_sk(mptcp_subflow_tcp_sock(subflow)); 1468 1469 sf_rtt_us = READ_ONCE(tp->rcv_rtt_est.rtt_us); 1470 sf_advmss = READ_ONCE(tp->advmss); 1471 1472 rtt_us = max(sf_rtt_us, rtt_us); 1473 advmss = max(sf_advmss, advmss); 1474 } 1475 1476 msk->rcvq_space.rtt_us = rtt_us; 1477 if (time < (rtt_us >> 3) || rtt_us == 0) 1478 return; 1479 1480 if (msk->rcvq_space.copied <= msk->rcvq_space.space) 1481 goto new_measure; 1482 1483 if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf && 1484 !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { 1485 int rcvmem, rcvbuf; 1486 u64 rcvwin, grow; 1487 1488 rcvwin = ((u64)msk->rcvq_space.copied << 1) + 16 * advmss; 1489 1490 grow = rcvwin * (msk->rcvq_space.copied - msk->rcvq_space.space); 1491 1492 do_div(grow, msk->rcvq_space.space); 1493 rcvwin += (grow << 1); 1494 1495 rcvmem = SKB_TRUESIZE(advmss + MAX_TCP_HEADER); 1496 while (tcp_win_from_space(sk, rcvmem) < advmss) 1497 rcvmem += 128; 1498 1499 do_div(rcvwin, advmss); 1500 rcvbuf = min_t(u64, rcvwin * rcvmem, 1501 sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); 1502 1503 if (rcvbuf > sk->sk_rcvbuf) { 1504 u32 window_clamp; 1505 1506 window_clamp = tcp_win_from_space(sk, rcvbuf); 1507 WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); 1508 1509 /* Make subflows follow along. If we do not do this, we 1510 * get drops at subflow level if skbs can't be moved to 1511 * the mptcp rx queue fast enough (announced rcv_win can 1512 * exceed ssk->sk_rcvbuf). 1513 */ 1514 mptcp_for_each_subflow(msk, subflow) { 1515 struct sock *ssk; 1516 bool slow; 1517 1518 ssk = mptcp_subflow_tcp_sock(subflow); 1519 slow = lock_sock_fast(ssk); 1520 WRITE_ONCE(ssk->sk_rcvbuf, rcvbuf); 1521 tcp_sk(ssk)->window_clamp = window_clamp; 1522 tcp_cleanup_rbuf(ssk, 1); 1523 unlock_sock_fast(ssk, slow); 1524 } 1525 } 1526 } 1527 1528 msk->rcvq_space.space = msk->rcvq_space.copied; 1529 new_measure: 1530 msk->rcvq_space.copied = 0; 1531 msk->rcvq_space.time = mstamp; 1532 } 1533 1534 static bool __mptcp_move_skbs(struct mptcp_sock *msk) 1535 { 1536 unsigned int moved = 0; 1537 bool done; 1538 1539 /* avoid looping forever below on racing close */ 1540 if (((struct sock *)msk)->sk_state == TCP_CLOSE) 1541 return false; 1542 1543 __mptcp_flush_join_list(msk); 1544 do { 1545 struct sock *ssk = mptcp_subflow_recv_lookup(msk); 1546 bool slowpath; 1547 1548 if (!ssk) 1549 break; 1550 1551 slowpath = lock_sock_fast(ssk); 1552 done = __mptcp_move_skbs_from_subflow(msk, ssk, &moved); 1553 unlock_sock_fast(ssk, slowpath); 1554 } while (!done); 1555 1556 if (mptcp_ofo_queue(msk) || moved > 0) { 1557 if (!mptcp_check_data_fin((struct sock *)msk)) 1558 mptcp_send_ack(msk); 1559 return true; 1560 } 1561 return false; 1562 } 1563 1564 static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, 1565 int nonblock, int flags, int *addr_len) 1566 { 1567 struct mptcp_sock *msk = mptcp_sk(sk); 1568 int copied = 0; 1569 int target; 1570 long timeo; 1571 1572 if (msg->msg_flags & ~(MSG_WAITALL | MSG_DONTWAIT)) 1573 return -EOPNOTSUPP; 1574 1575 lock_sock(sk); 1576 timeo = sock_rcvtimeo(sk, nonblock); 1577 1578 len = min_t(size_t, len, INT_MAX); 1579 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); 1580 __mptcp_flush_join_list(msk); 1581 1582 while (len > (size_t)copied) { 1583 int bytes_read; 1584 1585 bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied); 1586 if (unlikely(bytes_read < 0)) { 1587 if (!copied) 1588 copied = bytes_read; 1589 goto out_err; 1590 } 1591 1592 copied += bytes_read; 1593 1594 if (skb_queue_empty(&sk->sk_receive_queue) && 1595 __mptcp_move_skbs(msk)) 1596 continue; 1597 1598 /* only the master socket status is relevant here. The exit 1599 * conditions mirror closely tcp_recvmsg() 1600 */ 1601 if (copied >= target) 1602 break; 1603 1604 if (copied) { 1605 if (sk->sk_err || 1606 sk->sk_state == TCP_CLOSE || 1607 (sk->sk_shutdown & RCV_SHUTDOWN) || 1608 !timeo || 1609 signal_pending(current)) 1610 break; 1611 } else { 1612 if (sk->sk_err) { 1613 copied = sock_error(sk); 1614 break; 1615 } 1616 1617 if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags)) 1618 mptcp_check_for_eof(msk); 1619 1620 if (sk->sk_shutdown & RCV_SHUTDOWN) 1621 break; 1622 1623 if (sk->sk_state == TCP_CLOSE) { 1624 copied = -ENOTCONN; 1625 break; 1626 } 1627 1628 if (!timeo) { 1629 copied = -EAGAIN; 1630 break; 1631 } 1632 1633 if (signal_pending(current)) { 1634 copied = sock_intr_errno(timeo); 1635 break; 1636 } 1637 } 1638 1639 pr_debug("block timeout %ld", timeo); 1640 mptcp_wait_data(sk, &timeo); 1641 } 1642 1643 if (skb_queue_empty(&sk->sk_receive_queue)) { 1644 /* entire backlog drained, clear DATA_READY. */ 1645 clear_bit(MPTCP_DATA_READY, &msk->flags); 1646 1647 /* .. race-breaker: ssk might have gotten new data 1648 * after last __mptcp_move_skbs() returned false. 1649 */ 1650 if (unlikely(__mptcp_move_skbs(msk))) 1651 set_bit(MPTCP_DATA_READY, &msk->flags); 1652 } else if (unlikely(!test_bit(MPTCP_DATA_READY, &msk->flags))) { 1653 /* data to read but mptcp_wait_data() cleared DATA_READY */ 1654 set_bit(MPTCP_DATA_READY, &msk->flags); 1655 } 1656 out_err: 1657 pr_debug("msk=%p data_ready=%d rx queue empty=%d copied=%d", 1658 msk, test_bit(MPTCP_DATA_READY, &msk->flags), 1659 skb_queue_empty(&sk->sk_receive_queue), copied); 1660 mptcp_rcv_space_adjust(msk, copied); 1661 1662 release_sock(sk); 1663 return copied; 1664 } 1665 1666 static void mptcp_retransmit_handler(struct sock *sk) 1667 { 1668 struct mptcp_sock *msk = mptcp_sk(sk); 1669 1670 if (atomic64_read(&msk->snd_una) == READ_ONCE(msk->snd_nxt)) { 1671 mptcp_stop_timer(sk); 1672 } else { 1673 set_bit(MPTCP_WORK_RTX, &msk->flags); 1674 mptcp_schedule_work(sk); 1675 } 1676 } 1677 1678 static void mptcp_retransmit_timer(struct timer_list *t) 1679 { 1680 struct inet_connection_sock *icsk = from_timer(icsk, t, 1681 icsk_retransmit_timer); 1682 struct sock *sk = &icsk->icsk_inet.sk; 1683 1684 bh_lock_sock(sk); 1685 if (!sock_owned_by_user(sk)) { 1686 mptcp_retransmit_handler(sk); 1687 } else { 1688 /* delegate our work to tcp_release_cb() */ 1689 if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, 1690 &sk->sk_tsq_flags)) 1691 sock_hold(sk); 1692 } 1693 bh_unlock_sock(sk); 1694 sock_put(sk); 1695 } 1696 1697 static void mptcp_timeout_timer(struct timer_list *t) 1698 { 1699 struct sock *sk = from_timer(sk, t, sk_timer); 1700 1701 mptcp_schedule_work(sk); 1702 } 1703 1704 /* Find an idle subflow. Return NULL if there is unacked data at tcp 1705 * level. 1706 * 1707 * A backup subflow is returned only if that is the only kind available. 1708 */ 1709 static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk) 1710 { 1711 struct mptcp_subflow_context *subflow; 1712 struct sock *backup = NULL; 1713 1714 sock_owned_by_me((const struct sock *)msk); 1715 1716 if (__mptcp_check_fallback(msk)) 1717 return NULL; 1718 1719 mptcp_for_each_subflow(msk, subflow) { 1720 struct sock *ssk = mptcp_subflow_tcp_sock(subflow); 1721 1722 if (!mptcp_subflow_active(subflow)) 1723 continue; 1724 1725 /* still data outstanding at TCP level? Don't retransmit. */ 1726 if (!tcp_write_queue_empty(ssk)) 1727 return NULL; 1728 1729 if (subflow->backup) { 1730 if (!backup) 1731 backup = ssk; 1732 continue; 1733 } 1734 1735 return ssk; 1736 } 1737 1738 return backup; 1739 } 1740 1741 /* subflow sockets can be either outgoing (connect) or incoming 1742 * (accept). 1743 * 1744 * Outgoing subflows use in-kernel sockets. 1745 * Incoming subflows do not have their own 'struct socket' allocated, 1746 * so we need to use tcp_close() after detaching them from the mptcp 1747 * parent socket. 1748 */ 1749 void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, 1750 struct mptcp_subflow_context *subflow) 1751 { 1752 bool dispose_socket = false; 1753 struct socket *sock; 1754 1755 list_del(&subflow->node); 1756 1757 lock_sock(ssk); 1758 1759 /* if we are invoked by the msk cleanup code, the subflow is 1760 * already orphaned 1761 */ 1762 sock = ssk->sk_socket; 1763 if (sock) { 1764 dispose_socket = sock != sk->sk_socket; 1765 sock_orphan(ssk); 1766 } 1767 1768 /* if ssk hit tcp_done(), tcp_cleanup_ulp() cleared the related ops 1769 * the ssk has been already destroyed, we just need to release the 1770 * reference owned by msk; 1771 */ 1772 if (!inet_csk(ssk)->icsk_ulp_ops) { 1773 kfree_rcu(subflow, rcu); 1774 } else { 1775 /* otherwise ask tcp do dispose of ssk and subflow ctx */ 1776 subflow->disposable = 1; 1777 __tcp_close(ssk, 0); 1778 1779 /* close acquired an extra ref */ 1780 __sock_put(ssk); 1781 } 1782 release_sock(ssk); 1783 if (dispose_socket) 1784 iput(SOCK_INODE(sock)); 1785 1786 sock_put(ssk); 1787 } 1788 1789 static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu) 1790 { 1791 return 0; 1792 } 1793 1794 static void pm_work(struct mptcp_sock *msk) 1795 { 1796 struct mptcp_pm_data *pm = &msk->pm; 1797 1798 spin_lock_bh(&msk->pm.lock); 1799 1800 pr_debug("msk=%p status=%x", msk, pm->status); 1801 if (pm->status & BIT(MPTCP_PM_ADD_ADDR_RECEIVED)) { 1802 pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_RECEIVED); 1803 mptcp_pm_nl_add_addr_received(msk); 1804 } 1805 if (pm->status & BIT(MPTCP_PM_RM_ADDR_RECEIVED)) { 1806 pm->status &= ~BIT(MPTCP_PM_RM_ADDR_RECEIVED); 1807 mptcp_pm_nl_rm_addr_received(msk); 1808 } 1809 if (pm->status & BIT(MPTCP_PM_ESTABLISHED)) { 1810 pm->status &= ~BIT(MPTCP_PM_ESTABLISHED); 1811 mptcp_pm_nl_fully_established(msk); 1812 } 1813 if (pm->status & BIT(MPTCP_PM_SUBFLOW_ESTABLISHED)) { 1814 pm->status &= ~BIT(MPTCP_PM_SUBFLOW_ESTABLISHED); 1815 mptcp_pm_nl_subflow_established(msk); 1816 } 1817 1818 spin_unlock_bh(&msk->pm.lock); 1819 } 1820 1821 static void __mptcp_close_subflow(struct mptcp_sock *msk) 1822 { 1823 struct mptcp_subflow_context *subflow, *tmp; 1824 1825 list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) { 1826 struct sock *ssk = mptcp_subflow_tcp_sock(subflow); 1827 1828 if (inet_sk_state_load(ssk) != TCP_CLOSE) 1829 continue; 1830 1831 __mptcp_close_ssk((struct sock *)msk, ssk, subflow); 1832 } 1833 } 1834 1835 static bool mptcp_check_close_timeout(const struct sock *sk) 1836 { 1837 s32 delta = tcp_jiffies32 - inet_csk(sk)->icsk_mtup.probe_timestamp; 1838 struct mptcp_subflow_context *subflow; 1839 1840 if (delta >= TCP_TIMEWAIT_LEN) 1841 return true; 1842 1843 /* if all subflows are in closed status don't bother with additional 1844 * timeout 1845 */ 1846 mptcp_for_each_subflow(mptcp_sk(sk), subflow) { 1847 if (inet_sk_state_load(mptcp_subflow_tcp_sock(subflow)) != 1848 TCP_CLOSE) 1849 return false; 1850 } 1851 return true; 1852 } 1853 1854 static void mptcp_worker(struct work_struct *work) 1855 { 1856 struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work); 1857 struct sock *ssk, *sk = &msk->sk.icsk_inet.sk; 1858 struct mptcp_sendmsg_info info = {}; 1859 struct mptcp_data_frag *dfrag; 1860 size_t copied = 0; 1861 int state, ret; 1862 1863 lock_sock(sk); 1864 set_bit(MPTCP_WORKER_RUNNING, &msk->flags); 1865 state = sk->sk_state; 1866 if (unlikely(state == TCP_CLOSE)) 1867 goto unlock; 1868 1869 mptcp_clean_una_wakeup(sk); 1870 mptcp_check_data_fin_ack(sk); 1871 __mptcp_flush_join_list(msk); 1872 if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) 1873 __mptcp_close_subflow(msk); 1874 1875 __mptcp_move_skbs(msk); 1876 if (mptcp_send_head(sk)) 1877 mptcp_push_pending(sk, 0); 1878 1879 if (msk->pm.status) 1880 pm_work(msk); 1881 1882 if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags)) 1883 mptcp_check_for_eof(msk); 1884 1885 mptcp_check_data_fin(sk); 1886 1887 /* if the msk data is completely acked, or the socket timedout, 1888 * there is no point in keeping around an orphaned sk 1889 */ 1890 if (sock_flag(sk, SOCK_DEAD) && 1891 (mptcp_check_close_timeout(sk) || 1892 (state != sk->sk_state && 1893 ((1 << inet_sk_state_load(sk)) & (TCPF_CLOSE | TCPF_FIN_WAIT2))))) { 1894 inet_sk_state_store(sk, TCP_CLOSE); 1895 __mptcp_destroy_sock(sk); 1896 goto unlock; 1897 } 1898 1899 if (!test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags)) 1900 goto unlock; 1901 1902 dfrag = mptcp_rtx_head(sk); 1903 if (!dfrag) 1904 goto unlock; 1905 1906 if (!mptcp_ext_cache_refill(msk)) 1907 goto reset_unlock; 1908 1909 ssk = mptcp_subflow_get_retrans(msk); 1910 if (!ssk) 1911 goto reset_unlock; 1912 1913 lock_sock(ssk); 1914 1915 /* limit retransmission to the bytes already sent on some subflows */ 1916 info.sent = 0; 1917 info.limit = dfrag->already_sent; 1918 while (info.sent < dfrag->already_sent) { 1919 ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); 1920 if (ret <= 0) 1921 break; 1922 1923 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RETRANSSEGS); 1924 copied += ret; 1925 info.sent += ret; 1926 1927 if (!mptcp_ext_cache_refill(msk)) 1928 break; 1929 } 1930 if (copied) 1931 tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, 1932 info.size_goal); 1933 1934 mptcp_set_timeout(sk, ssk); 1935 release_sock(ssk); 1936 1937 reset_unlock: 1938 if (!mptcp_timer_pending(sk)) 1939 mptcp_reset_timer(sk); 1940 1941 unlock: 1942 clear_bit(MPTCP_WORKER_RUNNING, &msk->flags); 1943 release_sock(sk); 1944 sock_put(sk); 1945 } 1946 1947 static int __mptcp_init_sock(struct sock *sk) 1948 { 1949 struct mptcp_sock *msk = mptcp_sk(sk); 1950 1951 spin_lock_init(&msk->join_list_lock); 1952 1953 INIT_LIST_HEAD(&msk->conn_list); 1954 INIT_LIST_HEAD(&msk->join_list); 1955 INIT_LIST_HEAD(&msk->rtx_queue); 1956 INIT_WORK(&msk->work, mptcp_worker); 1957 msk->out_of_order_queue = RB_ROOT; 1958 msk->first_pending = NULL; 1959 1960 msk->first = NULL; 1961 inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss; 1962 1963 mptcp_pm_data_init(msk); 1964 1965 /* re-use the csk retrans timer for MPTCP-level retrans */ 1966 timer_setup(&msk->sk.icsk_retransmit_timer, mptcp_retransmit_timer, 0); 1967 timer_setup(&sk->sk_timer, mptcp_timeout_timer, 0); 1968 return 0; 1969 } 1970 1971 static int mptcp_init_sock(struct sock *sk) 1972 { 1973 struct net *net = sock_net(sk); 1974 int ret; 1975 1976 ret = __mptcp_init_sock(sk); 1977 if (ret) 1978 return ret; 1979 1980 if (!mptcp_is_enabled(net)) 1981 return -ENOPROTOOPT; 1982 1983 if (unlikely(!net->mib.mptcp_statistics) && !mptcp_mib_alloc(net)) 1984 return -ENOMEM; 1985 1986 ret = __mptcp_socket_create(mptcp_sk(sk)); 1987 if (ret) 1988 return ret; 1989 1990 sk_sockets_allocated_inc(sk); 1991 sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1]; 1992 sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[1]; 1993 1994 return 0; 1995 } 1996 1997 static void __mptcp_clear_xmit(struct sock *sk) 1998 { 1999 struct mptcp_sock *msk = mptcp_sk(sk); 2000 struct mptcp_data_frag *dtmp, *dfrag; 2001 2002 sk_stop_timer(sk, &msk->sk.icsk_retransmit_timer); 2003 2004 WRITE_ONCE(msk->first_pending, NULL); 2005 list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) 2006 dfrag_clear(sk, dfrag); 2007 } 2008 2009 static void mptcp_cancel_work(struct sock *sk) 2010 { 2011 struct mptcp_sock *msk = mptcp_sk(sk); 2012 2013 /* if called by the work itself, do not try to cancel the work, or 2014 * we will hang. 2015 */ 2016 if (!test_bit(MPTCP_WORKER_RUNNING, &msk->flags) && 2017 cancel_work_sync(&msk->work)) 2018 __sock_put(sk); 2019 } 2020 2021 void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how) 2022 { 2023 lock_sock(ssk); 2024 2025 switch (ssk->sk_state) { 2026 case TCP_LISTEN: 2027 if (!(how & RCV_SHUTDOWN)) 2028 break; 2029 fallthrough; 2030 case TCP_SYN_SENT: 2031 tcp_disconnect(ssk, O_NONBLOCK); 2032 break; 2033 default: 2034 if (__mptcp_check_fallback(mptcp_sk(sk))) { 2035 pr_debug("Fallback"); 2036 ssk->sk_shutdown |= how; 2037 tcp_shutdown(ssk, how); 2038 } else { 2039 pr_debug("Sending DATA_FIN on subflow %p", ssk); 2040 mptcp_set_timeout(sk, ssk); 2041 tcp_send_ack(ssk); 2042 } 2043 break; 2044 } 2045 2046 release_sock(ssk); 2047 } 2048 2049 static const unsigned char new_state[16] = { 2050 /* current state: new state: action: */ 2051 [0 /* (Invalid) */] = TCP_CLOSE, 2052 [TCP_ESTABLISHED] = TCP_FIN_WAIT1 | TCP_ACTION_FIN, 2053 [TCP_SYN_SENT] = TCP_CLOSE, 2054 [TCP_SYN_RECV] = TCP_FIN_WAIT1 | TCP_ACTION_FIN, 2055 [TCP_FIN_WAIT1] = TCP_FIN_WAIT1, 2056 [TCP_FIN_WAIT2] = TCP_FIN_WAIT2, 2057 [TCP_TIME_WAIT] = TCP_CLOSE, /* should not happen ! */ 2058 [TCP_CLOSE] = TCP_CLOSE, 2059 [TCP_CLOSE_WAIT] = TCP_LAST_ACK | TCP_ACTION_FIN, 2060 [TCP_LAST_ACK] = TCP_LAST_ACK, 2061 [TCP_LISTEN] = TCP_CLOSE, 2062 [TCP_CLOSING] = TCP_CLOSING, 2063 [TCP_NEW_SYN_RECV] = TCP_CLOSE, /* should not happen ! */ 2064 }; 2065 2066 static int mptcp_close_state(struct sock *sk) 2067 { 2068 int next = (int)new_state[sk->sk_state]; 2069 int ns = next & TCP_STATE_MASK; 2070 2071 inet_sk_state_store(sk, ns); 2072 2073 return next & TCP_ACTION_FIN; 2074 } 2075 2076 static void __mptcp_check_send_data_fin(struct sock *sk) 2077 { 2078 struct mptcp_subflow_context *subflow; 2079 struct mptcp_sock *msk = mptcp_sk(sk); 2080 2081 pr_debug("msk=%p snd_data_fin_enable=%d pending=%d snd_nxt=%llu write_seq=%llu", 2082 msk, msk->snd_data_fin_enable, !!mptcp_send_head(sk), 2083 msk->snd_nxt, msk->write_seq); 2084 2085 /* we still need to enqueue subflows or not really shutting down, 2086 * skip this 2087 */ 2088 if (!msk->snd_data_fin_enable || msk->snd_nxt + 1 != msk->write_seq || 2089 mptcp_send_head(sk)) 2090 return; 2091 2092 WRITE_ONCE(msk->snd_nxt, msk->write_seq); 2093 2094 /* fallback socket will not get data_fin/ack, can move to close now */ 2095 if (__mptcp_check_fallback(msk) && sk->sk_state == TCP_LAST_ACK) { 2096 inet_sk_state_store(sk, TCP_CLOSE); 2097 mptcp_close_wake_up(sk); 2098 } 2099 2100 __mptcp_flush_join_list(msk); 2101 mptcp_for_each_subflow(msk, subflow) { 2102 struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); 2103 2104 mptcp_subflow_shutdown(sk, tcp_sk, SEND_SHUTDOWN); 2105 } 2106 } 2107 2108 static void __mptcp_wr_shutdown(struct sock *sk) 2109 { 2110 struct mptcp_sock *msk = mptcp_sk(sk); 2111 2112 pr_debug("msk=%p snd_data_fin_enable=%d shutdown=%x state=%d pending=%d", 2113 msk, msk->snd_data_fin_enable, sk->sk_shutdown, sk->sk_state, 2114 !!mptcp_send_head(sk)); 2115 2116 /* will be ignored by fallback sockets */ 2117 WRITE_ONCE(msk->write_seq, msk->write_seq + 1); 2118 WRITE_ONCE(msk->snd_data_fin_enable, 1); 2119 2120 __mptcp_check_send_data_fin(sk); 2121 } 2122 2123 static void __mptcp_destroy_sock(struct sock *sk) 2124 { 2125 struct mptcp_subflow_context *subflow, *tmp; 2126 struct mptcp_sock *msk = mptcp_sk(sk); 2127 LIST_HEAD(conn_list); 2128 2129 pr_debug("msk=%p", msk); 2130 2131 /* be sure to always acquire the join list lock, to sync vs 2132 * mptcp_finish_join(). 2133 */ 2134 spin_lock_bh(&msk->join_list_lock); 2135 list_splice_tail_init(&msk->join_list, &msk->conn_list); 2136 spin_unlock_bh(&msk->join_list_lock); 2137 list_splice_init(&msk->conn_list, &conn_list); 2138 2139 __mptcp_clear_xmit(sk); 2140 sk_stop_timer(sk, &sk->sk_timer); 2141 msk->pm.status = 0; 2142 2143 list_for_each_entry_safe(subflow, tmp, &conn_list, node) { 2144 struct sock *ssk = mptcp_subflow_tcp_sock(subflow); 2145 __mptcp_close_ssk(sk, ssk, subflow); 2146 } 2147 2148 sk->sk_prot->destroy(sk); 2149 2150 sk_stream_kill_queues(sk); 2151 xfrm_sk_free_policy(sk); 2152 sk_refcnt_debug_release(sk); 2153 sock_put(sk); 2154 } 2155 2156 static void mptcp_close(struct sock *sk, long timeout) 2157 { 2158 struct mptcp_subflow_context *subflow; 2159 bool do_cancel_work = false; 2160 2161 lock_sock(sk); 2162 sk->sk_shutdown = SHUTDOWN_MASK; 2163 2164 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) { 2165 inet_sk_state_store(sk, TCP_CLOSE); 2166 goto cleanup; 2167 } 2168 2169 if (mptcp_close_state(sk)) 2170 __mptcp_wr_shutdown(sk); 2171 2172 sk_stream_wait_close(sk, timeout); 2173 2174 cleanup: 2175 /* orphan all the subflows */ 2176 inet_csk(sk)->icsk_mtup.probe_timestamp = tcp_jiffies32; 2177 list_for_each_entry(subflow, &mptcp_sk(sk)->conn_list, node) { 2178 struct sock *ssk = mptcp_subflow_tcp_sock(subflow); 2179 bool slow, dispose_socket; 2180 struct socket *sock; 2181 2182 slow = lock_sock_fast(ssk); 2183 sock = ssk->sk_socket; 2184 dispose_socket = sock && sock != sk->sk_socket; 2185 sock_orphan(ssk); 2186 unlock_sock_fast(ssk, slow); 2187 2188 /* for the outgoing subflows we additionally need to free 2189 * the associated socket 2190 */ 2191 if (dispose_socket) 2192 iput(SOCK_INODE(sock)); 2193 } 2194 sock_orphan(sk); 2195 2196 sock_hold(sk); 2197 pr_debug("msk=%p state=%d", sk, sk->sk_state); 2198 if (sk->sk_state == TCP_CLOSE) { 2199 __mptcp_destroy_sock(sk); 2200 do_cancel_work = true; 2201 } else { 2202 sk_reset_timer(sk, &sk->sk_timer, jiffies + TCP_TIMEWAIT_LEN); 2203 } 2204 release_sock(sk); 2205 if (do_cancel_work) 2206 mptcp_cancel_work(sk); 2207 sock_put(sk); 2208 } 2209 2210 static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk) 2211 { 2212 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 2213 const struct ipv6_pinfo *ssk6 = inet6_sk(ssk); 2214 struct ipv6_pinfo *msk6 = inet6_sk(msk); 2215 2216 msk->sk_v6_daddr = ssk->sk_v6_daddr; 2217 msk->sk_v6_rcv_saddr = ssk->sk_v6_rcv_saddr; 2218 2219 if (msk6 && ssk6) { 2220 msk6->saddr = ssk6->saddr; 2221 msk6->flow_label = ssk6->flow_label; 2222 } 2223 #endif 2224 2225 inet_sk(msk)->inet_num = inet_sk(ssk)->inet_num; 2226 inet_sk(msk)->inet_dport = inet_sk(ssk)->inet_dport; 2227 inet_sk(msk)->inet_sport = inet_sk(ssk)->inet_sport; 2228 inet_sk(msk)->inet_daddr = inet_sk(ssk)->inet_daddr; 2229 inet_sk(msk)->inet_saddr = inet_sk(ssk)->inet_saddr; 2230 inet_sk(msk)->inet_rcv_saddr = inet_sk(ssk)->inet_rcv_saddr; 2231 } 2232 2233 static int mptcp_disconnect(struct sock *sk, int flags) 2234 { 2235 /* Should never be called. 2236 * inet_stream_connect() calls ->disconnect, but that 2237 * refers to the subflow socket, not the mptcp one. 2238 */ 2239 WARN_ON_ONCE(1); 2240 return 0; 2241 } 2242 2243 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 2244 static struct ipv6_pinfo *mptcp_inet6_sk(const struct sock *sk) 2245 { 2246 unsigned int offset = sizeof(struct mptcp6_sock) - sizeof(struct ipv6_pinfo); 2247 2248 return (struct ipv6_pinfo *)(((u8 *)sk) + offset); 2249 } 2250 #endif 2251 2252 struct sock *mptcp_sk_clone(const struct sock *sk, 2253 const struct mptcp_options_received *mp_opt, 2254 struct request_sock *req) 2255 { 2256 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); 2257 struct sock *nsk = sk_clone_lock(sk, GFP_ATOMIC); 2258 struct mptcp_sock *msk; 2259 u64 ack_seq; 2260 2261 if (!nsk) 2262 return NULL; 2263 2264 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 2265 if (nsk->sk_family == AF_INET6) 2266 inet_sk(nsk)->pinet6 = mptcp_inet6_sk(nsk); 2267 #endif 2268 2269 __mptcp_init_sock(nsk); 2270 2271 msk = mptcp_sk(nsk); 2272 msk->local_key = subflow_req->local_key; 2273 msk->token = subflow_req->token; 2274 msk->subflow = NULL; 2275 WRITE_ONCE(msk->fully_established, false); 2276 2277 msk->write_seq = subflow_req->idsn + 1; 2278 msk->snd_nxt = msk->write_seq; 2279 atomic64_set(&msk->snd_una, msk->write_seq); 2280 atomic64_set(&msk->wnd_end, msk->snd_nxt + req->rsk_rcv_wnd); 2281 2282 if (mp_opt->mp_capable) { 2283 msk->can_ack = true; 2284 msk->remote_key = mp_opt->sndr_key; 2285 mptcp_crypto_key_sha(msk->remote_key, NULL, &ack_seq); 2286 ack_seq++; 2287 WRITE_ONCE(msk->ack_seq, ack_seq); 2288 } 2289 2290 sock_reset_flag(nsk, SOCK_RCU_FREE); 2291 /* will be fully established after successful MPC subflow creation */ 2292 inet_sk_state_store(nsk, TCP_SYN_RECV); 2293 bh_unlock_sock(nsk); 2294 2295 /* keep a single reference */ 2296 __sock_put(nsk); 2297 return nsk; 2298 } 2299 2300 void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk) 2301 { 2302 const struct tcp_sock *tp = tcp_sk(ssk); 2303 2304 msk->rcvq_space.copied = 0; 2305 msk->rcvq_space.rtt_us = 0; 2306 2307 msk->rcvq_space.time = tp->tcp_mstamp; 2308 2309 /* initial rcv_space offering made to peer */ 2310 msk->rcvq_space.space = min_t(u32, tp->rcv_wnd, 2311 TCP_INIT_CWND * tp->advmss); 2312 if (msk->rcvq_space.space == 0) 2313 msk->rcvq_space.space = TCP_INIT_CWND * TCP_MSS_DEFAULT; 2314 2315 atomic64_set(&msk->wnd_end, msk->snd_nxt + tcp_sk(ssk)->snd_wnd); 2316 } 2317 2318 static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, 2319 bool kern) 2320 { 2321 struct mptcp_sock *msk = mptcp_sk(sk); 2322 struct socket *listener; 2323 struct sock *newsk; 2324 2325 listener = __mptcp_nmpc_socket(msk); 2326 if (WARN_ON_ONCE(!listener)) { 2327 *err = -EINVAL; 2328 return NULL; 2329 } 2330 2331 pr_debug("msk=%p, listener=%p", msk, mptcp_subflow_ctx(listener->sk)); 2332 newsk = inet_csk_accept(listener->sk, flags, err, kern); 2333 if (!newsk) 2334 return NULL; 2335 2336 pr_debug("msk=%p, subflow is mptcp=%d", msk, sk_is_mptcp(newsk)); 2337 if (sk_is_mptcp(newsk)) { 2338 struct mptcp_subflow_context *subflow; 2339 struct sock *new_mptcp_sock; 2340 struct sock *ssk = newsk; 2341 2342 subflow = mptcp_subflow_ctx(newsk); 2343 new_mptcp_sock = subflow->conn; 2344 2345 /* is_mptcp should be false if subflow->conn is missing, see 2346 * subflow_syn_recv_sock() 2347 */ 2348 if (WARN_ON_ONCE(!new_mptcp_sock)) { 2349 tcp_sk(newsk)->is_mptcp = 0; 2350 return newsk; 2351 } 2352 2353 /* acquire the 2nd reference for the owning socket */ 2354 sock_hold(new_mptcp_sock); 2355 2356 local_bh_disable(); 2357 bh_lock_sock(new_mptcp_sock); 2358 msk = mptcp_sk(new_mptcp_sock); 2359 msk->first = newsk; 2360 2361 newsk = new_mptcp_sock; 2362 mptcp_copy_inaddrs(newsk, ssk); 2363 list_add(&subflow->node, &msk->conn_list); 2364 sock_hold(ssk); 2365 2366 mptcp_rcv_space_init(msk, ssk); 2367 bh_unlock_sock(new_mptcp_sock); 2368 2369 __MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEPASSIVEACK); 2370 local_bh_enable(); 2371 } else { 2372 MPTCP_INC_STATS(sock_net(sk), 2373 MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK); 2374 } 2375 2376 return newsk; 2377 } 2378 2379 void mptcp_destroy_common(struct mptcp_sock *msk) 2380 { 2381 skb_rbtree_purge(&msk->out_of_order_queue); 2382 mptcp_token_destroy(msk); 2383 mptcp_pm_free_anno_list(msk); 2384 } 2385 2386 static void mptcp_destroy(struct sock *sk) 2387 { 2388 struct mptcp_sock *msk = mptcp_sk(sk); 2389 2390 if (msk->cached_ext) 2391 __skb_ext_put(msk->cached_ext); 2392 2393 mptcp_destroy_common(msk); 2394 sk_sockets_allocated_dec(sk); 2395 } 2396 2397 static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname, 2398 sockptr_t optval, unsigned int optlen) 2399 { 2400 struct sock *sk = (struct sock *)msk; 2401 struct socket *ssock; 2402 int ret; 2403 2404 switch (optname) { 2405 case SO_REUSEPORT: 2406 case SO_REUSEADDR: 2407 lock_sock(sk); 2408 ssock = __mptcp_nmpc_socket(msk); 2409 if (!ssock) { 2410 release_sock(sk); 2411 return -EINVAL; 2412 } 2413 2414 ret = sock_setsockopt(ssock, SOL_SOCKET, optname, optval, optlen); 2415 if (ret == 0) { 2416 if (optname == SO_REUSEPORT) 2417 sk->sk_reuseport = ssock->sk->sk_reuseport; 2418 else if (optname == SO_REUSEADDR) 2419 sk->sk_reuse = ssock->sk->sk_reuse; 2420 } 2421 release_sock(sk); 2422 return ret; 2423 } 2424 2425 return sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname, optval, optlen); 2426 } 2427 2428 static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname, 2429 sockptr_t optval, unsigned int optlen) 2430 { 2431 struct sock *sk = (struct sock *)msk; 2432 int ret = -EOPNOTSUPP; 2433 struct socket *ssock; 2434 2435 switch (optname) { 2436 case IPV6_V6ONLY: 2437 lock_sock(sk); 2438 ssock = __mptcp_nmpc_socket(msk); 2439 if (!ssock) { 2440 release_sock(sk); 2441 return -EINVAL; 2442 } 2443 2444 ret = tcp_setsockopt(ssock->sk, SOL_IPV6, optname, optval, optlen); 2445 if (ret == 0) 2446 sk->sk_ipv6only = ssock->sk->sk_ipv6only; 2447 2448 release_sock(sk); 2449 break; 2450 } 2451 2452 return ret; 2453 } 2454 2455 static int mptcp_setsockopt(struct sock *sk, int level, int optname, 2456 sockptr_t optval, unsigned int optlen) 2457 { 2458 struct mptcp_sock *msk = mptcp_sk(sk); 2459 struct sock *ssk; 2460 2461 pr_debug("msk=%p", msk); 2462 2463 if (level == SOL_SOCKET) 2464 return mptcp_setsockopt_sol_socket(msk, optname, optval, optlen); 2465 2466 /* @@ the meaning of setsockopt() when the socket is connected and 2467 * there are multiple subflows is not yet defined. It is up to the 2468 * MPTCP-level socket to configure the subflows until the subflow 2469 * is in TCP fallback, when TCP socket options are passed through 2470 * to the one remaining subflow. 2471 */ 2472 lock_sock(sk); 2473 ssk = __mptcp_tcp_fallback(msk); 2474 release_sock(sk); 2475 if (ssk) 2476 return tcp_setsockopt(ssk, level, optname, optval, optlen); 2477 2478 if (level == SOL_IPV6) 2479 return mptcp_setsockopt_v6(msk, optname, optval, optlen); 2480 2481 return -EOPNOTSUPP; 2482 } 2483 2484 static int mptcp_getsockopt(struct sock *sk, int level, int optname, 2485 char __user *optval, int __user *option) 2486 { 2487 struct mptcp_sock *msk = mptcp_sk(sk); 2488 struct sock *ssk; 2489 2490 pr_debug("msk=%p", msk); 2491 2492 /* @@ the meaning of setsockopt() when the socket is connected and 2493 * there are multiple subflows is not yet defined. It is up to the 2494 * MPTCP-level socket to configure the subflows until the subflow 2495 * is in TCP fallback, when socket options are passed through 2496 * to the one remaining subflow. 2497 */ 2498 lock_sock(sk); 2499 ssk = __mptcp_tcp_fallback(msk); 2500 release_sock(sk); 2501 if (ssk) 2502 return tcp_getsockopt(ssk, level, optname, optval, option); 2503 2504 return -EOPNOTSUPP; 2505 } 2506 2507 #define MPTCP_DEFERRED_ALL (TCPF_DELACK_TIMER_DEFERRED | \ 2508 TCPF_WRITE_TIMER_DEFERRED) 2509 2510 /* this is very alike tcp_release_cb() but we must handle differently a 2511 * different set of events 2512 */ 2513 static void mptcp_release_cb(struct sock *sk) 2514 { 2515 unsigned long flags, nflags; 2516 2517 do { 2518 flags = sk->sk_tsq_flags; 2519 if (!(flags & MPTCP_DEFERRED_ALL)) 2520 return; 2521 nflags = flags & ~MPTCP_DEFERRED_ALL; 2522 } while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags); 2523 2524 sock_release_ownership(sk); 2525 2526 if (flags & TCPF_DELACK_TIMER_DEFERRED) { 2527 struct mptcp_sock *msk = mptcp_sk(sk); 2528 struct sock *ssk; 2529 2530 ssk = mptcp_subflow_recv_lookup(msk); 2531 if (!ssk || sk->sk_state == TCP_CLOSE || 2532 !schedule_work(&msk->work)) 2533 __sock_put(sk); 2534 } 2535 2536 if (flags & TCPF_WRITE_TIMER_DEFERRED) { 2537 mptcp_retransmit_handler(sk); 2538 __sock_put(sk); 2539 } 2540 } 2541 2542 static int mptcp_hash(struct sock *sk) 2543 { 2544 /* should never be called, 2545 * we hash the TCP subflows not the master socket 2546 */ 2547 WARN_ON_ONCE(1); 2548 return 0; 2549 } 2550 2551 static void mptcp_unhash(struct sock *sk) 2552 { 2553 /* called from sk_common_release(), but nothing to do here */ 2554 } 2555 2556 static int mptcp_get_port(struct sock *sk, unsigned short snum) 2557 { 2558 struct mptcp_sock *msk = mptcp_sk(sk); 2559 struct socket *ssock; 2560 2561 ssock = __mptcp_nmpc_socket(msk); 2562 pr_debug("msk=%p, subflow=%p", msk, ssock); 2563 if (WARN_ON_ONCE(!ssock)) 2564 return -EINVAL; 2565 2566 return inet_csk_get_port(ssock->sk, snum); 2567 } 2568 2569 void mptcp_finish_connect(struct sock *ssk) 2570 { 2571 struct mptcp_subflow_context *subflow; 2572 struct mptcp_sock *msk; 2573 struct sock *sk; 2574 u64 ack_seq; 2575 2576 subflow = mptcp_subflow_ctx(ssk); 2577 sk = subflow->conn; 2578 msk = mptcp_sk(sk); 2579 2580 pr_debug("msk=%p, token=%u", sk, subflow->token); 2581 2582 mptcp_crypto_key_sha(subflow->remote_key, NULL, &ack_seq); 2583 ack_seq++; 2584 subflow->map_seq = ack_seq; 2585 subflow->map_subflow_seq = 1; 2586 2587 /* the socket is not connected yet, no msk/subflow ops can access/race 2588 * accessing the field below 2589 */ 2590 WRITE_ONCE(msk->remote_key, subflow->remote_key); 2591 WRITE_ONCE(msk->local_key, subflow->local_key); 2592 WRITE_ONCE(msk->write_seq, subflow->idsn + 1); 2593 WRITE_ONCE(msk->snd_nxt, msk->write_seq); 2594 WRITE_ONCE(msk->ack_seq, ack_seq); 2595 WRITE_ONCE(msk->can_ack, 1); 2596 atomic64_set(&msk->snd_una, msk->write_seq); 2597 2598 mptcp_pm_new_connection(msk, 0); 2599 2600 mptcp_rcv_space_init(msk, ssk); 2601 } 2602 2603 static void mptcp_sock_graft(struct sock *sk, struct socket *parent) 2604 { 2605 write_lock_bh(&sk->sk_callback_lock); 2606 rcu_assign_pointer(sk->sk_wq, &parent->wq); 2607 sk_set_socket(sk, parent); 2608 sk->sk_uid = SOCK_INODE(parent)->i_uid; 2609 write_unlock_bh(&sk->sk_callback_lock); 2610 } 2611 2612 bool mptcp_finish_join(struct sock *ssk) 2613 { 2614 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 2615 struct mptcp_sock *msk = mptcp_sk(subflow->conn); 2616 struct sock *parent = (void *)msk; 2617 struct socket *parent_sock; 2618 bool ret; 2619 2620 pr_debug("msk=%p, subflow=%p", msk, subflow); 2621 2622 /* mptcp socket already closing? */ 2623 if (!mptcp_is_fully_established(parent)) 2624 return false; 2625 2626 if (!msk->pm.server_side) 2627 return true; 2628 2629 if (!mptcp_pm_allow_new_subflow(msk)) 2630 return false; 2631 2632 /* active connections are already on conn_list, and we can't acquire 2633 * msk lock here. 2634 * use the join list lock as synchronization point and double-check 2635 * msk status to avoid racing with __mptcp_destroy_sock() 2636 */ 2637 spin_lock_bh(&msk->join_list_lock); 2638 ret = inet_sk_state_load(parent) == TCP_ESTABLISHED; 2639 if (ret && !WARN_ON_ONCE(!list_empty(&subflow->node))) { 2640 list_add_tail(&subflow->node, &msk->join_list); 2641 sock_hold(ssk); 2642 } 2643 spin_unlock_bh(&msk->join_list_lock); 2644 if (!ret) 2645 return false; 2646 2647 /* attach to msk socket only after we are sure he will deal with us 2648 * at close time 2649 */ 2650 parent_sock = READ_ONCE(parent->sk_socket); 2651 if (parent_sock && !ssk->sk_socket) 2652 mptcp_sock_graft(ssk, parent_sock); 2653 subflow->map_seq = READ_ONCE(msk->ack_seq); 2654 return true; 2655 } 2656 2657 static struct proto mptcp_prot = { 2658 .name = "MPTCP", 2659 .owner = THIS_MODULE, 2660 .init = mptcp_init_sock, 2661 .disconnect = mptcp_disconnect, 2662 .close = mptcp_close, 2663 .accept = mptcp_accept, 2664 .setsockopt = mptcp_setsockopt, 2665 .getsockopt = mptcp_getsockopt, 2666 .shutdown = tcp_shutdown, 2667 .destroy = mptcp_destroy, 2668 .sendmsg = mptcp_sendmsg, 2669 .recvmsg = mptcp_recvmsg, 2670 .release_cb = mptcp_release_cb, 2671 .hash = mptcp_hash, 2672 .unhash = mptcp_unhash, 2673 .get_port = mptcp_get_port, 2674 .sockets_allocated = &mptcp_sockets_allocated, 2675 .memory_allocated = &tcp_memory_allocated, 2676 .memory_pressure = &tcp_memory_pressure, 2677 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), 2678 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), 2679 .sysctl_mem = sysctl_tcp_mem, 2680 .obj_size = sizeof(struct mptcp_sock), 2681 .slab_flags = SLAB_TYPESAFE_BY_RCU, 2682 .no_autobind = true, 2683 }; 2684 2685 static int mptcp_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) 2686 { 2687 struct mptcp_sock *msk = mptcp_sk(sock->sk); 2688 struct socket *ssock; 2689 int err; 2690 2691 lock_sock(sock->sk); 2692 ssock = __mptcp_nmpc_socket(msk); 2693 if (!ssock) { 2694 err = -EINVAL; 2695 goto unlock; 2696 } 2697 2698 err = ssock->ops->bind(ssock, uaddr, addr_len); 2699 if (!err) 2700 mptcp_copy_inaddrs(sock->sk, ssock->sk); 2701 2702 unlock: 2703 release_sock(sock->sk); 2704 return err; 2705 } 2706 2707 static void mptcp_subflow_early_fallback(struct mptcp_sock *msk, 2708 struct mptcp_subflow_context *subflow) 2709 { 2710 subflow->request_mptcp = 0; 2711 __mptcp_do_fallback(msk); 2712 } 2713 2714 static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr, 2715 int addr_len, int flags) 2716 { 2717 struct mptcp_sock *msk = mptcp_sk(sock->sk); 2718 struct mptcp_subflow_context *subflow; 2719 struct socket *ssock; 2720 int err; 2721 2722 lock_sock(sock->sk); 2723 if (sock->state != SS_UNCONNECTED && msk->subflow) { 2724 /* pending connection or invalid state, let existing subflow 2725 * cope with that 2726 */ 2727 ssock = msk->subflow; 2728 goto do_connect; 2729 } 2730 2731 ssock = __mptcp_nmpc_socket(msk); 2732 if (!ssock) { 2733 err = -EINVAL; 2734 goto unlock; 2735 } 2736 2737 mptcp_token_destroy(msk); 2738 inet_sk_state_store(sock->sk, TCP_SYN_SENT); 2739 subflow = mptcp_subflow_ctx(ssock->sk); 2740 #ifdef CONFIG_TCP_MD5SIG 2741 /* no MPTCP if MD5SIG is enabled on this socket or we may run out of 2742 * TCP option space. 2743 */ 2744 if (rcu_access_pointer(tcp_sk(ssock->sk)->md5sig_info)) 2745 mptcp_subflow_early_fallback(msk, subflow); 2746 #endif 2747 if (subflow->request_mptcp && mptcp_token_new_connect(ssock->sk)) 2748 mptcp_subflow_early_fallback(msk, subflow); 2749 2750 do_connect: 2751 err = ssock->ops->connect(ssock, uaddr, addr_len, flags); 2752 sock->state = ssock->state; 2753 2754 /* on successful connect, the msk state will be moved to established by 2755 * subflow_finish_connect() 2756 */ 2757 if (!err || err == -EINPROGRESS) 2758 mptcp_copy_inaddrs(sock->sk, ssock->sk); 2759 else 2760 inet_sk_state_store(sock->sk, inet_sk_state_load(ssock->sk)); 2761 2762 unlock: 2763 release_sock(sock->sk); 2764 return err; 2765 } 2766 2767 static int mptcp_listen(struct socket *sock, int backlog) 2768 { 2769 struct mptcp_sock *msk = mptcp_sk(sock->sk); 2770 struct socket *ssock; 2771 int err; 2772 2773 pr_debug("msk=%p", msk); 2774 2775 lock_sock(sock->sk); 2776 ssock = __mptcp_nmpc_socket(msk); 2777 if (!ssock) { 2778 err = -EINVAL; 2779 goto unlock; 2780 } 2781 2782 mptcp_token_destroy(msk); 2783 inet_sk_state_store(sock->sk, TCP_LISTEN); 2784 sock_set_flag(sock->sk, SOCK_RCU_FREE); 2785 2786 err = ssock->ops->listen(ssock, backlog); 2787 inet_sk_state_store(sock->sk, inet_sk_state_load(ssock->sk)); 2788 if (!err) 2789 mptcp_copy_inaddrs(sock->sk, ssock->sk); 2790 2791 unlock: 2792 release_sock(sock->sk); 2793 return err; 2794 } 2795 2796 static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, 2797 int flags, bool kern) 2798 { 2799 struct mptcp_sock *msk = mptcp_sk(sock->sk); 2800 struct socket *ssock; 2801 int err; 2802 2803 pr_debug("msk=%p", msk); 2804 2805 lock_sock(sock->sk); 2806 if (sock->sk->sk_state != TCP_LISTEN) 2807 goto unlock_fail; 2808 2809 ssock = __mptcp_nmpc_socket(msk); 2810 if (!ssock) 2811 goto unlock_fail; 2812 2813 clear_bit(MPTCP_DATA_READY, &msk->flags); 2814 sock_hold(ssock->sk); 2815 release_sock(sock->sk); 2816 2817 err = ssock->ops->accept(sock, newsock, flags, kern); 2818 if (err == 0 && !mptcp_is_tcpsk(newsock->sk)) { 2819 struct mptcp_sock *msk = mptcp_sk(newsock->sk); 2820 struct mptcp_subflow_context *subflow; 2821 2822 /* set ssk->sk_socket of accept()ed flows to mptcp socket. 2823 * This is needed so NOSPACE flag can be set from tcp stack. 2824 */ 2825 __mptcp_flush_join_list(msk); 2826 mptcp_for_each_subflow(msk, subflow) { 2827 struct sock *ssk = mptcp_subflow_tcp_sock(subflow); 2828 2829 if (!ssk->sk_socket) 2830 mptcp_sock_graft(ssk, newsock); 2831 } 2832 } 2833 2834 if (inet_csk_listen_poll(ssock->sk)) 2835 set_bit(MPTCP_DATA_READY, &msk->flags); 2836 sock_put(ssock->sk); 2837 return err; 2838 2839 unlock_fail: 2840 release_sock(sock->sk); 2841 return -EINVAL; 2842 } 2843 2844 static __poll_t mptcp_check_readable(struct mptcp_sock *msk) 2845 { 2846 return test_bit(MPTCP_DATA_READY, &msk->flags) ? EPOLLIN | EPOLLRDNORM : 2847 0; 2848 } 2849 2850 static bool __mptcp_check_writeable(struct mptcp_sock *msk) 2851 { 2852 struct sock *sk = (struct sock *)msk; 2853 bool mptcp_writable; 2854 2855 mptcp_clean_una(sk); 2856 mptcp_writable = sk_stream_is_writeable(sk); 2857 if (!mptcp_writable) 2858 mptcp_nospace(msk); 2859 2860 return mptcp_writable; 2861 } 2862 2863 static __poll_t mptcp_check_writeable(struct mptcp_sock *msk) 2864 { 2865 struct sock *sk = (struct sock *)msk; 2866 __poll_t ret = 0; 2867 bool slow; 2868 2869 if (unlikely(sk->sk_shutdown & SEND_SHUTDOWN)) 2870 return 0; 2871 2872 if (sk_stream_is_writeable(sk)) 2873 return EPOLLOUT | EPOLLWRNORM; 2874 2875 slow = lock_sock_fast(sk); 2876 if (__mptcp_check_writeable(msk)) 2877 ret = EPOLLOUT | EPOLLWRNORM; 2878 2879 unlock_sock_fast(sk, slow); 2880 return ret; 2881 } 2882 2883 static __poll_t mptcp_poll(struct file *file, struct socket *sock, 2884 struct poll_table_struct *wait) 2885 { 2886 struct sock *sk = sock->sk; 2887 struct mptcp_sock *msk; 2888 __poll_t mask = 0; 2889 int state; 2890 2891 msk = mptcp_sk(sk); 2892 sock_poll_wait(file, sock, wait); 2893 2894 state = inet_sk_state_load(sk); 2895 pr_debug("msk=%p state=%d flags=%lx", msk, state, msk->flags); 2896 if (state == TCP_LISTEN) 2897 return mptcp_check_readable(msk); 2898 2899 if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) { 2900 mask |= mptcp_check_readable(msk); 2901 mask |= mptcp_check_writeable(msk); 2902 } 2903 if (sk->sk_shutdown & RCV_SHUTDOWN) 2904 mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; 2905 2906 return mask; 2907 } 2908 2909 static int mptcp_shutdown(struct socket *sock, int how) 2910 { 2911 struct mptcp_sock *msk = mptcp_sk(sock->sk); 2912 struct sock *sk = sock->sk; 2913 int ret = 0; 2914 2915 pr_debug("sk=%p, how=%d", msk, how); 2916 2917 lock_sock(sk); 2918 2919 how++; 2920 if ((how & ~SHUTDOWN_MASK) || !how) { 2921 ret = -EINVAL; 2922 goto out_unlock; 2923 } 2924 2925 if (sock->state == SS_CONNECTING) { 2926 if ((1 << sk->sk_state) & 2927 (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE)) 2928 sock->state = SS_DISCONNECTING; 2929 else 2930 sock->state = SS_CONNECTED; 2931 } 2932 2933 sk->sk_shutdown |= how; 2934 if ((how & SEND_SHUTDOWN) && mptcp_close_state(sk)) 2935 __mptcp_wr_shutdown(sk); 2936 2937 /* Wake up anyone sleeping in poll. */ 2938 sk->sk_state_change(sk); 2939 2940 out_unlock: 2941 release_sock(sk); 2942 2943 return ret; 2944 } 2945 2946 static const struct proto_ops mptcp_stream_ops = { 2947 .family = PF_INET, 2948 .owner = THIS_MODULE, 2949 .release = inet_release, 2950 .bind = mptcp_bind, 2951 .connect = mptcp_stream_connect, 2952 .socketpair = sock_no_socketpair, 2953 .accept = mptcp_stream_accept, 2954 .getname = inet_getname, 2955 .poll = mptcp_poll, 2956 .ioctl = inet_ioctl, 2957 .gettstamp = sock_gettstamp, 2958 .listen = mptcp_listen, 2959 .shutdown = mptcp_shutdown, 2960 .setsockopt = sock_common_setsockopt, 2961 .getsockopt = sock_common_getsockopt, 2962 .sendmsg = inet_sendmsg, 2963 .recvmsg = inet_recvmsg, 2964 .mmap = sock_no_mmap, 2965 .sendpage = inet_sendpage, 2966 }; 2967 2968 static struct inet_protosw mptcp_protosw = { 2969 .type = SOCK_STREAM, 2970 .protocol = IPPROTO_MPTCP, 2971 .prot = &mptcp_prot, 2972 .ops = &mptcp_stream_ops, 2973 .flags = INET_PROTOSW_ICSK, 2974 }; 2975 2976 void __init mptcp_proto_init(void) 2977 { 2978 mptcp_prot.h.hashinfo = tcp_prot.h.hashinfo; 2979 2980 if (percpu_counter_init(&mptcp_sockets_allocated, 0, GFP_KERNEL)) 2981 panic("Failed to allocate MPTCP pcpu counter\n"); 2982 2983 mptcp_subflow_init(); 2984 mptcp_pm_init(); 2985 mptcp_token_init(); 2986 2987 if (proto_register(&mptcp_prot, 1) != 0) 2988 panic("Failed to register MPTCP proto.\n"); 2989 2990 inet_register_protosw(&mptcp_protosw); 2991 2992 BUILD_BUG_ON(sizeof(struct mptcp_skb_cb) > sizeof_field(struct sk_buff, cb)); 2993 } 2994 2995 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 2996 static const struct proto_ops mptcp_v6_stream_ops = { 2997 .family = PF_INET6, 2998 .owner = THIS_MODULE, 2999 .release = inet6_release, 3000 .bind = mptcp_bind, 3001 .connect = mptcp_stream_connect, 3002 .socketpair = sock_no_socketpair, 3003 .accept = mptcp_stream_accept, 3004 .getname = inet6_getname, 3005 .poll = mptcp_poll, 3006 .ioctl = inet6_ioctl, 3007 .gettstamp = sock_gettstamp, 3008 .listen = mptcp_listen, 3009 .shutdown = mptcp_shutdown, 3010 .setsockopt = sock_common_setsockopt, 3011 .getsockopt = sock_common_getsockopt, 3012 .sendmsg = inet6_sendmsg, 3013 .recvmsg = inet6_recvmsg, 3014 .mmap = sock_no_mmap, 3015 .sendpage = inet_sendpage, 3016 #ifdef CONFIG_COMPAT 3017 .compat_ioctl = inet6_compat_ioctl, 3018 #endif 3019 }; 3020 3021 static struct proto mptcp_v6_prot; 3022 3023 static void mptcp_v6_destroy(struct sock *sk) 3024 { 3025 mptcp_destroy(sk); 3026 inet6_destroy_sock(sk); 3027 } 3028 3029 static struct inet_protosw mptcp_v6_protosw = { 3030 .type = SOCK_STREAM, 3031 .protocol = IPPROTO_MPTCP, 3032 .prot = &mptcp_v6_prot, 3033 .ops = &mptcp_v6_stream_ops, 3034 .flags = INET_PROTOSW_ICSK, 3035 }; 3036 3037 int __init mptcp_proto_v6_init(void) 3038 { 3039 int err; 3040 3041 mptcp_v6_prot = mptcp_prot; 3042 strcpy(mptcp_v6_prot.name, "MPTCPv6"); 3043 mptcp_v6_prot.slab = NULL; 3044 mptcp_v6_prot.destroy = mptcp_v6_destroy; 3045 mptcp_v6_prot.obj_size = sizeof(struct mptcp6_sock); 3046 3047 err = proto_register(&mptcp_v6_prot, 1); 3048 if (err) 3049 return err; 3050 3051 err = inet6_register_protosw(&mptcp_v6_protosw); 3052 if (err) 3053 proto_unregister(&mptcp_v6_prot); 3054 3055 return err; 3056 } 3057 #endif 3058