1 // SPDX-License-Identifier: GPL-2.0 2 /* Multipath TCP 3 * 4 * Copyright (c) 2017 - 2019, Intel Corporation. 5 */ 6 7 #define pr_fmt(fmt) "MPTCP: " fmt 8 9 #include <linux/kernel.h> 10 #include <linux/module.h> 11 #include <linux/netdevice.h> 12 #include <linux/sched/signal.h> 13 #include <linux/atomic.h> 14 #include <net/sock.h> 15 #include <net/inet_common.h> 16 #include <net/inet_hashtables.h> 17 #include <net/protocol.h> 18 #include <net/tcp.h> 19 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 20 #include <net/transp_v6.h> 21 #endif 22 #include <net/mptcp.h> 23 #include "protocol.h" 24 #include "mib.h" 25 26 #define MPTCP_SAME_STATE TCP_MAX_STATES 27 28 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 29 struct mptcp6_sock { 30 struct mptcp_sock msk; 31 struct ipv6_pinfo np; 32 }; 33 #endif 34 35 struct mptcp_skb_cb { 36 u32 offset; 37 }; 38 39 #define MPTCP_SKB_CB(__skb) ((struct mptcp_skb_cb *)&((__skb)->cb[0])) 40 41 static struct percpu_counter mptcp_sockets_allocated; 42 43 /* If msk has an initial subflow socket, and the MP_CAPABLE handshake has not 44 * completed yet or has failed, return the subflow socket. 45 * Otherwise return NULL. 46 */ 47 static struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk) 48 { 49 if (!msk->subflow || READ_ONCE(msk->can_ack)) 50 return NULL; 51 52 return msk->subflow; 53 } 54 55 static bool mptcp_is_tcpsk(struct sock *sk) 56 { 57 struct socket *sock = sk->sk_socket; 58 59 if (unlikely(sk->sk_prot == &tcp_prot)) { 60 /* we are being invoked after mptcp_accept() has 61 * accepted a non-mp-capable flow: sk is a tcp_sk, 62 * not an mptcp one. 63 * 64 * Hand the socket over to tcp so all further socket ops 65 * bypass mptcp. 66 */ 67 sock->ops = &inet_stream_ops; 68 return true; 69 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 70 } else if (unlikely(sk->sk_prot == &tcpv6_prot)) { 71 sock->ops = &inet6_stream_ops; 72 return true; 73 #endif 74 } 75 76 return false; 77 } 78 79 static struct sock *__mptcp_tcp_fallback(struct mptcp_sock *msk) 80 { 81 sock_owned_by_me((const struct sock *)msk); 82 83 if (likely(!__mptcp_check_fallback(msk))) 84 return NULL; 85 86 return msk->first; 87 } 88 89 static int __mptcp_socket_create(struct mptcp_sock *msk) 90 { 91 struct mptcp_subflow_context *subflow; 92 struct sock *sk = (struct sock *)msk; 93 struct socket *ssock; 94 int err; 95 96 err = mptcp_subflow_create_socket(sk, &ssock); 97 if (err) 98 return err; 99 100 msk->first = ssock->sk; 101 msk->subflow = ssock; 102 subflow = mptcp_subflow_ctx(ssock->sk); 103 list_add(&subflow->node, &msk->conn_list); 104 subflow->request_mptcp = 1; 105 106 /* accept() will wait on first subflow sk_wq, and we always wakes up 107 * via msk->sk_socket 108 */ 109 RCU_INIT_POINTER(msk->first->sk_wq, &sk->sk_socket->wq); 110 111 return 0; 112 } 113 114 static void __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, 115 struct sk_buff *skb, 116 unsigned int offset, size_t copy_len) 117 { 118 struct sock *sk = (struct sock *)msk; 119 struct sk_buff *tail; 120 121 __skb_unlink(skb, &ssk->sk_receive_queue); 122 123 skb_ext_reset(skb); 124 skb_orphan(skb); 125 msk->ack_seq += copy_len; 126 127 tail = skb_peek_tail(&sk->sk_receive_queue); 128 if (offset == 0 && tail) { 129 bool fragstolen; 130 int delta; 131 132 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { 133 kfree_skb_partial(skb, fragstolen); 134 atomic_add(delta, &sk->sk_rmem_alloc); 135 sk_mem_charge(sk, delta); 136 return; 137 } 138 } 139 140 skb_set_owner_r(skb, sk); 141 __skb_queue_tail(&sk->sk_receive_queue, skb); 142 MPTCP_SKB_CB(skb)->offset = offset; 143 } 144 145 /* both sockets must be locked */ 146 static bool mptcp_subflow_dsn_valid(const struct mptcp_sock *msk, 147 struct sock *ssk) 148 { 149 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 150 u64 dsn = mptcp_subflow_get_mapped_dsn(subflow); 151 152 /* revalidate data sequence number. 153 * 154 * mptcp_subflow_data_available() is usually called 155 * without msk lock. Its unlikely (but possible) 156 * that msk->ack_seq has been advanced since the last 157 * call found in-sequence data. 158 */ 159 if (likely(dsn == msk->ack_seq)) 160 return true; 161 162 subflow->data_avail = 0; 163 return mptcp_subflow_data_available(ssk); 164 } 165 166 static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, 167 struct sock *ssk, 168 unsigned int *bytes) 169 { 170 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 171 struct sock *sk = (struct sock *)msk; 172 unsigned int moved = 0; 173 bool more_data_avail; 174 struct tcp_sock *tp; 175 bool done = false; 176 177 if (!mptcp_subflow_dsn_valid(msk, ssk)) { 178 *bytes = 0; 179 return false; 180 } 181 182 if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { 183 int rcvbuf = max(ssk->sk_rcvbuf, sk->sk_rcvbuf); 184 185 if (rcvbuf > sk->sk_rcvbuf) 186 sk->sk_rcvbuf = rcvbuf; 187 } 188 189 tp = tcp_sk(ssk); 190 do { 191 u32 map_remaining, offset; 192 u32 seq = tp->copied_seq; 193 struct sk_buff *skb; 194 bool fin; 195 196 /* try to move as much data as available */ 197 map_remaining = subflow->map_data_len - 198 mptcp_subflow_get_map_offset(subflow); 199 200 skb = skb_peek(&ssk->sk_receive_queue); 201 if (!skb) 202 break; 203 204 if (__mptcp_check_fallback(msk)) { 205 /* if we are running under the workqueue, TCP could have 206 * collapsed skbs between dummy map creation and now 207 * be sure to adjust the size 208 */ 209 map_remaining = skb->len; 210 subflow->map_data_len = skb->len; 211 } 212 213 offset = seq - TCP_SKB_CB(skb)->seq; 214 fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; 215 if (fin) { 216 done = true; 217 seq++; 218 } 219 220 if (offset < skb->len) { 221 size_t len = skb->len - offset; 222 223 if (tp->urg_data) 224 done = true; 225 226 __mptcp_move_skb(msk, ssk, skb, offset, len); 227 seq += len; 228 moved += len; 229 230 if (WARN_ON_ONCE(map_remaining < len)) 231 break; 232 } else { 233 WARN_ON_ONCE(!fin); 234 sk_eat_skb(ssk, skb); 235 done = true; 236 } 237 238 WRITE_ONCE(tp->copied_seq, seq); 239 more_data_avail = mptcp_subflow_data_available(ssk); 240 241 if (atomic_read(&sk->sk_rmem_alloc) > READ_ONCE(sk->sk_rcvbuf)) { 242 done = true; 243 break; 244 } 245 } while (more_data_avail); 246 247 *bytes = moved; 248 249 return done; 250 } 251 252 /* In most cases we will be able to lock the mptcp socket. If its already 253 * owned, we need to defer to the work queue to avoid ABBA deadlock. 254 */ 255 static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) 256 { 257 struct sock *sk = (struct sock *)msk; 258 unsigned int moved = 0; 259 260 if (READ_ONCE(sk->sk_lock.owned)) 261 return false; 262 263 if (unlikely(!spin_trylock_bh(&sk->sk_lock.slock))) 264 return false; 265 266 /* must re-check after taking the lock */ 267 if (!READ_ONCE(sk->sk_lock.owned)) 268 __mptcp_move_skbs_from_subflow(msk, ssk, &moved); 269 270 spin_unlock_bh(&sk->sk_lock.slock); 271 272 return moved > 0; 273 } 274 275 void mptcp_data_ready(struct sock *sk, struct sock *ssk) 276 { 277 struct mptcp_sock *msk = mptcp_sk(sk); 278 279 set_bit(MPTCP_DATA_READY, &msk->flags); 280 281 if (atomic_read(&sk->sk_rmem_alloc) < READ_ONCE(sk->sk_rcvbuf) && 282 move_skbs_to_msk(msk, ssk)) 283 goto wake; 284 285 /* don't schedule if mptcp sk is (still) over limit */ 286 if (atomic_read(&sk->sk_rmem_alloc) > READ_ONCE(sk->sk_rcvbuf)) 287 goto wake; 288 289 /* mptcp socket is owned, release_cb should retry */ 290 if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, 291 &sk->sk_tsq_flags)) { 292 sock_hold(sk); 293 294 /* need to try again, its possible release_cb() has already 295 * been called after the test_and_set_bit() above. 296 */ 297 move_skbs_to_msk(msk, ssk); 298 } 299 wake: 300 sk->sk_data_ready(sk); 301 } 302 303 static void __mptcp_flush_join_list(struct mptcp_sock *msk) 304 { 305 if (likely(list_empty(&msk->join_list))) 306 return; 307 308 spin_lock_bh(&msk->join_list_lock); 309 list_splice_tail_init(&msk->join_list, &msk->conn_list); 310 spin_unlock_bh(&msk->join_list_lock); 311 } 312 313 static void mptcp_set_timeout(const struct sock *sk, const struct sock *ssk) 314 { 315 long tout = ssk && inet_csk(ssk)->icsk_pending ? 316 inet_csk(ssk)->icsk_timeout - jiffies : 0; 317 318 if (tout <= 0) 319 tout = mptcp_sk(sk)->timer_ival; 320 mptcp_sk(sk)->timer_ival = tout > 0 ? tout : TCP_RTO_MIN; 321 } 322 323 static bool mptcp_timer_pending(struct sock *sk) 324 { 325 return timer_pending(&inet_csk(sk)->icsk_retransmit_timer); 326 } 327 328 static void mptcp_reset_timer(struct sock *sk) 329 { 330 struct inet_connection_sock *icsk = inet_csk(sk); 331 unsigned long tout; 332 333 /* should never be called with mptcp level timer cleared */ 334 tout = READ_ONCE(mptcp_sk(sk)->timer_ival); 335 if (WARN_ON_ONCE(!tout)) 336 tout = TCP_RTO_MIN; 337 sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + tout); 338 } 339 340 void mptcp_data_acked(struct sock *sk) 341 { 342 mptcp_reset_timer(sk); 343 344 if (!sk_stream_is_writeable(sk) && 345 schedule_work(&mptcp_sk(sk)->work)) 346 sock_hold(sk); 347 } 348 349 void mptcp_subflow_eof(struct sock *sk) 350 { 351 struct mptcp_sock *msk = mptcp_sk(sk); 352 353 if (!test_and_set_bit(MPTCP_WORK_EOF, &msk->flags) && 354 schedule_work(&msk->work)) 355 sock_hold(sk); 356 } 357 358 static void mptcp_check_for_eof(struct mptcp_sock *msk) 359 { 360 struct mptcp_subflow_context *subflow; 361 struct sock *sk = (struct sock *)msk; 362 int receivers = 0; 363 364 mptcp_for_each_subflow(msk, subflow) 365 receivers += !subflow->rx_eof; 366 367 if (!receivers && !(sk->sk_shutdown & RCV_SHUTDOWN)) { 368 /* hopefully temporary hack: propagate shutdown status 369 * to msk, when all subflows agree on it 370 */ 371 sk->sk_shutdown |= RCV_SHUTDOWN; 372 373 smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ 374 set_bit(MPTCP_DATA_READY, &msk->flags); 375 sk->sk_data_ready(sk); 376 } 377 } 378 379 static void mptcp_stop_timer(struct sock *sk) 380 { 381 struct inet_connection_sock *icsk = inet_csk(sk); 382 383 sk_stop_timer(sk, &icsk->icsk_retransmit_timer); 384 mptcp_sk(sk)->timer_ival = 0; 385 } 386 387 static bool mptcp_ext_cache_refill(struct mptcp_sock *msk) 388 { 389 const struct sock *sk = (const struct sock *)msk; 390 391 if (!msk->cached_ext) 392 msk->cached_ext = __skb_ext_alloc(sk->sk_allocation); 393 394 return !!msk->cached_ext; 395 } 396 397 static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk) 398 { 399 struct mptcp_subflow_context *subflow; 400 struct sock *sk = (struct sock *)msk; 401 402 sock_owned_by_me(sk); 403 404 mptcp_for_each_subflow(msk, subflow) { 405 if (subflow->data_avail) 406 return mptcp_subflow_tcp_sock(subflow); 407 } 408 409 return NULL; 410 } 411 412 static bool mptcp_skb_can_collapse_to(u64 write_seq, 413 const struct sk_buff *skb, 414 const struct mptcp_ext *mpext) 415 { 416 if (!tcp_skb_can_collapse_to(skb)) 417 return false; 418 419 /* can collapse only if MPTCP level sequence is in order */ 420 return mpext && mpext->data_seq + mpext->data_len == write_seq; 421 } 422 423 static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk, 424 const struct page_frag *pfrag, 425 const struct mptcp_data_frag *df) 426 { 427 return df && pfrag->page == df->page && 428 df->data_seq + df->data_len == msk->write_seq; 429 } 430 431 static void dfrag_uncharge(struct sock *sk, int len) 432 { 433 sk_mem_uncharge(sk, len); 434 sk_wmem_queued_add(sk, -len); 435 } 436 437 static void dfrag_clear(struct sock *sk, struct mptcp_data_frag *dfrag) 438 { 439 int len = dfrag->data_len + dfrag->overhead; 440 441 list_del(&dfrag->list); 442 dfrag_uncharge(sk, len); 443 put_page(dfrag->page); 444 } 445 446 static void mptcp_clean_una(struct sock *sk) 447 { 448 struct mptcp_sock *msk = mptcp_sk(sk); 449 struct mptcp_data_frag *dtmp, *dfrag; 450 bool cleaned = false; 451 u64 snd_una; 452 453 /* on fallback we just need to ignore snd_una, as this is really 454 * plain TCP 455 */ 456 if (__mptcp_check_fallback(msk)) 457 atomic64_set(&msk->snd_una, msk->write_seq); 458 snd_una = atomic64_read(&msk->snd_una); 459 460 list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) { 461 if (after64(dfrag->data_seq + dfrag->data_len, snd_una)) 462 break; 463 464 dfrag_clear(sk, dfrag); 465 cleaned = true; 466 } 467 468 dfrag = mptcp_rtx_head(sk); 469 if (dfrag && after64(snd_una, dfrag->data_seq)) { 470 u64 delta = dfrag->data_seq + dfrag->data_len - snd_una; 471 472 dfrag->data_seq += delta; 473 dfrag->data_len -= delta; 474 475 dfrag_uncharge(sk, delta); 476 cleaned = true; 477 } 478 479 if (cleaned) { 480 sk_mem_reclaim_partial(sk); 481 482 /* Only wake up writers if a subflow is ready */ 483 if (test_bit(MPTCP_SEND_SPACE, &msk->flags)) 484 sk_stream_write_space(sk); 485 } 486 } 487 488 /* ensure we get enough memory for the frag hdr, beyond some minimal amount of 489 * data 490 */ 491 static bool mptcp_page_frag_refill(struct sock *sk, struct page_frag *pfrag) 492 { 493 if (likely(skb_page_frag_refill(32U + sizeof(struct mptcp_data_frag), 494 pfrag, sk->sk_allocation))) 495 return true; 496 497 sk->sk_prot->enter_memory_pressure(sk); 498 sk_stream_moderate_sndbuf(sk); 499 return false; 500 } 501 502 static struct mptcp_data_frag * 503 mptcp_carve_data_frag(const struct mptcp_sock *msk, struct page_frag *pfrag, 504 int orig_offset) 505 { 506 int offset = ALIGN(orig_offset, sizeof(long)); 507 struct mptcp_data_frag *dfrag; 508 509 dfrag = (struct mptcp_data_frag *)(page_to_virt(pfrag->page) + offset); 510 dfrag->data_len = 0; 511 dfrag->data_seq = msk->write_seq; 512 dfrag->overhead = offset - orig_offset + sizeof(struct mptcp_data_frag); 513 dfrag->offset = offset + sizeof(struct mptcp_data_frag); 514 dfrag->page = pfrag->page; 515 516 return dfrag; 517 } 518 519 static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, 520 struct msghdr *msg, struct mptcp_data_frag *dfrag, 521 long *timeo, int *pmss_now, 522 int *ps_goal) 523 { 524 int mss_now, avail_size, size_goal, offset, ret, frag_truesize = 0; 525 bool dfrag_collapsed, can_collapse = false; 526 struct mptcp_sock *msk = mptcp_sk(sk); 527 struct mptcp_ext *mpext = NULL; 528 bool retransmission = !!dfrag; 529 struct sk_buff *skb, *tail; 530 struct page_frag *pfrag; 531 struct page *page; 532 u64 *write_seq; 533 size_t psize; 534 535 /* use the mptcp page cache so that we can easily move the data 536 * from one substream to another, but do per subflow memory accounting 537 * Note: pfrag is used only !retransmission, but the compiler if 538 * fooled into a warning if we don't init here 539 */ 540 pfrag = sk_page_frag(sk); 541 if (!retransmission) { 542 write_seq = &msk->write_seq; 543 page = pfrag->page; 544 } else { 545 write_seq = &dfrag->data_seq; 546 page = dfrag->page; 547 } 548 549 /* compute copy limit */ 550 mss_now = tcp_send_mss(ssk, &size_goal, msg->msg_flags); 551 *pmss_now = mss_now; 552 *ps_goal = size_goal; 553 avail_size = size_goal; 554 skb = tcp_write_queue_tail(ssk); 555 if (skb) { 556 mpext = skb_ext_find(skb, SKB_EXT_MPTCP); 557 558 /* Limit the write to the size available in the 559 * current skb, if any, so that we create at most a new skb. 560 * Explicitly tells TCP internals to avoid collapsing on later 561 * queue management operation, to avoid breaking the ext <-> 562 * SSN association set here 563 */ 564 can_collapse = (size_goal - skb->len > 0) && 565 mptcp_skb_can_collapse_to(*write_seq, skb, mpext); 566 if (!can_collapse) 567 TCP_SKB_CB(skb)->eor = 1; 568 else 569 avail_size = size_goal - skb->len; 570 } 571 572 if (!retransmission) { 573 /* reuse tail pfrag, if possible, or carve a new one from the 574 * page allocator 575 */ 576 dfrag = mptcp_rtx_tail(sk); 577 offset = pfrag->offset; 578 dfrag_collapsed = mptcp_frag_can_collapse_to(msk, pfrag, dfrag); 579 if (!dfrag_collapsed) { 580 dfrag = mptcp_carve_data_frag(msk, pfrag, offset); 581 offset = dfrag->offset; 582 frag_truesize = dfrag->overhead; 583 } 584 psize = min_t(size_t, pfrag->size - offset, avail_size); 585 586 /* Copy to page */ 587 pr_debug("left=%zu", msg_data_left(msg)); 588 psize = copy_page_from_iter(pfrag->page, offset, 589 min_t(size_t, msg_data_left(msg), 590 psize), 591 &msg->msg_iter); 592 pr_debug("left=%zu", msg_data_left(msg)); 593 if (!psize) 594 return -EINVAL; 595 596 if (!sk_wmem_schedule(sk, psize + dfrag->overhead)) 597 return -ENOMEM; 598 } else { 599 offset = dfrag->offset; 600 psize = min_t(size_t, dfrag->data_len, avail_size); 601 } 602 603 /* tell the TCP stack to delay the push so that we can safely 604 * access the skb after the sendpages call 605 */ 606 ret = do_tcp_sendpages(ssk, page, offset, psize, 607 msg->msg_flags | MSG_SENDPAGE_NOTLAST | MSG_DONTWAIT); 608 if (ret <= 0) 609 return ret; 610 611 frag_truesize += ret; 612 if (!retransmission) { 613 if (unlikely(ret < psize)) 614 iov_iter_revert(&msg->msg_iter, psize - ret); 615 616 /* send successful, keep track of sent data for mptcp-level 617 * retransmission 618 */ 619 dfrag->data_len += ret; 620 if (!dfrag_collapsed) { 621 get_page(dfrag->page); 622 list_add_tail(&dfrag->list, &msk->rtx_queue); 623 sk_wmem_queued_add(sk, frag_truesize); 624 } else { 625 sk_wmem_queued_add(sk, ret); 626 } 627 628 /* charge data on mptcp rtx queue to the master socket 629 * Note: we charge such data both to sk and ssk 630 */ 631 sk->sk_forward_alloc -= frag_truesize; 632 } 633 634 /* if the tail skb extension is still the cached one, collapsing 635 * really happened. Note: we can't check for 'same skb' as the sk_buff 636 * hdr on tail can be transmitted, freed and re-allocated by the 637 * do_tcp_sendpages() call 638 */ 639 tail = tcp_write_queue_tail(ssk); 640 if (mpext && tail && mpext == skb_ext_find(tail, SKB_EXT_MPTCP)) { 641 WARN_ON_ONCE(!can_collapse); 642 mpext->data_len += ret; 643 goto out; 644 } 645 646 skb = tcp_write_queue_tail(ssk); 647 mpext = __skb_ext_set(skb, SKB_EXT_MPTCP, msk->cached_ext); 648 msk->cached_ext = NULL; 649 650 memset(mpext, 0, sizeof(*mpext)); 651 mpext->data_seq = *write_seq; 652 mpext->subflow_seq = mptcp_subflow_ctx(ssk)->rel_write_seq; 653 mpext->data_len = ret; 654 mpext->use_map = 1; 655 mpext->dsn64 = 1; 656 657 pr_debug("data_seq=%llu subflow_seq=%u data_len=%u dsn64=%d", 658 mpext->data_seq, mpext->subflow_seq, mpext->data_len, 659 mpext->dsn64); 660 661 out: 662 if (!retransmission) 663 pfrag->offset += frag_truesize; 664 *write_seq += ret; 665 mptcp_subflow_ctx(ssk)->rel_write_seq += ret; 666 667 return ret; 668 } 669 670 static void mptcp_nospace(struct mptcp_sock *msk, struct socket *sock) 671 { 672 clear_bit(MPTCP_SEND_SPACE, &msk->flags); 673 smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */ 674 675 /* enables sk->write_space() callbacks */ 676 set_bit(SOCK_NOSPACE, &sock->flags); 677 } 678 679 static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) 680 { 681 struct mptcp_subflow_context *subflow; 682 struct sock *backup = NULL; 683 684 sock_owned_by_me((const struct sock *)msk); 685 686 if (!mptcp_ext_cache_refill(msk)) 687 return NULL; 688 689 mptcp_for_each_subflow(msk, subflow) { 690 struct sock *ssk = mptcp_subflow_tcp_sock(subflow); 691 692 if (!sk_stream_memory_free(ssk)) { 693 struct socket *sock = ssk->sk_socket; 694 695 if (sock) 696 mptcp_nospace(msk, sock); 697 698 return NULL; 699 } 700 701 if (subflow->backup) { 702 if (!backup) 703 backup = ssk; 704 705 continue; 706 } 707 708 return ssk; 709 } 710 711 return backup; 712 } 713 714 static void ssk_check_wmem(struct mptcp_sock *msk, struct sock *ssk) 715 { 716 struct socket *sock; 717 718 if (likely(sk_stream_is_writeable(ssk))) 719 return; 720 721 sock = READ_ONCE(ssk->sk_socket); 722 if (sock) 723 mptcp_nospace(msk, sock); 724 } 725 726 static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) 727 { 728 int mss_now = 0, size_goal = 0, ret = 0; 729 struct mptcp_sock *msk = mptcp_sk(sk); 730 struct page_frag *pfrag; 731 size_t copied = 0; 732 struct sock *ssk; 733 bool tx_ok; 734 long timeo; 735 736 if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL)) 737 return -EOPNOTSUPP; 738 739 lock_sock(sk); 740 741 timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); 742 743 if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) { 744 ret = sk_stream_wait_connect(sk, &timeo); 745 if (ret) 746 goto out; 747 } 748 749 pfrag = sk_page_frag(sk); 750 restart: 751 mptcp_clean_una(sk); 752 753 wait_for_sndbuf: 754 __mptcp_flush_join_list(msk); 755 ssk = mptcp_subflow_get_send(msk); 756 while (!sk_stream_memory_free(sk) || 757 !ssk || 758 !mptcp_page_frag_refill(ssk, pfrag)) { 759 if (ssk) { 760 /* make sure retransmit timer is 761 * running before we wait for memory. 762 * 763 * The retransmit timer might be needed 764 * to make the peer send an up-to-date 765 * MPTCP Ack. 766 */ 767 mptcp_set_timeout(sk, ssk); 768 if (!mptcp_timer_pending(sk)) 769 mptcp_reset_timer(sk); 770 } 771 772 ret = sk_stream_wait_memory(sk, &timeo); 773 if (ret) 774 goto out; 775 776 mptcp_clean_una(sk); 777 778 ssk = mptcp_subflow_get_send(msk); 779 if (list_empty(&msk->conn_list)) { 780 ret = -ENOTCONN; 781 goto out; 782 } 783 } 784 785 pr_debug("conn_list->subflow=%p", ssk); 786 787 lock_sock(ssk); 788 tx_ok = msg_data_left(msg); 789 while (tx_ok) { 790 ret = mptcp_sendmsg_frag(sk, ssk, msg, NULL, &timeo, &mss_now, 791 &size_goal); 792 if (ret < 0) { 793 if (ret == -EAGAIN && timeo > 0) { 794 mptcp_set_timeout(sk, ssk); 795 release_sock(ssk); 796 goto restart; 797 } 798 break; 799 } 800 801 copied += ret; 802 803 tx_ok = msg_data_left(msg); 804 if (!tx_ok) 805 break; 806 807 if (!sk_stream_memory_free(ssk) || 808 !mptcp_page_frag_refill(ssk, pfrag) || 809 !mptcp_ext_cache_refill(msk)) { 810 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 811 tcp_push(ssk, msg->msg_flags, mss_now, 812 tcp_sk(ssk)->nonagle, size_goal); 813 mptcp_set_timeout(sk, ssk); 814 release_sock(ssk); 815 goto restart; 816 } 817 818 /* memory is charged to mptcp level socket as well, i.e. 819 * if msg is very large, mptcp socket may run out of buffer 820 * space. mptcp_clean_una() will release data that has 821 * been acked at mptcp level in the mean time, so there is 822 * a good chance we can continue sending data right away. 823 * 824 * Normally, when the tcp subflow can accept more data, then 825 * so can the MPTCP socket. However, we need to cope with 826 * peers that might lag behind in their MPTCP-level 827 * acknowledgements, i.e. data might have been acked at 828 * tcp level only. So, we must also check the MPTCP socket 829 * limits before we send more data. 830 */ 831 if (unlikely(!sk_stream_memory_free(sk))) { 832 tcp_push(ssk, msg->msg_flags, mss_now, 833 tcp_sk(ssk)->nonagle, size_goal); 834 mptcp_clean_una(sk); 835 if (!sk_stream_memory_free(sk)) { 836 /* can't send more for now, need to wait for 837 * MPTCP-level ACKs from peer. 838 * 839 * Wakeup will happen via mptcp_clean_una(). 840 */ 841 mptcp_set_timeout(sk, ssk); 842 release_sock(ssk); 843 goto wait_for_sndbuf; 844 } 845 } 846 } 847 848 mptcp_set_timeout(sk, ssk); 849 if (copied) { 850 ret = copied; 851 tcp_push(ssk, msg->msg_flags, mss_now, tcp_sk(ssk)->nonagle, 852 size_goal); 853 854 /* start the timer, if it's not pending */ 855 if (!mptcp_timer_pending(sk)) 856 mptcp_reset_timer(sk); 857 } 858 859 ssk_check_wmem(msk, ssk); 860 release_sock(ssk); 861 out: 862 release_sock(sk); 863 return ret; 864 } 865 866 static void mptcp_wait_data(struct sock *sk, long *timeo) 867 { 868 DEFINE_WAIT_FUNC(wait, woken_wake_function); 869 struct mptcp_sock *msk = mptcp_sk(sk); 870 871 add_wait_queue(sk_sleep(sk), &wait); 872 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); 873 874 sk_wait_event(sk, timeo, 875 test_and_clear_bit(MPTCP_DATA_READY, &msk->flags), &wait); 876 877 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); 878 remove_wait_queue(sk_sleep(sk), &wait); 879 } 880 881 static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, 882 struct msghdr *msg, 883 size_t len) 884 { 885 struct sock *sk = (struct sock *)msk; 886 struct sk_buff *skb; 887 int copied = 0; 888 889 while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) { 890 u32 offset = MPTCP_SKB_CB(skb)->offset; 891 u32 data_len = skb->len - offset; 892 u32 count = min_t(size_t, len - copied, data_len); 893 int err; 894 895 err = skb_copy_datagram_msg(skb, offset, msg, count); 896 if (unlikely(err < 0)) { 897 if (!copied) 898 return err; 899 break; 900 } 901 902 copied += count; 903 904 if (count < data_len) { 905 MPTCP_SKB_CB(skb)->offset += count; 906 break; 907 } 908 909 __skb_unlink(skb, &sk->sk_receive_queue); 910 __kfree_skb(skb); 911 912 if (copied >= len) 913 break; 914 } 915 916 return copied; 917 } 918 919 static bool __mptcp_move_skbs(struct mptcp_sock *msk) 920 { 921 unsigned int moved = 0; 922 bool done; 923 924 do { 925 struct sock *ssk = mptcp_subflow_recv_lookup(msk); 926 927 if (!ssk) 928 break; 929 930 lock_sock(ssk); 931 done = __mptcp_move_skbs_from_subflow(msk, ssk, &moved); 932 release_sock(ssk); 933 } while (!done); 934 935 return moved > 0; 936 } 937 938 static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, 939 int nonblock, int flags, int *addr_len) 940 { 941 struct mptcp_sock *msk = mptcp_sk(sk); 942 int copied = 0; 943 int target; 944 long timeo; 945 946 if (msg->msg_flags & ~(MSG_WAITALL | MSG_DONTWAIT)) 947 return -EOPNOTSUPP; 948 949 lock_sock(sk); 950 timeo = sock_rcvtimeo(sk, nonblock); 951 952 len = min_t(size_t, len, INT_MAX); 953 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); 954 __mptcp_flush_join_list(msk); 955 956 while (len > (size_t)copied) { 957 int bytes_read; 958 959 bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied); 960 if (unlikely(bytes_read < 0)) { 961 if (!copied) 962 copied = bytes_read; 963 goto out_err; 964 } 965 966 copied += bytes_read; 967 968 if (skb_queue_empty(&sk->sk_receive_queue) && 969 __mptcp_move_skbs(msk)) 970 continue; 971 972 /* only the master socket status is relevant here. The exit 973 * conditions mirror closely tcp_recvmsg() 974 */ 975 if (copied >= target) 976 break; 977 978 if (copied) { 979 if (sk->sk_err || 980 sk->sk_state == TCP_CLOSE || 981 (sk->sk_shutdown & RCV_SHUTDOWN) || 982 !timeo || 983 signal_pending(current)) 984 break; 985 } else { 986 if (sk->sk_err) { 987 copied = sock_error(sk); 988 break; 989 } 990 991 if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags)) 992 mptcp_check_for_eof(msk); 993 994 if (sk->sk_shutdown & RCV_SHUTDOWN) 995 break; 996 997 if (sk->sk_state == TCP_CLOSE) { 998 copied = -ENOTCONN; 999 break; 1000 } 1001 1002 if (!timeo) { 1003 copied = -EAGAIN; 1004 break; 1005 } 1006 1007 if (signal_pending(current)) { 1008 copied = sock_intr_errno(timeo); 1009 break; 1010 } 1011 } 1012 1013 pr_debug("block timeout %ld", timeo); 1014 mptcp_wait_data(sk, &timeo); 1015 } 1016 1017 if (skb_queue_empty(&sk->sk_receive_queue)) { 1018 /* entire backlog drained, clear DATA_READY. */ 1019 clear_bit(MPTCP_DATA_READY, &msk->flags); 1020 1021 /* .. race-breaker: ssk might have gotten new data 1022 * after last __mptcp_move_skbs() returned false. 1023 */ 1024 if (unlikely(__mptcp_move_skbs(msk))) 1025 set_bit(MPTCP_DATA_READY, &msk->flags); 1026 } else if (unlikely(!test_bit(MPTCP_DATA_READY, &msk->flags))) { 1027 /* data to read but mptcp_wait_data() cleared DATA_READY */ 1028 set_bit(MPTCP_DATA_READY, &msk->flags); 1029 } 1030 out_err: 1031 release_sock(sk); 1032 return copied; 1033 } 1034 1035 static void mptcp_retransmit_handler(struct sock *sk) 1036 { 1037 struct mptcp_sock *msk = mptcp_sk(sk); 1038 1039 if (atomic64_read(&msk->snd_una) == msk->write_seq) { 1040 mptcp_stop_timer(sk); 1041 } else { 1042 set_bit(MPTCP_WORK_RTX, &msk->flags); 1043 if (schedule_work(&msk->work)) 1044 sock_hold(sk); 1045 } 1046 } 1047 1048 static void mptcp_retransmit_timer(struct timer_list *t) 1049 { 1050 struct inet_connection_sock *icsk = from_timer(icsk, t, 1051 icsk_retransmit_timer); 1052 struct sock *sk = &icsk->icsk_inet.sk; 1053 1054 bh_lock_sock(sk); 1055 if (!sock_owned_by_user(sk)) { 1056 mptcp_retransmit_handler(sk); 1057 } else { 1058 /* delegate our work to tcp_release_cb() */ 1059 if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, 1060 &sk->sk_tsq_flags)) 1061 sock_hold(sk); 1062 } 1063 bh_unlock_sock(sk); 1064 sock_put(sk); 1065 } 1066 1067 /* Find an idle subflow. Return NULL if there is unacked data at tcp 1068 * level. 1069 * 1070 * A backup subflow is returned only if that is the only kind available. 1071 */ 1072 static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk) 1073 { 1074 struct mptcp_subflow_context *subflow; 1075 struct sock *backup = NULL; 1076 1077 sock_owned_by_me((const struct sock *)msk); 1078 1079 mptcp_for_each_subflow(msk, subflow) { 1080 struct sock *ssk = mptcp_subflow_tcp_sock(subflow); 1081 1082 /* still data outstanding at TCP level? Don't retransmit. */ 1083 if (!tcp_write_queue_empty(ssk)) 1084 return NULL; 1085 1086 if (subflow->backup) { 1087 if (!backup) 1088 backup = ssk; 1089 continue; 1090 } 1091 1092 return ssk; 1093 } 1094 1095 return backup; 1096 } 1097 1098 /* subflow sockets can be either outgoing (connect) or incoming 1099 * (accept). 1100 * 1101 * Outgoing subflows use in-kernel sockets. 1102 * Incoming subflows do not have their own 'struct socket' allocated, 1103 * so we need to use tcp_close() after detaching them from the mptcp 1104 * parent socket. 1105 */ 1106 static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, 1107 struct mptcp_subflow_context *subflow, 1108 long timeout) 1109 { 1110 struct socket *sock = READ_ONCE(ssk->sk_socket); 1111 1112 list_del(&subflow->node); 1113 1114 if (sock && sock != sk->sk_socket) { 1115 /* outgoing subflow */ 1116 sock_release(sock); 1117 } else { 1118 /* incoming subflow */ 1119 tcp_close(ssk, timeout); 1120 } 1121 } 1122 1123 static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu) 1124 { 1125 return 0; 1126 } 1127 1128 static void mptcp_worker(struct work_struct *work) 1129 { 1130 struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work); 1131 struct sock *ssk, *sk = &msk->sk.icsk_inet.sk; 1132 int orig_len, orig_offset, mss_now = 0, size_goal = 0; 1133 struct mptcp_data_frag *dfrag; 1134 u64 orig_write_seq; 1135 size_t copied = 0; 1136 struct msghdr msg; 1137 long timeo = 0; 1138 1139 lock_sock(sk); 1140 mptcp_clean_una(sk); 1141 __mptcp_flush_join_list(msk); 1142 __mptcp_move_skbs(msk); 1143 1144 if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags)) 1145 mptcp_check_for_eof(msk); 1146 1147 if (!test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags)) 1148 goto unlock; 1149 1150 dfrag = mptcp_rtx_head(sk); 1151 if (!dfrag) 1152 goto unlock; 1153 1154 if (!mptcp_ext_cache_refill(msk)) 1155 goto reset_unlock; 1156 1157 ssk = mptcp_subflow_get_retrans(msk); 1158 if (!ssk) 1159 goto reset_unlock; 1160 1161 lock_sock(ssk); 1162 1163 msg.msg_flags = MSG_DONTWAIT; 1164 orig_len = dfrag->data_len; 1165 orig_offset = dfrag->offset; 1166 orig_write_seq = dfrag->data_seq; 1167 while (dfrag->data_len > 0) { 1168 int ret = mptcp_sendmsg_frag(sk, ssk, &msg, dfrag, &timeo, 1169 &mss_now, &size_goal); 1170 if (ret < 0) 1171 break; 1172 1173 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RETRANSSEGS); 1174 copied += ret; 1175 dfrag->data_len -= ret; 1176 dfrag->offset += ret; 1177 1178 if (!mptcp_ext_cache_refill(msk)) 1179 break; 1180 } 1181 if (copied) 1182 tcp_push(ssk, msg.msg_flags, mss_now, tcp_sk(ssk)->nonagle, 1183 size_goal); 1184 1185 dfrag->data_seq = orig_write_seq; 1186 dfrag->offset = orig_offset; 1187 dfrag->data_len = orig_len; 1188 1189 mptcp_set_timeout(sk, ssk); 1190 release_sock(ssk); 1191 1192 reset_unlock: 1193 if (!mptcp_timer_pending(sk)) 1194 mptcp_reset_timer(sk); 1195 1196 unlock: 1197 release_sock(sk); 1198 sock_put(sk); 1199 } 1200 1201 static int __mptcp_init_sock(struct sock *sk) 1202 { 1203 struct mptcp_sock *msk = mptcp_sk(sk); 1204 1205 spin_lock_init(&msk->join_list_lock); 1206 1207 INIT_LIST_HEAD(&msk->conn_list); 1208 INIT_LIST_HEAD(&msk->join_list); 1209 INIT_LIST_HEAD(&msk->rtx_queue); 1210 __set_bit(MPTCP_SEND_SPACE, &msk->flags); 1211 INIT_WORK(&msk->work, mptcp_worker); 1212 1213 msk->first = NULL; 1214 inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss; 1215 1216 mptcp_pm_data_init(msk); 1217 1218 /* re-use the csk retrans timer for MPTCP-level retrans */ 1219 timer_setup(&msk->sk.icsk_retransmit_timer, mptcp_retransmit_timer, 0); 1220 1221 return 0; 1222 } 1223 1224 static int mptcp_init_sock(struct sock *sk) 1225 { 1226 struct net *net = sock_net(sk); 1227 int ret; 1228 1229 if (!mptcp_is_enabled(net)) 1230 return -ENOPROTOOPT; 1231 1232 if (unlikely(!net->mib.mptcp_statistics) && !mptcp_mib_alloc(net)) 1233 return -ENOMEM; 1234 1235 ret = __mptcp_init_sock(sk); 1236 if (ret) 1237 return ret; 1238 1239 ret = __mptcp_socket_create(mptcp_sk(sk)); 1240 if (ret) 1241 return ret; 1242 1243 sk_sockets_allocated_inc(sk); 1244 sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[2]; 1245 1246 return 0; 1247 } 1248 1249 static void __mptcp_clear_xmit(struct sock *sk) 1250 { 1251 struct mptcp_sock *msk = mptcp_sk(sk); 1252 struct mptcp_data_frag *dtmp, *dfrag; 1253 1254 sk_stop_timer(sk, &msk->sk.icsk_retransmit_timer); 1255 1256 list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) 1257 dfrag_clear(sk, dfrag); 1258 } 1259 1260 static void mptcp_cancel_work(struct sock *sk) 1261 { 1262 struct mptcp_sock *msk = mptcp_sk(sk); 1263 1264 if (cancel_work_sync(&msk->work)) 1265 sock_put(sk); 1266 } 1267 1268 static void mptcp_subflow_shutdown(struct sock *ssk, int how, 1269 bool data_fin_tx_enable, u64 data_fin_tx_seq) 1270 { 1271 lock_sock(ssk); 1272 1273 switch (ssk->sk_state) { 1274 case TCP_LISTEN: 1275 if (!(how & RCV_SHUTDOWN)) 1276 break; 1277 /* fall through */ 1278 case TCP_SYN_SENT: 1279 tcp_disconnect(ssk, O_NONBLOCK); 1280 break; 1281 default: 1282 if (data_fin_tx_enable) { 1283 struct mptcp_subflow_context *subflow; 1284 1285 subflow = mptcp_subflow_ctx(ssk); 1286 subflow->data_fin_tx_seq = data_fin_tx_seq; 1287 subflow->data_fin_tx_enable = 1; 1288 } 1289 1290 ssk->sk_shutdown |= how; 1291 tcp_shutdown(ssk, how); 1292 break; 1293 } 1294 1295 release_sock(ssk); 1296 } 1297 1298 /* Called with msk lock held, releases such lock before returning */ 1299 static void mptcp_close(struct sock *sk, long timeout) 1300 { 1301 struct mptcp_subflow_context *subflow, *tmp; 1302 struct mptcp_sock *msk = mptcp_sk(sk); 1303 LIST_HEAD(conn_list); 1304 u64 data_fin_tx_seq; 1305 1306 lock_sock(sk); 1307 1308 inet_sk_state_store(sk, TCP_CLOSE); 1309 1310 /* be sure to always acquire the join list lock, to sync vs 1311 * mptcp_finish_join(). 1312 */ 1313 spin_lock_bh(&msk->join_list_lock); 1314 list_splice_tail_init(&msk->join_list, &msk->conn_list); 1315 spin_unlock_bh(&msk->join_list_lock); 1316 list_splice_init(&msk->conn_list, &conn_list); 1317 1318 data_fin_tx_seq = msk->write_seq; 1319 1320 __mptcp_clear_xmit(sk); 1321 1322 release_sock(sk); 1323 1324 list_for_each_entry_safe(subflow, tmp, &conn_list, node) { 1325 struct sock *ssk = mptcp_subflow_tcp_sock(subflow); 1326 1327 subflow->data_fin_tx_seq = data_fin_tx_seq; 1328 subflow->data_fin_tx_enable = 1; 1329 __mptcp_close_ssk(sk, ssk, subflow, timeout); 1330 } 1331 1332 mptcp_cancel_work(sk); 1333 mptcp_pm_close(msk); 1334 1335 __skb_queue_purge(&sk->sk_receive_queue); 1336 1337 sk_common_release(sk); 1338 } 1339 1340 static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk) 1341 { 1342 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 1343 const struct ipv6_pinfo *ssk6 = inet6_sk(ssk); 1344 struct ipv6_pinfo *msk6 = inet6_sk(msk); 1345 1346 msk->sk_v6_daddr = ssk->sk_v6_daddr; 1347 msk->sk_v6_rcv_saddr = ssk->sk_v6_rcv_saddr; 1348 1349 if (msk6 && ssk6) { 1350 msk6->saddr = ssk6->saddr; 1351 msk6->flow_label = ssk6->flow_label; 1352 } 1353 #endif 1354 1355 inet_sk(msk)->inet_num = inet_sk(ssk)->inet_num; 1356 inet_sk(msk)->inet_dport = inet_sk(ssk)->inet_dport; 1357 inet_sk(msk)->inet_sport = inet_sk(ssk)->inet_sport; 1358 inet_sk(msk)->inet_daddr = inet_sk(ssk)->inet_daddr; 1359 inet_sk(msk)->inet_saddr = inet_sk(ssk)->inet_saddr; 1360 inet_sk(msk)->inet_rcv_saddr = inet_sk(ssk)->inet_rcv_saddr; 1361 } 1362 1363 static int mptcp_disconnect(struct sock *sk, int flags) 1364 { 1365 /* Should never be called. 1366 * inet_stream_connect() calls ->disconnect, but that 1367 * refers to the subflow socket, not the mptcp one. 1368 */ 1369 WARN_ON_ONCE(1); 1370 return 0; 1371 } 1372 1373 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 1374 static struct ipv6_pinfo *mptcp_inet6_sk(const struct sock *sk) 1375 { 1376 unsigned int offset = sizeof(struct mptcp6_sock) - sizeof(struct ipv6_pinfo); 1377 1378 return (struct ipv6_pinfo *)(((u8 *)sk) + offset); 1379 } 1380 #endif 1381 1382 struct sock *mptcp_sk_clone(const struct sock *sk, 1383 const struct mptcp_options_received *mp_opt, 1384 struct request_sock *req) 1385 { 1386 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); 1387 struct sock *nsk = sk_clone_lock(sk, GFP_ATOMIC); 1388 struct mptcp_sock *msk; 1389 u64 ack_seq; 1390 1391 if (!nsk) 1392 return NULL; 1393 1394 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 1395 if (nsk->sk_family == AF_INET6) 1396 inet_sk(nsk)->pinet6 = mptcp_inet6_sk(nsk); 1397 #endif 1398 1399 __mptcp_init_sock(nsk); 1400 1401 msk = mptcp_sk(nsk); 1402 msk->local_key = subflow_req->local_key; 1403 msk->token = subflow_req->token; 1404 msk->subflow = NULL; 1405 1406 msk->write_seq = subflow_req->idsn + 1; 1407 atomic64_set(&msk->snd_una, msk->write_seq); 1408 if (mp_opt->mp_capable) { 1409 msk->can_ack = true; 1410 msk->remote_key = mp_opt->sndr_key; 1411 mptcp_crypto_key_sha(msk->remote_key, NULL, &ack_seq); 1412 ack_seq++; 1413 msk->ack_seq = ack_seq; 1414 } 1415 1416 sock_reset_flag(nsk, SOCK_RCU_FREE); 1417 /* will be fully established after successful MPC subflow creation */ 1418 inet_sk_state_store(nsk, TCP_SYN_RECV); 1419 bh_unlock_sock(nsk); 1420 1421 /* keep a single reference */ 1422 __sock_put(nsk); 1423 return nsk; 1424 } 1425 1426 static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, 1427 bool kern) 1428 { 1429 struct mptcp_sock *msk = mptcp_sk(sk); 1430 struct socket *listener; 1431 struct sock *newsk; 1432 1433 listener = __mptcp_nmpc_socket(msk); 1434 if (WARN_ON_ONCE(!listener)) { 1435 *err = -EINVAL; 1436 return NULL; 1437 } 1438 1439 pr_debug("msk=%p, listener=%p", msk, mptcp_subflow_ctx(listener->sk)); 1440 newsk = inet_csk_accept(listener->sk, flags, err, kern); 1441 if (!newsk) 1442 return NULL; 1443 1444 pr_debug("msk=%p, subflow is mptcp=%d", msk, sk_is_mptcp(newsk)); 1445 if (sk_is_mptcp(newsk)) { 1446 struct mptcp_subflow_context *subflow; 1447 struct sock *new_mptcp_sock; 1448 struct sock *ssk = newsk; 1449 1450 subflow = mptcp_subflow_ctx(newsk); 1451 new_mptcp_sock = subflow->conn; 1452 1453 /* is_mptcp should be false if subflow->conn is missing, see 1454 * subflow_syn_recv_sock() 1455 */ 1456 if (WARN_ON_ONCE(!new_mptcp_sock)) { 1457 tcp_sk(newsk)->is_mptcp = 0; 1458 return newsk; 1459 } 1460 1461 /* acquire the 2nd reference for the owning socket */ 1462 sock_hold(new_mptcp_sock); 1463 1464 local_bh_disable(); 1465 bh_lock_sock(new_mptcp_sock); 1466 msk = mptcp_sk(new_mptcp_sock); 1467 msk->first = newsk; 1468 1469 newsk = new_mptcp_sock; 1470 mptcp_copy_inaddrs(newsk, ssk); 1471 list_add(&subflow->node, &msk->conn_list); 1472 inet_sk_state_store(newsk, TCP_ESTABLISHED); 1473 1474 bh_unlock_sock(new_mptcp_sock); 1475 1476 __MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEPASSIVEACK); 1477 local_bh_enable(); 1478 } else { 1479 MPTCP_INC_STATS(sock_net(sk), 1480 MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK); 1481 } 1482 1483 return newsk; 1484 } 1485 1486 static void mptcp_destroy(struct sock *sk) 1487 { 1488 struct mptcp_sock *msk = mptcp_sk(sk); 1489 1490 mptcp_token_destroy(msk); 1491 if (msk->cached_ext) 1492 __skb_ext_put(msk->cached_ext); 1493 1494 sk_sockets_allocated_dec(sk); 1495 } 1496 1497 static int mptcp_setsockopt(struct sock *sk, int level, int optname, 1498 char __user *optval, unsigned int optlen) 1499 { 1500 struct mptcp_sock *msk = mptcp_sk(sk); 1501 struct sock *ssk; 1502 1503 pr_debug("msk=%p", msk); 1504 1505 /* @@ the meaning of setsockopt() when the socket is connected and 1506 * there are multiple subflows is not yet defined. It is up to the 1507 * MPTCP-level socket to configure the subflows until the subflow 1508 * is in TCP fallback, when TCP socket options are passed through 1509 * to the one remaining subflow. 1510 */ 1511 lock_sock(sk); 1512 ssk = __mptcp_tcp_fallback(msk); 1513 release_sock(sk); 1514 if (ssk) 1515 return tcp_setsockopt(ssk, level, optname, optval, optlen); 1516 1517 return -EOPNOTSUPP; 1518 } 1519 1520 static int mptcp_getsockopt(struct sock *sk, int level, int optname, 1521 char __user *optval, int __user *option) 1522 { 1523 struct mptcp_sock *msk = mptcp_sk(sk); 1524 struct sock *ssk; 1525 1526 pr_debug("msk=%p", msk); 1527 1528 /* @@ the meaning of setsockopt() when the socket is connected and 1529 * there are multiple subflows is not yet defined. It is up to the 1530 * MPTCP-level socket to configure the subflows until the subflow 1531 * is in TCP fallback, when socket options are passed through 1532 * to the one remaining subflow. 1533 */ 1534 lock_sock(sk); 1535 ssk = __mptcp_tcp_fallback(msk); 1536 release_sock(sk); 1537 if (ssk) 1538 return tcp_getsockopt(ssk, level, optname, optval, option); 1539 1540 return -EOPNOTSUPP; 1541 } 1542 1543 #define MPTCP_DEFERRED_ALL (TCPF_DELACK_TIMER_DEFERRED | \ 1544 TCPF_WRITE_TIMER_DEFERRED) 1545 1546 /* this is very alike tcp_release_cb() but we must handle differently a 1547 * different set of events 1548 */ 1549 static void mptcp_release_cb(struct sock *sk) 1550 { 1551 unsigned long flags, nflags; 1552 1553 do { 1554 flags = sk->sk_tsq_flags; 1555 if (!(flags & MPTCP_DEFERRED_ALL)) 1556 return; 1557 nflags = flags & ~MPTCP_DEFERRED_ALL; 1558 } while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags); 1559 1560 sock_release_ownership(sk); 1561 1562 if (flags & TCPF_DELACK_TIMER_DEFERRED) { 1563 struct mptcp_sock *msk = mptcp_sk(sk); 1564 struct sock *ssk; 1565 1566 ssk = mptcp_subflow_recv_lookup(msk); 1567 if (!ssk || !schedule_work(&msk->work)) 1568 __sock_put(sk); 1569 } 1570 1571 if (flags & TCPF_WRITE_TIMER_DEFERRED) { 1572 mptcp_retransmit_handler(sk); 1573 __sock_put(sk); 1574 } 1575 } 1576 1577 static int mptcp_hash(struct sock *sk) 1578 { 1579 /* should never be called, 1580 * we hash the TCP subflows not the master socket 1581 */ 1582 WARN_ON_ONCE(1); 1583 return 0; 1584 } 1585 1586 static void mptcp_unhash(struct sock *sk) 1587 { 1588 /* called from sk_common_release(), but nothing to do here */ 1589 } 1590 1591 static int mptcp_get_port(struct sock *sk, unsigned short snum) 1592 { 1593 struct mptcp_sock *msk = mptcp_sk(sk); 1594 struct socket *ssock; 1595 1596 ssock = __mptcp_nmpc_socket(msk); 1597 pr_debug("msk=%p, subflow=%p", msk, ssock); 1598 if (WARN_ON_ONCE(!ssock)) 1599 return -EINVAL; 1600 1601 return inet_csk_get_port(ssock->sk, snum); 1602 } 1603 1604 void mptcp_finish_connect(struct sock *ssk) 1605 { 1606 struct mptcp_subflow_context *subflow; 1607 struct mptcp_sock *msk; 1608 struct sock *sk; 1609 u64 ack_seq; 1610 1611 subflow = mptcp_subflow_ctx(ssk); 1612 sk = subflow->conn; 1613 msk = mptcp_sk(sk); 1614 1615 pr_debug("msk=%p, token=%u", sk, subflow->token); 1616 1617 mptcp_crypto_key_sha(subflow->remote_key, NULL, &ack_seq); 1618 ack_seq++; 1619 subflow->map_seq = ack_seq; 1620 subflow->map_subflow_seq = 1; 1621 subflow->rel_write_seq = 1; 1622 1623 /* the socket is not connected yet, no msk/subflow ops can access/race 1624 * accessing the field below 1625 */ 1626 WRITE_ONCE(msk->remote_key, subflow->remote_key); 1627 WRITE_ONCE(msk->local_key, subflow->local_key); 1628 WRITE_ONCE(msk->write_seq, subflow->idsn + 1); 1629 WRITE_ONCE(msk->ack_seq, ack_seq); 1630 WRITE_ONCE(msk->can_ack, 1); 1631 atomic64_set(&msk->snd_una, msk->write_seq); 1632 1633 mptcp_pm_new_connection(msk, 0); 1634 } 1635 1636 static void mptcp_sock_graft(struct sock *sk, struct socket *parent) 1637 { 1638 write_lock_bh(&sk->sk_callback_lock); 1639 rcu_assign_pointer(sk->sk_wq, &parent->wq); 1640 sk_set_socket(sk, parent); 1641 sk->sk_uid = SOCK_INODE(parent)->i_uid; 1642 write_unlock_bh(&sk->sk_callback_lock); 1643 } 1644 1645 bool mptcp_finish_join(struct sock *sk) 1646 { 1647 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 1648 struct mptcp_sock *msk = mptcp_sk(subflow->conn); 1649 struct sock *parent = (void *)msk; 1650 struct socket *parent_sock; 1651 bool ret; 1652 1653 pr_debug("msk=%p, subflow=%p", msk, subflow); 1654 1655 /* mptcp socket already closing? */ 1656 if (inet_sk_state_load(parent) != TCP_ESTABLISHED) 1657 return false; 1658 1659 if (!msk->pm.server_side) 1660 return true; 1661 1662 if (!mptcp_pm_allow_new_subflow(msk)) 1663 return false; 1664 1665 /* active connections are already on conn_list, and we can't acquire 1666 * msk lock here. 1667 * use the join list lock as synchronization point and double-check 1668 * msk status to avoid racing with mptcp_close() 1669 */ 1670 spin_lock_bh(&msk->join_list_lock); 1671 ret = inet_sk_state_load(parent) == TCP_ESTABLISHED; 1672 if (ret && !WARN_ON_ONCE(!list_empty(&subflow->node))) 1673 list_add_tail(&subflow->node, &msk->join_list); 1674 spin_unlock_bh(&msk->join_list_lock); 1675 if (!ret) 1676 return false; 1677 1678 /* attach to msk socket only after we are sure he will deal with us 1679 * at close time 1680 */ 1681 parent_sock = READ_ONCE(parent->sk_socket); 1682 if (parent_sock && !sk->sk_socket) 1683 mptcp_sock_graft(sk, parent_sock); 1684 subflow->map_seq = msk->ack_seq; 1685 return true; 1686 } 1687 1688 static bool mptcp_memory_free(const struct sock *sk, int wake) 1689 { 1690 struct mptcp_sock *msk = mptcp_sk(sk); 1691 1692 return wake ? test_bit(MPTCP_SEND_SPACE, &msk->flags) : true; 1693 } 1694 1695 static struct proto mptcp_prot = { 1696 .name = "MPTCP", 1697 .owner = THIS_MODULE, 1698 .init = mptcp_init_sock, 1699 .disconnect = mptcp_disconnect, 1700 .close = mptcp_close, 1701 .accept = mptcp_accept, 1702 .setsockopt = mptcp_setsockopt, 1703 .getsockopt = mptcp_getsockopt, 1704 .shutdown = tcp_shutdown, 1705 .destroy = mptcp_destroy, 1706 .sendmsg = mptcp_sendmsg, 1707 .recvmsg = mptcp_recvmsg, 1708 .release_cb = mptcp_release_cb, 1709 .hash = mptcp_hash, 1710 .unhash = mptcp_unhash, 1711 .get_port = mptcp_get_port, 1712 .sockets_allocated = &mptcp_sockets_allocated, 1713 .memory_allocated = &tcp_memory_allocated, 1714 .memory_pressure = &tcp_memory_pressure, 1715 .stream_memory_free = mptcp_memory_free, 1716 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), 1717 .sysctl_mem = sysctl_tcp_mem, 1718 .obj_size = sizeof(struct mptcp_sock), 1719 .slab_flags = SLAB_TYPESAFE_BY_RCU, 1720 .no_autobind = true, 1721 }; 1722 1723 static int mptcp_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) 1724 { 1725 struct mptcp_sock *msk = mptcp_sk(sock->sk); 1726 struct socket *ssock; 1727 int err; 1728 1729 lock_sock(sock->sk); 1730 ssock = __mptcp_nmpc_socket(msk); 1731 if (!ssock) { 1732 err = -EINVAL; 1733 goto unlock; 1734 } 1735 1736 err = ssock->ops->bind(ssock, uaddr, addr_len); 1737 if (!err) 1738 mptcp_copy_inaddrs(sock->sk, ssock->sk); 1739 1740 unlock: 1741 release_sock(sock->sk); 1742 return err; 1743 } 1744 1745 static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr, 1746 int addr_len, int flags) 1747 { 1748 struct mptcp_sock *msk = mptcp_sk(sock->sk); 1749 struct mptcp_subflow_context *subflow; 1750 struct socket *ssock; 1751 int err; 1752 1753 lock_sock(sock->sk); 1754 if (sock->state != SS_UNCONNECTED && msk->subflow) { 1755 /* pending connection or invalid state, let existing subflow 1756 * cope with that 1757 */ 1758 ssock = msk->subflow; 1759 goto do_connect; 1760 } 1761 1762 ssock = __mptcp_nmpc_socket(msk); 1763 if (!ssock) { 1764 err = -EINVAL; 1765 goto unlock; 1766 } 1767 1768 mptcp_token_destroy(msk); 1769 inet_sk_state_store(sock->sk, TCP_SYN_SENT); 1770 subflow = mptcp_subflow_ctx(ssock->sk); 1771 #ifdef CONFIG_TCP_MD5SIG 1772 /* no MPTCP if MD5SIG is enabled on this socket or we may run out of 1773 * TCP option space. 1774 */ 1775 if (rcu_access_pointer(tcp_sk(ssock->sk)->md5sig_info)) 1776 subflow->request_mptcp = 0; 1777 #endif 1778 if (subflow->request_mptcp && mptcp_token_new_connect(ssock->sk)) 1779 subflow->request_mptcp = 0; 1780 1781 do_connect: 1782 err = ssock->ops->connect(ssock, uaddr, addr_len, flags); 1783 sock->state = ssock->state; 1784 1785 /* on successful connect, the msk state will be moved to established by 1786 * subflow_finish_connect() 1787 */ 1788 if (!err || err == EINPROGRESS) 1789 mptcp_copy_inaddrs(sock->sk, ssock->sk); 1790 else 1791 inet_sk_state_store(sock->sk, inet_sk_state_load(ssock->sk)); 1792 1793 unlock: 1794 release_sock(sock->sk); 1795 return err; 1796 } 1797 1798 static int mptcp_listen(struct socket *sock, int backlog) 1799 { 1800 struct mptcp_sock *msk = mptcp_sk(sock->sk); 1801 struct socket *ssock; 1802 int err; 1803 1804 pr_debug("msk=%p", msk); 1805 1806 lock_sock(sock->sk); 1807 ssock = __mptcp_nmpc_socket(msk); 1808 if (!ssock) { 1809 err = -EINVAL; 1810 goto unlock; 1811 } 1812 1813 mptcp_token_destroy(msk); 1814 inet_sk_state_store(sock->sk, TCP_LISTEN); 1815 sock_set_flag(sock->sk, SOCK_RCU_FREE); 1816 1817 err = ssock->ops->listen(ssock, backlog); 1818 inet_sk_state_store(sock->sk, inet_sk_state_load(ssock->sk)); 1819 if (!err) 1820 mptcp_copy_inaddrs(sock->sk, ssock->sk); 1821 1822 unlock: 1823 release_sock(sock->sk); 1824 return err; 1825 } 1826 1827 static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, 1828 int flags, bool kern) 1829 { 1830 struct mptcp_sock *msk = mptcp_sk(sock->sk); 1831 struct socket *ssock; 1832 int err; 1833 1834 pr_debug("msk=%p", msk); 1835 1836 lock_sock(sock->sk); 1837 if (sock->sk->sk_state != TCP_LISTEN) 1838 goto unlock_fail; 1839 1840 ssock = __mptcp_nmpc_socket(msk); 1841 if (!ssock) 1842 goto unlock_fail; 1843 1844 clear_bit(MPTCP_DATA_READY, &msk->flags); 1845 sock_hold(ssock->sk); 1846 release_sock(sock->sk); 1847 1848 err = ssock->ops->accept(sock, newsock, flags, kern); 1849 if (err == 0 && !mptcp_is_tcpsk(newsock->sk)) { 1850 struct mptcp_sock *msk = mptcp_sk(newsock->sk); 1851 struct mptcp_subflow_context *subflow; 1852 1853 /* set ssk->sk_socket of accept()ed flows to mptcp socket. 1854 * This is needed so NOSPACE flag can be set from tcp stack. 1855 */ 1856 __mptcp_flush_join_list(msk); 1857 list_for_each_entry(subflow, &msk->conn_list, node) { 1858 struct sock *ssk = mptcp_subflow_tcp_sock(subflow); 1859 1860 if (!ssk->sk_socket) 1861 mptcp_sock_graft(ssk, newsock); 1862 } 1863 } 1864 1865 if (inet_csk_listen_poll(ssock->sk)) 1866 set_bit(MPTCP_DATA_READY, &msk->flags); 1867 sock_put(ssock->sk); 1868 return err; 1869 1870 unlock_fail: 1871 release_sock(sock->sk); 1872 return -EINVAL; 1873 } 1874 1875 static __poll_t mptcp_check_readable(struct mptcp_sock *msk) 1876 { 1877 return test_bit(MPTCP_DATA_READY, &msk->flags) ? EPOLLIN | EPOLLRDNORM : 1878 0; 1879 } 1880 1881 static __poll_t mptcp_poll(struct file *file, struct socket *sock, 1882 struct poll_table_struct *wait) 1883 { 1884 struct sock *sk = sock->sk; 1885 struct mptcp_sock *msk; 1886 __poll_t mask = 0; 1887 int state; 1888 1889 msk = mptcp_sk(sk); 1890 sock_poll_wait(file, sock, wait); 1891 1892 state = inet_sk_state_load(sk); 1893 if (state == TCP_LISTEN) 1894 return mptcp_check_readable(msk); 1895 1896 if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) { 1897 mask |= mptcp_check_readable(msk); 1898 if (sk_stream_is_writeable(sk) && 1899 test_bit(MPTCP_SEND_SPACE, &msk->flags)) 1900 mask |= EPOLLOUT | EPOLLWRNORM; 1901 } 1902 if (sk->sk_shutdown & RCV_SHUTDOWN) 1903 mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; 1904 1905 return mask; 1906 } 1907 1908 static int mptcp_shutdown(struct socket *sock, int how) 1909 { 1910 struct mptcp_sock *msk = mptcp_sk(sock->sk); 1911 struct mptcp_subflow_context *subflow; 1912 int ret = 0; 1913 1914 pr_debug("sk=%p, how=%d", msk, how); 1915 1916 lock_sock(sock->sk); 1917 if (how == SHUT_WR || how == SHUT_RDWR) 1918 inet_sk_state_store(sock->sk, TCP_FIN_WAIT1); 1919 1920 how++; 1921 1922 if ((how & ~SHUTDOWN_MASK) || !how) { 1923 ret = -EINVAL; 1924 goto out_unlock; 1925 } 1926 1927 if (sock->state == SS_CONNECTING) { 1928 if ((1 << sock->sk->sk_state) & 1929 (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE)) 1930 sock->state = SS_DISCONNECTING; 1931 else 1932 sock->state = SS_CONNECTED; 1933 } 1934 1935 __mptcp_flush_join_list(msk); 1936 mptcp_for_each_subflow(msk, subflow) { 1937 struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); 1938 1939 mptcp_subflow_shutdown(tcp_sk, how, 1, msk->write_seq); 1940 } 1941 1942 /* Wake up anyone sleeping in poll. */ 1943 sock->sk->sk_state_change(sock->sk); 1944 1945 out_unlock: 1946 release_sock(sock->sk); 1947 1948 return ret; 1949 } 1950 1951 static const struct proto_ops mptcp_stream_ops = { 1952 .family = PF_INET, 1953 .owner = THIS_MODULE, 1954 .release = inet_release, 1955 .bind = mptcp_bind, 1956 .connect = mptcp_stream_connect, 1957 .socketpair = sock_no_socketpair, 1958 .accept = mptcp_stream_accept, 1959 .getname = inet_getname, 1960 .poll = mptcp_poll, 1961 .ioctl = inet_ioctl, 1962 .gettstamp = sock_gettstamp, 1963 .listen = mptcp_listen, 1964 .shutdown = mptcp_shutdown, 1965 .setsockopt = sock_common_setsockopt, 1966 .getsockopt = sock_common_getsockopt, 1967 .sendmsg = inet_sendmsg, 1968 .recvmsg = inet_recvmsg, 1969 .mmap = sock_no_mmap, 1970 .sendpage = inet_sendpage, 1971 #ifdef CONFIG_COMPAT 1972 .compat_setsockopt = compat_sock_common_setsockopt, 1973 .compat_getsockopt = compat_sock_common_getsockopt, 1974 #endif 1975 }; 1976 1977 static struct inet_protosw mptcp_protosw = { 1978 .type = SOCK_STREAM, 1979 .protocol = IPPROTO_MPTCP, 1980 .prot = &mptcp_prot, 1981 .ops = &mptcp_stream_ops, 1982 .flags = INET_PROTOSW_ICSK, 1983 }; 1984 1985 void __init mptcp_proto_init(void) 1986 { 1987 mptcp_prot.h.hashinfo = tcp_prot.h.hashinfo; 1988 1989 if (percpu_counter_init(&mptcp_sockets_allocated, 0, GFP_KERNEL)) 1990 panic("Failed to allocate MPTCP pcpu counter\n"); 1991 1992 mptcp_subflow_init(); 1993 mptcp_pm_init(); 1994 mptcp_token_init(); 1995 1996 if (proto_register(&mptcp_prot, 1) != 0) 1997 panic("Failed to register MPTCP proto.\n"); 1998 1999 inet_register_protosw(&mptcp_protosw); 2000 2001 BUILD_BUG_ON(sizeof(struct mptcp_skb_cb) > sizeof_field(struct sk_buff, cb)); 2002 } 2003 2004 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 2005 static const struct proto_ops mptcp_v6_stream_ops = { 2006 .family = PF_INET6, 2007 .owner = THIS_MODULE, 2008 .release = inet6_release, 2009 .bind = mptcp_bind, 2010 .connect = mptcp_stream_connect, 2011 .socketpair = sock_no_socketpair, 2012 .accept = mptcp_stream_accept, 2013 .getname = inet6_getname, 2014 .poll = mptcp_poll, 2015 .ioctl = inet6_ioctl, 2016 .gettstamp = sock_gettstamp, 2017 .listen = mptcp_listen, 2018 .shutdown = mptcp_shutdown, 2019 .setsockopt = sock_common_setsockopt, 2020 .getsockopt = sock_common_getsockopt, 2021 .sendmsg = inet6_sendmsg, 2022 .recvmsg = inet6_recvmsg, 2023 .mmap = sock_no_mmap, 2024 .sendpage = inet_sendpage, 2025 #ifdef CONFIG_COMPAT 2026 .compat_ioctl = inet6_compat_ioctl, 2027 .compat_setsockopt = compat_sock_common_setsockopt, 2028 .compat_getsockopt = compat_sock_common_getsockopt, 2029 #endif 2030 }; 2031 2032 static struct proto mptcp_v6_prot; 2033 2034 static void mptcp_v6_destroy(struct sock *sk) 2035 { 2036 mptcp_destroy(sk); 2037 inet6_destroy_sock(sk); 2038 } 2039 2040 static struct inet_protosw mptcp_v6_protosw = { 2041 .type = SOCK_STREAM, 2042 .protocol = IPPROTO_MPTCP, 2043 .prot = &mptcp_v6_prot, 2044 .ops = &mptcp_v6_stream_ops, 2045 .flags = INET_PROTOSW_ICSK, 2046 }; 2047 2048 int __init mptcp_proto_v6_init(void) 2049 { 2050 int err; 2051 2052 mptcp_v6_prot = mptcp_prot; 2053 strcpy(mptcp_v6_prot.name, "MPTCPv6"); 2054 mptcp_v6_prot.slab = NULL; 2055 mptcp_v6_prot.destroy = mptcp_v6_destroy; 2056 mptcp_v6_prot.obj_size = sizeof(struct mptcp6_sock); 2057 2058 err = proto_register(&mptcp_v6_prot, 1); 2059 if (err) 2060 return err; 2061 2062 err = inet6_register_protosw(&mptcp_v6_protosw); 2063 if (err) 2064 proto_unregister(&mptcp_v6_prot); 2065 2066 return err; 2067 } 2068 #endif 2069