1 // SPDX-License-Identifier: GPL-2.0 2 /* Multipath TCP 3 * 4 * Copyright (c) 2017 - 2019, Intel Corporation. 5 */ 6 7 #define pr_fmt(fmt) "MPTCP: " fmt 8 9 #include <linux/kernel.h> 10 #include <linux/module.h> 11 #include <linux/netdevice.h> 12 #include <crypto/algapi.h> 13 #include <crypto/sha.h> 14 #include <net/sock.h> 15 #include <net/inet_common.h> 16 #include <net/inet_hashtables.h> 17 #include <net/protocol.h> 18 #include <net/tcp.h> 19 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 20 #include <net/ip6_route.h> 21 #endif 22 #include <net/mptcp.h> 23 #include <uapi/linux/mptcp.h> 24 #include "protocol.h" 25 #include "mib.h" 26 27 static void SUBFLOW_REQ_INC_STATS(struct request_sock *req, 28 enum linux_mptcp_mib_field field) 29 { 30 MPTCP_INC_STATS(sock_net(req_to_sk(req)), field); 31 } 32 33 static void subflow_req_destructor(struct request_sock *req) 34 { 35 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); 36 37 pr_debug("subflow_req=%p", subflow_req); 38 39 if (subflow_req->msk) 40 sock_put((struct sock *)subflow_req->msk); 41 42 mptcp_token_destroy_request(req); 43 tcp_request_sock_ops.destructor(req); 44 } 45 46 static void subflow_generate_hmac(u64 key1, u64 key2, u32 nonce1, u32 nonce2, 47 void *hmac) 48 { 49 u8 msg[8]; 50 51 put_unaligned_be32(nonce1, &msg[0]); 52 put_unaligned_be32(nonce2, &msg[4]); 53 54 mptcp_crypto_hmac_sha(key1, key2, msg, 8, hmac); 55 } 56 57 static bool mptcp_can_accept_new_subflow(const struct mptcp_sock *msk) 58 { 59 return mptcp_is_fully_established((void *)msk) && 60 READ_ONCE(msk->pm.accept_subflow); 61 } 62 63 /* validate received token and create truncated hmac and nonce for SYN-ACK */ 64 static struct mptcp_sock *subflow_token_join_request(struct request_sock *req, 65 const struct sk_buff *skb) 66 { 67 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); 68 u8 hmac[SHA256_DIGEST_SIZE]; 69 struct mptcp_sock *msk; 70 int local_id; 71 72 msk = mptcp_token_get_sock(subflow_req->token); 73 if (!msk) { 74 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINNOTOKEN); 75 return NULL; 76 } 77 78 local_id = mptcp_pm_get_local_id(msk, (struct sock_common *)req); 79 if (local_id < 0) { 80 sock_put((struct sock *)msk); 81 return NULL; 82 } 83 subflow_req->local_id = local_id; 84 85 get_random_bytes(&subflow_req->local_nonce, sizeof(u32)); 86 87 subflow_generate_hmac(msk->local_key, msk->remote_key, 88 subflow_req->local_nonce, 89 subflow_req->remote_nonce, hmac); 90 91 subflow_req->thmac = get_unaligned_be64(hmac); 92 return msk; 93 } 94 95 static int __subflow_init_req(struct request_sock *req, const struct sock *sk_listener) 96 { 97 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); 98 99 subflow_req->mp_capable = 0; 100 subflow_req->mp_join = 0; 101 subflow_req->msk = NULL; 102 mptcp_token_init_request(req); 103 104 #ifdef CONFIG_TCP_MD5SIG 105 /* no MPTCP if MD5SIG is enabled on this socket or we may run out of 106 * TCP option space. 107 */ 108 if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info)) 109 return -EINVAL; 110 #endif 111 112 return 0; 113 } 114 115 /* Init mptcp request socket. 116 * 117 * Returns an error code if a JOIN has failed and a TCP reset 118 * should be sent. 119 */ 120 static int subflow_init_req(struct request_sock *req, 121 const struct sock *sk_listener, 122 struct sk_buff *skb) 123 { 124 struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener); 125 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); 126 struct mptcp_options_received mp_opt; 127 int ret; 128 129 pr_debug("subflow_req=%p, listener=%p", subflow_req, listener); 130 131 ret = __subflow_init_req(req, sk_listener); 132 if (ret) 133 return 0; 134 135 mptcp_get_options(skb, &mp_opt); 136 137 if (mp_opt.mp_capable) { 138 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE); 139 140 if (mp_opt.mp_join) 141 return 0; 142 } else if (mp_opt.mp_join) { 143 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNRX); 144 } 145 146 if (mp_opt.mp_capable && listener->request_mptcp) { 147 int err, retries = 4; 148 149 subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq; 150 again: 151 do { 152 get_random_bytes(&subflow_req->local_key, sizeof(subflow_req->local_key)); 153 } while (subflow_req->local_key == 0); 154 155 if (unlikely(req->syncookie)) { 156 mptcp_crypto_key_sha(subflow_req->local_key, 157 &subflow_req->token, 158 &subflow_req->idsn); 159 if (mptcp_token_exists(subflow_req->token)) { 160 if (retries-- > 0) 161 goto again; 162 } else { 163 subflow_req->mp_capable = 1; 164 } 165 return 0; 166 } 167 168 err = mptcp_token_new_request(req); 169 if (err == 0) 170 subflow_req->mp_capable = 1; 171 else if (retries-- > 0) 172 goto again; 173 174 } else if (mp_opt.mp_join && listener->request_mptcp) { 175 subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq; 176 subflow_req->mp_join = 1; 177 subflow_req->backup = mp_opt.backup; 178 subflow_req->remote_id = mp_opt.join_id; 179 subflow_req->token = mp_opt.token; 180 subflow_req->remote_nonce = mp_opt.nonce; 181 subflow_req->msk = subflow_token_join_request(req, skb); 182 183 /* Can't fall back to TCP in this case. */ 184 if (!subflow_req->msk) 185 return -EPERM; 186 187 if (unlikely(req->syncookie)) { 188 if (mptcp_can_accept_new_subflow(subflow_req->msk)) 189 subflow_init_req_cookie_join_save(subflow_req, skb); 190 } 191 192 pr_debug("token=%u, remote_nonce=%u msk=%p", subflow_req->token, 193 subflow_req->remote_nonce, subflow_req->msk); 194 } 195 196 return 0; 197 } 198 199 int mptcp_subflow_init_cookie_req(struct request_sock *req, 200 const struct sock *sk_listener, 201 struct sk_buff *skb) 202 { 203 struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener); 204 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); 205 struct mptcp_options_received mp_opt; 206 int err; 207 208 err = __subflow_init_req(req, sk_listener); 209 if (err) 210 return err; 211 212 mptcp_get_options(skb, &mp_opt); 213 214 if (mp_opt.mp_capable && mp_opt.mp_join) 215 return -EINVAL; 216 217 if (mp_opt.mp_capable && listener->request_mptcp) { 218 if (mp_opt.sndr_key == 0) 219 return -EINVAL; 220 221 subflow_req->local_key = mp_opt.rcvr_key; 222 err = mptcp_token_new_request(req); 223 if (err) 224 return err; 225 226 subflow_req->mp_capable = 1; 227 subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq - 1; 228 } else if (mp_opt.mp_join && listener->request_mptcp) { 229 if (!mptcp_token_join_cookie_init_state(subflow_req, skb)) 230 return -EINVAL; 231 232 if (mptcp_can_accept_new_subflow(subflow_req->msk)) 233 subflow_req->mp_join = 1; 234 235 subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq - 1; 236 } 237 238 return 0; 239 } 240 EXPORT_SYMBOL_GPL(mptcp_subflow_init_cookie_req); 241 242 static struct dst_entry *subflow_v4_route_req(const struct sock *sk, 243 struct sk_buff *skb, 244 struct flowi *fl, 245 struct request_sock *req) 246 { 247 struct dst_entry *dst; 248 int err; 249 250 tcp_rsk(req)->is_mptcp = 1; 251 252 dst = tcp_request_sock_ipv4_ops.route_req(sk, skb, fl, req); 253 if (!dst) 254 return NULL; 255 256 err = subflow_init_req(req, sk, skb); 257 if (err == 0) 258 return dst; 259 260 dst_release(dst); 261 if (!req->syncookie) 262 tcp_request_sock_ops.send_reset(sk, skb); 263 return NULL; 264 } 265 266 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 267 static struct dst_entry *subflow_v6_route_req(const struct sock *sk, 268 struct sk_buff *skb, 269 struct flowi *fl, 270 struct request_sock *req) 271 { 272 struct dst_entry *dst; 273 int err; 274 275 tcp_rsk(req)->is_mptcp = 1; 276 277 dst = tcp_request_sock_ipv6_ops.route_req(sk, skb, fl, req); 278 if (!dst) 279 return NULL; 280 281 err = subflow_init_req(req, sk, skb); 282 if (err == 0) 283 return dst; 284 285 dst_release(dst); 286 if (!req->syncookie) 287 tcp6_request_sock_ops.send_reset(sk, skb); 288 return NULL; 289 } 290 #endif 291 292 /* validate received truncated hmac and create hmac for third ACK */ 293 static bool subflow_thmac_valid(struct mptcp_subflow_context *subflow) 294 { 295 u8 hmac[SHA256_DIGEST_SIZE]; 296 u64 thmac; 297 298 subflow_generate_hmac(subflow->remote_key, subflow->local_key, 299 subflow->remote_nonce, subflow->local_nonce, 300 hmac); 301 302 thmac = get_unaligned_be64(hmac); 303 pr_debug("subflow=%p, token=%u, thmac=%llu, subflow->thmac=%llu\n", 304 subflow, subflow->token, 305 (unsigned long long)thmac, 306 (unsigned long long)subflow->thmac); 307 308 return thmac == subflow->thmac; 309 } 310 311 void mptcp_subflow_reset(struct sock *ssk) 312 { 313 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 314 struct sock *sk = subflow->conn; 315 316 tcp_set_state(ssk, TCP_CLOSE); 317 tcp_send_active_reset(ssk, GFP_ATOMIC); 318 tcp_done(ssk); 319 if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &mptcp_sk(sk)->flags) && 320 schedule_work(&mptcp_sk(sk)->work)) 321 sock_hold(sk); 322 } 323 324 static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) 325 { 326 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 327 struct mptcp_options_received mp_opt; 328 struct sock *parent = subflow->conn; 329 330 subflow->icsk_af_ops->sk_rx_dst_set(sk, skb); 331 332 if (inet_sk_state_load(parent) == TCP_SYN_SENT) { 333 inet_sk_state_store(parent, TCP_ESTABLISHED); 334 parent->sk_state_change(parent); 335 } 336 337 /* be sure no special action on any packet other than syn-ack */ 338 if (subflow->conn_finished) 339 return; 340 341 subflow->rel_write_seq = 1; 342 subflow->conn_finished = 1; 343 subflow->ssn_offset = TCP_SKB_CB(skb)->seq; 344 pr_debug("subflow=%p synack seq=%x", subflow, subflow->ssn_offset); 345 346 mptcp_get_options(skb, &mp_opt); 347 if (subflow->request_mptcp) { 348 if (!mp_opt.mp_capable) { 349 MPTCP_INC_STATS(sock_net(sk), 350 MPTCP_MIB_MPCAPABLEACTIVEFALLBACK); 351 mptcp_do_fallback(sk); 352 pr_fallback(mptcp_sk(subflow->conn)); 353 goto fallback; 354 } 355 356 subflow->mp_capable = 1; 357 subflow->can_ack = 1; 358 subflow->remote_key = mp_opt.sndr_key; 359 pr_debug("subflow=%p, remote_key=%llu", subflow, 360 subflow->remote_key); 361 mptcp_finish_connect(sk); 362 } else if (subflow->request_join) { 363 u8 hmac[SHA256_DIGEST_SIZE]; 364 365 if (!mp_opt.mp_join) 366 goto do_reset; 367 368 subflow->thmac = mp_opt.thmac; 369 subflow->remote_nonce = mp_opt.nonce; 370 pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u", subflow, 371 subflow->thmac, subflow->remote_nonce); 372 373 if (!subflow_thmac_valid(subflow)) { 374 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINACKMAC); 375 goto do_reset; 376 } 377 378 subflow_generate_hmac(subflow->local_key, subflow->remote_key, 379 subflow->local_nonce, 380 subflow->remote_nonce, 381 hmac); 382 memcpy(subflow->hmac, hmac, MPTCPOPT_HMAC_LEN); 383 384 if (!mptcp_finish_join(sk)) 385 goto do_reset; 386 387 subflow->mp_join = 1; 388 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX); 389 } else if (mptcp_check_fallback(sk)) { 390 fallback: 391 mptcp_rcv_space_init(mptcp_sk(parent), sk); 392 } 393 return; 394 395 do_reset: 396 mptcp_subflow_reset(sk); 397 } 398 399 struct request_sock_ops mptcp_subflow_request_sock_ops; 400 EXPORT_SYMBOL_GPL(mptcp_subflow_request_sock_ops); 401 static struct tcp_request_sock_ops subflow_request_sock_ipv4_ops; 402 403 static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb) 404 { 405 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 406 407 pr_debug("subflow=%p", subflow); 408 409 /* Never answer to SYNs sent to broadcast or multicast */ 410 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 411 goto drop; 412 413 return tcp_conn_request(&mptcp_subflow_request_sock_ops, 414 &subflow_request_sock_ipv4_ops, 415 sk, skb); 416 drop: 417 tcp_listendrop(sk); 418 return 0; 419 } 420 421 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 422 static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops; 423 static struct inet_connection_sock_af_ops subflow_v6_specific; 424 static struct inet_connection_sock_af_ops subflow_v6m_specific; 425 426 static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb) 427 { 428 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 429 430 pr_debug("subflow=%p", subflow); 431 432 if (skb->protocol == htons(ETH_P_IP)) 433 return subflow_v4_conn_request(sk, skb); 434 435 if (!ipv6_unicast_destination(skb)) 436 goto drop; 437 438 return tcp_conn_request(&mptcp_subflow_request_sock_ops, 439 &subflow_request_sock_ipv6_ops, sk, skb); 440 441 drop: 442 tcp_listendrop(sk); 443 return 0; /* don't send reset */ 444 } 445 #endif 446 447 /* validate hmac received in third ACK */ 448 static bool subflow_hmac_valid(const struct request_sock *req, 449 const struct mptcp_options_received *mp_opt) 450 { 451 const struct mptcp_subflow_request_sock *subflow_req; 452 u8 hmac[SHA256_DIGEST_SIZE]; 453 struct mptcp_sock *msk; 454 455 subflow_req = mptcp_subflow_rsk(req); 456 msk = subflow_req->msk; 457 if (!msk) 458 return false; 459 460 subflow_generate_hmac(msk->remote_key, msk->local_key, 461 subflow_req->remote_nonce, 462 subflow_req->local_nonce, hmac); 463 464 return !crypto_memneq(hmac, mp_opt->hmac, MPTCPOPT_HMAC_LEN); 465 } 466 467 static void mptcp_sock_destruct(struct sock *sk) 468 { 469 /* if new mptcp socket isn't accepted, it is free'd 470 * from the tcp listener sockets request queue, linked 471 * from req->sk. The tcp socket is released. 472 * This calls the ULP release function which will 473 * also remove the mptcp socket, via 474 * sock_put(ctx->conn). 475 * 476 * Problem is that the mptcp socket will be in 477 * ESTABLISHED state and will not have the SOCK_DEAD flag. 478 * Both result in warnings from inet_sock_destruct. 479 */ 480 481 if (sk->sk_state == TCP_ESTABLISHED) { 482 sk->sk_state = TCP_CLOSE; 483 WARN_ON_ONCE(sk->sk_socket); 484 sock_orphan(sk); 485 } 486 487 mptcp_destroy_common(mptcp_sk(sk)); 488 inet_sock_destruct(sk); 489 } 490 491 static void mptcp_force_close(struct sock *sk) 492 { 493 inet_sk_state_store(sk, TCP_CLOSE); 494 sk_common_release(sk); 495 } 496 497 static void subflow_ulp_fallback(struct sock *sk, 498 struct mptcp_subflow_context *old_ctx) 499 { 500 struct inet_connection_sock *icsk = inet_csk(sk); 501 502 mptcp_subflow_tcp_fallback(sk, old_ctx); 503 icsk->icsk_ulp_ops = NULL; 504 rcu_assign_pointer(icsk->icsk_ulp_data, NULL); 505 tcp_sk(sk)->is_mptcp = 0; 506 } 507 508 static void subflow_drop_ctx(struct sock *ssk) 509 { 510 struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(ssk); 511 512 if (!ctx) 513 return; 514 515 subflow_ulp_fallback(ssk, ctx); 516 if (ctx->conn) 517 sock_put(ctx->conn); 518 519 kfree_rcu(ctx, rcu); 520 } 521 522 void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow, 523 struct mptcp_options_received *mp_opt) 524 { 525 struct mptcp_sock *msk = mptcp_sk(subflow->conn); 526 527 subflow->remote_key = mp_opt->sndr_key; 528 subflow->fully_established = 1; 529 subflow->can_ack = 1; 530 WRITE_ONCE(msk->fully_established, true); 531 } 532 533 static struct sock *subflow_syn_recv_sock(const struct sock *sk, 534 struct sk_buff *skb, 535 struct request_sock *req, 536 struct dst_entry *dst, 537 struct request_sock *req_unhash, 538 bool *own_req) 539 { 540 struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk); 541 struct mptcp_subflow_request_sock *subflow_req; 542 struct mptcp_options_received mp_opt; 543 bool fallback, fallback_is_fatal; 544 struct sock *new_msk = NULL; 545 struct sock *child; 546 547 pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn); 548 549 /* After child creation we must look for 'mp_capable' even when options 550 * are not parsed 551 */ 552 mp_opt.mp_capable = 0; 553 554 /* hopefully temporary handling for MP_JOIN+syncookie */ 555 subflow_req = mptcp_subflow_rsk(req); 556 fallback_is_fatal = tcp_rsk(req)->is_mptcp && subflow_req->mp_join; 557 fallback = !tcp_rsk(req)->is_mptcp; 558 if (fallback) 559 goto create_child; 560 561 /* if the sk is MP_CAPABLE, we try to fetch the client key */ 562 if (subflow_req->mp_capable) { 563 if (TCP_SKB_CB(skb)->seq != subflow_req->ssn_offset + 1) { 564 /* here we can receive and accept an in-window, 565 * out-of-order pkt, which will not carry the MP_CAPABLE 566 * opt even on mptcp enabled paths 567 */ 568 goto create_msk; 569 } 570 571 mptcp_get_options(skb, &mp_opt); 572 if (!mp_opt.mp_capable) { 573 fallback = true; 574 goto create_child; 575 } 576 577 create_msk: 578 new_msk = mptcp_sk_clone(listener->conn, &mp_opt, req); 579 if (!new_msk) 580 fallback = true; 581 } else if (subflow_req->mp_join) { 582 mptcp_get_options(skb, &mp_opt); 583 if (!mp_opt.mp_join || !subflow_hmac_valid(req, &mp_opt) || 584 !mptcp_can_accept_new_subflow(subflow_req->msk)) { 585 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC); 586 fallback = true; 587 } 588 } 589 590 create_child: 591 child = listener->icsk_af_ops->syn_recv_sock(sk, skb, req, dst, 592 req_unhash, own_req); 593 594 if (child && *own_req) { 595 struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(child); 596 597 tcp_rsk(req)->drop_req = false; 598 599 /* we need to fallback on ctx allocation failure and on pre-reqs 600 * checking above. In the latter scenario we additionally need 601 * to reset the context to non MPTCP status. 602 */ 603 if (!ctx || fallback) { 604 if (fallback_is_fatal) 605 goto dispose_child; 606 607 subflow_drop_ctx(child); 608 goto out; 609 } 610 611 if (ctx->mp_capable) { 612 /* this can't race with mptcp_close(), as the msk is 613 * not yet exposted to user-space 614 */ 615 inet_sk_state_store((void *)new_msk, TCP_ESTABLISHED); 616 617 /* record the newly created socket as the first msk 618 * subflow, but don't link it yet into conn_list 619 */ 620 WRITE_ONCE(mptcp_sk(new_msk)->first, child); 621 622 /* new mpc subflow takes ownership of the newly 623 * created mptcp socket 624 */ 625 new_msk->sk_destruct = mptcp_sock_destruct; 626 mptcp_pm_new_connection(mptcp_sk(new_msk), 1); 627 mptcp_token_accept(subflow_req, mptcp_sk(new_msk)); 628 ctx->conn = new_msk; 629 new_msk = NULL; 630 631 /* with OoO packets we can reach here without ingress 632 * mpc option 633 */ 634 if (mp_opt.mp_capable) 635 mptcp_subflow_fully_established(ctx, &mp_opt); 636 } else if (ctx->mp_join) { 637 struct mptcp_sock *owner; 638 639 owner = subflow_req->msk; 640 if (!owner) 641 goto dispose_child; 642 643 /* move the msk reference ownership to the subflow */ 644 subflow_req->msk = NULL; 645 ctx->conn = (struct sock *)owner; 646 if (!mptcp_finish_join(child)) 647 goto dispose_child; 648 649 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKRX); 650 tcp_rsk(req)->drop_req = true; 651 } 652 } 653 654 out: 655 /* dispose of the left over mptcp master, if any */ 656 if (unlikely(new_msk)) 657 mptcp_force_close(new_msk); 658 659 /* check for expected invariant - should never trigger, just help 660 * catching eariler subtle bugs 661 */ 662 WARN_ON_ONCE(child && *own_req && tcp_sk(child)->is_mptcp && 663 (!mptcp_subflow_ctx(child) || 664 !mptcp_subflow_ctx(child)->conn)); 665 return child; 666 667 dispose_child: 668 subflow_drop_ctx(child); 669 tcp_rsk(req)->drop_req = true; 670 inet_csk_prepare_for_destroy_sock(child); 671 tcp_done(child); 672 req->rsk_ops->send_reset(sk, skb); 673 674 /* The last child reference will be released by the caller */ 675 return child; 676 } 677 678 static struct inet_connection_sock_af_ops subflow_specific; 679 680 enum mapping_status { 681 MAPPING_OK, 682 MAPPING_INVALID, 683 MAPPING_EMPTY, 684 MAPPING_DATA_FIN, 685 MAPPING_DUMMY 686 }; 687 688 static u64 expand_seq(u64 old_seq, u16 old_data_len, u64 seq) 689 { 690 if ((u32)seq == (u32)old_seq) 691 return old_seq; 692 693 /* Assume map covers data not mapped yet. */ 694 return seq | ((old_seq + old_data_len + 1) & GENMASK_ULL(63, 32)); 695 } 696 697 static void warn_bad_map(struct mptcp_subflow_context *subflow, u32 ssn) 698 { 699 WARN_ONCE(1, "Bad mapping: ssn=%d map_seq=%d map_data_len=%d", 700 ssn, subflow->map_subflow_seq, subflow->map_data_len); 701 } 702 703 static bool skb_is_fully_mapped(struct sock *ssk, struct sk_buff *skb) 704 { 705 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 706 unsigned int skb_consumed; 707 708 skb_consumed = tcp_sk(ssk)->copied_seq - TCP_SKB_CB(skb)->seq; 709 if (WARN_ON_ONCE(skb_consumed >= skb->len)) 710 return true; 711 712 return skb->len - skb_consumed <= subflow->map_data_len - 713 mptcp_subflow_get_map_offset(subflow); 714 } 715 716 static bool validate_mapping(struct sock *ssk, struct sk_buff *skb) 717 { 718 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 719 u32 ssn = tcp_sk(ssk)->copied_seq - subflow->ssn_offset; 720 721 if (unlikely(before(ssn, subflow->map_subflow_seq))) { 722 /* Mapping covers data later in the subflow stream, 723 * currently unsupported. 724 */ 725 warn_bad_map(subflow, ssn); 726 return false; 727 } 728 if (unlikely(!before(ssn, subflow->map_subflow_seq + 729 subflow->map_data_len))) { 730 /* Mapping does covers past subflow data, invalid */ 731 warn_bad_map(subflow, ssn + skb->len); 732 return false; 733 } 734 return true; 735 } 736 737 static enum mapping_status get_mapping_status(struct sock *ssk, 738 struct mptcp_sock *msk) 739 { 740 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 741 struct mptcp_ext *mpext; 742 struct sk_buff *skb; 743 u16 data_len; 744 u64 map_seq; 745 746 skb = skb_peek(&ssk->sk_receive_queue); 747 if (!skb) 748 return MAPPING_EMPTY; 749 750 if (mptcp_check_fallback(ssk)) 751 return MAPPING_DUMMY; 752 753 mpext = mptcp_get_ext(skb); 754 if (!mpext || !mpext->use_map) { 755 if (!subflow->map_valid && !skb->len) { 756 /* the TCP stack deliver 0 len FIN pkt to the receive 757 * queue, that is the only 0len pkts ever expected here, 758 * and we can admit no mapping only for 0 len pkts 759 */ 760 if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) 761 WARN_ONCE(1, "0len seq %d:%d flags %x", 762 TCP_SKB_CB(skb)->seq, 763 TCP_SKB_CB(skb)->end_seq, 764 TCP_SKB_CB(skb)->tcp_flags); 765 sk_eat_skb(ssk, skb); 766 return MAPPING_EMPTY; 767 } 768 769 if (!subflow->map_valid) 770 return MAPPING_INVALID; 771 772 goto validate_seq; 773 } 774 775 pr_debug("seq=%llu is64=%d ssn=%u data_len=%u data_fin=%d", 776 mpext->data_seq, mpext->dsn64, mpext->subflow_seq, 777 mpext->data_len, mpext->data_fin); 778 779 data_len = mpext->data_len; 780 if (data_len == 0) { 781 pr_err("Infinite mapping not handled"); 782 MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPRX); 783 return MAPPING_INVALID; 784 } 785 786 if (mpext->data_fin == 1) { 787 if (data_len == 1) { 788 bool updated = mptcp_update_rcv_data_fin(msk, mpext->data_seq, 789 mpext->dsn64); 790 pr_debug("DATA_FIN with no payload seq=%llu", mpext->data_seq); 791 if (subflow->map_valid) { 792 /* A DATA_FIN might arrive in a DSS 793 * option before the previous mapping 794 * has been fully consumed. Continue 795 * handling the existing mapping. 796 */ 797 skb_ext_del(skb, SKB_EXT_MPTCP); 798 return MAPPING_OK; 799 } else { 800 if (updated && schedule_work(&msk->work)) 801 sock_hold((struct sock *)msk); 802 803 return MAPPING_DATA_FIN; 804 } 805 } else { 806 u64 data_fin_seq = mpext->data_seq + data_len - 1; 807 808 /* If mpext->data_seq is a 32-bit value, data_fin_seq 809 * must also be limited to 32 bits. 810 */ 811 if (!mpext->dsn64) 812 data_fin_seq &= GENMASK_ULL(31, 0); 813 814 mptcp_update_rcv_data_fin(msk, data_fin_seq, mpext->dsn64); 815 pr_debug("DATA_FIN with mapping seq=%llu dsn64=%d", 816 data_fin_seq, mpext->dsn64); 817 } 818 819 /* Adjust for DATA_FIN using 1 byte of sequence space */ 820 data_len--; 821 } 822 823 if (!mpext->dsn64) { 824 map_seq = expand_seq(subflow->map_seq, subflow->map_data_len, 825 mpext->data_seq); 826 pr_debug("expanded seq=%llu", subflow->map_seq); 827 } else { 828 map_seq = mpext->data_seq; 829 } 830 WRITE_ONCE(mptcp_sk(subflow->conn)->use_64bit_ack, !!mpext->dsn64); 831 832 if (subflow->map_valid) { 833 /* Allow replacing only with an identical map */ 834 if (subflow->map_seq == map_seq && 835 subflow->map_subflow_seq == mpext->subflow_seq && 836 subflow->map_data_len == data_len) { 837 skb_ext_del(skb, SKB_EXT_MPTCP); 838 return MAPPING_OK; 839 } 840 841 /* If this skb data are fully covered by the current mapping, 842 * the new map would need caching, which is not supported 843 */ 844 if (skb_is_fully_mapped(ssk, skb)) { 845 MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSNOMATCH); 846 return MAPPING_INVALID; 847 } 848 849 /* will validate the next map after consuming the current one */ 850 return MAPPING_OK; 851 } 852 853 subflow->map_seq = map_seq; 854 subflow->map_subflow_seq = mpext->subflow_seq; 855 subflow->map_data_len = data_len; 856 subflow->map_valid = 1; 857 subflow->mpc_map = mpext->mpc_map; 858 pr_debug("new map seq=%llu subflow_seq=%u data_len=%u", 859 subflow->map_seq, subflow->map_subflow_seq, 860 subflow->map_data_len); 861 862 validate_seq: 863 /* we revalidate valid mapping on new skb, because we must ensure 864 * the current skb is completely covered by the available mapping 865 */ 866 if (!validate_mapping(ssk, skb)) 867 return MAPPING_INVALID; 868 869 skb_ext_del(skb, SKB_EXT_MPTCP); 870 return MAPPING_OK; 871 } 872 873 static void mptcp_subflow_discard_data(struct sock *ssk, struct sk_buff *skb, 874 u64 limit) 875 { 876 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 877 bool fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; 878 u32 incr; 879 880 incr = limit >= skb->len ? skb->len + fin : limit; 881 882 pr_debug("discarding=%d len=%d seq=%d", incr, skb->len, 883 subflow->map_subflow_seq); 884 MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DUPDATA); 885 tcp_sk(ssk)->copied_seq += incr; 886 if (!before(tcp_sk(ssk)->copied_seq, TCP_SKB_CB(skb)->end_seq)) 887 sk_eat_skb(ssk, skb); 888 if (mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) 889 subflow->map_valid = 0; 890 } 891 892 static bool subflow_check_data_avail(struct sock *ssk) 893 { 894 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 895 enum mapping_status status; 896 struct mptcp_sock *msk; 897 struct sk_buff *skb; 898 899 pr_debug("msk=%p ssk=%p data_avail=%d skb=%p", subflow->conn, ssk, 900 subflow->data_avail, skb_peek(&ssk->sk_receive_queue)); 901 if (!skb_peek(&ssk->sk_receive_queue)) 902 subflow->data_avail = 0; 903 if (subflow->data_avail) 904 return true; 905 906 msk = mptcp_sk(subflow->conn); 907 for (;;) { 908 u64 ack_seq; 909 u64 old_ack; 910 911 status = get_mapping_status(ssk, msk); 912 pr_debug("msk=%p ssk=%p status=%d", msk, ssk, status); 913 if (status == MAPPING_INVALID) { 914 ssk->sk_err = EBADMSG; 915 goto fatal; 916 } 917 if (status == MAPPING_DUMMY) { 918 __mptcp_do_fallback(msk); 919 skb = skb_peek(&ssk->sk_receive_queue); 920 subflow->map_valid = 1; 921 subflow->map_seq = READ_ONCE(msk->ack_seq); 922 subflow->map_data_len = skb->len; 923 subflow->map_subflow_seq = tcp_sk(ssk)->copied_seq - 924 subflow->ssn_offset; 925 subflow->data_avail = MPTCP_SUBFLOW_DATA_AVAIL; 926 return true; 927 } 928 929 if (status != MAPPING_OK) 930 return false; 931 932 skb = skb_peek(&ssk->sk_receive_queue); 933 if (WARN_ON_ONCE(!skb)) 934 return false; 935 936 /* if msk lacks the remote key, this subflow must provide an 937 * MP_CAPABLE-based mapping 938 */ 939 if (unlikely(!READ_ONCE(msk->can_ack))) { 940 if (!subflow->mpc_map) { 941 ssk->sk_err = EBADMSG; 942 goto fatal; 943 } 944 WRITE_ONCE(msk->remote_key, subflow->remote_key); 945 WRITE_ONCE(msk->ack_seq, subflow->map_seq); 946 WRITE_ONCE(msk->can_ack, true); 947 } 948 949 old_ack = READ_ONCE(msk->ack_seq); 950 ack_seq = mptcp_subflow_get_mapped_dsn(subflow); 951 pr_debug("msk ack_seq=%llx subflow ack_seq=%llx", old_ack, 952 ack_seq); 953 if (ack_seq == old_ack) { 954 subflow->data_avail = MPTCP_SUBFLOW_DATA_AVAIL; 955 break; 956 } else if (after64(ack_seq, old_ack)) { 957 subflow->data_avail = MPTCP_SUBFLOW_OOO_DATA; 958 break; 959 } 960 961 /* only accept in-sequence mapping. Old values are spurious 962 * retransmission 963 */ 964 mptcp_subflow_discard_data(ssk, skb, old_ack - ack_seq); 965 } 966 return true; 967 968 fatal: 969 /* fatal protocol error, close the socket */ 970 /* This barrier is coupled with smp_rmb() in tcp_poll() */ 971 smp_wmb(); 972 ssk->sk_error_report(ssk); 973 tcp_set_state(ssk, TCP_CLOSE); 974 tcp_send_active_reset(ssk, GFP_ATOMIC); 975 subflow->data_avail = 0; 976 return false; 977 } 978 979 bool mptcp_subflow_data_available(struct sock *sk) 980 { 981 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 982 983 /* check if current mapping is still valid */ 984 if (subflow->map_valid && 985 mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) { 986 subflow->map_valid = 0; 987 subflow->data_avail = 0; 988 989 pr_debug("Done with mapping: seq=%u data_len=%u", 990 subflow->map_subflow_seq, 991 subflow->map_data_len); 992 } 993 994 return subflow_check_data_avail(sk); 995 } 996 997 /* If ssk has an mptcp parent socket, use the mptcp rcvbuf occupancy, 998 * not the ssk one. 999 * 1000 * In mptcp, rwin is about the mptcp-level connection data. 1001 * 1002 * Data that is still on the ssk rx queue can thus be ignored, 1003 * as far as mptcp peer is concerened that data is still inflight. 1004 * DSS ACK is updated when skb is moved to the mptcp rx queue. 1005 */ 1006 void mptcp_space(const struct sock *ssk, int *space, int *full_space) 1007 { 1008 const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 1009 const struct sock *sk = subflow->conn; 1010 1011 *space = __mptcp_space(sk); 1012 *full_space = tcp_full_space(sk); 1013 } 1014 1015 static void subflow_data_ready(struct sock *sk) 1016 { 1017 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 1018 u16 state = 1 << inet_sk_state_load(sk); 1019 struct sock *parent = subflow->conn; 1020 struct mptcp_sock *msk; 1021 1022 msk = mptcp_sk(parent); 1023 if (state & TCPF_LISTEN) { 1024 set_bit(MPTCP_DATA_READY, &msk->flags); 1025 parent->sk_data_ready(parent); 1026 return; 1027 } 1028 1029 WARN_ON_ONCE(!__mptcp_check_fallback(msk) && !subflow->mp_capable && 1030 !subflow->mp_join && !(state & TCPF_CLOSE)); 1031 1032 if (mptcp_subflow_data_available(sk)) 1033 mptcp_data_ready(parent, sk); 1034 } 1035 1036 static void subflow_write_space(struct sock *ssk) 1037 { 1038 /* we take action in __mptcp_clean_una() */ 1039 } 1040 1041 static struct inet_connection_sock_af_ops * 1042 subflow_default_af_ops(struct sock *sk) 1043 { 1044 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 1045 if (sk->sk_family == AF_INET6) 1046 return &subflow_v6_specific; 1047 #endif 1048 return &subflow_specific; 1049 } 1050 1051 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 1052 void mptcpv6_handle_mapped(struct sock *sk, bool mapped) 1053 { 1054 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 1055 struct inet_connection_sock *icsk = inet_csk(sk); 1056 struct inet_connection_sock_af_ops *target; 1057 1058 target = mapped ? &subflow_v6m_specific : subflow_default_af_ops(sk); 1059 1060 pr_debug("subflow=%p family=%d ops=%p target=%p mapped=%d", 1061 subflow, sk->sk_family, icsk->icsk_af_ops, target, mapped); 1062 1063 if (likely(icsk->icsk_af_ops == target)) 1064 return; 1065 1066 subflow->icsk_af_ops = icsk->icsk_af_ops; 1067 icsk->icsk_af_ops = target; 1068 } 1069 #endif 1070 1071 static void mptcp_info2sockaddr(const struct mptcp_addr_info *info, 1072 struct sockaddr_storage *addr) 1073 { 1074 memset(addr, 0, sizeof(*addr)); 1075 addr->ss_family = info->family; 1076 if (addr->ss_family == AF_INET) { 1077 struct sockaddr_in *in_addr = (struct sockaddr_in *)addr; 1078 1079 in_addr->sin_addr = info->addr; 1080 in_addr->sin_port = info->port; 1081 } 1082 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 1083 else if (addr->ss_family == AF_INET6) { 1084 struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)addr; 1085 1086 in6_addr->sin6_addr = info->addr6; 1087 in6_addr->sin6_port = info->port; 1088 } 1089 #endif 1090 } 1091 1092 int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, 1093 const struct mptcp_addr_info *remote) 1094 { 1095 struct mptcp_sock *msk = mptcp_sk(sk); 1096 struct mptcp_subflow_context *subflow; 1097 struct sockaddr_storage addr; 1098 int remote_id = remote->id; 1099 int local_id = loc->id; 1100 struct socket *sf; 1101 struct sock *ssk; 1102 u32 remote_token; 1103 int addrlen; 1104 int err; 1105 1106 if (!mptcp_is_fully_established(sk)) 1107 return -ENOTCONN; 1108 1109 err = mptcp_subflow_create_socket(sk, &sf); 1110 if (err) 1111 return err; 1112 1113 ssk = sf->sk; 1114 subflow = mptcp_subflow_ctx(ssk); 1115 do { 1116 get_random_bytes(&subflow->local_nonce, sizeof(u32)); 1117 } while (!subflow->local_nonce); 1118 1119 if (!local_id) { 1120 err = mptcp_pm_get_local_id(msk, (struct sock_common *)ssk); 1121 if (err < 0) 1122 goto failed; 1123 1124 local_id = err; 1125 } 1126 1127 subflow->remote_key = msk->remote_key; 1128 subflow->local_key = msk->local_key; 1129 subflow->token = msk->token; 1130 mptcp_info2sockaddr(loc, &addr); 1131 1132 addrlen = sizeof(struct sockaddr_in); 1133 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 1134 if (loc->family == AF_INET6) 1135 addrlen = sizeof(struct sockaddr_in6); 1136 #endif 1137 ssk->sk_bound_dev_if = loc->ifindex; 1138 err = kernel_bind(sf, (struct sockaddr *)&addr, addrlen); 1139 if (err) 1140 goto failed; 1141 1142 mptcp_crypto_key_sha(subflow->remote_key, &remote_token, NULL); 1143 pr_debug("msk=%p remote_token=%u local_id=%d remote_id=%d", msk, 1144 remote_token, local_id, remote_id); 1145 subflow->remote_token = remote_token; 1146 subflow->local_id = local_id; 1147 subflow->remote_id = remote_id; 1148 subflow->request_join = 1; 1149 subflow->request_bkup = !!(loc->flags & MPTCP_PM_ADDR_FLAG_BACKUP); 1150 mptcp_info2sockaddr(remote, &addr); 1151 1152 mptcp_add_pending_subflow(msk, subflow); 1153 err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK); 1154 if (err && err != -EINPROGRESS) 1155 goto failed_unlink; 1156 1157 return err; 1158 1159 failed_unlink: 1160 spin_lock_bh(&msk->join_list_lock); 1161 list_del(&subflow->node); 1162 spin_unlock_bh(&msk->join_list_lock); 1163 1164 failed: 1165 subflow->disposable = 1; 1166 sock_release(sf); 1167 return err; 1168 } 1169 1170 int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock) 1171 { 1172 struct mptcp_subflow_context *subflow; 1173 struct net *net = sock_net(sk); 1174 struct socket *sf; 1175 int err; 1176 1177 /* un-accepted server sockets can reach here - on bad configuration 1178 * bail early to avoid greater trouble later 1179 */ 1180 if (unlikely(!sk->sk_socket)) 1181 return -EINVAL; 1182 1183 err = sock_create_kern(net, sk->sk_family, SOCK_STREAM, IPPROTO_TCP, 1184 &sf); 1185 if (err) 1186 return err; 1187 1188 lock_sock(sf->sk); 1189 1190 /* kernel sockets do not by default acquire net ref, but TCP timer 1191 * needs it. 1192 */ 1193 sf->sk->sk_net_refcnt = 1; 1194 get_net(net); 1195 #ifdef CONFIG_PROC_FS 1196 this_cpu_add(*net->core.sock_inuse, 1); 1197 #endif 1198 err = tcp_set_ulp(sf->sk, "mptcp"); 1199 release_sock(sf->sk); 1200 1201 if (err) { 1202 sock_release(sf); 1203 return err; 1204 } 1205 1206 /* the newly created socket really belongs to the owning MPTCP master 1207 * socket, even if for additional subflows the allocation is performed 1208 * by a kernel workqueue. Adjust inode references, so that the 1209 * procfs/diag interaces really show this one belonging to the correct 1210 * user. 1211 */ 1212 SOCK_INODE(sf)->i_ino = SOCK_INODE(sk->sk_socket)->i_ino; 1213 SOCK_INODE(sf)->i_uid = SOCK_INODE(sk->sk_socket)->i_uid; 1214 SOCK_INODE(sf)->i_gid = SOCK_INODE(sk->sk_socket)->i_gid; 1215 1216 subflow = mptcp_subflow_ctx(sf->sk); 1217 pr_debug("subflow=%p", subflow); 1218 1219 *new_sock = sf; 1220 sock_hold(sk); 1221 subflow->conn = sk; 1222 1223 return 0; 1224 } 1225 1226 static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk, 1227 gfp_t priority) 1228 { 1229 struct inet_connection_sock *icsk = inet_csk(sk); 1230 struct mptcp_subflow_context *ctx; 1231 1232 ctx = kzalloc(sizeof(*ctx), priority); 1233 if (!ctx) 1234 return NULL; 1235 1236 rcu_assign_pointer(icsk->icsk_ulp_data, ctx); 1237 INIT_LIST_HEAD(&ctx->node); 1238 1239 pr_debug("subflow=%p", ctx); 1240 1241 ctx->tcp_sock = sk; 1242 1243 return ctx; 1244 } 1245 1246 static void __subflow_state_change(struct sock *sk) 1247 { 1248 struct socket_wq *wq; 1249 1250 rcu_read_lock(); 1251 wq = rcu_dereference(sk->sk_wq); 1252 if (skwq_has_sleeper(wq)) 1253 wake_up_interruptible_all(&wq->wait); 1254 rcu_read_unlock(); 1255 } 1256 1257 static bool subflow_is_done(const struct sock *sk) 1258 { 1259 return sk->sk_shutdown & RCV_SHUTDOWN || sk->sk_state == TCP_CLOSE; 1260 } 1261 1262 static void subflow_state_change(struct sock *sk) 1263 { 1264 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 1265 struct sock *parent = subflow->conn; 1266 1267 __subflow_state_change(sk); 1268 1269 if (subflow_simultaneous_connect(sk)) { 1270 mptcp_do_fallback(sk); 1271 mptcp_rcv_space_init(mptcp_sk(parent), sk); 1272 pr_fallback(mptcp_sk(parent)); 1273 subflow->conn_finished = 1; 1274 if (inet_sk_state_load(parent) == TCP_SYN_SENT) { 1275 inet_sk_state_store(parent, TCP_ESTABLISHED); 1276 parent->sk_state_change(parent); 1277 } 1278 } 1279 1280 /* as recvmsg() does not acquire the subflow socket for ssk selection 1281 * a fin packet carrying a DSS can be unnoticed if we don't trigger 1282 * the data available machinery here. 1283 */ 1284 if (mptcp_subflow_data_available(sk)) 1285 mptcp_data_ready(parent, sk); 1286 1287 if (__mptcp_check_fallback(mptcp_sk(parent)) && 1288 !subflow->rx_eof && subflow_is_done(sk)) { 1289 subflow->rx_eof = 1; 1290 mptcp_subflow_eof(parent); 1291 } 1292 } 1293 1294 static int subflow_ulp_init(struct sock *sk) 1295 { 1296 struct inet_connection_sock *icsk = inet_csk(sk); 1297 struct mptcp_subflow_context *ctx; 1298 struct tcp_sock *tp = tcp_sk(sk); 1299 int err = 0; 1300 1301 /* disallow attaching ULP to a socket unless it has been 1302 * created with sock_create_kern() 1303 */ 1304 if (!sk->sk_kern_sock) { 1305 err = -EOPNOTSUPP; 1306 goto out; 1307 } 1308 1309 ctx = subflow_create_ctx(sk, GFP_KERNEL); 1310 if (!ctx) { 1311 err = -ENOMEM; 1312 goto out; 1313 } 1314 1315 pr_debug("subflow=%p, family=%d", ctx, sk->sk_family); 1316 1317 tp->is_mptcp = 1; 1318 ctx->icsk_af_ops = icsk->icsk_af_ops; 1319 icsk->icsk_af_ops = subflow_default_af_ops(sk); 1320 ctx->tcp_data_ready = sk->sk_data_ready; 1321 ctx->tcp_state_change = sk->sk_state_change; 1322 ctx->tcp_write_space = sk->sk_write_space; 1323 sk->sk_data_ready = subflow_data_ready; 1324 sk->sk_write_space = subflow_write_space; 1325 sk->sk_state_change = subflow_state_change; 1326 out: 1327 return err; 1328 } 1329 1330 static void subflow_ulp_release(struct sock *ssk) 1331 { 1332 struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(ssk); 1333 bool release = true; 1334 struct sock *sk; 1335 1336 if (!ctx) 1337 return; 1338 1339 sk = ctx->conn; 1340 if (sk) { 1341 /* if the msk has been orphaned, keep the ctx 1342 * alive, will be freed by __mptcp_close_ssk(), 1343 * when the subflow is still unaccepted 1344 */ 1345 release = ctx->disposable || list_empty(&ctx->node); 1346 sock_put(sk); 1347 } 1348 1349 if (release) 1350 kfree_rcu(ctx, rcu); 1351 } 1352 1353 static void subflow_ulp_clone(const struct request_sock *req, 1354 struct sock *newsk, 1355 const gfp_t priority) 1356 { 1357 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); 1358 struct mptcp_subflow_context *old_ctx = mptcp_subflow_ctx(newsk); 1359 struct mptcp_subflow_context *new_ctx; 1360 1361 if (!tcp_rsk(req)->is_mptcp || 1362 (!subflow_req->mp_capable && !subflow_req->mp_join)) { 1363 subflow_ulp_fallback(newsk, old_ctx); 1364 return; 1365 } 1366 1367 new_ctx = subflow_create_ctx(newsk, priority); 1368 if (!new_ctx) { 1369 subflow_ulp_fallback(newsk, old_ctx); 1370 return; 1371 } 1372 1373 new_ctx->conn_finished = 1; 1374 new_ctx->icsk_af_ops = old_ctx->icsk_af_ops; 1375 new_ctx->tcp_data_ready = old_ctx->tcp_data_ready; 1376 new_ctx->tcp_state_change = old_ctx->tcp_state_change; 1377 new_ctx->tcp_write_space = old_ctx->tcp_write_space; 1378 new_ctx->rel_write_seq = 1; 1379 new_ctx->tcp_sock = newsk; 1380 1381 if (subflow_req->mp_capable) { 1382 /* see comments in subflow_syn_recv_sock(), MPTCP connection 1383 * is fully established only after we receive the remote key 1384 */ 1385 new_ctx->mp_capable = 1; 1386 new_ctx->local_key = subflow_req->local_key; 1387 new_ctx->token = subflow_req->token; 1388 new_ctx->ssn_offset = subflow_req->ssn_offset; 1389 new_ctx->idsn = subflow_req->idsn; 1390 } else if (subflow_req->mp_join) { 1391 new_ctx->ssn_offset = subflow_req->ssn_offset; 1392 new_ctx->mp_join = 1; 1393 new_ctx->fully_established = 1; 1394 new_ctx->backup = subflow_req->backup; 1395 new_ctx->local_id = subflow_req->local_id; 1396 new_ctx->remote_id = subflow_req->remote_id; 1397 new_ctx->token = subflow_req->token; 1398 new_ctx->thmac = subflow_req->thmac; 1399 } 1400 } 1401 1402 static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = { 1403 .name = "mptcp", 1404 .owner = THIS_MODULE, 1405 .init = subflow_ulp_init, 1406 .release = subflow_ulp_release, 1407 .clone = subflow_ulp_clone, 1408 }; 1409 1410 static int subflow_ops_init(struct request_sock_ops *subflow_ops) 1411 { 1412 subflow_ops->obj_size = sizeof(struct mptcp_subflow_request_sock); 1413 subflow_ops->slab_name = "request_sock_subflow"; 1414 1415 subflow_ops->slab = kmem_cache_create(subflow_ops->slab_name, 1416 subflow_ops->obj_size, 0, 1417 SLAB_ACCOUNT | 1418 SLAB_TYPESAFE_BY_RCU, 1419 NULL); 1420 if (!subflow_ops->slab) 1421 return -ENOMEM; 1422 1423 subflow_ops->destructor = subflow_req_destructor; 1424 1425 return 0; 1426 } 1427 1428 void __init mptcp_subflow_init(void) 1429 { 1430 mptcp_subflow_request_sock_ops = tcp_request_sock_ops; 1431 if (subflow_ops_init(&mptcp_subflow_request_sock_ops) != 0) 1432 panic("MPTCP: failed to init subflow request sock ops\n"); 1433 1434 subflow_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops; 1435 subflow_request_sock_ipv4_ops.route_req = subflow_v4_route_req; 1436 1437 subflow_specific = ipv4_specific; 1438 subflow_specific.conn_request = subflow_v4_conn_request; 1439 subflow_specific.syn_recv_sock = subflow_syn_recv_sock; 1440 subflow_specific.sk_rx_dst_set = subflow_finish_connect; 1441 1442 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 1443 subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops; 1444 subflow_request_sock_ipv6_ops.route_req = subflow_v6_route_req; 1445 1446 subflow_v6_specific = ipv6_specific; 1447 subflow_v6_specific.conn_request = subflow_v6_conn_request; 1448 subflow_v6_specific.syn_recv_sock = subflow_syn_recv_sock; 1449 subflow_v6_specific.sk_rx_dst_set = subflow_finish_connect; 1450 1451 subflow_v6m_specific = subflow_v6_specific; 1452 subflow_v6m_specific.queue_xmit = ipv4_specific.queue_xmit; 1453 subflow_v6m_specific.send_check = ipv4_specific.send_check; 1454 subflow_v6m_specific.net_header_len = ipv4_specific.net_header_len; 1455 subflow_v6m_specific.mtu_reduced = ipv4_specific.mtu_reduced; 1456 subflow_v6m_specific.net_frag_header_len = 0; 1457 #endif 1458 1459 mptcp_diag_subflow_init(&subflow_ulp_ops); 1460 1461 if (tcp_register_ulp(&subflow_ulp_ops) != 0) 1462 panic("MPTCP: failed to register subflows to ULP\n"); 1463 } 1464