1 // SPDX-License-Identifier: GPL-2.0 2 /* Multipath TCP 3 * 4 * Copyright (c) 2017 - 2019, Intel Corporation. 5 */ 6 7 #define pr_fmt(fmt) "MPTCP: " fmt 8 9 #include <linux/kernel.h> 10 #include <linux/module.h> 11 #include <linux/netdevice.h> 12 #include <crypto/sha2.h> 13 #include <crypto/utils.h> 14 #include <net/sock.h> 15 #include <net/inet_common.h> 16 #include <net/inet_hashtables.h> 17 #include <net/protocol.h> 18 #include <net/tcp.h> 19 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 20 #include <net/ip6_route.h> 21 #include <net/transp_v6.h> 22 #endif 23 #include <net/mptcp.h> 24 #include <uapi/linux/mptcp.h> 25 #include "protocol.h" 26 #include "mib.h" 27 28 #include <trace/events/mptcp.h> 29 #include <trace/events/sock.h> 30 31 static void mptcp_subflow_ops_undo_override(struct sock *ssk); 32 33 static void SUBFLOW_REQ_INC_STATS(struct request_sock *req, 34 enum linux_mptcp_mib_field field) 35 { 36 MPTCP_INC_STATS(sock_net(req_to_sk(req)), field); 37 } 38 39 static void subflow_req_destructor(struct request_sock *req) 40 { 41 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); 42 43 pr_debug("subflow_req=%p", subflow_req); 44 45 if (subflow_req->msk) 46 sock_put((struct sock *)subflow_req->msk); 47 48 mptcp_token_destroy_request(req); 49 } 50 51 static void subflow_generate_hmac(u64 key1, u64 key2, u32 nonce1, u32 nonce2, 52 void *hmac) 53 { 54 u8 msg[8]; 55 56 put_unaligned_be32(nonce1, &msg[0]); 57 put_unaligned_be32(nonce2, &msg[4]); 58 59 mptcp_crypto_hmac_sha(key1, key2, msg, 8, hmac); 60 } 61 62 static bool mptcp_can_accept_new_subflow(const struct mptcp_sock *msk) 63 { 64 return mptcp_is_fully_established((void *)msk) && 65 ((mptcp_pm_is_userspace(msk) && 66 mptcp_userspace_pm_active(msk)) || 67 READ_ONCE(msk->pm.accept_subflow)); 68 } 69 70 /* validate received token and create truncated hmac and nonce for SYN-ACK */ 71 static void subflow_req_create_thmac(struct mptcp_subflow_request_sock *subflow_req) 72 { 73 struct mptcp_sock *msk = subflow_req->msk; 74 u8 hmac[SHA256_DIGEST_SIZE]; 75 76 get_random_bytes(&subflow_req->local_nonce, sizeof(u32)); 77 78 subflow_generate_hmac(READ_ONCE(msk->local_key), 79 READ_ONCE(msk->remote_key), 80 subflow_req->local_nonce, 81 subflow_req->remote_nonce, hmac); 82 83 subflow_req->thmac = get_unaligned_be64(hmac); 84 } 85 86 static struct mptcp_sock *subflow_token_join_request(struct request_sock *req) 87 { 88 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); 89 struct mptcp_sock *msk; 90 int local_id; 91 92 msk = mptcp_token_get_sock(sock_net(req_to_sk(req)), subflow_req->token); 93 if (!msk) { 94 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINNOTOKEN); 95 return NULL; 96 } 97 98 local_id = mptcp_pm_get_local_id(msk, (struct sock_common *)req); 99 if (local_id < 0) { 100 sock_put((struct sock *)msk); 101 return NULL; 102 } 103 subflow_req->local_id = local_id; 104 105 return msk; 106 } 107 108 static void subflow_init_req(struct request_sock *req, const struct sock *sk_listener) 109 { 110 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); 111 112 subflow_req->mp_capable = 0; 113 subflow_req->mp_join = 0; 114 subflow_req->csum_reqd = mptcp_is_checksum_enabled(sock_net(sk_listener)); 115 subflow_req->allow_join_id0 = mptcp_allow_join_id0(sock_net(sk_listener)); 116 subflow_req->msk = NULL; 117 mptcp_token_init_request(req); 118 } 119 120 static bool subflow_use_different_sport(struct mptcp_sock *msk, const struct sock *sk) 121 { 122 return inet_sk(sk)->inet_sport != inet_sk((struct sock *)msk)->inet_sport; 123 } 124 125 static void subflow_add_reset_reason(struct sk_buff *skb, u8 reason) 126 { 127 struct mptcp_ext *mpext = skb_ext_add(skb, SKB_EXT_MPTCP); 128 129 if (mpext) { 130 memset(mpext, 0, sizeof(*mpext)); 131 mpext->reset_reason = reason; 132 } 133 } 134 135 /* Init mptcp request socket. 136 * 137 * Returns an error code if a JOIN has failed and a TCP reset 138 * should be sent. 139 */ 140 static int subflow_check_req(struct request_sock *req, 141 const struct sock *sk_listener, 142 struct sk_buff *skb) 143 { 144 struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener); 145 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); 146 struct mptcp_options_received mp_opt; 147 bool opt_mp_capable, opt_mp_join; 148 149 pr_debug("subflow_req=%p, listener=%p", subflow_req, listener); 150 151 #ifdef CONFIG_TCP_MD5SIG 152 /* no MPTCP if MD5SIG is enabled on this socket or we may run out of 153 * TCP option space. 154 */ 155 if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info)) 156 return -EINVAL; 157 #endif 158 159 mptcp_get_options(skb, &mp_opt); 160 161 opt_mp_capable = !!(mp_opt.suboptions & OPTION_MPTCP_MPC_SYN); 162 opt_mp_join = !!(mp_opt.suboptions & OPTION_MPTCP_MPJ_SYN); 163 if (opt_mp_capable) { 164 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE); 165 166 if (opt_mp_join) 167 return 0; 168 } else if (opt_mp_join) { 169 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNRX); 170 } 171 172 if (opt_mp_capable && listener->request_mptcp) { 173 int err, retries = MPTCP_TOKEN_MAX_RETRIES; 174 175 subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq; 176 again: 177 do { 178 get_random_bytes(&subflow_req->local_key, sizeof(subflow_req->local_key)); 179 } while (subflow_req->local_key == 0); 180 181 if (unlikely(req->syncookie)) { 182 mptcp_crypto_key_sha(subflow_req->local_key, 183 &subflow_req->token, 184 &subflow_req->idsn); 185 if (mptcp_token_exists(subflow_req->token)) { 186 if (retries-- > 0) 187 goto again; 188 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_TOKENFALLBACKINIT); 189 } else { 190 subflow_req->mp_capable = 1; 191 } 192 return 0; 193 } 194 195 err = mptcp_token_new_request(req); 196 if (err == 0) 197 subflow_req->mp_capable = 1; 198 else if (retries-- > 0) 199 goto again; 200 else 201 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_TOKENFALLBACKINIT); 202 203 } else if (opt_mp_join && listener->request_mptcp) { 204 subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq; 205 subflow_req->mp_join = 1; 206 subflow_req->backup = mp_opt.backup; 207 subflow_req->remote_id = mp_opt.join_id; 208 subflow_req->token = mp_opt.token; 209 subflow_req->remote_nonce = mp_opt.nonce; 210 subflow_req->msk = subflow_token_join_request(req); 211 212 /* Can't fall back to TCP in this case. */ 213 if (!subflow_req->msk) { 214 subflow_add_reset_reason(skb, MPTCP_RST_EMPTCP); 215 return -EPERM; 216 } 217 218 if (subflow_use_different_sport(subflow_req->msk, sk_listener)) { 219 pr_debug("syn inet_sport=%d %d", 220 ntohs(inet_sk(sk_listener)->inet_sport), 221 ntohs(inet_sk((struct sock *)subflow_req->msk)->inet_sport)); 222 if (!mptcp_pm_sport_in_anno_list(subflow_req->msk, sk_listener)) { 223 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MISMATCHPORTSYNRX); 224 return -EPERM; 225 } 226 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINPORTSYNRX); 227 } 228 229 subflow_req_create_thmac(subflow_req); 230 231 if (unlikely(req->syncookie)) { 232 if (mptcp_can_accept_new_subflow(subflow_req->msk)) 233 subflow_init_req_cookie_join_save(subflow_req, skb); 234 else 235 return -EPERM; 236 } 237 238 pr_debug("token=%u, remote_nonce=%u msk=%p", subflow_req->token, 239 subflow_req->remote_nonce, subflow_req->msk); 240 } 241 242 return 0; 243 } 244 245 int mptcp_subflow_init_cookie_req(struct request_sock *req, 246 const struct sock *sk_listener, 247 struct sk_buff *skb) 248 { 249 struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener); 250 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); 251 struct mptcp_options_received mp_opt; 252 bool opt_mp_capable, opt_mp_join; 253 int err; 254 255 subflow_init_req(req, sk_listener); 256 mptcp_get_options(skb, &mp_opt); 257 258 opt_mp_capable = !!(mp_opt.suboptions & OPTION_MPTCP_MPC_ACK); 259 opt_mp_join = !!(mp_opt.suboptions & OPTION_MPTCP_MPJ_ACK); 260 if (opt_mp_capable && opt_mp_join) 261 return -EINVAL; 262 263 if (opt_mp_capable && listener->request_mptcp) { 264 if (mp_opt.sndr_key == 0) 265 return -EINVAL; 266 267 subflow_req->local_key = mp_opt.rcvr_key; 268 err = mptcp_token_new_request(req); 269 if (err) 270 return err; 271 272 subflow_req->mp_capable = 1; 273 subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq - 1; 274 } else if (opt_mp_join && listener->request_mptcp) { 275 if (!mptcp_token_join_cookie_init_state(subflow_req, skb)) 276 return -EINVAL; 277 278 subflow_req->mp_join = 1; 279 subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq - 1; 280 } 281 282 return 0; 283 } 284 EXPORT_SYMBOL_GPL(mptcp_subflow_init_cookie_req); 285 286 static struct dst_entry *subflow_v4_route_req(const struct sock *sk, 287 struct sk_buff *skb, 288 struct flowi *fl, 289 struct request_sock *req) 290 { 291 struct dst_entry *dst; 292 int err; 293 294 tcp_rsk(req)->is_mptcp = 1; 295 subflow_init_req(req, sk); 296 297 dst = tcp_request_sock_ipv4_ops.route_req(sk, skb, fl, req); 298 if (!dst) 299 return NULL; 300 301 err = subflow_check_req(req, sk, skb); 302 if (err == 0) 303 return dst; 304 305 dst_release(dst); 306 if (!req->syncookie) 307 tcp_request_sock_ops.send_reset(sk, skb); 308 return NULL; 309 } 310 311 static void subflow_prep_synack(const struct sock *sk, struct request_sock *req, 312 struct tcp_fastopen_cookie *foc, 313 enum tcp_synack_type synack_type) 314 { 315 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 316 struct inet_request_sock *ireq = inet_rsk(req); 317 318 /* clear tstamp_ok, as needed depending on cookie */ 319 if (foc && foc->len > -1) 320 ireq->tstamp_ok = 0; 321 322 if (synack_type == TCP_SYNACK_FASTOPEN) 323 mptcp_fastopen_subflow_synack_set_params(subflow, req); 324 } 325 326 static int subflow_v4_send_synack(const struct sock *sk, struct dst_entry *dst, 327 struct flowi *fl, 328 struct request_sock *req, 329 struct tcp_fastopen_cookie *foc, 330 enum tcp_synack_type synack_type, 331 struct sk_buff *syn_skb) 332 { 333 subflow_prep_synack(sk, req, foc, synack_type); 334 335 return tcp_request_sock_ipv4_ops.send_synack(sk, dst, fl, req, foc, 336 synack_type, syn_skb); 337 } 338 339 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 340 static int subflow_v6_send_synack(const struct sock *sk, struct dst_entry *dst, 341 struct flowi *fl, 342 struct request_sock *req, 343 struct tcp_fastopen_cookie *foc, 344 enum tcp_synack_type synack_type, 345 struct sk_buff *syn_skb) 346 { 347 subflow_prep_synack(sk, req, foc, synack_type); 348 349 return tcp_request_sock_ipv6_ops.send_synack(sk, dst, fl, req, foc, 350 synack_type, syn_skb); 351 } 352 353 static struct dst_entry *subflow_v6_route_req(const struct sock *sk, 354 struct sk_buff *skb, 355 struct flowi *fl, 356 struct request_sock *req) 357 { 358 struct dst_entry *dst; 359 int err; 360 361 tcp_rsk(req)->is_mptcp = 1; 362 subflow_init_req(req, sk); 363 364 dst = tcp_request_sock_ipv6_ops.route_req(sk, skb, fl, req); 365 if (!dst) 366 return NULL; 367 368 err = subflow_check_req(req, sk, skb); 369 if (err == 0) 370 return dst; 371 372 dst_release(dst); 373 if (!req->syncookie) 374 tcp6_request_sock_ops.send_reset(sk, skb); 375 return NULL; 376 } 377 #endif 378 379 /* validate received truncated hmac and create hmac for third ACK */ 380 static bool subflow_thmac_valid(struct mptcp_subflow_context *subflow) 381 { 382 u8 hmac[SHA256_DIGEST_SIZE]; 383 u64 thmac; 384 385 subflow_generate_hmac(subflow->remote_key, subflow->local_key, 386 subflow->remote_nonce, subflow->local_nonce, 387 hmac); 388 389 thmac = get_unaligned_be64(hmac); 390 pr_debug("subflow=%p, token=%u, thmac=%llu, subflow->thmac=%llu\n", 391 subflow, subflow->token, thmac, subflow->thmac); 392 393 return thmac == subflow->thmac; 394 } 395 396 void mptcp_subflow_reset(struct sock *ssk) 397 { 398 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 399 struct sock *sk = subflow->conn; 400 401 /* mptcp_mp_fail_no_response() can reach here on an already closed 402 * socket 403 */ 404 if (ssk->sk_state == TCP_CLOSE) 405 return; 406 407 /* must hold: tcp_done() could drop last reference on parent */ 408 sock_hold(sk); 409 410 tcp_send_active_reset(ssk, GFP_ATOMIC); 411 tcp_done(ssk); 412 if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &mptcp_sk(sk)->flags)) 413 mptcp_schedule_work(sk); 414 415 sock_put(sk); 416 } 417 418 static bool subflow_use_different_dport(struct mptcp_sock *msk, const struct sock *sk) 419 { 420 return inet_sk(sk)->inet_dport != inet_sk((struct sock *)msk)->inet_dport; 421 } 422 423 void __mptcp_sync_state(struct sock *sk, int state) 424 { 425 struct mptcp_subflow_context *subflow; 426 struct mptcp_sock *msk = mptcp_sk(sk); 427 struct sock *ssk = msk->first; 428 429 subflow = mptcp_subflow_ctx(ssk); 430 __mptcp_propagate_sndbuf(sk, ssk); 431 if (!msk->rcvspace_init) 432 mptcp_rcv_space_init(msk, ssk); 433 434 if (sk->sk_state == TCP_SYN_SENT) { 435 /* subflow->idsn is always available is TCP_SYN_SENT state, 436 * even for the FASTOPEN scenarios 437 */ 438 WRITE_ONCE(msk->write_seq, subflow->idsn + 1); 439 WRITE_ONCE(msk->snd_nxt, msk->write_seq); 440 mptcp_set_state(sk, state); 441 sk->sk_state_change(sk); 442 } 443 } 444 445 static void subflow_set_remote_key(struct mptcp_sock *msk, 446 struct mptcp_subflow_context *subflow, 447 const struct mptcp_options_received *mp_opt) 448 { 449 /* active MPC subflow will reach here multiple times: 450 * at subflow_finish_connect() time and at 4th ack time 451 */ 452 if (subflow->remote_key_valid) 453 return; 454 455 subflow->remote_key_valid = 1; 456 subflow->remote_key = mp_opt->sndr_key; 457 mptcp_crypto_key_sha(subflow->remote_key, NULL, &subflow->iasn); 458 subflow->iasn++; 459 460 WRITE_ONCE(msk->remote_key, subflow->remote_key); 461 WRITE_ONCE(msk->ack_seq, subflow->iasn); 462 WRITE_ONCE(msk->can_ack, true); 463 atomic64_set(&msk->rcv_wnd_sent, subflow->iasn); 464 } 465 466 static void mptcp_propagate_state(struct sock *sk, struct sock *ssk, 467 struct mptcp_subflow_context *subflow, 468 const struct mptcp_options_received *mp_opt) 469 { 470 struct mptcp_sock *msk = mptcp_sk(sk); 471 472 mptcp_data_lock(sk); 473 if (mp_opt) { 474 /* Options are available only in the non fallback cases 475 * avoid updating rx path fields otherwise 476 */ 477 WRITE_ONCE(msk->snd_una, subflow->idsn + 1); 478 WRITE_ONCE(msk->wnd_end, subflow->idsn + 1 + tcp_sk(ssk)->snd_wnd); 479 subflow_set_remote_key(msk, subflow, mp_opt); 480 } 481 482 if (!sock_owned_by_user(sk)) { 483 __mptcp_sync_state(sk, ssk->sk_state); 484 } else { 485 msk->pending_state = ssk->sk_state; 486 __set_bit(MPTCP_SYNC_STATE, &msk->cb_flags); 487 } 488 mptcp_data_unlock(sk); 489 } 490 491 static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) 492 { 493 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 494 struct mptcp_options_received mp_opt; 495 struct sock *parent = subflow->conn; 496 struct mptcp_sock *msk; 497 498 subflow->icsk_af_ops->sk_rx_dst_set(sk, skb); 499 500 /* be sure no special action on any packet other than syn-ack */ 501 if (subflow->conn_finished) 502 return; 503 504 msk = mptcp_sk(parent); 505 subflow->rel_write_seq = 1; 506 subflow->conn_finished = 1; 507 subflow->ssn_offset = TCP_SKB_CB(skb)->seq; 508 pr_debug("subflow=%p synack seq=%x", subflow, subflow->ssn_offset); 509 510 mptcp_get_options(skb, &mp_opt); 511 if (subflow->request_mptcp) { 512 if (!(mp_opt.suboptions & OPTION_MPTCP_MPC_SYNACK)) { 513 MPTCP_INC_STATS(sock_net(sk), 514 MPTCP_MIB_MPCAPABLEACTIVEFALLBACK); 515 mptcp_do_fallback(sk); 516 pr_fallback(msk); 517 goto fallback; 518 } 519 520 if (mp_opt.suboptions & OPTION_MPTCP_CSUMREQD) 521 WRITE_ONCE(msk->csum_enabled, true); 522 if (mp_opt.deny_join_id0) 523 WRITE_ONCE(msk->pm.remote_deny_join_id0, true); 524 subflow->mp_capable = 1; 525 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVEACK); 526 mptcp_finish_connect(sk); 527 mptcp_propagate_state(parent, sk, subflow, &mp_opt); 528 } else if (subflow->request_join) { 529 u8 hmac[SHA256_DIGEST_SIZE]; 530 531 if (!(mp_opt.suboptions & OPTION_MPTCP_MPJ_SYNACK)) { 532 subflow->reset_reason = MPTCP_RST_EMPTCP; 533 goto do_reset; 534 } 535 536 subflow->backup = mp_opt.backup; 537 subflow->thmac = mp_opt.thmac; 538 subflow->remote_nonce = mp_opt.nonce; 539 subflow->remote_id = mp_opt.join_id; 540 pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u backup=%d", 541 subflow, subflow->thmac, subflow->remote_nonce, 542 subflow->backup); 543 544 if (!subflow_thmac_valid(subflow)) { 545 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINACKMAC); 546 subflow->reset_reason = MPTCP_RST_EMPTCP; 547 goto do_reset; 548 } 549 550 if (!mptcp_finish_join(sk)) 551 goto do_reset; 552 553 subflow_generate_hmac(subflow->local_key, subflow->remote_key, 554 subflow->local_nonce, 555 subflow->remote_nonce, 556 hmac); 557 memcpy(subflow->hmac, hmac, MPTCPOPT_HMAC_LEN); 558 559 subflow->mp_join = 1; 560 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX); 561 562 if (subflow_use_different_dport(msk, sk)) { 563 pr_debug("synack inet_dport=%d %d", 564 ntohs(inet_sk(sk)->inet_dport), 565 ntohs(inet_sk(parent)->inet_dport)); 566 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINPORTSYNACKRX); 567 } 568 } else if (mptcp_check_fallback(sk)) { 569 fallback: 570 mptcp_propagate_state(parent, sk, subflow, NULL); 571 } 572 return; 573 574 do_reset: 575 subflow->reset_transient = 0; 576 mptcp_subflow_reset(sk); 577 } 578 579 static void subflow_set_local_id(struct mptcp_subflow_context *subflow, int local_id) 580 { 581 subflow->local_id = local_id; 582 subflow->local_id_valid = 1; 583 } 584 585 static int subflow_chk_local_id(struct sock *sk) 586 { 587 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 588 struct mptcp_sock *msk = mptcp_sk(subflow->conn); 589 int err; 590 591 if (likely(subflow->local_id_valid)) 592 return 0; 593 594 err = mptcp_pm_get_local_id(msk, (struct sock_common *)sk); 595 if (err < 0) 596 return err; 597 598 subflow_set_local_id(subflow, err); 599 return 0; 600 } 601 602 static int subflow_rebuild_header(struct sock *sk) 603 { 604 int err = subflow_chk_local_id(sk); 605 606 if (unlikely(err < 0)) 607 return err; 608 609 return inet_sk_rebuild_header(sk); 610 } 611 612 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 613 static int subflow_v6_rebuild_header(struct sock *sk) 614 { 615 int err = subflow_chk_local_id(sk); 616 617 if (unlikely(err < 0)) 618 return err; 619 620 return inet6_sk_rebuild_header(sk); 621 } 622 #endif 623 624 static struct request_sock_ops mptcp_subflow_v4_request_sock_ops __ro_after_init; 625 static struct tcp_request_sock_ops subflow_request_sock_ipv4_ops __ro_after_init; 626 627 static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb) 628 { 629 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 630 631 pr_debug("subflow=%p", subflow); 632 633 /* Never answer to SYNs sent to broadcast or multicast */ 634 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 635 goto drop; 636 637 return tcp_conn_request(&mptcp_subflow_v4_request_sock_ops, 638 &subflow_request_sock_ipv4_ops, 639 sk, skb); 640 drop: 641 tcp_listendrop(sk); 642 return 0; 643 } 644 645 static void subflow_v4_req_destructor(struct request_sock *req) 646 { 647 subflow_req_destructor(req); 648 tcp_request_sock_ops.destructor(req); 649 } 650 651 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 652 static struct request_sock_ops mptcp_subflow_v6_request_sock_ops __ro_after_init; 653 static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops __ro_after_init; 654 static struct inet_connection_sock_af_ops subflow_v6_specific __ro_after_init; 655 static struct inet_connection_sock_af_ops subflow_v6m_specific __ro_after_init; 656 static struct proto tcpv6_prot_override __ro_after_init; 657 658 static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb) 659 { 660 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 661 662 pr_debug("subflow=%p", subflow); 663 664 if (skb->protocol == htons(ETH_P_IP)) 665 return subflow_v4_conn_request(sk, skb); 666 667 if (!ipv6_unicast_destination(skb)) 668 goto drop; 669 670 if (ipv6_addr_v4mapped(&ipv6_hdr(skb)->saddr)) { 671 __IP6_INC_STATS(sock_net(sk), NULL, IPSTATS_MIB_INHDRERRORS); 672 return 0; 673 } 674 675 return tcp_conn_request(&mptcp_subflow_v6_request_sock_ops, 676 &subflow_request_sock_ipv6_ops, sk, skb); 677 678 drop: 679 tcp_listendrop(sk); 680 return 0; /* don't send reset */ 681 } 682 683 static void subflow_v6_req_destructor(struct request_sock *req) 684 { 685 subflow_req_destructor(req); 686 tcp6_request_sock_ops.destructor(req); 687 } 688 #endif 689 690 struct request_sock *mptcp_subflow_reqsk_alloc(const struct request_sock_ops *ops, 691 struct sock *sk_listener, 692 bool attach_listener) 693 { 694 if (ops->family == AF_INET) 695 ops = &mptcp_subflow_v4_request_sock_ops; 696 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 697 else if (ops->family == AF_INET6) 698 ops = &mptcp_subflow_v6_request_sock_ops; 699 #endif 700 701 return inet_reqsk_alloc(ops, sk_listener, attach_listener); 702 } 703 EXPORT_SYMBOL(mptcp_subflow_reqsk_alloc); 704 705 /* validate hmac received in third ACK */ 706 static bool subflow_hmac_valid(const struct request_sock *req, 707 const struct mptcp_options_received *mp_opt) 708 { 709 const struct mptcp_subflow_request_sock *subflow_req; 710 u8 hmac[SHA256_DIGEST_SIZE]; 711 struct mptcp_sock *msk; 712 713 subflow_req = mptcp_subflow_rsk(req); 714 msk = subflow_req->msk; 715 if (!msk) 716 return false; 717 718 subflow_generate_hmac(READ_ONCE(msk->remote_key), 719 READ_ONCE(msk->local_key), 720 subflow_req->remote_nonce, 721 subflow_req->local_nonce, hmac); 722 723 return !crypto_memneq(hmac, mp_opt->hmac, MPTCPOPT_HMAC_LEN); 724 } 725 726 static void subflow_ulp_fallback(struct sock *sk, 727 struct mptcp_subflow_context *old_ctx) 728 { 729 struct inet_connection_sock *icsk = inet_csk(sk); 730 731 mptcp_subflow_tcp_fallback(sk, old_ctx); 732 icsk->icsk_ulp_ops = NULL; 733 rcu_assign_pointer(icsk->icsk_ulp_data, NULL); 734 tcp_sk(sk)->is_mptcp = 0; 735 736 mptcp_subflow_ops_undo_override(sk); 737 } 738 739 void mptcp_subflow_drop_ctx(struct sock *ssk) 740 { 741 struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(ssk); 742 743 if (!ctx) 744 return; 745 746 list_del(&mptcp_subflow_ctx(ssk)->node); 747 if (inet_csk(ssk)->icsk_ulp_ops) { 748 subflow_ulp_fallback(ssk, ctx); 749 if (ctx->conn) 750 sock_put(ctx->conn); 751 } 752 753 kfree_rcu(ctx, rcu); 754 } 755 756 void __mptcp_subflow_fully_established(struct mptcp_sock *msk, 757 struct mptcp_subflow_context *subflow, 758 const struct mptcp_options_received *mp_opt) 759 { 760 subflow_set_remote_key(msk, subflow, mp_opt); 761 subflow->fully_established = 1; 762 WRITE_ONCE(msk->fully_established, true); 763 764 if (subflow->is_mptfo) 765 __mptcp_fastopen_gen_msk_ackseq(msk, subflow, mp_opt); 766 } 767 768 static struct sock *subflow_syn_recv_sock(const struct sock *sk, 769 struct sk_buff *skb, 770 struct request_sock *req, 771 struct dst_entry *dst, 772 struct request_sock *req_unhash, 773 bool *own_req) 774 { 775 struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk); 776 struct mptcp_subflow_request_sock *subflow_req; 777 struct mptcp_options_received mp_opt; 778 bool fallback, fallback_is_fatal; 779 struct mptcp_sock *owner; 780 struct sock *child; 781 782 pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn); 783 784 /* After child creation we must look for MPC even when options 785 * are not parsed 786 */ 787 mp_opt.suboptions = 0; 788 789 /* hopefully temporary handling for MP_JOIN+syncookie */ 790 subflow_req = mptcp_subflow_rsk(req); 791 fallback_is_fatal = tcp_rsk(req)->is_mptcp && subflow_req->mp_join; 792 fallback = !tcp_rsk(req)->is_mptcp; 793 if (fallback) 794 goto create_child; 795 796 /* if the sk is MP_CAPABLE, we try to fetch the client key */ 797 if (subflow_req->mp_capable) { 798 /* we can receive and accept an in-window, out-of-order pkt, 799 * which may not carry the MP_CAPABLE opt even on mptcp enabled 800 * paths: always try to extract the peer key, and fallback 801 * for packets missing it. 802 * Even OoO DSS packets coming legitly after dropped or 803 * reordered MPC will cause fallback, but we don't have other 804 * options. 805 */ 806 mptcp_get_options(skb, &mp_opt); 807 if (!(mp_opt.suboptions & 808 (OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_ACK))) 809 fallback = true; 810 811 } else if (subflow_req->mp_join) { 812 mptcp_get_options(skb, &mp_opt); 813 if (!(mp_opt.suboptions & OPTION_MPTCP_MPJ_ACK) || 814 !subflow_hmac_valid(req, &mp_opt) || 815 !mptcp_can_accept_new_subflow(subflow_req->msk)) { 816 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC); 817 fallback = true; 818 } 819 } 820 821 create_child: 822 child = listener->icsk_af_ops->syn_recv_sock(sk, skb, req, dst, 823 req_unhash, own_req); 824 825 if (child && *own_req) { 826 struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(child); 827 828 tcp_rsk(req)->drop_req = false; 829 830 /* we need to fallback on ctx allocation failure and on pre-reqs 831 * checking above. In the latter scenario we additionally need 832 * to reset the context to non MPTCP status. 833 */ 834 if (!ctx || fallback) { 835 if (fallback_is_fatal) { 836 subflow_add_reset_reason(skb, MPTCP_RST_EMPTCP); 837 goto dispose_child; 838 } 839 goto fallback; 840 } 841 842 /* ssk inherits options of listener sk */ 843 ctx->setsockopt_seq = listener->setsockopt_seq; 844 845 if (ctx->mp_capable) { 846 ctx->conn = mptcp_sk_clone_init(listener->conn, &mp_opt, child, req); 847 if (!ctx->conn) 848 goto fallback; 849 850 ctx->subflow_id = 1; 851 owner = mptcp_sk(ctx->conn); 852 mptcp_pm_new_connection(owner, child, 1); 853 854 /* with OoO packets we can reach here without ingress 855 * mpc option 856 */ 857 if (mp_opt.suboptions & OPTION_MPTCP_MPC_ACK) { 858 mptcp_pm_fully_established(owner, child); 859 ctx->pm_notified = 1; 860 } 861 } else if (ctx->mp_join) { 862 owner = subflow_req->msk; 863 if (!owner) { 864 subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); 865 goto dispose_child; 866 } 867 868 /* move the msk reference ownership to the subflow */ 869 subflow_req->msk = NULL; 870 ctx->conn = (struct sock *)owner; 871 872 if (subflow_use_different_sport(owner, sk)) { 873 pr_debug("ack inet_sport=%d %d", 874 ntohs(inet_sk(sk)->inet_sport), 875 ntohs(inet_sk((struct sock *)owner)->inet_sport)); 876 if (!mptcp_pm_sport_in_anno_list(owner, sk)) { 877 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MISMATCHPORTACKRX); 878 goto dispose_child; 879 } 880 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINPORTACKRX); 881 } 882 883 if (!mptcp_finish_join(child)) 884 goto dispose_child; 885 886 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKRX); 887 tcp_rsk(req)->drop_req = true; 888 } 889 } 890 891 /* check for expected invariant - should never trigger, just help 892 * catching eariler subtle bugs 893 */ 894 WARN_ON_ONCE(child && *own_req && tcp_sk(child)->is_mptcp && 895 (!mptcp_subflow_ctx(child) || 896 !mptcp_subflow_ctx(child)->conn)); 897 return child; 898 899 dispose_child: 900 mptcp_subflow_drop_ctx(child); 901 tcp_rsk(req)->drop_req = true; 902 inet_csk_prepare_for_destroy_sock(child); 903 tcp_done(child); 904 req->rsk_ops->send_reset(sk, skb); 905 906 /* The last child reference will be released by the caller */ 907 return child; 908 909 fallback: 910 mptcp_subflow_drop_ctx(child); 911 return child; 912 } 913 914 static struct inet_connection_sock_af_ops subflow_specific __ro_after_init; 915 static struct proto tcp_prot_override __ro_after_init; 916 917 enum mapping_status { 918 MAPPING_OK, 919 MAPPING_INVALID, 920 MAPPING_EMPTY, 921 MAPPING_DATA_FIN, 922 MAPPING_DUMMY, 923 MAPPING_BAD_CSUM 924 }; 925 926 static void dbg_bad_map(struct mptcp_subflow_context *subflow, u32 ssn) 927 { 928 pr_debug("Bad mapping: ssn=%d map_seq=%d map_data_len=%d", 929 ssn, subflow->map_subflow_seq, subflow->map_data_len); 930 } 931 932 static bool skb_is_fully_mapped(struct sock *ssk, struct sk_buff *skb) 933 { 934 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 935 unsigned int skb_consumed; 936 937 skb_consumed = tcp_sk(ssk)->copied_seq - TCP_SKB_CB(skb)->seq; 938 if (WARN_ON_ONCE(skb_consumed >= skb->len)) 939 return true; 940 941 return skb->len - skb_consumed <= subflow->map_data_len - 942 mptcp_subflow_get_map_offset(subflow); 943 } 944 945 static bool validate_mapping(struct sock *ssk, struct sk_buff *skb) 946 { 947 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 948 u32 ssn = tcp_sk(ssk)->copied_seq - subflow->ssn_offset; 949 950 if (unlikely(before(ssn, subflow->map_subflow_seq))) { 951 /* Mapping covers data later in the subflow stream, 952 * currently unsupported. 953 */ 954 dbg_bad_map(subflow, ssn); 955 return false; 956 } 957 if (unlikely(!before(ssn, subflow->map_subflow_seq + 958 subflow->map_data_len))) { 959 /* Mapping does covers past subflow data, invalid */ 960 dbg_bad_map(subflow, ssn); 961 return false; 962 } 963 return true; 964 } 965 966 static enum mapping_status validate_data_csum(struct sock *ssk, struct sk_buff *skb, 967 bool csum_reqd) 968 { 969 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 970 u32 offset, seq, delta; 971 __sum16 csum; 972 int len; 973 974 if (!csum_reqd) 975 return MAPPING_OK; 976 977 /* mapping already validated on previous traversal */ 978 if (subflow->map_csum_len == subflow->map_data_len) 979 return MAPPING_OK; 980 981 /* traverse the receive queue, ensuring it contains a full 982 * DSS mapping and accumulating the related csum. 983 * Preserve the accoumlate csum across multiple calls, to compute 984 * the csum only once 985 */ 986 delta = subflow->map_data_len - subflow->map_csum_len; 987 for (;;) { 988 seq = tcp_sk(ssk)->copied_seq + subflow->map_csum_len; 989 offset = seq - TCP_SKB_CB(skb)->seq; 990 991 /* if the current skb has not been accounted yet, csum its contents 992 * up to the amount covered by the current DSS 993 */ 994 if (offset < skb->len) { 995 __wsum csum; 996 997 len = min(skb->len - offset, delta); 998 csum = skb_checksum(skb, offset, len, 0); 999 subflow->map_data_csum = csum_block_add(subflow->map_data_csum, csum, 1000 subflow->map_csum_len); 1001 1002 delta -= len; 1003 subflow->map_csum_len += len; 1004 } 1005 if (delta == 0) 1006 break; 1007 1008 if (skb_queue_is_last(&ssk->sk_receive_queue, skb)) { 1009 /* if this subflow is closed, the partial mapping 1010 * will be never completed; flush the pending skbs, so 1011 * that subflow_sched_work_if_closed() can kick in 1012 */ 1013 if (unlikely(ssk->sk_state == TCP_CLOSE)) 1014 while ((skb = skb_peek(&ssk->sk_receive_queue))) 1015 sk_eat_skb(ssk, skb); 1016 1017 /* not enough data to validate the csum */ 1018 return MAPPING_EMPTY; 1019 } 1020 1021 /* the DSS mapping for next skbs will be validated later, 1022 * when a get_mapping_status call will process such skb 1023 */ 1024 skb = skb->next; 1025 } 1026 1027 /* note that 'map_data_len' accounts only for the carried data, does 1028 * not include the eventual seq increment due to the data fin, 1029 * while the pseudo header requires the original DSS data len, 1030 * including that 1031 */ 1032 csum = __mptcp_make_csum(subflow->map_seq, 1033 subflow->map_subflow_seq, 1034 subflow->map_data_len + subflow->map_data_fin, 1035 subflow->map_data_csum); 1036 if (unlikely(csum)) { 1037 MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DATACSUMERR); 1038 return MAPPING_BAD_CSUM; 1039 } 1040 1041 subflow->valid_csum_seen = 1; 1042 return MAPPING_OK; 1043 } 1044 1045 static enum mapping_status get_mapping_status(struct sock *ssk, 1046 struct mptcp_sock *msk) 1047 { 1048 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 1049 bool csum_reqd = READ_ONCE(msk->csum_enabled); 1050 struct mptcp_ext *mpext; 1051 struct sk_buff *skb; 1052 u16 data_len; 1053 u64 map_seq; 1054 1055 skb = skb_peek(&ssk->sk_receive_queue); 1056 if (!skb) 1057 return MAPPING_EMPTY; 1058 1059 if (mptcp_check_fallback(ssk)) 1060 return MAPPING_DUMMY; 1061 1062 mpext = mptcp_get_ext(skb); 1063 if (!mpext || !mpext->use_map) { 1064 if (!subflow->map_valid && !skb->len) { 1065 /* the TCP stack deliver 0 len FIN pkt to the receive 1066 * queue, that is the only 0len pkts ever expected here, 1067 * and we can admit no mapping only for 0 len pkts 1068 */ 1069 if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) 1070 WARN_ONCE(1, "0len seq %d:%d flags %x", 1071 TCP_SKB_CB(skb)->seq, 1072 TCP_SKB_CB(skb)->end_seq, 1073 TCP_SKB_CB(skb)->tcp_flags); 1074 sk_eat_skb(ssk, skb); 1075 return MAPPING_EMPTY; 1076 } 1077 1078 if (!subflow->map_valid) 1079 return MAPPING_INVALID; 1080 1081 goto validate_seq; 1082 } 1083 1084 trace_get_mapping_status(mpext); 1085 1086 data_len = mpext->data_len; 1087 if (data_len == 0) { 1088 pr_debug("infinite mapping received"); 1089 MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPRX); 1090 subflow->map_data_len = 0; 1091 return MAPPING_INVALID; 1092 } 1093 1094 if (mpext->data_fin == 1) { 1095 if (data_len == 1) { 1096 bool updated = mptcp_update_rcv_data_fin(msk, mpext->data_seq, 1097 mpext->dsn64); 1098 pr_debug("DATA_FIN with no payload seq=%llu", mpext->data_seq); 1099 if (subflow->map_valid) { 1100 /* A DATA_FIN might arrive in a DSS 1101 * option before the previous mapping 1102 * has been fully consumed. Continue 1103 * handling the existing mapping. 1104 */ 1105 skb_ext_del(skb, SKB_EXT_MPTCP); 1106 return MAPPING_OK; 1107 } else { 1108 if (updated) 1109 mptcp_schedule_work((struct sock *)msk); 1110 1111 return MAPPING_DATA_FIN; 1112 } 1113 } else { 1114 u64 data_fin_seq = mpext->data_seq + data_len - 1; 1115 1116 /* If mpext->data_seq is a 32-bit value, data_fin_seq 1117 * must also be limited to 32 bits. 1118 */ 1119 if (!mpext->dsn64) 1120 data_fin_seq &= GENMASK_ULL(31, 0); 1121 1122 mptcp_update_rcv_data_fin(msk, data_fin_seq, mpext->dsn64); 1123 pr_debug("DATA_FIN with mapping seq=%llu dsn64=%d", 1124 data_fin_seq, mpext->dsn64); 1125 } 1126 1127 /* Adjust for DATA_FIN using 1 byte of sequence space */ 1128 data_len--; 1129 } 1130 1131 map_seq = mptcp_expand_seq(READ_ONCE(msk->ack_seq), mpext->data_seq, mpext->dsn64); 1132 WRITE_ONCE(mptcp_sk(subflow->conn)->use_64bit_ack, !!mpext->dsn64); 1133 1134 if (subflow->map_valid) { 1135 /* Allow replacing only with an identical map */ 1136 if (subflow->map_seq == map_seq && 1137 subflow->map_subflow_seq == mpext->subflow_seq && 1138 subflow->map_data_len == data_len && 1139 subflow->map_csum_reqd == mpext->csum_reqd) { 1140 skb_ext_del(skb, SKB_EXT_MPTCP); 1141 goto validate_csum; 1142 } 1143 1144 /* If this skb data are fully covered by the current mapping, 1145 * the new map would need caching, which is not supported 1146 */ 1147 if (skb_is_fully_mapped(ssk, skb)) { 1148 MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSNOMATCH); 1149 return MAPPING_INVALID; 1150 } 1151 1152 /* will validate the next map after consuming the current one */ 1153 goto validate_csum; 1154 } 1155 1156 subflow->map_seq = map_seq; 1157 subflow->map_subflow_seq = mpext->subflow_seq; 1158 subflow->map_data_len = data_len; 1159 subflow->map_valid = 1; 1160 subflow->map_data_fin = mpext->data_fin; 1161 subflow->mpc_map = mpext->mpc_map; 1162 subflow->map_csum_reqd = mpext->csum_reqd; 1163 subflow->map_csum_len = 0; 1164 subflow->map_data_csum = csum_unfold(mpext->csum); 1165 1166 /* Cfr RFC 8684 Section 3.3.0 */ 1167 if (unlikely(subflow->map_csum_reqd != csum_reqd)) 1168 return MAPPING_INVALID; 1169 1170 pr_debug("new map seq=%llu subflow_seq=%u data_len=%u csum=%d:%u", 1171 subflow->map_seq, subflow->map_subflow_seq, 1172 subflow->map_data_len, subflow->map_csum_reqd, 1173 subflow->map_data_csum); 1174 1175 validate_seq: 1176 /* we revalidate valid mapping on new skb, because we must ensure 1177 * the current skb is completely covered by the available mapping 1178 */ 1179 if (!validate_mapping(ssk, skb)) { 1180 MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSTCPMISMATCH); 1181 return MAPPING_INVALID; 1182 } 1183 1184 skb_ext_del(skb, SKB_EXT_MPTCP); 1185 1186 validate_csum: 1187 return validate_data_csum(ssk, skb, csum_reqd); 1188 } 1189 1190 static void mptcp_subflow_discard_data(struct sock *ssk, struct sk_buff *skb, 1191 u64 limit) 1192 { 1193 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 1194 bool fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; 1195 u32 incr; 1196 1197 incr = limit >= skb->len ? skb->len + fin : limit; 1198 1199 pr_debug("discarding=%d len=%d seq=%d", incr, skb->len, 1200 subflow->map_subflow_seq); 1201 MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DUPDATA); 1202 tcp_sk(ssk)->copied_seq += incr; 1203 if (!before(tcp_sk(ssk)->copied_seq, TCP_SKB_CB(skb)->end_seq)) 1204 sk_eat_skb(ssk, skb); 1205 if (mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) 1206 subflow->map_valid = 0; 1207 } 1208 1209 /* sched mptcp worker to remove the subflow if no more data is pending */ 1210 static void subflow_sched_work_if_closed(struct mptcp_sock *msk, struct sock *ssk) 1211 { 1212 if (likely(ssk->sk_state != TCP_CLOSE)) 1213 return; 1214 1215 if (skb_queue_empty(&ssk->sk_receive_queue) && 1216 !test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) 1217 mptcp_schedule_work((struct sock *)msk); 1218 } 1219 1220 static bool subflow_can_fallback(struct mptcp_subflow_context *subflow) 1221 { 1222 struct mptcp_sock *msk = mptcp_sk(subflow->conn); 1223 1224 if (subflow->mp_join) 1225 return false; 1226 else if (READ_ONCE(msk->csum_enabled)) 1227 return !subflow->valid_csum_seen; 1228 else 1229 return !subflow->fully_established; 1230 } 1231 1232 static void mptcp_subflow_fail(struct mptcp_sock *msk, struct sock *ssk) 1233 { 1234 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 1235 unsigned long fail_tout; 1236 1237 /* greceful failure can happen only on the MPC subflow */ 1238 if (WARN_ON_ONCE(ssk != READ_ONCE(msk->first))) 1239 return; 1240 1241 /* since the close timeout take precedence on the fail one, 1242 * no need to start the latter when the first is already set 1243 */ 1244 if (sock_flag((struct sock *)msk, SOCK_DEAD)) 1245 return; 1246 1247 /* we don't need extreme accuracy here, use a zero fail_tout as special 1248 * value meaning no fail timeout at all; 1249 */ 1250 fail_tout = jiffies + TCP_RTO_MAX; 1251 if (!fail_tout) 1252 fail_tout = 1; 1253 WRITE_ONCE(subflow->fail_tout, fail_tout); 1254 tcp_send_ack(ssk); 1255 1256 mptcp_reset_tout_timer(msk, subflow->fail_tout); 1257 } 1258 1259 static bool subflow_check_data_avail(struct sock *ssk) 1260 { 1261 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 1262 enum mapping_status status; 1263 struct mptcp_sock *msk; 1264 struct sk_buff *skb; 1265 1266 if (!skb_peek(&ssk->sk_receive_queue)) 1267 WRITE_ONCE(subflow->data_avail, false); 1268 if (subflow->data_avail) 1269 return true; 1270 1271 msk = mptcp_sk(subflow->conn); 1272 for (;;) { 1273 u64 ack_seq; 1274 u64 old_ack; 1275 1276 status = get_mapping_status(ssk, msk); 1277 trace_subflow_check_data_avail(status, skb_peek(&ssk->sk_receive_queue)); 1278 if (unlikely(status == MAPPING_INVALID || status == MAPPING_DUMMY || 1279 status == MAPPING_BAD_CSUM)) 1280 goto fallback; 1281 1282 if (status != MAPPING_OK) 1283 goto no_data; 1284 1285 skb = skb_peek(&ssk->sk_receive_queue); 1286 if (WARN_ON_ONCE(!skb)) 1287 goto no_data; 1288 1289 if (unlikely(!READ_ONCE(msk->can_ack))) 1290 goto fallback; 1291 1292 old_ack = READ_ONCE(msk->ack_seq); 1293 ack_seq = mptcp_subflow_get_mapped_dsn(subflow); 1294 pr_debug("msk ack_seq=%llx subflow ack_seq=%llx", old_ack, 1295 ack_seq); 1296 if (unlikely(before64(ack_seq, old_ack))) { 1297 mptcp_subflow_discard_data(ssk, skb, old_ack - ack_seq); 1298 continue; 1299 } 1300 1301 WRITE_ONCE(subflow->data_avail, true); 1302 break; 1303 } 1304 return true; 1305 1306 no_data: 1307 subflow_sched_work_if_closed(msk, ssk); 1308 return false; 1309 1310 fallback: 1311 if (!__mptcp_check_fallback(msk)) { 1312 /* RFC 8684 section 3.7. */ 1313 if (status == MAPPING_BAD_CSUM && 1314 (subflow->mp_join || subflow->valid_csum_seen)) { 1315 subflow->send_mp_fail = 1; 1316 1317 if (!READ_ONCE(msk->allow_infinite_fallback)) { 1318 subflow->reset_transient = 0; 1319 subflow->reset_reason = MPTCP_RST_EMIDDLEBOX; 1320 goto reset; 1321 } 1322 mptcp_subflow_fail(msk, ssk); 1323 WRITE_ONCE(subflow->data_avail, true); 1324 return true; 1325 } 1326 1327 if (!subflow_can_fallback(subflow) && subflow->map_data_len) { 1328 /* fatal protocol error, close the socket. 1329 * subflow_error_report() will introduce the appropriate barriers 1330 */ 1331 subflow->reset_transient = 0; 1332 subflow->reset_reason = MPTCP_RST_EMPTCP; 1333 1334 reset: 1335 WRITE_ONCE(ssk->sk_err, EBADMSG); 1336 tcp_set_state(ssk, TCP_CLOSE); 1337 while ((skb = skb_peek(&ssk->sk_receive_queue))) 1338 sk_eat_skb(ssk, skb); 1339 tcp_send_active_reset(ssk, GFP_ATOMIC); 1340 WRITE_ONCE(subflow->data_avail, false); 1341 return false; 1342 } 1343 1344 mptcp_do_fallback(ssk); 1345 } 1346 1347 skb = skb_peek(&ssk->sk_receive_queue); 1348 subflow->map_valid = 1; 1349 subflow->map_seq = READ_ONCE(msk->ack_seq); 1350 subflow->map_data_len = skb->len; 1351 subflow->map_subflow_seq = tcp_sk(ssk)->copied_seq - subflow->ssn_offset; 1352 WRITE_ONCE(subflow->data_avail, true); 1353 return true; 1354 } 1355 1356 bool mptcp_subflow_data_available(struct sock *sk) 1357 { 1358 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 1359 1360 /* check if current mapping is still valid */ 1361 if (subflow->map_valid && 1362 mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) { 1363 subflow->map_valid = 0; 1364 WRITE_ONCE(subflow->data_avail, false); 1365 1366 pr_debug("Done with mapping: seq=%u data_len=%u", 1367 subflow->map_subflow_seq, 1368 subflow->map_data_len); 1369 } 1370 1371 return subflow_check_data_avail(sk); 1372 } 1373 1374 /* If ssk has an mptcp parent socket, use the mptcp rcvbuf occupancy, 1375 * not the ssk one. 1376 * 1377 * In mptcp, rwin is about the mptcp-level connection data. 1378 * 1379 * Data that is still on the ssk rx queue can thus be ignored, 1380 * as far as mptcp peer is concerned that data is still inflight. 1381 * DSS ACK is updated when skb is moved to the mptcp rx queue. 1382 */ 1383 void mptcp_space(const struct sock *ssk, int *space, int *full_space) 1384 { 1385 const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 1386 const struct sock *sk = subflow->conn; 1387 1388 *space = __mptcp_space(sk); 1389 *full_space = mptcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf)); 1390 } 1391 1392 static void subflow_error_report(struct sock *ssk) 1393 { 1394 struct sock *sk = mptcp_subflow_ctx(ssk)->conn; 1395 1396 /* bail early if this is a no-op, so that we avoid introducing a 1397 * problematic lockdep dependency between TCP accept queue lock 1398 * and msk socket spinlock 1399 */ 1400 if (!sk->sk_socket) 1401 return; 1402 1403 mptcp_data_lock(sk); 1404 if (!sock_owned_by_user(sk)) 1405 __mptcp_error_report(sk); 1406 else 1407 __set_bit(MPTCP_ERROR_REPORT, &mptcp_sk(sk)->cb_flags); 1408 mptcp_data_unlock(sk); 1409 } 1410 1411 static void subflow_data_ready(struct sock *sk) 1412 { 1413 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 1414 u16 state = 1 << inet_sk_state_load(sk); 1415 struct sock *parent = subflow->conn; 1416 struct mptcp_sock *msk; 1417 1418 trace_sk_data_ready(sk); 1419 1420 msk = mptcp_sk(parent); 1421 if (state & TCPF_LISTEN) { 1422 /* MPJ subflow are removed from accept queue before reaching here, 1423 * avoid stray wakeups 1424 */ 1425 if (reqsk_queue_empty(&inet_csk(sk)->icsk_accept_queue)) 1426 return; 1427 1428 parent->sk_data_ready(parent); 1429 return; 1430 } 1431 1432 WARN_ON_ONCE(!__mptcp_check_fallback(msk) && !subflow->mp_capable && 1433 !subflow->mp_join && !(state & TCPF_CLOSE)); 1434 1435 if (mptcp_subflow_data_available(sk)) { 1436 mptcp_data_ready(parent, sk); 1437 1438 /* subflow-level lowat test are not relevant. 1439 * respect the msk-level threshold eventually mandating an immediate ack 1440 */ 1441 if (mptcp_data_avail(msk) < parent->sk_rcvlowat && 1442 (tcp_sk(sk)->rcv_nxt - tcp_sk(sk)->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss) 1443 inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; 1444 } else if (unlikely(sk->sk_err)) { 1445 subflow_error_report(sk); 1446 } 1447 } 1448 1449 static void subflow_write_space(struct sock *ssk) 1450 { 1451 struct sock *sk = mptcp_subflow_ctx(ssk)->conn; 1452 1453 mptcp_propagate_sndbuf(sk, ssk); 1454 mptcp_write_space(sk); 1455 } 1456 1457 static const struct inet_connection_sock_af_ops * 1458 subflow_default_af_ops(struct sock *sk) 1459 { 1460 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 1461 if (sk->sk_family == AF_INET6) 1462 return &subflow_v6_specific; 1463 #endif 1464 return &subflow_specific; 1465 } 1466 1467 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 1468 void mptcpv6_handle_mapped(struct sock *sk, bool mapped) 1469 { 1470 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 1471 struct inet_connection_sock *icsk = inet_csk(sk); 1472 const struct inet_connection_sock_af_ops *target; 1473 1474 target = mapped ? &subflow_v6m_specific : subflow_default_af_ops(sk); 1475 1476 pr_debug("subflow=%p family=%d ops=%p target=%p mapped=%d", 1477 subflow, sk->sk_family, icsk->icsk_af_ops, target, mapped); 1478 1479 if (likely(icsk->icsk_af_ops == target)) 1480 return; 1481 1482 subflow->icsk_af_ops = icsk->icsk_af_ops; 1483 icsk->icsk_af_ops = target; 1484 } 1485 #endif 1486 1487 void mptcp_info2sockaddr(const struct mptcp_addr_info *info, 1488 struct sockaddr_storage *addr, 1489 unsigned short family) 1490 { 1491 memset(addr, 0, sizeof(*addr)); 1492 addr->ss_family = family; 1493 if (addr->ss_family == AF_INET) { 1494 struct sockaddr_in *in_addr = (struct sockaddr_in *)addr; 1495 1496 if (info->family == AF_INET) 1497 in_addr->sin_addr = info->addr; 1498 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 1499 else if (ipv6_addr_v4mapped(&info->addr6)) 1500 in_addr->sin_addr.s_addr = info->addr6.s6_addr32[3]; 1501 #endif 1502 in_addr->sin_port = info->port; 1503 } 1504 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 1505 else if (addr->ss_family == AF_INET6) { 1506 struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)addr; 1507 1508 if (info->family == AF_INET) 1509 ipv6_addr_set_v4mapped(info->addr.s_addr, 1510 &in6_addr->sin6_addr); 1511 else 1512 in6_addr->sin6_addr = info->addr6; 1513 in6_addr->sin6_port = info->port; 1514 } 1515 #endif 1516 } 1517 1518 int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, 1519 const struct mptcp_addr_info *remote) 1520 { 1521 struct mptcp_sock *msk = mptcp_sk(sk); 1522 struct mptcp_subflow_context *subflow; 1523 struct sockaddr_storage addr; 1524 int remote_id = remote->id; 1525 int local_id = loc->id; 1526 int err = -ENOTCONN; 1527 struct socket *sf; 1528 struct sock *ssk; 1529 u32 remote_token; 1530 int addrlen; 1531 int ifindex; 1532 u8 flags; 1533 1534 if (!mptcp_is_fully_established(sk)) 1535 goto err_out; 1536 1537 err = mptcp_subflow_create_socket(sk, loc->family, &sf); 1538 if (err) 1539 goto err_out; 1540 1541 ssk = sf->sk; 1542 subflow = mptcp_subflow_ctx(ssk); 1543 do { 1544 get_random_bytes(&subflow->local_nonce, sizeof(u32)); 1545 } while (!subflow->local_nonce); 1546 1547 if (local_id) 1548 subflow_set_local_id(subflow, local_id); 1549 1550 mptcp_pm_get_flags_and_ifindex_by_id(msk, local_id, 1551 &flags, &ifindex); 1552 subflow->remote_key_valid = 1; 1553 subflow->remote_key = READ_ONCE(msk->remote_key); 1554 subflow->local_key = READ_ONCE(msk->local_key); 1555 subflow->token = msk->token; 1556 mptcp_info2sockaddr(loc, &addr, ssk->sk_family); 1557 1558 addrlen = sizeof(struct sockaddr_in); 1559 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 1560 if (addr.ss_family == AF_INET6) 1561 addrlen = sizeof(struct sockaddr_in6); 1562 #endif 1563 ssk->sk_bound_dev_if = ifindex; 1564 err = kernel_bind(sf, (struct sockaddr *)&addr, addrlen); 1565 if (err) 1566 goto failed; 1567 1568 mptcp_crypto_key_sha(subflow->remote_key, &remote_token, NULL); 1569 pr_debug("msk=%p remote_token=%u local_id=%d remote_id=%d", msk, 1570 remote_token, local_id, remote_id); 1571 subflow->remote_token = remote_token; 1572 subflow->remote_id = remote_id; 1573 subflow->request_join = 1; 1574 subflow->request_bkup = !!(flags & MPTCP_PM_ADDR_FLAG_BACKUP); 1575 subflow->subflow_id = msk->subflow_id++; 1576 mptcp_info2sockaddr(remote, &addr, ssk->sk_family); 1577 1578 sock_hold(ssk); 1579 list_add_tail(&subflow->node, &msk->conn_list); 1580 err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK); 1581 if (err && err != -EINPROGRESS) 1582 goto failed_unlink; 1583 1584 /* discard the subflow socket */ 1585 mptcp_sock_graft(ssk, sk->sk_socket); 1586 iput(SOCK_INODE(sf)); 1587 WRITE_ONCE(msk->allow_infinite_fallback, false); 1588 mptcp_stop_tout_timer(sk); 1589 return 0; 1590 1591 failed_unlink: 1592 list_del(&subflow->node); 1593 sock_put(mptcp_subflow_tcp_sock(subflow)); 1594 1595 failed: 1596 subflow->disposable = 1; 1597 sock_release(sf); 1598 1599 err_out: 1600 /* we account subflows before the creation, and this failures will not 1601 * be caught by sk_state_change() 1602 */ 1603 mptcp_pm_close_subflow(msk); 1604 return err; 1605 } 1606 1607 static void mptcp_attach_cgroup(struct sock *parent, struct sock *child) 1608 { 1609 #ifdef CONFIG_SOCK_CGROUP_DATA 1610 struct sock_cgroup_data *parent_skcd = &parent->sk_cgrp_data, 1611 *child_skcd = &child->sk_cgrp_data; 1612 1613 /* only the additional subflows created by kworkers have to be modified */ 1614 if (cgroup_id(sock_cgroup_ptr(parent_skcd)) != 1615 cgroup_id(sock_cgroup_ptr(child_skcd))) { 1616 #ifdef CONFIG_MEMCG 1617 struct mem_cgroup *memcg = parent->sk_memcg; 1618 1619 mem_cgroup_sk_free(child); 1620 if (memcg && css_tryget(&memcg->css)) 1621 child->sk_memcg = memcg; 1622 #endif /* CONFIG_MEMCG */ 1623 1624 cgroup_sk_free(child_skcd); 1625 *child_skcd = *parent_skcd; 1626 cgroup_sk_clone(child_skcd); 1627 } 1628 #endif /* CONFIG_SOCK_CGROUP_DATA */ 1629 } 1630 1631 static void mptcp_subflow_ops_override(struct sock *ssk) 1632 { 1633 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 1634 if (ssk->sk_prot == &tcpv6_prot) 1635 ssk->sk_prot = &tcpv6_prot_override; 1636 else 1637 #endif 1638 ssk->sk_prot = &tcp_prot_override; 1639 } 1640 1641 static void mptcp_subflow_ops_undo_override(struct sock *ssk) 1642 { 1643 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 1644 if (ssk->sk_prot == &tcpv6_prot_override) 1645 ssk->sk_prot = &tcpv6_prot; 1646 else 1647 #endif 1648 ssk->sk_prot = &tcp_prot; 1649 } 1650 1651 int mptcp_subflow_create_socket(struct sock *sk, unsigned short family, 1652 struct socket **new_sock) 1653 { 1654 struct mptcp_subflow_context *subflow; 1655 struct net *net = sock_net(sk); 1656 struct socket *sf; 1657 int err; 1658 1659 /* un-accepted server sockets can reach here - on bad configuration 1660 * bail early to avoid greater trouble later 1661 */ 1662 if (unlikely(!sk->sk_socket)) 1663 return -EINVAL; 1664 1665 err = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP, &sf); 1666 if (err) 1667 return err; 1668 1669 lock_sock_nested(sf->sk, SINGLE_DEPTH_NESTING); 1670 1671 err = security_mptcp_add_subflow(sk, sf->sk); 1672 if (err) 1673 goto err_free; 1674 1675 /* the newly created socket has to be in the same cgroup as its parent */ 1676 mptcp_attach_cgroup(sk, sf->sk); 1677 1678 /* kernel sockets do not by default acquire net ref, but TCP timer 1679 * needs it. 1680 * Update ns_tracker to current stack trace and refcounted tracker. 1681 */ 1682 __netns_tracker_free(net, &sf->sk->ns_tracker, false); 1683 sf->sk->sk_net_refcnt = 1; 1684 get_net_track(net, &sf->sk->ns_tracker, GFP_KERNEL); 1685 sock_inuse_add(net, 1); 1686 err = tcp_set_ulp(sf->sk, "mptcp"); 1687 if (err) 1688 goto err_free; 1689 1690 mptcp_sockopt_sync_locked(mptcp_sk(sk), sf->sk); 1691 release_sock(sf->sk); 1692 1693 /* the newly created socket really belongs to the owning MPTCP master 1694 * socket, even if for additional subflows the allocation is performed 1695 * by a kernel workqueue. Adjust inode references, so that the 1696 * procfs/diag interfaces really show this one belonging to the correct 1697 * user. 1698 */ 1699 SOCK_INODE(sf)->i_ino = SOCK_INODE(sk->sk_socket)->i_ino; 1700 SOCK_INODE(sf)->i_uid = SOCK_INODE(sk->sk_socket)->i_uid; 1701 SOCK_INODE(sf)->i_gid = SOCK_INODE(sk->sk_socket)->i_gid; 1702 1703 subflow = mptcp_subflow_ctx(sf->sk); 1704 pr_debug("subflow=%p", subflow); 1705 1706 *new_sock = sf; 1707 sock_hold(sk); 1708 subflow->conn = sk; 1709 mptcp_subflow_ops_override(sf->sk); 1710 1711 return 0; 1712 1713 err_free: 1714 release_sock(sf->sk); 1715 sock_release(sf); 1716 return err; 1717 } 1718 1719 static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk, 1720 gfp_t priority) 1721 { 1722 struct inet_connection_sock *icsk = inet_csk(sk); 1723 struct mptcp_subflow_context *ctx; 1724 1725 ctx = kzalloc(sizeof(*ctx), priority); 1726 if (!ctx) 1727 return NULL; 1728 1729 rcu_assign_pointer(icsk->icsk_ulp_data, ctx); 1730 INIT_LIST_HEAD(&ctx->node); 1731 INIT_LIST_HEAD(&ctx->delegated_node); 1732 1733 pr_debug("subflow=%p", ctx); 1734 1735 ctx->tcp_sock = sk; 1736 1737 return ctx; 1738 } 1739 1740 static void __subflow_state_change(struct sock *sk) 1741 { 1742 struct socket_wq *wq; 1743 1744 rcu_read_lock(); 1745 wq = rcu_dereference(sk->sk_wq); 1746 if (skwq_has_sleeper(wq)) 1747 wake_up_interruptible_all(&wq->wait); 1748 rcu_read_unlock(); 1749 } 1750 1751 static bool subflow_is_done(const struct sock *sk) 1752 { 1753 return sk->sk_shutdown & RCV_SHUTDOWN || sk->sk_state == TCP_CLOSE; 1754 } 1755 1756 static void subflow_state_change(struct sock *sk) 1757 { 1758 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 1759 struct sock *parent = subflow->conn; 1760 struct mptcp_sock *msk; 1761 1762 __subflow_state_change(sk); 1763 1764 msk = mptcp_sk(parent); 1765 if (subflow_simultaneous_connect(sk)) { 1766 mptcp_do_fallback(sk); 1767 pr_fallback(msk); 1768 subflow->conn_finished = 1; 1769 mptcp_propagate_state(parent, sk, subflow, NULL); 1770 } 1771 1772 /* as recvmsg() does not acquire the subflow socket for ssk selection 1773 * a fin packet carrying a DSS can be unnoticed if we don't trigger 1774 * the data available machinery here. 1775 */ 1776 if (mptcp_subflow_data_available(sk)) 1777 mptcp_data_ready(parent, sk); 1778 else if (unlikely(sk->sk_err)) 1779 subflow_error_report(sk); 1780 1781 subflow_sched_work_if_closed(mptcp_sk(parent), sk); 1782 1783 /* when the fallback subflow closes the rx side, trigger a 'dummy' 1784 * ingress data fin, so that the msk state will follow along 1785 */ 1786 if (__mptcp_check_fallback(msk) && subflow_is_done(sk) && msk->first == sk && 1787 mptcp_update_rcv_data_fin(msk, READ_ONCE(msk->ack_seq), true)) 1788 mptcp_schedule_work(parent); 1789 } 1790 1791 void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_ssk) 1792 { 1793 struct request_sock_queue *queue = &inet_csk(listener_ssk)->icsk_accept_queue; 1794 struct request_sock *req, *head, *tail; 1795 struct mptcp_subflow_context *subflow; 1796 struct sock *sk, *ssk; 1797 1798 /* Due to lock dependencies no relevant lock can be acquired under rskq_lock. 1799 * Splice the req list, so that accept() can not reach the pending ssk after 1800 * the listener socket is released below. 1801 */ 1802 spin_lock_bh(&queue->rskq_lock); 1803 head = queue->rskq_accept_head; 1804 tail = queue->rskq_accept_tail; 1805 queue->rskq_accept_head = NULL; 1806 queue->rskq_accept_tail = NULL; 1807 spin_unlock_bh(&queue->rskq_lock); 1808 1809 if (!head) 1810 return; 1811 1812 /* can't acquire the msk socket lock under the subflow one, 1813 * or will cause ABBA deadlock 1814 */ 1815 release_sock(listener_ssk); 1816 1817 for (req = head; req; req = req->dl_next) { 1818 ssk = req->sk; 1819 if (!sk_is_mptcp(ssk)) 1820 continue; 1821 1822 subflow = mptcp_subflow_ctx(ssk); 1823 if (!subflow || !subflow->conn) 1824 continue; 1825 1826 sk = subflow->conn; 1827 sock_hold(sk); 1828 1829 lock_sock_nested(sk, SINGLE_DEPTH_NESTING); 1830 __mptcp_unaccepted_force_close(sk); 1831 release_sock(sk); 1832 1833 /* lockdep will report a false positive ABBA deadlock 1834 * between cancel_work_sync and the listener socket. 1835 * The involved locks belong to different sockets WRT 1836 * the existing AB chain. 1837 * Using a per socket key is problematic as key 1838 * deregistration requires process context and must be 1839 * performed at socket disposal time, in atomic 1840 * context. 1841 * Just tell lockdep to consider the listener socket 1842 * released here. 1843 */ 1844 mutex_release(&listener_sk->sk_lock.dep_map, _RET_IP_); 1845 mptcp_cancel_work(sk); 1846 mutex_acquire(&listener_sk->sk_lock.dep_map, 0, 0, _RET_IP_); 1847 1848 sock_put(sk); 1849 } 1850 1851 /* we are still under the listener msk socket lock */ 1852 lock_sock_nested(listener_ssk, SINGLE_DEPTH_NESTING); 1853 1854 /* restore the listener queue, to let the TCP code clean it up */ 1855 spin_lock_bh(&queue->rskq_lock); 1856 WARN_ON_ONCE(queue->rskq_accept_head); 1857 queue->rskq_accept_head = head; 1858 queue->rskq_accept_tail = tail; 1859 spin_unlock_bh(&queue->rskq_lock); 1860 } 1861 1862 static int subflow_ulp_init(struct sock *sk) 1863 { 1864 struct inet_connection_sock *icsk = inet_csk(sk); 1865 struct mptcp_subflow_context *ctx; 1866 struct tcp_sock *tp = tcp_sk(sk); 1867 int err = 0; 1868 1869 /* disallow attaching ULP to a socket unless it has been 1870 * created with sock_create_kern() 1871 */ 1872 if (!sk->sk_kern_sock) { 1873 err = -EOPNOTSUPP; 1874 goto out; 1875 } 1876 1877 ctx = subflow_create_ctx(sk, GFP_KERNEL); 1878 if (!ctx) { 1879 err = -ENOMEM; 1880 goto out; 1881 } 1882 1883 pr_debug("subflow=%p, family=%d", ctx, sk->sk_family); 1884 1885 tp->is_mptcp = 1; 1886 ctx->icsk_af_ops = icsk->icsk_af_ops; 1887 icsk->icsk_af_ops = subflow_default_af_ops(sk); 1888 ctx->tcp_state_change = sk->sk_state_change; 1889 ctx->tcp_error_report = sk->sk_error_report; 1890 1891 WARN_ON_ONCE(sk->sk_data_ready != sock_def_readable); 1892 WARN_ON_ONCE(sk->sk_write_space != sk_stream_write_space); 1893 1894 sk->sk_data_ready = subflow_data_ready; 1895 sk->sk_write_space = subflow_write_space; 1896 sk->sk_state_change = subflow_state_change; 1897 sk->sk_error_report = subflow_error_report; 1898 out: 1899 return err; 1900 } 1901 1902 static void subflow_ulp_release(struct sock *ssk) 1903 { 1904 struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(ssk); 1905 bool release = true; 1906 struct sock *sk; 1907 1908 if (!ctx) 1909 return; 1910 1911 sk = ctx->conn; 1912 if (sk) { 1913 /* if the msk has been orphaned, keep the ctx 1914 * alive, will be freed by __mptcp_close_ssk(), 1915 * when the subflow is still unaccepted 1916 */ 1917 release = ctx->disposable || list_empty(&ctx->node); 1918 1919 /* inet_child_forget() does not call sk_state_change(), 1920 * explicitly trigger the socket close machinery 1921 */ 1922 if (!release && !test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, 1923 &mptcp_sk(sk)->flags)) 1924 mptcp_schedule_work(sk); 1925 sock_put(sk); 1926 } 1927 1928 mptcp_subflow_ops_undo_override(ssk); 1929 if (release) 1930 kfree_rcu(ctx, rcu); 1931 } 1932 1933 static void subflow_ulp_clone(const struct request_sock *req, 1934 struct sock *newsk, 1935 const gfp_t priority) 1936 { 1937 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); 1938 struct mptcp_subflow_context *old_ctx = mptcp_subflow_ctx(newsk); 1939 struct mptcp_subflow_context *new_ctx; 1940 1941 if (!tcp_rsk(req)->is_mptcp || 1942 (!subflow_req->mp_capable && !subflow_req->mp_join)) { 1943 subflow_ulp_fallback(newsk, old_ctx); 1944 return; 1945 } 1946 1947 new_ctx = subflow_create_ctx(newsk, priority); 1948 if (!new_ctx) { 1949 subflow_ulp_fallback(newsk, old_ctx); 1950 return; 1951 } 1952 1953 new_ctx->conn_finished = 1; 1954 new_ctx->icsk_af_ops = old_ctx->icsk_af_ops; 1955 new_ctx->tcp_state_change = old_ctx->tcp_state_change; 1956 new_ctx->tcp_error_report = old_ctx->tcp_error_report; 1957 new_ctx->rel_write_seq = 1; 1958 new_ctx->tcp_sock = newsk; 1959 1960 if (subflow_req->mp_capable) { 1961 /* see comments in subflow_syn_recv_sock(), MPTCP connection 1962 * is fully established only after we receive the remote key 1963 */ 1964 new_ctx->mp_capable = 1; 1965 new_ctx->local_key = subflow_req->local_key; 1966 new_ctx->token = subflow_req->token; 1967 new_ctx->ssn_offset = subflow_req->ssn_offset; 1968 new_ctx->idsn = subflow_req->idsn; 1969 1970 /* this is the first subflow, id is always 0 */ 1971 new_ctx->local_id_valid = 1; 1972 } else if (subflow_req->mp_join) { 1973 new_ctx->ssn_offset = subflow_req->ssn_offset; 1974 new_ctx->mp_join = 1; 1975 new_ctx->fully_established = 1; 1976 new_ctx->remote_key_valid = 1; 1977 new_ctx->backup = subflow_req->backup; 1978 new_ctx->remote_id = subflow_req->remote_id; 1979 new_ctx->token = subflow_req->token; 1980 new_ctx->thmac = subflow_req->thmac; 1981 1982 /* the subflow req id is valid, fetched via subflow_check_req() 1983 * and subflow_token_join_request() 1984 */ 1985 subflow_set_local_id(new_ctx, subflow_req->local_id); 1986 } 1987 } 1988 1989 static void tcp_release_cb_override(struct sock *ssk) 1990 { 1991 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 1992 long status; 1993 1994 /* process and clear all the pending actions, but leave the subflow into 1995 * the napi queue. To respect locking, only the same CPU that originated 1996 * the action can touch the list. mptcp_napi_poll will take care of it. 1997 */ 1998 status = set_mask_bits(&subflow->delegated_status, MPTCP_DELEGATE_ACTIONS_MASK, 0); 1999 if (status) 2000 mptcp_subflow_process_delegated(ssk, status); 2001 2002 tcp_release_cb(ssk); 2003 } 2004 2005 static int tcp_abort_override(struct sock *ssk, int err) 2006 { 2007 /* closing a listener subflow requires a great deal of care. 2008 * keep it simple and just prevent such operation 2009 */ 2010 if (inet_sk_state_load(ssk) == TCP_LISTEN) 2011 return -EINVAL; 2012 2013 return tcp_abort(ssk, err); 2014 } 2015 2016 static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = { 2017 .name = "mptcp", 2018 .owner = THIS_MODULE, 2019 .init = subflow_ulp_init, 2020 .release = subflow_ulp_release, 2021 .clone = subflow_ulp_clone, 2022 }; 2023 2024 static int subflow_ops_init(struct request_sock_ops *subflow_ops) 2025 { 2026 subflow_ops->obj_size = sizeof(struct mptcp_subflow_request_sock); 2027 2028 subflow_ops->slab = kmem_cache_create(subflow_ops->slab_name, 2029 subflow_ops->obj_size, 0, 2030 SLAB_ACCOUNT | 2031 SLAB_TYPESAFE_BY_RCU, 2032 NULL); 2033 if (!subflow_ops->slab) 2034 return -ENOMEM; 2035 2036 return 0; 2037 } 2038 2039 void __init mptcp_subflow_init(void) 2040 { 2041 mptcp_subflow_v4_request_sock_ops = tcp_request_sock_ops; 2042 mptcp_subflow_v4_request_sock_ops.slab_name = "request_sock_subflow_v4"; 2043 mptcp_subflow_v4_request_sock_ops.destructor = subflow_v4_req_destructor; 2044 2045 if (subflow_ops_init(&mptcp_subflow_v4_request_sock_ops) != 0) 2046 panic("MPTCP: failed to init subflow v4 request sock ops\n"); 2047 2048 subflow_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops; 2049 subflow_request_sock_ipv4_ops.route_req = subflow_v4_route_req; 2050 subflow_request_sock_ipv4_ops.send_synack = subflow_v4_send_synack; 2051 2052 subflow_specific = ipv4_specific; 2053 subflow_specific.conn_request = subflow_v4_conn_request; 2054 subflow_specific.syn_recv_sock = subflow_syn_recv_sock; 2055 subflow_specific.sk_rx_dst_set = subflow_finish_connect; 2056 subflow_specific.rebuild_header = subflow_rebuild_header; 2057 2058 tcp_prot_override = tcp_prot; 2059 tcp_prot_override.release_cb = tcp_release_cb_override; 2060 tcp_prot_override.diag_destroy = tcp_abort_override; 2061 2062 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 2063 /* In struct mptcp_subflow_request_sock, we assume the TCP request sock 2064 * structures for v4 and v6 have the same size. It should not changed in 2065 * the future but better to make sure to be warned if it is no longer 2066 * the case. 2067 */ 2068 BUILD_BUG_ON(sizeof(struct tcp_request_sock) != sizeof(struct tcp6_request_sock)); 2069 2070 mptcp_subflow_v6_request_sock_ops = tcp6_request_sock_ops; 2071 mptcp_subflow_v6_request_sock_ops.slab_name = "request_sock_subflow_v6"; 2072 mptcp_subflow_v6_request_sock_ops.destructor = subflow_v6_req_destructor; 2073 2074 if (subflow_ops_init(&mptcp_subflow_v6_request_sock_ops) != 0) 2075 panic("MPTCP: failed to init subflow v6 request sock ops\n"); 2076 2077 subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops; 2078 subflow_request_sock_ipv6_ops.route_req = subflow_v6_route_req; 2079 subflow_request_sock_ipv6_ops.send_synack = subflow_v6_send_synack; 2080 2081 subflow_v6_specific = ipv6_specific; 2082 subflow_v6_specific.conn_request = subflow_v6_conn_request; 2083 subflow_v6_specific.syn_recv_sock = subflow_syn_recv_sock; 2084 subflow_v6_specific.sk_rx_dst_set = subflow_finish_connect; 2085 subflow_v6_specific.rebuild_header = subflow_v6_rebuild_header; 2086 2087 subflow_v6m_specific = subflow_v6_specific; 2088 subflow_v6m_specific.queue_xmit = ipv4_specific.queue_xmit; 2089 subflow_v6m_specific.send_check = ipv4_specific.send_check; 2090 subflow_v6m_specific.net_header_len = ipv4_specific.net_header_len; 2091 subflow_v6m_specific.mtu_reduced = ipv4_specific.mtu_reduced; 2092 subflow_v6m_specific.rebuild_header = subflow_rebuild_header; 2093 2094 tcpv6_prot_override = tcpv6_prot; 2095 tcpv6_prot_override.release_cb = tcp_release_cb_override; 2096 tcpv6_prot_override.diag_destroy = tcp_abort_override; 2097 #endif 2098 2099 mptcp_diag_subflow_init(&subflow_ulp_ops); 2100 2101 if (tcp_register_ulp(&subflow_ulp_ops) != 0) 2102 panic("MPTCP: failed to register subflows to ULP\n"); 2103 } 2104